From f5b5338f458945ef4fde494dc7bce598b17b4670 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 31 Mar 2026 13:24:56 +0200 Subject: [PATCH 01/32] feat(datafusion): add IcebergPartitionedTableProvider and IcebergPartitionedScan for parallel file scanning --- crates/integrations/datafusion/src/lib.rs | 2 + .../datafusion/src/physical_plan/mod.rs | 2 + .../src/physical_plan/partitioned_scan.rs | 174 ++++++++++ .../integrations/datafusion/src/table/mod.rs | 7 +- .../datafusion/src/table/partitioned.rs | 315 ++++++++++++++++++ 5 files changed, 499 insertions(+), 1 deletion(-) create mode 100644 crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs create mode 100644 crates/integrations/datafusion/src/table/partitioned.rs diff --git a/crates/integrations/datafusion/src/lib.rs b/crates/integrations/datafusion/src/lib.rs index 4b0ea8606d..9a84832d88 100644 --- a/crates/integrations/datafusion/src/lib.rs +++ b/crates/integrations/datafusion/src/lib.rs @@ -24,6 +24,8 @@ pub use error::*; pub mod physical_plan; mod schema; pub mod table; +pub use physical_plan::IcebergPartitionedScan; +pub use table::partitioned::IcebergPartitionedTableProvider; pub use table::table_provider_factory::IcebergTableProviderFactory; pub use table::*; diff --git a/crates/integrations/datafusion/src/physical_plan/mod.rs b/crates/integrations/datafusion/src/physical_plan/mod.rs index aeac30de32..a257fe9e20 100644 --- a/crates/integrations/datafusion/src/physical_plan/mod.rs +++ b/crates/integrations/datafusion/src/physical_plan/mod.rs @@ -18,6 +18,7 @@ pub(crate) mod commit; pub(crate) mod expr_to_predicate; pub(crate) mod metadata_scan; +pub(crate) mod partitioned_scan; pub(crate) mod project; pub(crate) mod repartition; pub(crate) mod scan; @@ -27,5 +28,6 @@ pub(crate) mod write; pub(crate) const DATA_FILES_COL_NAME: &str = "data_files"; pub use expr_to_predicate::convert_filters_to_predicate; +pub use partitioned_scan::IcebergPartitionedScan; pub use project::project_with_partition; pub use scan::IcebergTableScan; diff --git a/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs b/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs new file mode 100644 index 0000000000..69d12b872b --- /dev/null +++ b/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs @@ -0,0 +1,174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; +use datafusion::error::Result as DFResult; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{DisplayAs, ExecutionPlan, Partitioning, PlanProperties}; +use futures::TryStreamExt; +use iceberg::arrow::ArrowReaderBuilder; +use iceberg::io::FileIO; +use iceberg::scan::FileScanTask; + +use crate::to_datafusion_error; + +/// A DataFusion [`ExecutionPlan`] that reads one [`FileScanTask`] per partition. +/// +/// Display information (projection, predicate) is derived at runtime from the output schema and +/// the tasks rather than stored as dedicated struct fields. This keeps the node self-contained: +/// all state is already serializable via `FileScanTask`, which simplifies the DataFusion +/// distributed codec, adding dedicated fields would require encoding them separately in the +/// protobuf round-trip. +#[derive(Debug, Clone)] +pub struct IcebergPartitionedScan { + tasks: Vec, + file_io: FileIO, + plan_properties: Arc, +} + +impl IcebergPartitionedScan { + pub fn new(tasks: Vec, file_io: FileIO, schema: ArrowSchemaRef) -> Self { + let n_partitions = tasks.len(); + let plan_properties = Self::compute_properties(schema, n_partitions); + Self { + tasks, + file_io, + plan_properties, + } + } + + pub fn tasks(&self) -> &[FileScanTask] { + &self.tasks + } + + pub fn file_io(&self) -> &FileIO { + &self.file_io + } + + fn compute_properties(schema: ArrowSchemaRef, n_partitions: usize) -> Arc { + Arc::new(PlanProperties::new( + EquivalenceProperties::new(schema), + Partitioning::UnknownPartitioning(n_partitions), + EmissionType::Incremental, + Boundedness::Bounded, + )) + } +} + +impl ExecutionPlan for IcebergPartitionedScan { + fn name(&self) -> &str { + "IcebergPartitionedScan" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> DFResult> { + Ok(self) + } + + fn properties(&self) -> &Arc { + &self.plan_properties + } + + fn execute( + &self, + partition: usize, + _context: Arc, + ) -> DFResult { + let task = self.tasks.get(partition).cloned().ok_or_else(|| { + datafusion::error::DataFusionError::Internal(format!( + "{}: partition index {partition} is out of bounds \ + (total tasks: {})", + self.name(), + self.tasks.len() + )) + })?; + + let file_io = self.file_io.clone(); + + let fut = async move { + let task_stream = futures::stream::once(futures::future::ready(Ok(task))); + let record_batch_stream = ArrowReaderBuilder::new(file_io) + .build() + .read(Box::pin(task_stream)) + .map_err(to_datafusion_error)? + .map_err(to_datafusion_error); + Ok::<_, datafusion::error::DataFusionError>(record_batch_stream) + }; + + let stream = futures::stream::once(fut).try_flatten(); + + Ok(Box::pin(RecordBatchStreamAdapter::new( + self.schema(), + stream, + ))) + } +} + +impl DisplayAs for IcebergPartitionedScan { + fn fmt_as( + &self, + _t: datafusion::physical_plan::DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + let projection = self + .schema() + .fields() + .iter() + .map(|f| f.name().as_str()) + .collect::>() + .join(","); + // All tasks share the same predicate (they come from a single scan plan build), + // so reading it from the first task is sufficient. + let predicate = self + .tasks + .first() + .and_then(|t| t.predicate()) + .map_or(String::new(), |p| format!("{p}")); + let file_count = self.tasks.len(); + write!( + f, + "{} projection:[{projection}] predicate:[{predicate}] file_count:[{file_count}]", + self.name() + )?; + if self.tasks.len() <= 5 { + let files = self + .tasks + .iter() + .map(|t| t.data_file_path()) + .collect::>() + .join(", "); + write!(f, " files:[{files}]")?; + } + Ok(()) + } +} diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 75b7988d8d..5ae41b86c1 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -17,15 +17,20 @@ //! Iceberg table providers for DataFusion. //! -//! This module provides two table provider implementations: +//! This module provides three table provider implementations: //! //! - [`IcebergTableProvider`]: Catalog-backed provider with automatic metadata refresh. //! Use for write operations and when you need to see the latest table state. //! //! - [`IcebergStaticTableProvider`]: Static provider for read-only access to a specific //! table snapshot. Use for consistent analytical queries or time-travel scenarios. +//! +//! - [`IcebergPartitionedTableProvider`]: Catalog-backed provider that assigns one +//! DataFusion partition per data file, enabling parallel file-level scanning. +//! Read-only; use [`IcebergTableProvider`] for write operations. pub mod metadata_table; +pub mod partitioned; pub mod table_provider_factory; use std::any::Any; diff --git a/crates/integrations/datafusion/src/table/partitioned.rs b/crates/integrations/datafusion/src/table/partitioned.rs new file mode 100644 index 0000000000..2e9aa22628 --- /dev/null +++ b/crates/integrations/datafusion/src/table/partitioned.rs @@ -0,0 +1,315 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; +use datafusion::catalog::Session; +use datafusion::common::DataFusionError; +use datafusion::datasource::{TableProvider, TableType}; +use datafusion::error::Result as DFResult; +use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; +use datafusion::physical_plan::ExecutionPlan; +use futures::TryStreamExt; +use iceberg::arrow::schema_to_arrow_schema; +use iceberg::{Catalog, Error, ErrorKind, NamespaceIdent, Result, TableIdent}; + +use crate::error::to_datafusion_error; +use crate::physical_plan::expr_to_predicate::convert_filters_to_predicate; +use crate::physical_plan::partitioned_scan::IcebergPartitionedScan; + +/// Catalog-backed table provider that scans each data file in a separate DataFusion partition. +/// +/// This provider reloads table metadata from the catalog on every [`scan`][Self::scan] call +/// to guarantee freshness, then issues one DataFusion partition per data file so that +/// DataFusion's scheduler can execute file reads in parallel. +/// +/// Write operations are not supported. Use [`IcebergTableProvider`] for write access. +/// +/// For consistent read-only access to a fixed snapshot without per-scan catalog overhead, +/// use [`IcebergStaticTableProvider`] instead. +#[derive(Debug, Clone)] +pub struct IcebergPartitionedTableProvider { + catalog: Arc, + table_ident: TableIdent, + schema: ArrowSchemaRef, +} + +impl IcebergPartitionedTableProvider { + pub async fn try_new( + catalog: Arc, + namespace: NamespaceIdent, + name: impl Into, + ) -> Result { + let table_ident = TableIdent::new(namespace, name.into()); + // First load: used only to snapshot the Arrow schema for DataFusion planning. + // A second load_table is issued at scan time to guarantee the freshest snapshot. + let table = catalog.load_table(&table_ident).await?; + let schema = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?); + Ok(Self { + catalog, + table_ident, + schema, + }) + } +} + +#[async_trait] +impl TableProvider for IcebergPartitionedTableProvider { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> ArrowSchemaRef { + self.schema.clone() + } + + fn table_type(&self) -> TableType { + TableType::Base + } + + async fn scan( + &self, + _state: &dyn Session, + projection: Option<&Vec>, + filters: &[Expr], + _limit: Option, + ) -> DFResult> { + // Per-partition row limits are not yet implemented for IcebergPartitionedScan. + // DataFusion will apply a GlobalLimitExec on top of this node when needed. + + // Second load: fetch the latest snapshot so scans always reflect current table state. + let table = self + .catalog + .load_table(&self.table_ident) + .await + .map_err(to_datafusion_error)?; + + // Projection indices are resolved against self.schema (captured at try_new time), + // same as IcebergTableProvider / IcebergTableScan. + let col_names = projection.map(|indices| { + indices + .iter() + .map(|&i| self.schema.field(i).name().clone()) + .collect::>() + }); + + let predicate = convert_filters_to_predicate(filters); + + let mut builder = table.scan(); + builder = match col_names { + Some(names) => builder.select(names), + None => builder.select_all(), + }; + if let Some(pred) = predicate { + builder = builder.with_filter(pred); + } + + let tasks = builder + .build() + .map_err(to_datafusion_error)? + .plan_files() + .await + .map_err(to_datafusion_error)? + .try_collect::>() + .await + .map_err(to_datafusion_error)?; + + let output_schema = match projection { + None => self.schema.clone(), + Some(indices) => Arc::new(self.schema.project(indices).map_err(|e| { + DataFusionError::Internal(format!("schema projection failed: {e}")) + })?), + }; + + Ok(Arc::new(IcebergPartitionedScan::new( + tasks, + table.file_io().clone(), + output_schema, + ))) + } + + fn supports_filters_pushdown( + &self, + filters: &[&Expr], + ) -> DFResult> { + Ok(vec![TableProviderFilterPushDown::Inexact; filters.len()]) + } + + async fn insert_into( + &self, + _state: &dyn Session, + _input: Arc, + _insert_op: datafusion::logical_expr::dml::InsertOp, + ) -> DFResult> { + Err(to_datafusion_error(Error::new( + ErrorKind::FeatureUnsupported, + "IcebergPartitionedTableProvider does not support writes; \ + use IcebergTableProvider instead", + ))) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use datafusion::prelude::SessionContext; + use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; + use iceberg::spec::{ + DataContentType, DataFileBuilder, DataFileFormat, NestedField, PrimitiveType, Schema, Type, + }; + use iceberg::transaction::{ApplyTransactionAction, Transaction}; + use iceberg::{Catalog, CatalogBuilder, NamespaceIdent, TableCreation, TableIdent}; + use tempfile::TempDir; + + use super::*; + + async fn make_catalog_and_table() -> (Arc, NamespaceIdent, String, TempDir) { + let temp_dir = TempDir::new().unwrap(); + let warehouse = temp_dir.path().to_str().unwrap().to_string(); + + let catalog = Arc::new( + MemoryCatalogBuilder::default() + .load( + "memory", + HashMap::from([(MEMORY_CATALOG_WAREHOUSE.to_string(), warehouse.clone())]), + ) + .await + .unwrap(), + ); + + let namespace = NamespaceIdent::new("ns".to_string()); + catalog + .create_namespace(&namespace, HashMap::new()) + .await + .unwrap(); + + let schema = Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(); + + catalog + .create_table( + &namespace, + TableCreation::builder() + .name("t".to_string()) + .location(format!("{warehouse}/t")) + .schema(schema) + .properties(HashMap::new()) + .build(), + ) + .await + .unwrap(); + + (catalog, namespace, "t".to_string(), temp_dir) + } + + /// Registers `n` synthetic data files in the table metadata via the iceberg + /// transaction API. No actual parquet files are written, only the metadata + /// entries that `plan_files()` reads are created. + async fn append_fake_data_files( + catalog: &Arc, + namespace: &NamespaceIdent, + table_name: &str, + n: usize, + ) { + let table = catalog + .load_table(&TableIdent::new(namespace.clone(), table_name.to_string())) + .await + .unwrap(); + + let data_files = (0..n) + .map(|i| { + DataFileBuilder::default() + .content(DataContentType::Data) + .file_path(format!( + "{}/data/fake_{i}.parquet", + table.metadata().location() + )) + .file_format(DataFileFormat::Parquet) + .file_size_in_bytes(128) + .record_count(1) + .partition_spec_id(table.metadata().default_partition_spec_id()) + .build() + .unwrap() + }) + .collect::>(); + + let tx = Transaction::new(&table); + let action = tx.fast_append().add_data_files(data_files); + action + .apply(tx) + .unwrap() + .commit(catalog.as_ref()) + .await + .unwrap(); + } + + /// An empty table must produce a zero-partition scan so DataFusion never calls + /// execute(0), which would otherwise return an out-of-bounds error. + #[tokio::test] + async fn test_empty_table_zero_partitions() { + let (catalog, namespace, table_name, _temp_dir) = make_catalog_and_table().await; + // no files appended + let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan(&SessionContext::new().state(), None, &[], None) + .await + .unwrap(); + let scan = plan + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(scan.tasks().len(), 0); + assert_eq!(scan.properties().partitioning.partition_count(), 0); + } + + /// Each data file in the table must become exactly one DataFusion partition + /// in IcebergPartitionedScan, enabling parallel file reads. + #[tokio::test] + async fn test_one_partition_per_file() { + let (catalog, namespace, table_name, _temp_dir) = make_catalog_and_table().await; + append_fake_data_files(&catalog, &namespace, &table_name, 3).await; + + let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan(&SessionContext::new().state(), None, &[], None) + .await + .unwrap(); + let scan = plan + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(scan.tasks().len(), 3); + assert_eq!(scan.properties().partitioning.partition_count(), 3); + } +} From 5076f9e1512e3e66a7f1453b2c467a8b1259aa16 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 31 Mar 2026 16:40:57 +0200 Subject: [PATCH 02/32] docs(datafusion): update comment in IcebergPartitionedScan --- .../datafusion/src/physical_plan/partitioned_scan.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs b/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs index 69d12b872b..5083f068b2 100644 --- a/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs @@ -35,10 +35,7 @@ use crate::to_datafusion_error; /// A DataFusion [`ExecutionPlan`] that reads one [`FileScanTask`] per partition. /// /// Display information (projection, predicate) is derived at runtime from the output schema and -/// the tasks rather than stored as dedicated struct fields. This keeps the node self-contained: -/// all state is already serializable via `FileScanTask`, which simplifies the DataFusion -/// distributed codec, adding dedicated fields would require encoding them separately in the -/// protobuf round-trip. +/// the tasks rather than stored as dedicated struct fields. #[derive(Debug, Clone)] pub struct IcebergPartitionedScan { tasks: Vec, From 4c4b962f16ff1db9cfdfb680f10402434754aaf1 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Mon, 20 Apr 2026 14:45:58 +0200 Subject: [PATCH 03/32] Update crates/integrations/datafusion/src/table/mod.rs Co-authored-by: Tim Saucer --- crates/integrations/datafusion/src/table/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 5ae41b86c1..e2c9ca6efa 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -17,7 +17,7 @@ //! Iceberg table providers for DataFusion. //! -//! This module provides three table provider implementations: +//! This module provides various table provider implementations: //! //! - [`IcebergTableProvider`]: Catalog-backed provider with automatic metadata refresh. //! Use for write operations and when you need to see the latest table state. From 6a8e1e323a500dca72d34e9a6a967e2ca5b8151e Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Mon, 20 Apr 2026 15:25:59 +0200 Subject: [PATCH 04/32] fix(datafusion): reject non-empty children in IcebergPartitionedScan::with_new_children --- .../datafusion/src/physical_plan/partitioned_scan.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs b/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs index 5083f068b2..e0e2fd272c 100644 --- a/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs @@ -87,8 +87,15 @@ impl ExecutionPlan for IcebergPartitionedScan { fn with_new_children( self: Arc, - _children: Vec>, + children: Vec>, ) -> DFResult> { + if !children.is_empty() { + return Err(datafusion::error::DataFusionError::Internal(format!( + "{} is a leaf node and expects no children, but {} were provided", + self.name(), + children.len() + ))); + } Ok(self) } From bf9d689275659ac919bca622d9a048e6caa7b358 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 21 Apr 2026 11:37:28 +0200 Subject: [PATCH 05/32] fix(datafusion): use ArrowReaderBuilder existing configuration path --- crates/iceberg/src/scan/mod.rs | 28 +++- .../src/physical_plan/partitioned_scan.rs | 140 +++++++++++++----- .../datafusion/src/physical_plan/scan.rs | 2 +- .../datafusion/src/table/partitioned.rs | 19 +-- 4 files changed, 133 insertions(+), 56 deletions(-) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 368e8143e2..2d8edfce29 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -462,6 +462,29 @@ impl TableScan { /// Returns an [`ArrowRecordBatchStream`]. pub async fn to_arrow(&self) -> Result { + self.to_arrow_with_tasks(self.plan_files().await?) + } + + /// Consumes an externally-planned [`FileScanTask`] stream and returns an + /// [`ArrowRecordBatchStream`] using this scan's [`ArrowReaderBuilder`] + /// configuration (row-group filtering, row selection, data-file + /// concurrency limit, batch size). + /// + /// Equivalent to [`TableScan::to_arrow`] — which delegates to this method + /// after awaiting [`TableScan::plan_files`] — but lets the caller supply + /// a pre-computed task stream. This decouples planning from reading, so + /// external executors (e.g. a DataFusion partitioned scan) can plan once, + /// distribute tasks across workers, and replay them here without + /// re-running `plan_files()`. + /// + /// # Correctness + /// + /// The tasks passed in must have been produced by a [`TableScan`] whose + /// projection and filter match `self`: filters are already baked into + /// each [`FileScanTask::predicate`] at planning time and are not + /// re-applied here. Using tasks from a scan with a different projection + /// or filter yields undefined behavior. + pub fn to_arrow_with_tasks(&self, tasks: FileScanTaskStream) -> Result { let mut arrow_reader_builder = ArrowReaderBuilder::new(self.file_io.clone(), self.runtime.clone()) .with_data_file_concurrency_limit(self.concurrency_limit_data_files) @@ -472,10 +495,7 @@ impl TableScan { arrow_reader_builder = arrow_reader_builder.with_batch_size(batch_size); } - arrow_reader_builder - .build() - .read(self.plan_files().await?) - .map(|result| result.stream()) + Ok(arrow_reader_builder.build().read(tasks)?.stream()) } /// Returns a reference to the column names of the table scan. diff --git a/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs b/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs index e0e2fd272c..bf6a1d3465 100644 --- a/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs @@ -19,47 +19,99 @@ use std::any::Any; use std::sync::Arc; use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; -use datafusion::error::Result as DFResult; +use datafusion::error::{DataFusionError, Result as DFResult}; use datafusion::execution::{SendableRecordBatchStream, TaskContext}; use datafusion::physical_expr::EquivalenceProperties; use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{DisplayAs, ExecutionPlan, Partitioning, PlanProperties}; +use datafusion::prelude::Expr; use futures::TryStreamExt; -use iceberg::arrow::ArrowReaderBuilder; -use iceberg::io::FileIO; +use iceberg::expr::Predicate; use iceberg::scan::FileScanTask; +use iceberg::table::Table; +use super::expr_to_predicate::convert_filters_to_predicate; +use super::scan::get_column_names; use crate::to_datafusion_error; /// A DataFusion [`ExecutionPlan`] that reads one [`FileScanTask`] per partition. /// -/// Display information (projection, predicate) is derived at runtime from the output schema and -/// the tasks rather than stored as dedicated struct fields. -#[derive(Debug, Clone)] +/// Arrow reader configuration (row-group filtering, row selection, concurrency +/// limit, batch size) matches [`IcebergTableScan`][super::scan::IcebergTableScan]: +/// it is sourced from the underlying [`TableScan`][iceberg::scan::TableScan] +/// rebuilt in [`execute`](ExecutionPlan::execute) and applied via +/// [`TableScan::to_arrow_with_tasks`][iceberg::scan::TableScan::to_arrow_with_tasks]. +/// +/// Note: the `TableScan` is rebuilt on every `execute(partition)` call rather +/// than cached as an `Arc` on the struct. Caching would avoid +/// redundant schema resolution and predicate binding per partition, but +/// `TableScan` carries a `PlanContext` with `Arc`-shared evaluator caches +/// which is awkward to serialize if this plan ever needs to be shipped across +/// workers. The per-build cost is bounded (no I/O), so the rebuild is kept +/// for now; revisit once the cross-worker story is clearer. +#[derive(Debug)] pub struct IcebergPartitionedScan { - tasks: Vec, - file_io: FileIO, + /// A table in the catalog. + table: Table, + /// Snapshot of the table to scan. + snapshot_id: Option, + /// Stores certain, often expensive to compute, + /// plan properties used in query optimization. plan_properties: Arc, + /// Projection column names, None means all columns. + projection: Option>, + /// Filters to apply to the table scan. + predicates: Option, + /// Pre-planned file scan tasks, one per DataFusion partition. + tasks: Vec, } impl IcebergPartitionedScan { - pub fn new(tasks: Vec, file_io: FileIO, schema: ArrowSchemaRef) -> Self { - let n_partitions = tasks.len(); - let plan_properties = Self::compute_properties(schema, n_partitions); + pub(crate) fn new( + table: Table, + snapshot_id: Option, + schema: ArrowSchemaRef, + projection: Option<&Vec>, + filters: &[Expr], + tasks: Vec, + ) -> Self { + let output_schema = match projection { + None => schema.clone(), + Some(projection) => Arc::new(schema.project(projection).unwrap()), + }; + let plan_properties = Self::compute_properties(output_schema, tasks.len()); + let projection = get_column_names(schema, projection); + let predicates = convert_filters_to_predicate(filters); + Self { - tasks, - file_io, + table, + snapshot_id, plan_properties, + projection, + predicates, + tasks, } } - pub fn tasks(&self) -> &[FileScanTask] { - &self.tasks + pub fn table(&self) -> &Table { + &self.table } - pub fn file_io(&self) -> &FileIO { - &self.file_io + pub fn snapshot_id(&self) -> Option { + self.snapshot_id + } + + pub fn projection(&self) -> Option<&[String]> { + self.projection.as_deref() + } + + pub fn predicates(&self) -> Option<&Predicate> { + self.predicates.as_ref() + } + + pub fn tasks(&self) -> &[FileScanTask] { + &self.tasks } fn compute_properties(schema: ArrowSchemaRef, n_partitions: usize) -> Arc { @@ -90,7 +142,7 @@ impl ExecutionPlan for IcebergPartitionedScan { children: Vec>, ) -> DFResult> { if !children.is_empty() { - return Err(datafusion::error::DataFusionError::Internal(format!( + return Err(DataFusionError::Internal(format!( "{} is a leaf node and expects no children, but {} were provided", self.name(), children.len() @@ -109,24 +161,40 @@ impl ExecutionPlan for IcebergPartitionedScan { _context: Arc, ) -> DFResult { let task = self.tasks.get(partition).cloned().ok_or_else(|| { - datafusion::error::DataFusionError::Internal(format!( - "{}: partition index {partition} is out of bounds \ - (total tasks: {})", + DataFusionError::Internal(format!( + "{}: partition index {partition} is out of bounds (total tasks: {})", self.name(), self.tasks.len() )) })?; - let file_io = self.file_io.clone(); + let table = self.table.clone(); + let snapshot_id = self.snapshot_id; + let column_names = self.projection.clone(); + let predicates = self.predicates.clone(); let fut = async move { - let task_stream = futures::stream::once(futures::future::ready(Ok(task))); - let record_batch_stream = ArrowReaderBuilder::new(file_io) - .build() - .read(Box::pin(task_stream)) + // Rebuild a TableScan mirroring IcebergTableScan::get_batch_stream so we + // inherit the same defaults (row-group filtering, batch size, concurrency, ...). + let scan_builder = match snapshot_id { + Some(id) => table.scan().snapshot_id(id), + None => table.scan(), + }; + let mut scan_builder = match column_names { + Some(names) => scan_builder.select(names), + None => scan_builder.select_all(), + }; + if let Some(pred) = predicates { + scan_builder = scan_builder.with_filter(pred); + } + let table_scan = scan_builder.build().map_err(to_datafusion_error)?; + + let task_stream = Box::pin(futures::stream::once(futures::future::ready(Ok(task)))); + let record_batch_stream = table_scan + .to_arrow_with_tasks(task_stream) .map_err(to_datafusion_error)? .map_err(to_datafusion_error); - Ok::<_, datafusion::error::DataFusionError>(record_batch_stream) + Ok::<_, DataFusionError>(record_batch_stream) }; let stream = futures::stream::once(fut).try_flatten(); @@ -145,18 +213,12 @@ impl DisplayAs for IcebergPartitionedScan { f: &mut std::fmt::Formatter, ) -> std::fmt::Result { let projection = self - .schema() - .fields() - .iter() - .map(|f| f.name().as_str()) - .collect::>() - .join(","); - // All tasks share the same predicate (they come from a single scan plan build), - // so reading it from the first task is sufficient. + .projection + .clone() + .map_or(String::new(), |v| v.join(",")); let predicate = self - .tasks - .first() - .and_then(|t| t.predicate()) + .predicates + .clone() .map_or(String::new(), |p| format!("{p}")); let file_count = self.tasks.len(); write!( @@ -164,7 +226,7 @@ impl DisplayAs for IcebergPartitionedScan { "{} projection:[{projection}] predicate:[{predicate}] file_count:[{file_count}]", self.name() )?; - if self.tasks.len() <= 5 { + if file_count <= 5 { let files = self .tasks .iter() diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index 36539ae503..c5a892abeb 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -237,7 +237,7 @@ async fn get_batch_stream( Ok(Box::pin(stream)) } -fn get_column_names( +pub(super) fn get_column_names( schema: ArrowSchemaRef, projection: Option<&Vec>, ) -> Option> { diff --git a/crates/integrations/datafusion/src/table/partitioned.rs b/crates/integrations/datafusion/src/table/partitioned.rs index 2e9aa22628..00e08efb94 100644 --- a/crates/integrations/datafusion/src/table/partitioned.rs +++ b/crates/integrations/datafusion/src/table/partitioned.rs @@ -21,7 +21,6 @@ use std::sync::Arc; use async_trait::async_trait; use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; use datafusion::catalog::Session; -use datafusion::common::DataFusionError; use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::Result as DFResult; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; @@ -101,8 +100,8 @@ impl TableProvider for IcebergPartitionedTableProvider { .await .map_err(to_datafusion_error)?; - // Projection indices are resolved against self.schema (captured at try_new time), - // same as IcebergTableProvider / IcebergTableScan. + // Build a TableScan mirroring the inputs we'll hand to IcebergPartitionedScan, + // so plan_files() uses the same projection/filters the scan will replay in execute(). let col_names = projection.map(|indices| { indices .iter() @@ -131,17 +130,13 @@ impl TableProvider for IcebergPartitionedTableProvider { .await .map_err(to_datafusion_error)?; - let output_schema = match projection { - None => self.schema.clone(), - Some(indices) => Arc::new(self.schema.project(indices).map_err(|e| { - DataFusionError::Internal(format!("schema projection failed: {e}")) - })?), - }; - Ok(Arc::new(IcebergPartitionedScan::new( + table, + None, // Always use current snapshot for catalog-backed provider + self.schema.clone(), + projection, + filters, tasks, - table.file_io().clone(), - output_schema, ))) } From e7c1a48400af241a4b3bf69209e48c3399f69e17 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 24 Apr 2026 19:24:09 -0400 Subject: [PATCH 06/32] feat(datafusion): bucket FileScanTasks across target_partitions with identity-hash partitioning Replace the one-task-per-partition layout in IcebergPartitionedScan with N buckets sized from the session's target_partitions. When the table's default spec exposes identity-transform columns and every task carries the corresponding partition values, tasks are bucketed by hashing those values via DataFusion's REPARTITION_RANDOM_STATE so the resulting partitioning matches what RepartitionExec would produce. The scan then declares Partitioning::Hash(exprs, N), letting downstream joins and aggregates skip an extra repartition. Hash declaration is conservative and only stands when: - the table has a single partition spec (no spec evolution) - every identity source column is present in the output projection - every column type is supported by literal_to_array - every task supplied a full identity key Any miss collapses to UnknownPartitioning(N) while bucketing falls back to a hash of data_file_path so partitions still distribute. IcebergPartitionedScan now stores Vec> and execute(i) streams every task in buckets[i] through to_arrow_with_tasks. Bucket count is capped at min(target_partitions, num_files), and an empty table still yields zero partitions to avoid out-of-bounds execute calls. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/physical_plan/partitioned_scan.rs | 59 ++-- .../datafusion/src/table/partitioned.rs | 291 +++++++++++++++++- 2 files changed, 312 insertions(+), 38 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs b/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs index bf6a1d3465..159665dddd 100644 --- a/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs @@ -35,7 +35,14 @@ use super::expr_to_predicate::convert_filters_to_predicate; use super::scan::get_column_names; use crate::to_datafusion_error; -/// A DataFusion [`ExecutionPlan`] that reads one [`FileScanTask`] per partition. +/// A DataFusion [`ExecutionPlan`] that reads a bucket of [`FileScanTask`]s per partition. +/// +/// Each DataFusion partition `i` streams every [`FileScanTask`] in `buckets[i]`, +/// concatenated into a single Arrow record-batch stream. The caller decides how +/// tasks are assigned to buckets and supplies the resulting [`Partitioning`] +/// (typically [`Partitioning::Hash`] when files are bucketed by identity-partition +/// values matching DataFusion's repartition hash, otherwise +/// [`Partitioning::UnknownPartitioning`]). /// /// Arrow reader configuration (row-group filtering, row selection, concurrency /// limit, batch size) matches [`IcebergTableScan`][super::scan::IcebergTableScan]: @@ -63,8 +70,9 @@ pub struct IcebergPartitionedScan { projection: Option>, /// Filters to apply to the table scan. predicates: Option, - /// Pre-planned file scan tasks, one per DataFusion partition. - tasks: Vec, + /// Pre-planned file scan tasks grouped by output DataFusion partition. + /// `buckets[i]` holds every task that `execute(i)` will read. + buckets: Vec>, } impl IcebergPartitionedScan { @@ -74,13 +82,19 @@ impl IcebergPartitionedScan { schema: ArrowSchemaRef, projection: Option<&Vec>, filters: &[Expr], - tasks: Vec, + buckets: Vec>, + partitioning: Partitioning, ) -> Self { let output_schema = match projection { None => schema.clone(), Some(projection) => Arc::new(schema.project(projection).unwrap()), }; - let plan_properties = Self::compute_properties(output_schema, tasks.len()); + let plan_properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(output_schema), + partitioning, + EmissionType::Incremental, + Boundedness::Bounded, + )); let projection = get_column_names(schema, projection); let predicates = convert_filters_to_predicate(filters); @@ -90,7 +104,7 @@ impl IcebergPartitionedScan { plan_properties, projection, predicates, - tasks, + buckets, } } @@ -110,17 +124,12 @@ impl IcebergPartitionedScan { self.predicates.as_ref() } - pub fn tasks(&self) -> &[FileScanTask] { - &self.tasks + pub fn buckets(&self) -> &[Vec] { + &self.buckets } - fn compute_properties(schema: ArrowSchemaRef, n_partitions: usize) -> Arc { - Arc::new(PlanProperties::new( - EquivalenceProperties::new(schema), - Partitioning::UnknownPartitioning(n_partitions), - EmissionType::Incremental, - Boundedness::Bounded, - )) + fn total_file_count(&self) -> usize { + self.buckets.iter().map(|b| b.len()).sum() } } @@ -160,11 +169,11 @@ impl ExecutionPlan for IcebergPartitionedScan { partition: usize, _context: Arc, ) -> DFResult { - let task = self.tasks.get(partition).cloned().ok_or_else(|| { + let bucket = self.buckets.get(partition).cloned().ok_or_else(|| { DataFusionError::Internal(format!( - "{}: partition index {partition} is out of bounds (total tasks: {})", + "{}: partition index {partition} is out of bounds (total buckets: {})", self.name(), - self.tasks.len() + self.buckets.len() )) })?; @@ -189,7 +198,9 @@ impl ExecutionPlan for IcebergPartitionedScan { } let table_scan = scan_builder.build().map_err(to_datafusion_error)?; - let task_stream = Box::pin(futures::stream::once(futures::future::ready(Ok(task)))); + let task_stream = Box::pin(futures::stream::iter( + bucket.into_iter().map(Ok::<_, iceberg::Error>), + )); let record_batch_stream = table_scan .to_arrow_with_tasks(task_stream) .map_err(to_datafusion_error)? @@ -220,17 +231,19 @@ impl DisplayAs for IcebergPartitionedScan { .predicates .clone() .map_or(String::new(), |p| format!("{p}")); - let file_count = self.tasks.len(); + let file_count = self.total_file_count(); + let bucket_count = self.buckets.len(); write!( f, - "{} projection:[{projection}] predicate:[{predicate}] file_count:[{file_count}]", + "{} projection:[{projection}] predicate:[{predicate}] \ + buckets:[{bucket_count}] file_count:[{file_count}]", self.name() )?; if file_count <= 5 { let files = self - .tasks + .buckets .iter() - .map(|t| t.data_file_path()) + .flat_map(|b| b.iter().map(|t| t.data_file_path())) .collect::>() .join(", "); write!(f, " files:[{files}]")?; diff --git a/crates/integrations/datafusion/src/table/partitioned.rs b/crates/integrations/datafusion/src/table/partitioned.rs index 00e08efb94..580cf68a47 100644 --- a/crates/integrations/datafusion/src/table/partitioned.rs +++ b/crates/integrations/datafusion/src/table/partitioned.rs @@ -19,14 +19,25 @@ use std::any::Any; use std::sync::Arc; use async_trait::async_trait; -use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; +use datafusion::arrow::array::{ + ArrayRef, BooleanArray, Date32Array, Float32Array, Float64Array, Int32Array, Int64Array, + StringArray, +}; +use datafusion::arrow::datatypes::{DataType, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; use datafusion::catalog::Session; +use datafusion::common::hash_utils::create_hashes; use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::Result as DFResult; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; -use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_expr::expressions::Column; +use datafusion::physical_plan::repartition::REPARTITION_RANDOM_STATE; +use datafusion::physical_plan::{ExecutionPlan, Partitioning}; use futures::TryStreamExt; use iceberg::arrow::schema_to_arrow_schema; +use iceberg::scan::FileScanTask; +use iceberg::spec::{Literal, PrimitiveLiteral, Transform}; +use iceberg::table::Table; use iceberg::{Catalog, Error, ErrorKind, NamespaceIdent, Result, TableIdent}; use crate::error::to_datafusion_error; @@ -85,7 +96,7 @@ impl TableProvider for IcebergPartitionedTableProvider { async fn scan( &self, - _state: &dyn Session, + state: &dyn Session, projection: Option<&Vec>, filters: &[Expr], _limit: Option, @@ -120,7 +131,7 @@ impl TableProvider for IcebergPartitionedTableProvider { builder = builder.with_filter(pred); } - let tasks = builder + let tasks: Vec = builder .build() .map_err(to_datafusion_error)? .plan_files() @@ -130,13 +141,51 @@ impl TableProvider for IcebergPartitionedTableProvider { .await .map_err(to_datafusion_error)?; + // Output schema after projection: column indices in `Hash` exprs and any + // Arrow array we hash must reference this schema, not the full table schema. + let output_schema = match projection { + None => self.schema.clone(), + Some(p) => Arc::new(self.schema.project(p).map_err(|e| { + to_datafusion_error(Error::new(ErrorKind::DataInvalid, e.to_string())) + })?), + }; + + let target_partitions = state.config().target_partitions(); + let n_partitions = if tasks.is_empty() { + 0 + } else { + target_partitions.min(tasks.len()).max(1) + }; + + // identity_cols is Some(non-empty) iff every condition for declaring + // Partitioning::Hash is met: the table's default spec has identity-transform + // fields, every such source column is present in the output projection, and + // every column type is supported by literal_to_array. Any miss collapses to + // None, which forces UnknownPartitioning regardless of bucketing strategy. + let identity_cols = compute_identity_cols(&table, &output_schema); + + let (buckets, all_had_full_key) = + bucket_tasks(tasks, n_partitions, identity_cols.as_deref()); + + let partitioning = match identity_cols { + Some(cols) if !cols.is_empty() && all_had_full_key && n_partitions > 0 => { + let exprs: Vec> = cols + .iter() + .map(|c| Arc::new(Column::new(&c.name, c.output_idx)) as Arc) + .collect(); + Partitioning::Hash(exprs, n_partitions) + } + _ => Partitioning::UnknownPartitioning(n_partitions), + }; + Ok(Arc::new(IcebergPartitionedScan::new( table, None, // Always use current snapshot for catalog-backed provider self.schema.clone(), projection, filters, - tasks, + buckets, + partitioning, ))) } @@ -161,12 +210,167 @@ impl TableProvider for IcebergPartitionedTableProvider { } } +/// Identity-partitioned column that is also present in the output projection +/// and whose Arrow type can be reconstructed from a `Literal` for hashing. +struct IdentityCol { + name: String, + /// Position of this column in the *output* schema (after projection). + output_idx: usize, + /// Position of this column inside the partition spec's `fields()` slice, + /// matching the slot order of `FileScanTask::partition`. + spec_field_idx: usize, + output_dtype: DataType, +} + +/// Inspect the table's default partition spec and return the list of identity +/// columns that can support a [`Partitioning::Hash`] declaration. Returns +/// `None` if any condition is violated: +/// - the source column for an identity field is not in the output projection +/// - the source column's Arrow type is not currently supported by +/// [`literal_to_array`] +/// - the table has spec evolution (>1 historical specs), since older files +/// may carry a partition tuple that does not align with the default spec +/// +/// Returning `None` forces the scan to declare `UnknownPartitioning` even if +/// bucketing succeeds. +fn compute_identity_cols(table: &Table, output_schema: &ArrowSchema) -> Option> { + let metadata = table.metadata(); + if metadata.partition_specs_iter().len() > 1 { + return None; + } + let spec = metadata.default_partition_spec(); + let table_schema = metadata.current_schema(); + + let mut cols = Vec::new(); + for (spec_field_idx, pf) in spec.fields().iter().enumerate() { + if pf.transform != Transform::Identity { + continue; + } + let source_field = table_schema.field_by_id(pf.source_id)?; + let output_idx = output_schema.index_of(source_field.name.as_str()).ok()?; + let output_dtype = output_schema.field(output_idx).data_type().clone(); + if !is_supported_dtype(&output_dtype) { + return None; + } + cols.push(IdentityCol { + name: source_field.name.clone(), + output_idx, + spec_field_idx, + output_dtype, + }); + } + Some(cols) +} + +fn is_supported_dtype(dt: &DataType) -> bool { + matches!( + dt, + DataType::Boolean + | DataType::Int32 + | DataType::Int64 + | DataType::Float32 + | DataType::Float64 + | DataType::Utf8 + | DataType::Date32 + ) +} + +/// Distribute `tasks` across `n_partitions` buckets. When `identity_cols` +/// describes a non-empty, hashable identity key, each task is hashed on +/// that key using DataFusion's repartition hash so the resulting partitioning +/// matches what `RepartitionExec` would produce on the same data. Tasks +/// missing partition data fall back to hashing `data_file_path`, which still +/// distributes evenly but breaks the `Hash` contract — the second tuple +/// element flags whether every task supplied a full identity key. +fn bucket_tasks( + tasks: Vec, + n_partitions: usize, + identity_cols: Option<&[IdentityCol]>, +) -> (Vec>, bool) { + if n_partitions == 0 { + return (Vec::new(), tasks.is_empty()); + } + let mut buckets: Vec> = (0..n_partitions).map(|_| Vec::new()).collect(); + let mut all_full_key = true; + let cols = identity_cols.unwrap_or(&[]); + + for task in tasks { + let bucket_idx = match identity_hash(&task, cols) { + Some(h) => (h % n_partitions as u64) as usize, + None => { + all_full_key = false; + fallback_hash(&task) as usize % n_partitions + } + }; + buckets[bucket_idx].push(task); + } + (buckets, all_full_key) +} + +/// Hash the identity-partition values of `task` using +/// [`REPARTITION_RANDOM_STATE`] so the bucket assignment matches DataFusion's +/// hash-repartition convention. Returns `None` if the task lacks partition +/// data or any required slot is null/unsupported. +fn identity_hash(task: &FileScanTask, cols: &[IdentityCol]) -> Option { + if cols.is_empty() { + return None; + } + let partition = task.partition.as_ref()?; + let mut arrays: Vec = Vec::with_capacity(cols.len()); + for col in cols { + let lit = partition.fields().get(col.spec_field_idx)?.as_ref()?; + arrays.push(literal_to_array(lit, &col.output_dtype)?); + } + let mut hashes = vec![0u64; 1]; + create_hashes( + &arrays, + REPARTITION_RANDOM_STATE.random_state(), + &mut hashes, + ) + .ok()?; + Some(hashes[0]) +} + +/// Deterministic per-file fallback used when `identity_hash` cannot produce a +/// bucket. The hash function does not need to match DataFusion's because any +/// task taking this path causes the scan to drop to `UnknownPartitioning`. +fn fallback_hash(task: &FileScanTask) -> u64 { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + let mut hasher = DefaultHasher::new(); + task.data_file_path.hash(&mut hasher); + hasher.finish() +} + +/// Materialize a single-element Arrow array of `dt` holding the value of +/// `lit`. The Arrow type must match what DataFusion will see for this column +/// at scan time, otherwise `create_hashes` would dispatch on a different type +/// and produce a hash that disagrees with DataFusion's row-wise hashing. +fn literal_to_array(lit: &Literal, dt: &DataType) -> Option { + let prim = match lit { + Literal::Primitive(p) => p, + _ => return None, + }; + Some(match (prim, dt) { + (PrimitiveLiteral::Boolean(v), DataType::Boolean) => Arc::new(BooleanArray::from(vec![*v])), + (PrimitiveLiteral::Int(v), DataType::Int32) => Arc::new(Int32Array::from(vec![*v])), + (PrimitiveLiteral::Int(v), DataType::Date32) => Arc::new(Date32Array::from(vec![*v])), + (PrimitiveLiteral::Long(v), DataType::Int64) => Arc::new(Int64Array::from(vec![*v])), + (PrimitiveLiteral::Float(v), DataType::Float32) => Arc::new(Float32Array::from(vec![v.0])), + (PrimitiveLiteral::Double(v), DataType::Float64) => Arc::new(Float64Array::from(vec![v.0])), + (PrimitiveLiteral::String(v), DataType::Utf8) => { + Arc::new(StringArray::from(vec![v.as_str()])) + } + _ => return None, + }) +} + #[cfg(test)] mod tests { use std::collections::HashMap; use std::sync::Arc; - use datafusion::prelude::SessionContext; + use datafusion::prelude::{SessionConfig, SessionContext}; use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{ DataContentType, DataFileBuilder, DataFileFormat, NestedField, PrimitiveType, Schema, Type, @@ -263,6 +467,10 @@ mod tests { .unwrap(); } + fn ctx_with_target_partitions(n: usize) -> SessionContext { + SessionContext::new_with_config(SessionConfig::new().with_target_partitions(n)) + } + /// An empty table must produce a zero-partition scan so DataFusion never calls /// execute(0), which would otherwise return an out-of-bounds error. #[tokio::test] @@ -273,7 +481,7 @@ mod tests { .await .unwrap(); let plan = provider - .scan(&SessionContext::new().state(), None, &[], None) + .scan(&ctx_with_target_partitions(8).state(), None, &[], None) .await .unwrap(); let scan = plan @@ -281,22 +489,75 @@ mod tests { .downcast_ref::() .unwrap(); - assert_eq!(scan.tasks().len(), 0); + assert_eq!(scan.buckets().len(), 0); assert_eq!(scan.properties().partitioning.partition_count(), 0); } - /// Each data file in the table must become exactly one DataFusion partition - /// in IcebergPartitionedScan, enabling parallel file reads. + /// When the table has no identity-partition columns, every task takes the + /// fallback (file_path) bucket path, so the declaration must drop to + /// `UnknownPartitioning`. The bucket count should still equal + /// min(target_partitions, num_files). + #[tokio::test] + async fn test_unpartitioned_falls_back_to_unknown() { + let (catalog, namespace, table_name, _temp_dir) = make_catalog_and_table().await; + append_fake_data_files(&catalog, &namespace, &table_name, 5).await; + + let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan(&ctx_with_target_partitions(3).state(), None, &[], None) + .await + .unwrap(); + let scan = plan + .as_any() + .downcast_ref::() + .unwrap(); + + let total_files: usize = scan.buckets().iter().map(|b| b.len()).sum(); + assert_eq!(total_files, 5); + assert_eq!(scan.buckets().len(), 3); + assert!(matches!( + scan.properties().partitioning, + Partitioning::UnknownPartitioning(3) + )); + } + + /// Bucket count must be capped at the number of files: spinning up more + /// DataFusion partitions than there are tasks would just leave empty + /// streams, wasting scheduler slots. + #[tokio::test] + async fn test_bucket_count_capped_at_file_count() { + let (catalog, namespace, table_name, _temp_dir) = make_catalog_and_table().await; + append_fake_data_files(&catalog, &namespace, &table_name, 2).await; + + let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan(&ctx_with_target_partitions(16).state(), None, &[], None) + .await + .unwrap(); + let scan = plan + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(scan.buckets().len(), 2); + } + + /// target_partitions = 1 collapses every task into a single bucket, giving + /// the same execution profile as `IcebergTableScan`. #[tokio::test] - async fn test_one_partition_per_file() { + async fn test_single_target_partition_single_bucket() { let (catalog, namespace, table_name, _temp_dir) = make_catalog_and_table().await; - append_fake_data_files(&catalog, &namespace, &table_name, 3).await; + append_fake_data_files(&catalog, &namespace, &table_name, 4).await; let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) .await .unwrap(); let plan = provider - .scan(&SessionContext::new().state(), None, &[], None) + .scan(&ctx_with_target_partitions(1).state(), None, &[], None) .await .unwrap(); let scan = plan @@ -304,7 +565,7 @@ mod tests { .downcast_ref::() .unwrap(); - assert_eq!(scan.tasks().len(), 3); - assert_eq!(scan.properties().partitioning.partition_count(), 3); + assert_eq!(scan.buckets().len(), 1); + assert_eq!(scan.buckets()[0].len(), 4); } } From 8770e0ca75ce0dea925561a457d1dc4e16adce86 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 24 Apr 2026 19:37:22 -0400 Subject: [PATCH 07/32] feat(datafusion): mark identity-partition filters as Exact pushdown `IcebergPartitionedTableProvider::supports_filters_pushdown` previously returned `Inexact` for every filter, forcing DataFusion to re-evaluate even filters that Iceberg's manifest-level pruning has fully resolved. Per-filter the provider now returns `Exact` when both: - the iceberg conversion can represent the filter, so manifest pruning will remove every row that fails it, and - every leaf is a comparison or null check against an identity- partition column with a literal RHS. Identity-partitioned column names are cached at `try_new` from the table's default spec; tables with spec evolution (>1 historical specs) fall back to an empty set so all filters stay `Inexact`. Supported shapes: =, !=, <, <=, >, >=, IS NULL, IS NOT NULL, IN/NOT IN, plus AND/OR/NOT compositions of the above. Every other shape is `Inexact`. `convert_filter_to_predicate` is promoted to `pub(crate)` so the provider can probe convertibility per filter without rebuilding the whole AND-collapsed predicate. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/physical_plan/expr_to_predicate.rs | 2 +- .../datafusion/src/table/partitioned.rs | 249 +++++++++++++++++- 2 files changed, 246 insertions(+), 5 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index 17c9416d54..b426b1228f 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -49,7 +49,7 @@ pub fn convert_filters_to_predicate(filters: &[Expr]) -> Option { .reduce(Predicate::and) } -fn convert_filter_to_predicate(expr: &Expr) -> Option { +pub(crate) fn convert_filter_to_predicate(expr: &Expr) -> Option { match to_iceberg_predicate(expr) { TransformedResult::Predicate(predicate) => Some(predicate), TransformedResult::Column(column) => { diff --git a/crates/integrations/datafusion/src/table/partitioned.rs b/crates/integrations/datafusion/src/table/partitioned.rs index 580cf68a47..34b8cf18f6 100644 --- a/crates/integrations/datafusion/src/table/partitioned.rs +++ b/crates/integrations/datafusion/src/table/partitioned.rs @@ -16,6 +16,7 @@ // under the License. use std::any::Any; +use std::collections::HashSet; use std::sync::Arc; use async_trait::async_trait; @@ -28,7 +29,7 @@ use datafusion::catalog::Session; use datafusion::common::hash_utils::create_hashes; use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::Result as DFResult; -use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; +use datafusion::logical_expr::{Expr, Operator, TableProviderFilterPushDown}; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_expr::expressions::Column; use datafusion::physical_plan::repartition::REPARTITION_RANDOM_STATE; @@ -41,7 +42,9 @@ use iceberg::table::Table; use iceberg::{Catalog, Error, ErrorKind, NamespaceIdent, Result, TableIdent}; use crate::error::to_datafusion_error; -use crate::physical_plan::expr_to_predicate::convert_filters_to_predicate; +use crate::physical_plan::expr_to_predicate::{ + convert_filter_to_predicate, convert_filters_to_predicate, +}; use crate::physical_plan::partitioned_scan::IcebergPartitionedScan; /// Catalog-backed table provider that scans each data file in a separate DataFusion partition. @@ -59,6 +62,17 @@ pub struct IcebergPartitionedTableProvider { catalog: Arc, table_ident: TableIdent, schema: ArrowSchemaRef, + /// Source-column names that are identity-partitioned in the table's + /// default spec, captured at construction. Used by + /// `supports_filters_pushdown` to mark filters as `Exact` when they + /// only reference identity-partition columns. Empty when the table + /// has spec evolution (>1 historical specs) or no identity transforms, + /// which forces every filter to `Inexact`. + /// + /// This is a snapshot: if the table's default spec changes between + /// `try_new` and a later scan, the cached set may be stale. Spec + /// evolution is rare in practice and the next `try_new` will refresh. + identity_partition_cols: HashSet, } impl IcebergPartitionedTableProvider { @@ -72,10 +86,12 @@ impl IcebergPartitionedTableProvider { // A second load_table is issued at scan time to guarantee the freshest snapshot. let table = catalog.load_table(&table_ident).await?; let schema = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?); + let identity_partition_cols = identity_partition_col_names(&table); Ok(Self { catalog, table_ident, schema, + identity_partition_cols, }) } } @@ -193,7 +209,24 @@ impl TableProvider for IcebergPartitionedTableProvider { &self, filters: &[&Expr], ) -> DFResult> { - Ok(vec![TableProviderFilterPushDown::Inexact; filters.len()]) + Ok(filters + .iter() + .map(|f| { + // `Exact` is only safe when (1) the filter touches nothing but + // identity-partition columns and operators preserved by the + // identity transform, and (2) the iceberg conversion can + // actually represent the filter, so manifest pruning will + // remove every row that fails it. Either miss falls back to + // `Inexact` and DataFusion adds a FilterExec on top. + if convert_filter_to_predicate(f).is_some() + && is_exact_on_identity(f, &self.identity_partition_cols) + { + TableProviderFilterPushDown::Exact + } else { + TableProviderFilterPushDown::Inexact + } + }) + .collect()) } async fn insert_into( @@ -342,6 +375,76 @@ fn fallback_hash(task: &FileScanTask) -> u64 { hasher.finish() } +/// Source-column names of every identity-transform field in the table's +/// default partition spec. Returns the empty set when the table has spec +/// evolution (>1 historical specs) — older files may carry partition tuples +/// whose identity status differs from the current spec, so the safe choice +/// is to refuse all `Exact` pushdowns until each task carries its own spec. +fn identity_partition_col_names(table: &Table) -> HashSet { + let metadata = table.metadata(); + if metadata.partition_specs_iter().len() > 1 { + return HashSet::new(); + } + let spec = metadata.default_partition_spec(); + let table_schema = metadata.current_schema(); + let mut names = HashSet::new(); + for pf in spec.fields() { + if pf.transform != Transform::Identity { + continue; + } + if let Some(field) = table_schema.field_by_id(pf.source_id) { + names.insert(field.name.clone()); + } + } + names +} + +/// Returns `true` when every leaf of `expr` is a comparison or null check +/// against an identity-partition column. Such filters are fully resolvable +/// by manifest-level partition pruning, so DataFusion does not need to +/// re-apply them post-scan. +/// +/// Safe operators: `=`, `!=`, `<`, `<=`, `>`, `>=`, `IS NULL`, `IS NOT NULL`, +/// `IN (..)`, `NOT IN (..)`, plus `AND` / `OR` / `NOT` of any of those. Every +/// other shape returns `false` (caller falls back to `Inexact`). +fn is_exact_on_identity(expr: &Expr, cols: &HashSet) -> bool { + if cols.is_empty() { + return false; + } + match expr { + Expr::BinaryExpr(b) => match b.op { + Operator::And | Operator::Or => { + is_exact_on_identity(&b.left, cols) && is_exact_on_identity(&b.right, cols) + } + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq => is_simple_compare_on_identity(&b.left, &b.right, cols), + _ => false, + }, + Expr::Not(inner) => is_exact_on_identity(inner, cols), + Expr::IsNull(inner) | Expr::IsNotNull(inner) => is_identity_col(inner, cols), + Expr::InList(l) => { + is_identity_col(&l.expr, cols) && l.list.iter().all(|e| matches!(e, Expr::Literal(..))) + } + _ => false, + } +} + +fn is_simple_compare_on_identity(l: &Expr, r: &Expr, cols: &HashSet) -> bool { + let l_col = is_identity_col(l, cols); + let r_col = is_identity_col(r, cols); + let l_lit = matches!(l, Expr::Literal(..)); + let r_lit = matches!(r, Expr::Literal(..)); + (l_col && r_lit) || (r_col && l_lit) +} + +fn is_identity_col(e: &Expr, cols: &HashSet) -> bool { + matches!(e, Expr::Column(c) if cols.contains(&c.name)) +} + /// Materialize a single-element Arrow array of `dt` holding the value of /// `lit`. The Arrow type must match what DataFusion will see for this column /// at scan time, otherwise `create_hashes` would dispatch on a different type @@ -370,10 +473,12 @@ mod tests { use std::collections::HashMap; use std::sync::Arc; + use datafusion::logical_expr::{col, lit}; use datafusion::prelude::{SessionConfig, SessionContext}; use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{ - DataContentType, DataFileBuilder, DataFileFormat, NestedField, PrimitiveType, Schema, Type, + DataContentType, DataFileBuilder, DataFileFormat, NestedField, PrimitiveType, Schema, + Transform, Type, UnboundPartitionSpec, }; use iceberg::transaction::{ApplyTransactionAction, Transaction}; use iceberg::{Catalog, CatalogBuilder, NamespaceIdent, TableCreation, TableIdent}; @@ -381,6 +486,58 @@ mod tests { use super::*; + async fn make_catalog_and_partitioned_table( + partition_spec: Option, + ) -> (Arc, NamespaceIdent, String, TempDir) { + let temp_dir = TempDir::new().unwrap(); + let warehouse = temp_dir.path().to_str().unwrap().to_string(); + + let catalog = Arc::new( + MemoryCatalogBuilder::default() + .load( + "memory", + HashMap::from([(MEMORY_CATALOG_WAREHOUSE.to_string(), warehouse.clone())]), + ) + .await + .unwrap(), + ); + + let namespace = NamespaceIdent::new("ns".to_string()); + catalog + .create_namespace(&namespace, HashMap::new()) + .await + .unwrap(); + + let schema = Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(); + + let creation = match partition_spec { + Some(spec) => TableCreation::builder() + .name("t".to_string()) + .location(format!("{warehouse}/t")) + .schema(schema) + .partition_spec(spec) + .properties(HashMap::new()) + .build(), + None => TableCreation::builder() + .name("t".to_string()) + .location(format!("{warehouse}/t")) + .schema(schema) + .properties(HashMap::new()) + .build(), + }; + + catalog.create_table(&namespace, creation).await.unwrap(); + + (catalog, namespace, "t".to_string(), temp_dir) + } + async fn make_catalog_and_table() -> (Arc, NamespaceIdent, String, TempDir) { let temp_dir = TempDir::new().unwrap(); let warehouse = temp_dir.path().to_str().unwrap().to_string(); @@ -546,6 +703,90 @@ mod tests { assert_eq!(scan.buckets().len(), 2); } + /// Filters that only touch identity-partition columns with literal RHS + /// can be marked `Exact` because Iceberg's manifest-level pruning already + /// removes every file whose partition value fails the predicate. + #[tokio::test] + async fn test_pushdown_exact_on_identity_column() { + let spec = UnboundPartitionSpec::builder() + .add_partition_field(1, "id_part", Transform::Identity) + .unwrap() + .build(); + let (catalog, namespace, table_name, _temp_dir) = + make_catalog_and_partitioned_table(Some(spec)).await; + let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + + let f_eq = col("id").eq(lit(5_i32)); + let f_neq = col("id").not_eq(lit(5_i32)); + let f_isnull = col("id").is_null(); + let f_and = col("id").eq(lit(5_i32)).and(col("id").lt(lit(10_i32))); + + let supports = provider + .supports_filters_pushdown(&[&f_eq, &f_neq, &f_isnull, &f_and]) + .unwrap(); + for (i, s) in supports.iter().enumerate() { + assert!( + matches!(s, TableProviderFilterPushDown::Exact), + "filter index {i} should be Exact, got {s:?}" + ); + } + } + + /// Filters touching non-partition columns or columns with non-identity + /// transforms must remain `Inexact`: the partition value is either + /// missing or lossy (bucket/truncate/etc.), so DataFusion still needs to + /// re-apply the filter against actual row values. + #[tokio::test] + async fn test_pushdown_inexact_on_non_identity_column() { + let spec = UnboundPartitionSpec::builder() + .add_partition_field(1, "id_part", Transform::Identity) + .unwrap() + .build(); + let (catalog, namespace, table_name, _temp_dir) = + make_catalog_and_partitioned_table(Some(spec)).await; + let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + + // `name` is not partitioned — manifest pruning cannot eliminate files + // by it, so the filter must re-execute post-scan. + let f_name = col("name").eq(lit("alice")); + // Mixed AND: even though `id` is identity-partitioned, the `name` arm + // is not exact, so the whole expression is Inexact. + let f_mixed = col("id").eq(lit(5_i32)).and(col("name").eq(lit("alice"))); + + let supports = provider + .supports_filters_pushdown(&[&f_name, &f_mixed]) + .unwrap(); + for (i, s) in supports.iter().enumerate() { + assert!( + matches!(s, TableProviderFilterPushDown::Inexact), + "filter index {i} should be Inexact, got {s:?}" + ); + } + } + + /// Unpartitioned tables must mark every filter `Inexact` regardless of + /// shape: there is no partition pruning that could make the scan + /// authoritative. + #[tokio::test] + async fn test_pushdown_unpartitioned_table_all_inexact() { + let (catalog, namespace, table_name, _temp_dir) = make_catalog_and_table().await; + let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + + let f_id = col("id").eq(lit(5_i32)); + let f_name = col("name").eq(lit("alice")); + let supports = provider + .supports_filters_pushdown(&[&f_id, &f_name]) + .unwrap(); + assert!(matches!(supports[0], TableProviderFilterPushDown::Inexact)); + assert!(matches!(supports[1], TableProviderFilterPushDown::Inexact)); + } + /// target_partitions = 1 collapses every task into a single bucket, giving /// the same execution profile as `IcebergTableScan`. #[tokio::test] From e0d6add40f067b95608ee1dfbd69155690797ff1 Mon Sep 17 00:00:00 2001 From: Tim Saucer Date: Fri, 24 Apr 2026 19:46:27 -0400 Subject: [PATCH 08/32] feat(datafusion): allow Exact pushdown across spec evolution via per-column intersection Previously identity_partition_col_names returned an empty set whenever the table had more than one historical partition spec, forcing every filter back to Inexact under spec evolution. This was overly conservative: Iceberg evaluates partition predicates against each manifest's own spec, so a column that is identity-partitioned in every spec is fully prunable across the entire table regardless of which spec a given file was written under. Replace the multi-spec gate with an intersection across every spec's identity-source set. A column survives only if every spec includes it with Transform::Identity; columns that appear with non-identity transforms in some spec, or are missing from a spec entirely, are dropped. The result remains an honest set of columns for which Exact pushdown is provably safe across all surviving files. Hash bucketing (compute_identity_cols) keeps its single-spec gate because slot-order alignment with the table's default spec depends on each task carrying its own spec id, which the native plan flow does not yet do. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../datafusion/src/table/partitioned.rs | 202 +++++++++++++++--- 1 file changed, 173 insertions(+), 29 deletions(-) diff --git a/crates/integrations/datafusion/src/table/partitioned.rs b/crates/integrations/datafusion/src/table/partitioned.rs index 34b8cf18f6..ba2ee2b615 100644 --- a/crates/integrations/datafusion/src/table/partitioned.rs +++ b/crates/integrations/datafusion/src/table/partitioned.rs @@ -37,7 +37,7 @@ use datafusion::physical_plan::{ExecutionPlan, Partitioning}; use futures::TryStreamExt; use iceberg::arrow::schema_to_arrow_schema; use iceberg::scan::FileScanTask; -use iceberg::spec::{Literal, PrimitiveLiteral, Transform}; +use iceberg::spec::{Literal, PartitionSpec, PrimitiveLiteral, Transform}; use iceberg::table::Table; use iceberg::{Catalog, Error, ErrorKind, NamespaceIdent, Result, TableIdent}; @@ -62,16 +62,21 @@ pub struct IcebergPartitionedTableProvider { catalog: Arc, table_ident: TableIdent, schema: ArrowSchemaRef, - /// Source-column names that are identity-partitioned in the table's - /// default spec, captured at construction. Used by - /// `supports_filters_pushdown` to mark filters as `Exact` when they - /// only reference identity-partition columns. Empty when the table - /// has spec evolution (>1 historical specs) or no identity transforms, - /// which forces every filter to `Inexact`. + /// Source-column names that are identity-partitioned in *every* historical + /// partition spec of the table, captured at construction. Used by + /// `supports_filters_pushdown` to mark filters as `Exact` when they only + /// reference these columns: Iceberg evaluates partition predicates against + /// each manifest's own spec, so a column that is identity-partitioned in + /// every spec is fully prunable across the full table regardless of which + /// spec a given file was written under. /// - /// This is a snapshot: if the table's default spec changes between - /// `try_new` and a later scan, the cached set may be stale. Spec - /// evolution is rare in practice and the next `try_new` will refresh. + /// Columns that appear in some specs with non-identity transforms + /// (`bucket`, `truncate`, `year`/`month`/etc.), or that are missing from + /// any spec entirely, are dropped from the set — those files cannot be + /// pruned exactly, so DataFusion must keep its FilterExec. + /// + /// This is a snapshot: if the table's specs change between `try_new` and + /// a later scan, the cached set may be stale. The next `try_new` refreshes. identity_partition_cols: HashSet, } @@ -375,28 +380,44 @@ fn fallback_hash(task: &FileScanTask) -> u64 { hasher.finish() } -/// Source-column names of every identity-transform field in the table's -/// default partition spec. Returns the empty set when the table has spec -/// evolution (>1 historical specs) — older files may carry partition tuples -/// whose identity status differs from the current spec, so the safe choice -/// is to refuse all `Exact` pushdowns until each task carries its own spec. +/// Intersection of identity-partitioned source-column names across every +/// historical partition spec. A column is included only if every spec +/// includes that column with `Transform::Identity`; any spec where the column +/// is absent or has a non-identity transform drops it from the result. +/// +/// Why intersection: Iceberg evaluates partition predicates against each +/// manifest's own spec. A file written under spec A can only be exactly +/// pruned by columns identity-partitioned in spec A. To guarantee Exact +/// pushdown for *every* file in the table, the column must be identity in +/// *every* spec. Otherwise some surviving files would still need DataFusion's +/// FilterExec to enforce the predicate. fn identity_partition_col_names(table: &Table) -> HashSet { let metadata = table.metadata(); - if metadata.partition_specs_iter().len() > 1 { - return HashSet::new(); - } - let spec = metadata.default_partition_spec(); let table_schema = metadata.current_schema(); - let mut names = HashSet::new(); - for pf in spec.fields() { - if pf.transform != Transform::Identity { - continue; - } - if let Some(field) = table_schema.field_by_id(pf.source_id) { - names.insert(field.name.clone()); + let identity_set = |spec: &PartitionSpec| -> HashSet { + spec.fields() + .iter() + .filter(|pf| pf.transform == Transform::Identity) + .filter_map(|pf| { + table_schema + .field_by_id(pf.source_id) + .map(|f| f.name.clone()) + }) + .collect() + }; + + let mut iter = metadata.partition_specs_iter(); + let Some(first) = iter.next() else { + return HashSet::new(); + }; + let mut acc = identity_set(first); + for spec in iter { + if acc.is_empty() { + break; } + acc = acc.intersection(&identity_set(spec)).cloned().collect(); } - names + acc } /// Returns `true` when every leaf of `expr` is a comparison or null check @@ -475,11 +496,14 @@ mod tests { use datafusion::logical_expr::{col, lit}; use datafusion::prelude::{SessionConfig, SessionContext}; + use iceberg::io::FileIO; use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{ - DataContentType, DataFileBuilder, DataFileFormat, NestedField, PrimitiveType, Schema, - Transform, Type, UnboundPartitionSpec, + DataContentType, DataFileBuilder, DataFileFormat, FormatVersion, NestedField, + PrimitiveType, Schema, SortOrder, TableMetadataBuilder, Transform, Type, + UnboundPartitionSpec, }; + use iceberg::table::Table; use iceberg::transaction::{ApplyTransactionAction, Transaction}; use iceberg::{Catalog, CatalogBuilder, NamespaceIdent, TableCreation, TableIdent}; use tempfile::TempDir; @@ -628,6 +652,126 @@ mod tests { SessionContext::new_with_config(SessionConfig::new().with_target_partitions(n)) } + /// Build a `Table` carrying `specs.len()` historical partition specs. The + /// first spec is the table's initial spec; each subsequent spec is added + /// via `into_builder().add_partition_spec(...)`. No catalog round-trip, + /// no real I/O — `FileIO::new_with_memory()` is sufficient because the + /// helper under test only reads metadata. + fn build_table_with_specs(specs: Vec) -> Table { + assert!(!specs.is_empty(), "need at least one spec"); + let schema = Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(); + + let mut iter = specs.into_iter(); + let first = iter.next().unwrap(); + let mut metadata = TableMetadataBuilder::new( + schema, + first, + SortOrder::unsorted_order(), + "memory:///t".to_string(), + FormatVersion::V2, + HashMap::new(), + ) + .unwrap() + .build() + .unwrap() + .metadata; + + for spec in iter { + metadata = metadata + .into_builder(None) + .add_partition_spec(spec) + .unwrap() + .build() + .unwrap() + .metadata; + } + + Table::builder() + .file_io(FileIO::new_with_memory()) + .metadata(Arc::new(metadata)) + .identifier(TableIdent::new( + NamespaceIdent::new("ns".to_string()), + "t".to_string(), + )) + .build() + .unwrap() + } + + /// Multi-spec table where every historical spec keeps `id` as identity: + /// the column survives the intersection and remains Exact-pushdown safe. + /// `name` is never identity-partitioned, so it is excluded. + #[test] + fn test_identity_cols_preserved_across_compatible_specs() { + let spec_v0 = UnboundPartitionSpec::builder() + .add_partition_field(1, "id_part", Transform::Identity) + .unwrap() + .build(); + // Evolved spec: still identity on `id`, plus a non-identity transform + // on `name`. The latter must not pollute the result. + let spec_v1 = UnboundPartitionSpec::builder() + .add_partition_field(1, "id_part", Transform::Identity) + .unwrap() + .add_partition_field(2, "name_bucket", Transform::Bucket(8)) + .unwrap() + .build(); + let table = build_table_with_specs(vec![spec_v0, spec_v1]); + + let cols = identity_partition_col_names(&table); + assert_eq!(cols, HashSet::from(["id".to_string()])); + } + + /// Multi-spec table where the evolved spec replaces `identity(id)` with + /// `bucket(id)`. Files written under the evolved spec cannot be exactly + /// pruned on `id`, so `id` must be dropped from the Exact-safe set. + #[test] + fn test_identity_cols_dropped_when_transform_changes() { + let spec_v0 = UnboundPartitionSpec::builder() + .add_partition_field(1, "id_part", Transform::Identity) + .unwrap() + .build(); + let spec_v1 = UnboundPartitionSpec::builder() + .add_partition_field(1, "id_bucket", Transform::Bucket(8)) + .unwrap() + .build(); + let table = build_table_with_specs(vec![spec_v0, spec_v1]); + + let cols = identity_partition_col_names(&table); + assert!( + cols.is_empty(), + "expected empty set after non-identity replacement, got {cols:?}" + ); + } + + /// Multi-spec table where the second spec omits `id` from partitioning + /// entirely. Files under that spec carry no `id` partition tuple, so + /// pruning is a no-op for them — `id` must be dropped from the + /// Exact-safe set. + #[test] + fn test_identity_cols_dropped_when_column_missing_from_some_spec() { + let spec_v0 = UnboundPartitionSpec::builder() + .add_partition_field(1, "id_part", Transform::Identity) + .unwrap() + .build(); + // Evolved spec only partitions on `name`, omitting `id`. + let spec_v1 = UnboundPartitionSpec::builder() + .add_partition_field(2, "name_part", Transform::Identity) + .unwrap() + .build(); + let table = build_table_with_specs(vec![spec_v0, spec_v1]); + + let cols = identity_partition_col_names(&table); + // Neither column survives the intersection: `id` missing from v1, + // `name` missing from v0. + assert!(cols.is_empty(), "got {cols:?}"); + } + /// An empty table must produce a zero-partition scan so DataFusion never calls /// execute(0), which would otherwise return an out-of-bounds error. #[tokio::test] From f25c911ca9d293f87ca06098a0d2dd296d0ac9cc Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 28 Apr 2026 16:40:25 +0200 Subject: [PATCH 09/32] Revert "feat(datafusion): allow Exact pushdown across spec evolution via per-column intersection" This reverts commit b2613e347ac3367f95220f7d55701d3e9db8b02f. (cherry picked from commit 826f054e368e6e6d7543c1f1f7cbc781f3bf6da1) --- .../datafusion/src/table/partitioned.rs | 202 +++--------------- 1 file changed, 29 insertions(+), 173 deletions(-) diff --git a/crates/integrations/datafusion/src/table/partitioned.rs b/crates/integrations/datafusion/src/table/partitioned.rs index ba2ee2b615..34b8cf18f6 100644 --- a/crates/integrations/datafusion/src/table/partitioned.rs +++ b/crates/integrations/datafusion/src/table/partitioned.rs @@ -37,7 +37,7 @@ use datafusion::physical_plan::{ExecutionPlan, Partitioning}; use futures::TryStreamExt; use iceberg::arrow::schema_to_arrow_schema; use iceberg::scan::FileScanTask; -use iceberg::spec::{Literal, PartitionSpec, PrimitiveLiteral, Transform}; +use iceberg::spec::{Literal, PrimitiveLiteral, Transform}; use iceberg::table::Table; use iceberg::{Catalog, Error, ErrorKind, NamespaceIdent, Result, TableIdent}; @@ -62,21 +62,16 @@ pub struct IcebergPartitionedTableProvider { catalog: Arc, table_ident: TableIdent, schema: ArrowSchemaRef, - /// Source-column names that are identity-partitioned in *every* historical - /// partition spec of the table, captured at construction. Used by - /// `supports_filters_pushdown` to mark filters as `Exact` when they only - /// reference these columns: Iceberg evaluates partition predicates against - /// each manifest's own spec, so a column that is identity-partitioned in - /// every spec is fully prunable across the full table regardless of which - /// spec a given file was written under. + /// Source-column names that are identity-partitioned in the table's + /// default spec, captured at construction. Used by + /// `supports_filters_pushdown` to mark filters as `Exact` when they + /// only reference identity-partition columns. Empty when the table + /// has spec evolution (>1 historical specs) or no identity transforms, + /// which forces every filter to `Inexact`. /// - /// Columns that appear in some specs with non-identity transforms - /// (`bucket`, `truncate`, `year`/`month`/etc.), or that are missing from - /// any spec entirely, are dropped from the set — those files cannot be - /// pruned exactly, so DataFusion must keep its FilterExec. - /// - /// This is a snapshot: if the table's specs change between `try_new` and - /// a later scan, the cached set may be stale. The next `try_new` refreshes. + /// This is a snapshot: if the table's default spec changes between + /// `try_new` and a later scan, the cached set may be stale. Spec + /// evolution is rare in practice and the next `try_new` will refresh. identity_partition_cols: HashSet, } @@ -380,44 +375,28 @@ fn fallback_hash(task: &FileScanTask) -> u64 { hasher.finish() } -/// Intersection of identity-partitioned source-column names across every -/// historical partition spec. A column is included only if every spec -/// includes that column with `Transform::Identity`; any spec where the column -/// is absent or has a non-identity transform drops it from the result. -/// -/// Why intersection: Iceberg evaluates partition predicates against each -/// manifest's own spec. A file written under spec A can only be exactly -/// pruned by columns identity-partitioned in spec A. To guarantee Exact -/// pushdown for *every* file in the table, the column must be identity in -/// *every* spec. Otherwise some surviving files would still need DataFusion's -/// FilterExec to enforce the predicate. +/// Source-column names of every identity-transform field in the table's +/// default partition spec. Returns the empty set when the table has spec +/// evolution (>1 historical specs) — older files may carry partition tuples +/// whose identity status differs from the current spec, so the safe choice +/// is to refuse all `Exact` pushdowns until each task carries its own spec. fn identity_partition_col_names(table: &Table) -> HashSet { let metadata = table.metadata(); - let table_schema = metadata.current_schema(); - let identity_set = |spec: &PartitionSpec| -> HashSet { - spec.fields() - .iter() - .filter(|pf| pf.transform == Transform::Identity) - .filter_map(|pf| { - table_schema - .field_by_id(pf.source_id) - .map(|f| f.name.clone()) - }) - .collect() - }; - - let mut iter = metadata.partition_specs_iter(); - let Some(first) = iter.next() else { + if metadata.partition_specs_iter().len() > 1 { return HashSet::new(); - }; - let mut acc = identity_set(first); - for spec in iter { - if acc.is_empty() { - break; + } + let spec = metadata.default_partition_spec(); + let table_schema = metadata.current_schema(); + let mut names = HashSet::new(); + for pf in spec.fields() { + if pf.transform != Transform::Identity { + continue; + } + if let Some(field) = table_schema.field_by_id(pf.source_id) { + names.insert(field.name.clone()); } - acc = acc.intersection(&identity_set(spec)).cloned().collect(); } - acc + names } /// Returns `true` when every leaf of `expr` is a comparison or null check @@ -496,14 +475,11 @@ mod tests { use datafusion::logical_expr::{col, lit}; use datafusion::prelude::{SessionConfig, SessionContext}; - use iceberg::io::FileIO; use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{ - DataContentType, DataFileBuilder, DataFileFormat, FormatVersion, NestedField, - PrimitiveType, Schema, SortOrder, TableMetadataBuilder, Transform, Type, - UnboundPartitionSpec, + DataContentType, DataFileBuilder, DataFileFormat, NestedField, PrimitiveType, Schema, + Transform, Type, UnboundPartitionSpec, }; - use iceberg::table::Table; use iceberg::transaction::{ApplyTransactionAction, Transaction}; use iceberg::{Catalog, CatalogBuilder, NamespaceIdent, TableCreation, TableIdent}; use tempfile::TempDir; @@ -652,126 +628,6 @@ mod tests { SessionContext::new_with_config(SessionConfig::new().with_target_partitions(n)) } - /// Build a `Table` carrying `specs.len()` historical partition specs. The - /// first spec is the table's initial spec; each subsequent spec is added - /// via `into_builder().add_partition_spec(...)`. No catalog round-trip, - /// no real I/O — `FileIO::new_with_memory()` is sufficient because the - /// helper under test only reads metadata. - fn build_table_with_specs(specs: Vec) -> Table { - assert!(!specs.is_empty(), "need at least one spec"); - let schema = Schema::builder() - .with_schema_id(0) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(); - - let mut iter = specs.into_iter(); - let first = iter.next().unwrap(); - let mut metadata = TableMetadataBuilder::new( - schema, - first, - SortOrder::unsorted_order(), - "memory:///t".to_string(), - FormatVersion::V2, - HashMap::new(), - ) - .unwrap() - .build() - .unwrap() - .metadata; - - for spec in iter { - metadata = metadata - .into_builder(None) - .add_partition_spec(spec) - .unwrap() - .build() - .unwrap() - .metadata; - } - - Table::builder() - .file_io(FileIO::new_with_memory()) - .metadata(Arc::new(metadata)) - .identifier(TableIdent::new( - NamespaceIdent::new("ns".to_string()), - "t".to_string(), - )) - .build() - .unwrap() - } - - /// Multi-spec table where every historical spec keeps `id` as identity: - /// the column survives the intersection and remains Exact-pushdown safe. - /// `name` is never identity-partitioned, so it is excluded. - #[test] - fn test_identity_cols_preserved_across_compatible_specs() { - let spec_v0 = UnboundPartitionSpec::builder() - .add_partition_field(1, "id_part", Transform::Identity) - .unwrap() - .build(); - // Evolved spec: still identity on `id`, plus a non-identity transform - // on `name`. The latter must not pollute the result. - let spec_v1 = UnboundPartitionSpec::builder() - .add_partition_field(1, "id_part", Transform::Identity) - .unwrap() - .add_partition_field(2, "name_bucket", Transform::Bucket(8)) - .unwrap() - .build(); - let table = build_table_with_specs(vec![spec_v0, spec_v1]); - - let cols = identity_partition_col_names(&table); - assert_eq!(cols, HashSet::from(["id".to_string()])); - } - - /// Multi-spec table where the evolved spec replaces `identity(id)` with - /// `bucket(id)`. Files written under the evolved spec cannot be exactly - /// pruned on `id`, so `id` must be dropped from the Exact-safe set. - #[test] - fn test_identity_cols_dropped_when_transform_changes() { - let spec_v0 = UnboundPartitionSpec::builder() - .add_partition_field(1, "id_part", Transform::Identity) - .unwrap() - .build(); - let spec_v1 = UnboundPartitionSpec::builder() - .add_partition_field(1, "id_bucket", Transform::Bucket(8)) - .unwrap() - .build(); - let table = build_table_with_specs(vec![spec_v0, spec_v1]); - - let cols = identity_partition_col_names(&table); - assert!( - cols.is_empty(), - "expected empty set after non-identity replacement, got {cols:?}" - ); - } - - /// Multi-spec table where the second spec omits `id` from partitioning - /// entirely. Files under that spec carry no `id` partition tuple, so - /// pruning is a no-op for them — `id` must be dropped from the - /// Exact-safe set. - #[test] - fn test_identity_cols_dropped_when_column_missing_from_some_spec() { - let spec_v0 = UnboundPartitionSpec::builder() - .add_partition_field(1, "id_part", Transform::Identity) - .unwrap() - .build(); - // Evolved spec only partitions on `name`, omitting `id`. - let spec_v1 = UnboundPartitionSpec::builder() - .add_partition_field(2, "name_part", Transform::Identity) - .unwrap() - .build(); - let table = build_table_with_specs(vec![spec_v0, spec_v1]); - - let cols = identity_partition_col_names(&table); - // Neither column survives the intersection: `id` missing from v1, - // `name` missing from v0. - assert!(cols.is_empty(), "got {cols:?}"); - } - /// An empty table must produce a zero-partition scan so DataFusion never calls /// execute(0), which would otherwise return an out-of-bounds error. #[tokio::test] From 5093b823d9e16e4e260b976a18b912ceefb1ffe4 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 28 Apr 2026 16:40:32 +0200 Subject: [PATCH 10/32] Revert "feat(datafusion): mark identity-partition filters as Exact pushdown" This reverts commit 6d0ed4c7c5ba4a0d53de6bca5c1321e744edc8fa. (cherry picked from commit 4381f004dc0d2aff5616e4cdd474f595d4ef1f8c) --- .../src/physical_plan/expr_to_predicate.rs | 2 +- .../datafusion/src/table/partitioned.rs | 249 +----------------- 2 files changed, 5 insertions(+), 246 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs index b426b1228f..17c9416d54 100644 --- a/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs +++ b/crates/integrations/datafusion/src/physical_plan/expr_to_predicate.rs @@ -49,7 +49,7 @@ pub fn convert_filters_to_predicate(filters: &[Expr]) -> Option { .reduce(Predicate::and) } -pub(crate) fn convert_filter_to_predicate(expr: &Expr) -> Option { +fn convert_filter_to_predicate(expr: &Expr) -> Option { match to_iceberg_predicate(expr) { TransformedResult::Predicate(predicate) => Some(predicate), TransformedResult::Column(column) => { diff --git a/crates/integrations/datafusion/src/table/partitioned.rs b/crates/integrations/datafusion/src/table/partitioned.rs index 34b8cf18f6..580cf68a47 100644 --- a/crates/integrations/datafusion/src/table/partitioned.rs +++ b/crates/integrations/datafusion/src/table/partitioned.rs @@ -16,7 +16,6 @@ // under the License. use std::any::Any; -use std::collections::HashSet; use std::sync::Arc; use async_trait::async_trait; @@ -29,7 +28,7 @@ use datafusion::catalog::Session; use datafusion::common::hash_utils::create_hashes; use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::Result as DFResult; -use datafusion::logical_expr::{Expr, Operator, TableProviderFilterPushDown}; +use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_expr::expressions::Column; use datafusion::physical_plan::repartition::REPARTITION_RANDOM_STATE; @@ -42,9 +41,7 @@ use iceberg::table::Table; use iceberg::{Catalog, Error, ErrorKind, NamespaceIdent, Result, TableIdent}; use crate::error::to_datafusion_error; -use crate::physical_plan::expr_to_predicate::{ - convert_filter_to_predicate, convert_filters_to_predicate, -}; +use crate::physical_plan::expr_to_predicate::convert_filters_to_predicate; use crate::physical_plan::partitioned_scan::IcebergPartitionedScan; /// Catalog-backed table provider that scans each data file in a separate DataFusion partition. @@ -62,17 +59,6 @@ pub struct IcebergPartitionedTableProvider { catalog: Arc, table_ident: TableIdent, schema: ArrowSchemaRef, - /// Source-column names that are identity-partitioned in the table's - /// default spec, captured at construction. Used by - /// `supports_filters_pushdown` to mark filters as `Exact` when they - /// only reference identity-partition columns. Empty when the table - /// has spec evolution (>1 historical specs) or no identity transforms, - /// which forces every filter to `Inexact`. - /// - /// This is a snapshot: if the table's default spec changes between - /// `try_new` and a later scan, the cached set may be stale. Spec - /// evolution is rare in practice and the next `try_new` will refresh. - identity_partition_cols: HashSet, } impl IcebergPartitionedTableProvider { @@ -86,12 +72,10 @@ impl IcebergPartitionedTableProvider { // A second load_table is issued at scan time to guarantee the freshest snapshot. let table = catalog.load_table(&table_ident).await?; let schema = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?); - let identity_partition_cols = identity_partition_col_names(&table); Ok(Self { catalog, table_ident, schema, - identity_partition_cols, }) } } @@ -209,24 +193,7 @@ impl TableProvider for IcebergPartitionedTableProvider { &self, filters: &[&Expr], ) -> DFResult> { - Ok(filters - .iter() - .map(|f| { - // `Exact` is only safe when (1) the filter touches nothing but - // identity-partition columns and operators preserved by the - // identity transform, and (2) the iceberg conversion can - // actually represent the filter, so manifest pruning will - // remove every row that fails it. Either miss falls back to - // `Inexact` and DataFusion adds a FilterExec on top. - if convert_filter_to_predicate(f).is_some() - && is_exact_on_identity(f, &self.identity_partition_cols) - { - TableProviderFilterPushDown::Exact - } else { - TableProviderFilterPushDown::Inexact - } - }) - .collect()) + Ok(vec![TableProviderFilterPushDown::Inexact; filters.len()]) } async fn insert_into( @@ -375,76 +342,6 @@ fn fallback_hash(task: &FileScanTask) -> u64 { hasher.finish() } -/// Source-column names of every identity-transform field in the table's -/// default partition spec. Returns the empty set when the table has spec -/// evolution (>1 historical specs) — older files may carry partition tuples -/// whose identity status differs from the current spec, so the safe choice -/// is to refuse all `Exact` pushdowns until each task carries its own spec. -fn identity_partition_col_names(table: &Table) -> HashSet { - let metadata = table.metadata(); - if metadata.partition_specs_iter().len() > 1 { - return HashSet::new(); - } - let spec = metadata.default_partition_spec(); - let table_schema = metadata.current_schema(); - let mut names = HashSet::new(); - for pf in spec.fields() { - if pf.transform != Transform::Identity { - continue; - } - if let Some(field) = table_schema.field_by_id(pf.source_id) { - names.insert(field.name.clone()); - } - } - names -} - -/// Returns `true` when every leaf of `expr` is a comparison or null check -/// against an identity-partition column. Such filters are fully resolvable -/// by manifest-level partition pruning, so DataFusion does not need to -/// re-apply them post-scan. -/// -/// Safe operators: `=`, `!=`, `<`, `<=`, `>`, `>=`, `IS NULL`, `IS NOT NULL`, -/// `IN (..)`, `NOT IN (..)`, plus `AND` / `OR` / `NOT` of any of those. Every -/// other shape returns `false` (caller falls back to `Inexact`). -fn is_exact_on_identity(expr: &Expr, cols: &HashSet) -> bool { - if cols.is_empty() { - return false; - } - match expr { - Expr::BinaryExpr(b) => match b.op { - Operator::And | Operator::Or => { - is_exact_on_identity(&b.left, cols) && is_exact_on_identity(&b.right, cols) - } - Operator::Eq - | Operator::NotEq - | Operator::Lt - | Operator::LtEq - | Operator::Gt - | Operator::GtEq => is_simple_compare_on_identity(&b.left, &b.right, cols), - _ => false, - }, - Expr::Not(inner) => is_exact_on_identity(inner, cols), - Expr::IsNull(inner) | Expr::IsNotNull(inner) => is_identity_col(inner, cols), - Expr::InList(l) => { - is_identity_col(&l.expr, cols) && l.list.iter().all(|e| matches!(e, Expr::Literal(..))) - } - _ => false, - } -} - -fn is_simple_compare_on_identity(l: &Expr, r: &Expr, cols: &HashSet) -> bool { - let l_col = is_identity_col(l, cols); - let r_col = is_identity_col(r, cols); - let l_lit = matches!(l, Expr::Literal(..)); - let r_lit = matches!(r, Expr::Literal(..)); - (l_col && r_lit) || (r_col && l_lit) -} - -fn is_identity_col(e: &Expr, cols: &HashSet) -> bool { - matches!(e, Expr::Column(c) if cols.contains(&c.name)) -} - /// Materialize a single-element Arrow array of `dt` holding the value of /// `lit`. The Arrow type must match what DataFusion will see for this column /// at scan time, otherwise `create_hashes` would dispatch on a different type @@ -473,12 +370,10 @@ mod tests { use std::collections::HashMap; use std::sync::Arc; - use datafusion::logical_expr::{col, lit}; use datafusion::prelude::{SessionConfig, SessionContext}; use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{ - DataContentType, DataFileBuilder, DataFileFormat, NestedField, PrimitiveType, Schema, - Transform, Type, UnboundPartitionSpec, + DataContentType, DataFileBuilder, DataFileFormat, NestedField, PrimitiveType, Schema, Type, }; use iceberg::transaction::{ApplyTransactionAction, Transaction}; use iceberg::{Catalog, CatalogBuilder, NamespaceIdent, TableCreation, TableIdent}; @@ -486,58 +381,6 @@ mod tests { use super::*; - async fn make_catalog_and_partitioned_table( - partition_spec: Option, - ) -> (Arc, NamespaceIdent, String, TempDir) { - let temp_dir = TempDir::new().unwrap(); - let warehouse = temp_dir.path().to_str().unwrap().to_string(); - - let catalog = Arc::new( - MemoryCatalogBuilder::default() - .load( - "memory", - HashMap::from([(MEMORY_CATALOG_WAREHOUSE.to_string(), warehouse.clone())]), - ) - .await - .unwrap(), - ); - - let namespace = NamespaceIdent::new("ns".to_string()); - catalog - .create_namespace(&namespace, HashMap::new()) - .await - .unwrap(); - - let schema = Schema::builder() - .with_schema_id(0) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(); - - let creation = match partition_spec { - Some(spec) => TableCreation::builder() - .name("t".to_string()) - .location(format!("{warehouse}/t")) - .schema(schema) - .partition_spec(spec) - .properties(HashMap::new()) - .build(), - None => TableCreation::builder() - .name("t".to_string()) - .location(format!("{warehouse}/t")) - .schema(schema) - .properties(HashMap::new()) - .build(), - }; - - catalog.create_table(&namespace, creation).await.unwrap(); - - (catalog, namespace, "t".to_string(), temp_dir) - } - async fn make_catalog_and_table() -> (Arc, NamespaceIdent, String, TempDir) { let temp_dir = TempDir::new().unwrap(); let warehouse = temp_dir.path().to_str().unwrap().to_string(); @@ -703,90 +546,6 @@ mod tests { assert_eq!(scan.buckets().len(), 2); } - /// Filters that only touch identity-partition columns with literal RHS - /// can be marked `Exact` because Iceberg's manifest-level pruning already - /// removes every file whose partition value fails the predicate. - #[tokio::test] - async fn test_pushdown_exact_on_identity_column() { - let spec = UnboundPartitionSpec::builder() - .add_partition_field(1, "id_part", Transform::Identity) - .unwrap() - .build(); - let (catalog, namespace, table_name, _temp_dir) = - make_catalog_and_partitioned_table(Some(spec)).await; - let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) - .await - .unwrap(); - - let f_eq = col("id").eq(lit(5_i32)); - let f_neq = col("id").not_eq(lit(5_i32)); - let f_isnull = col("id").is_null(); - let f_and = col("id").eq(lit(5_i32)).and(col("id").lt(lit(10_i32))); - - let supports = provider - .supports_filters_pushdown(&[&f_eq, &f_neq, &f_isnull, &f_and]) - .unwrap(); - for (i, s) in supports.iter().enumerate() { - assert!( - matches!(s, TableProviderFilterPushDown::Exact), - "filter index {i} should be Exact, got {s:?}" - ); - } - } - - /// Filters touching non-partition columns or columns with non-identity - /// transforms must remain `Inexact`: the partition value is either - /// missing or lossy (bucket/truncate/etc.), so DataFusion still needs to - /// re-apply the filter against actual row values. - #[tokio::test] - async fn test_pushdown_inexact_on_non_identity_column() { - let spec = UnboundPartitionSpec::builder() - .add_partition_field(1, "id_part", Transform::Identity) - .unwrap() - .build(); - let (catalog, namespace, table_name, _temp_dir) = - make_catalog_and_partitioned_table(Some(spec)).await; - let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) - .await - .unwrap(); - - // `name` is not partitioned — manifest pruning cannot eliminate files - // by it, so the filter must re-execute post-scan. - let f_name = col("name").eq(lit("alice")); - // Mixed AND: even though `id` is identity-partitioned, the `name` arm - // is not exact, so the whole expression is Inexact. - let f_mixed = col("id").eq(lit(5_i32)).and(col("name").eq(lit("alice"))); - - let supports = provider - .supports_filters_pushdown(&[&f_name, &f_mixed]) - .unwrap(); - for (i, s) in supports.iter().enumerate() { - assert!( - matches!(s, TableProviderFilterPushDown::Inexact), - "filter index {i} should be Inexact, got {s:?}" - ); - } - } - - /// Unpartitioned tables must mark every filter `Inexact` regardless of - /// shape: there is no partition pruning that could make the scan - /// authoritative. - #[tokio::test] - async fn test_pushdown_unpartitioned_table_all_inexact() { - let (catalog, namespace, table_name, _temp_dir) = make_catalog_and_table().await; - let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) - .await - .unwrap(); - - let f_id = col("id").eq(lit(5_i32)); - let f_name = col("name").eq(lit("alice")); - let supports = provider - .supports_filters_pushdown(&[&f_id, &f_name]) - .unwrap(); - assert!(matches!(supports[0], TableProviderFilterPushDown::Inexact)); - assert!(matches!(supports[1], TableProviderFilterPushDown::Inexact)); - } - /// target_partitions = 1 collapses every task into a single bucket, giving /// the same execution profile as `IcebergTableScan`. #[tokio::test] From 34d6f4c02715254ec3b48f33697393c28fe1c905 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 28 Apr 2026 17:29:19 +0200 Subject: [PATCH 11/32] refactor(datafusion): merge IcebergPartitionedTableProvider into IcebergTableProvider IcebergPartitionedTableProvider and IcebergPartitionedScan were introduced to enable parallel file scanning by bucketing FileScanTasks across DataFusion partitions. However, maintaining two TableProvider implementations is redundant: the new provider is strictly more capable, and its degenerate case (target_partitions=1) reproduces the old single-partition behavior exactly. This commit folds the partitioned provider into IcebergTableProvider and the partitioned scan into IcebergTableScan, eliminating the parallel types. Changes: - IcebergTableProvider::scan() now eagerly calls plan_files() and distributes FileScanTasks into buckets using the same identity-hash strategy (REPARTITION_RANDOM_STATE + create_hashes) that was in IcebergPartitionedTableProvider, enabling Partitioning::Hash declarations that align with DataFusion's RepartitionExec. - IcebergTableScan gains a new_with_tasks() constructor that accepts pre-planned buckets and a caller-supplied Partitioning. execute(i) streams the tasks in buckets[i] via TableScan::to_arrow_with_tasks, rebuilding the TableScan per-partition to avoid serializing PlanContext Arc-shared caches across workers. - The original new() constructor and the to_arrow() lazy path are kept unchanged for IcebergStaticTableProvider, which does not pre-plan tasks. - Limit slicing (try_filter_map truncation) from the old IcebergTableScan is preserved in both execution paths. - Bucketing helpers (IdentityCol, compute_identity_cols, bucket_tasks, identity_hash, fallback_hash, literal_to_array, is_supported_dtype) are moved verbatim into a new private table/bucketing.rs module. - Unit tests from partitioned.rs are migrated to table/mod.rs and updated to use IcebergTableProvider and IcebergTableScan. - integration_datafusion_test.rs: fix test_provider_plan_stream_schema to call execute(0) instead of execute(1). The old call worked only because the previous IcebergTableScan silently ignored the partition index. (cherry picked from commit d2e5e0412c1e7e17f85f9fa549af1544d07eaae1) --- crates/integrations/datafusion/src/lib.rs | 2 - .../datafusion/src/physical_plan/mod.rs | 2 - .../src/physical_plan/partitioned_scan.rs | 253 -------- .../datafusion/src/physical_plan/scan.rs | 241 ++++++-- .../datafusion/src/table/bucketing.rs | 189 ++++++ .../integrations/datafusion/src/table/mod.rs | 301 ++++++++- .../datafusion/src/table/partitioned.rs | 571 ------------------ .../tests/integration_datafusion_test.rs | 2 +- 8 files changed, 676 insertions(+), 885 deletions(-) delete mode 100644 crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs create mode 100644 crates/integrations/datafusion/src/table/bucketing.rs delete mode 100644 crates/integrations/datafusion/src/table/partitioned.rs diff --git a/crates/integrations/datafusion/src/lib.rs b/crates/integrations/datafusion/src/lib.rs index 9a84832d88..4b0ea8606d 100644 --- a/crates/integrations/datafusion/src/lib.rs +++ b/crates/integrations/datafusion/src/lib.rs @@ -24,8 +24,6 @@ pub use error::*; pub mod physical_plan; mod schema; pub mod table; -pub use physical_plan::IcebergPartitionedScan; -pub use table::partitioned::IcebergPartitionedTableProvider; pub use table::table_provider_factory::IcebergTableProviderFactory; pub use table::*; diff --git a/crates/integrations/datafusion/src/physical_plan/mod.rs b/crates/integrations/datafusion/src/physical_plan/mod.rs index a257fe9e20..aeac30de32 100644 --- a/crates/integrations/datafusion/src/physical_plan/mod.rs +++ b/crates/integrations/datafusion/src/physical_plan/mod.rs @@ -18,7 +18,6 @@ pub(crate) mod commit; pub(crate) mod expr_to_predicate; pub(crate) mod metadata_scan; -pub(crate) mod partitioned_scan; pub(crate) mod project; pub(crate) mod repartition; pub(crate) mod scan; @@ -28,6 +27,5 @@ pub(crate) mod write; pub(crate) const DATA_FILES_COL_NAME: &str = "data_files"; pub use expr_to_predicate::convert_filters_to_predicate; -pub use partitioned_scan::IcebergPartitionedScan; pub use project::project_with_partition; pub use scan::IcebergTableScan; diff --git a/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs b/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs deleted file mode 100644 index 159665dddd..0000000000 --- a/crates/integrations/datafusion/src/physical_plan/partitioned_scan.rs +++ /dev/null @@ -1,253 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::any::Any; -use std::sync::Arc; - -use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; -use datafusion::error::{DataFusionError, Result as DFResult}; -use datafusion::execution::{SendableRecordBatchStream, TaskContext}; -use datafusion::physical_expr::EquivalenceProperties; -use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; -use datafusion::physical_plan::stream::RecordBatchStreamAdapter; -use datafusion::physical_plan::{DisplayAs, ExecutionPlan, Partitioning, PlanProperties}; -use datafusion::prelude::Expr; -use futures::TryStreamExt; -use iceberg::expr::Predicate; -use iceberg::scan::FileScanTask; -use iceberg::table::Table; - -use super::expr_to_predicate::convert_filters_to_predicate; -use super::scan::get_column_names; -use crate::to_datafusion_error; - -/// A DataFusion [`ExecutionPlan`] that reads a bucket of [`FileScanTask`]s per partition. -/// -/// Each DataFusion partition `i` streams every [`FileScanTask`] in `buckets[i]`, -/// concatenated into a single Arrow record-batch stream. The caller decides how -/// tasks are assigned to buckets and supplies the resulting [`Partitioning`] -/// (typically [`Partitioning::Hash`] when files are bucketed by identity-partition -/// values matching DataFusion's repartition hash, otherwise -/// [`Partitioning::UnknownPartitioning`]). -/// -/// Arrow reader configuration (row-group filtering, row selection, concurrency -/// limit, batch size) matches [`IcebergTableScan`][super::scan::IcebergTableScan]: -/// it is sourced from the underlying [`TableScan`][iceberg::scan::TableScan] -/// rebuilt in [`execute`](ExecutionPlan::execute) and applied via -/// [`TableScan::to_arrow_with_tasks`][iceberg::scan::TableScan::to_arrow_with_tasks]. -/// -/// Note: the `TableScan` is rebuilt on every `execute(partition)` call rather -/// than cached as an `Arc` on the struct. Caching would avoid -/// redundant schema resolution and predicate binding per partition, but -/// `TableScan` carries a `PlanContext` with `Arc`-shared evaluator caches -/// which is awkward to serialize if this plan ever needs to be shipped across -/// workers. The per-build cost is bounded (no I/O), so the rebuild is kept -/// for now; revisit once the cross-worker story is clearer. -#[derive(Debug)] -pub struct IcebergPartitionedScan { - /// A table in the catalog. - table: Table, - /// Snapshot of the table to scan. - snapshot_id: Option, - /// Stores certain, often expensive to compute, - /// plan properties used in query optimization. - plan_properties: Arc, - /// Projection column names, None means all columns. - projection: Option>, - /// Filters to apply to the table scan. - predicates: Option, - /// Pre-planned file scan tasks grouped by output DataFusion partition. - /// `buckets[i]` holds every task that `execute(i)` will read. - buckets: Vec>, -} - -impl IcebergPartitionedScan { - pub(crate) fn new( - table: Table, - snapshot_id: Option, - schema: ArrowSchemaRef, - projection: Option<&Vec>, - filters: &[Expr], - buckets: Vec>, - partitioning: Partitioning, - ) -> Self { - let output_schema = match projection { - None => schema.clone(), - Some(projection) => Arc::new(schema.project(projection).unwrap()), - }; - let plan_properties = Arc::new(PlanProperties::new( - EquivalenceProperties::new(output_schema), - partitioning, - EmissionType::Incremental, - Boundedness::Bounded, - )); - let projection = get_column_names(schema, projection); - let predicates = convert_filters_to_predicate(filters); - - Self { - table, - snapshot_id, - plan_properties, - projection, - predicates, - buckets, - } - } - - pub fn table(&self) -> &Table { - &self.table - } - - pub fn snapshot_id(&self) -> Option { - self.snapshot_id - } - - pub fn projection(&self) -> Option<&[String]> { - self.projection.as_deref() - } - - pub fn predicates(&self) -> Option<&Predicate> { - self.predicates.as_ref() - } - - pub fn buckets(&self) -> &[Vec] { - &self.buckets - } - - fn total_file_count(&self) -> usize { - self.buckets.iter().map(|b| b.len()).sum() - } -} - -impl ExecutionPlan for IcebergPartitionedScan { - fn name(&self) -> &str { - "IcebergPartitionedScan" - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn children(&self) -> Vec<&Arc> { - vec![] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> DFResult> { - if !children.is_empty() { - return Err(DataFusionError::Internal(format!( - "{} is a leaf node and expects no children, but {} were provided", - self.name(), - children.len() - ))); - } - Ok(self) - } - - fn properties(&self) -> &Arc { - &self.plan_properties - } - - fn execute( - &self, - partition: usize, - _context: Arc, - ) -> DFResult { - let bucket = self.buckets.get(partition).cloned().ok_or_else(|| { - DataFusionError::Internal(format!( - "{}: partition index {partition} is out of bounds (total buckets: {})", - self.name(), - self.buckets.len() - )) - })?; - - let table = self.table.clone(); - let snapshot_id = self.snapshot_id; - let column_names = self.projection.clone(); - let predicates = self.predicates.clone(); - - let fut = async move { - // Rebuild a TableScan mirroring IcebergTableScan::get_batch_stream so we - // inherit the same defaults (row-group filtering, batch size, concurrency, ...). - let scan_builder = match snapshot_id { - Some(id) => table.scan().snapshot_id(id), - None => table.scan(), - }; - let mut scan_builder = match column_names { - Some(names) => scan_builder.select(names), - None => scan_builder.select_all(), - }; - if let Some(pred) = predicates { - scan_builder = scan_builder.with_filter(pred); - } - let table_scan = scan_builder.build().map_err(to_datafusion_error)?; - - let task_stream = Box::pin(futures::stream::iter( - bucket.into_iter().map(Ok::<_, iceberg::Error>), - )); - let record_batch_stream = table_scan - .to_arrow_with_tasks(task_stream) - .map_err(to_datafusion_error)? - .map_err(to_datafusion_error); - Ok::<_, DataFusionError>(record_batch_stream) - }; - - let stream = futures::stream::once(fut).try_flatten(); - - Ok(Box::pin(RecordBatchStreamAdapter::new( - self.schema(), - stream, - ))) - } -} - -impl DisplayAs for IcebergPartitionedScan { - fn fmt_as( - &self, - _t: datafusion::physical_plan::DisplayFormatType, - f: &mut std::fmt::Formatter, - ) -> std::fmt::Result { - let projection = self - .projection - .clone() - .map_or(String::new(), |v| v.join(",")); - let predicate = self - .predicates - .clone() - .map_or(String::new(), |p| format!("{p}")); - let file_count = self.total_file_count(); - let bucket_count = self.buckets.len(); - write!( - f, - "{} projection:[{projection}] predicate:[{predicate}] \ - buckets:[{bucket_count}] file_count:[{file_count}]", - self.name() - )?; - if file_count <= 5 { - let files = self - .buckets - .iter() - .flat_map(|b| b.iter().map(|t| t.data_file_path())) - .collect::>() - .join(", "); - write!(f, " files:[{files}]")?; - } - Ok(()) - } -} diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index c5a892abeb..49ff61e999 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -18,11 +18,10 @@ use std::any::Any; use std::pin::Pin; use std::sync::Arc; -use std::vec; use datafusion::arrow::array::RecordBatch; use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; -use datafusion::error::Result as DFResult; +use datafusion::error::{DataFusionError, Result as DFResult}; use datafusion::execution::{SendableRecordBatchStream, TaskContext}; use datafusion::physical_expr::EquivalenceProperties; use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; @@ -31,6 +30,7 @@ use datafusion::physical_plan::{DisplayAs, ExecutionPlan, Partitioning, PlanProp use datafusion::prelude::Expr; use futures::{Stream, TryStreamExt}; use iceberg::expr::Predicate; +use iceberg::scan::FileScanTask; use iceberg::table::Table; use super::expr_to_predicate::convert_filters_to_predicate; @@ -38,6 +38,26 @@ use crate::to_datafusion_error; /// Manages the scanning process of an Iceberg [`Table`], encapsulating the /// necessary details and computed properties required for execution planning. +/// +/// When constructed with pre-planned [`FileScanTask`] buckets via +/// [`IcebergTableScan::new_with_tasks`], each DataFusion partition `i` streams +/// every task in `buckets[i]` using +/// [`TableScan::to_arrow_with_tasks`][iceberg::scan::TableScan::to_arrow_with_tasks]. +/// +/// When constructed via [`IcebergTableScan::new`] (no pre-planned tasks), the +/// full table is scanned lazily in a single partition using +/// [`TableScan::to_arrow`][iceberg::scan::TableScan::to_arrow]. This mode is +/// used by [`IcebergStaticTableProvider`][crate::table::IcebergStaticTableProvider]. +/// +/// In both modes the optional `limit` field truncates the output stream to at +/// most that many rows. +/// +/// Note: when using pre-planned tasks, the `TableScan` is rebuilt on every +/// `execute(partition)` call rather than cached. `TableScan` carries a +/// `PlanContext` with `Arc`-shared evaluator caches which is awkward to +/// serialize if this plan ever needs to be shipped across workers. The +/// per-build cost is bounded (no I/O), so the rebuild is kept for now; +/// revisit once the cross-worker story is clearer. #[derive(Debug)] pub struct IcebergTableScan { /// A table in the catalog. @@ -47,16 +67,25 @@ pub struct IcebergTableScan { /// Stores certain, often expensive to compute, /// plan properties used in query optimization. plan_properties: Arc, - /// Projection column names, None means all columns + /// Projection column names, None means all columns. projection: Option>, - /// Filters to apply to the table scan + /// Filters to apply to the table scan. predicates: Option, - /// Optional limit on the number of rows to return + /// Pre-planned file scan tasks grouped by output DataFusion partition. + /// `None` in lazy mode (single-partition scan via `to_arrow()`). + /// `Some(buckets)` in eager mode: `buckets[i]` holds every task that + /// `execute(i)` will read. + buckets: Option>>, + /// Optional limit on the number of rows to return. limit: Option, } impl IcebergTableScan { - /// Creates a new [`IcebergTableScan`] object. + /// Creates a lazy single-partition scan. + /// + /// All file tasks are discovered and read inside `execute(0)` via + /// [`TableScan::to_arrow`][iceberg::scan::TableScan::to_arrow]. + /// Used by [`IcebergStaticTableProvider`][crate::table::IcebergStaticTableProvider]. pub(crate) fn new( table: Table, snapshot_id: Option, @@ -69,8 +98,13 @@ impl IcebergTableScan { None => schema.clone(), Some(projection) => Arc::new(schema.project(projection).unwrap()), }; - let plan_properties = Self::compute_properties(output_schema.clone()); - let projection = get_column_names(schema.clone(), projection); + let plan_properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(output_schema), + Partitioning::UnknownPartitioning(1), + EmissionType::Incremental, + Boundedness::Bounded, + )); + let projection = get_column_names(schema, projection); let predicates = convert_filters_to_predicate(filters); Self { @@ -79,6 +113,49 @@ impl IcebergTableScan { plan_properties, projection, predicates, + buckets: None, + limit, + } + } + + /// Creates an eager multi-partition scan from pre-planned file task buckets. + /// + /// Each DataFusion partition `i` streams the tasks in `buckets[i]` via + /// [`TableScan::to_arrow_with_tasks`][iceberg::scan::TableScan::to_arrow_with_tasks]. + /// The `partitioning` argument is used directly for [`PlanProperties`], so the + /// caller is responsible for ensuring it matches the bucketing strategy. + /// Used by [`IcebergTableProvider`][crate::table::IcebergTableProvider]. + #[allow(clippy::too_many_arguments)] + pub(crate) fn new_with_tasks( + table: Table, + snapshot_id: Option, + schema: ArrowSchemaRef, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, + buckets: Vec>, + partitioning: Partitioning, + ) -> Self { + let output_schema = match projection { + None => schema.clone(), + Some(projection) => Arc::new(schema.project(projection).unwrap()), + }; + let plan_properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(output_schema), + partitioning, + EmissionType::Incremental, + Boundedness::Bounded, + )); + let projection = get_column_names(schema, projection); + let predicates = convert_filters_to_predicate(filters); + + Self { + table, + snapshot_id, + plan_properties, + projection, + predicates, + buckets: Some(buckets), limit, } } @@ -99,21 +176,17 @@ impl IcebergTableScan { self.predicates.as_ref() } + /// Returns the pre-planned file task buckets, or an empty slice in lazy mode. + pub fn buckets(&self) -> &[Vec] { + self.buckets.as_deref().unwrap_or(&[]) + } + pub fn limit(&self) -> Option { self.limit } - /// Computes [`PlanProperties`] used in query optimization. - fn compute_properties(schema: ArrowSchemaRef) -> Arc { - // TODO: - // This is more or less a placeholder, to be replaced - // once we support output-partitioning - Arc::new(PlanProperties::new( - EquivalenceProperties::new(schema), - Partitioning::UnknownPartitioning(1), - EmissionType::Incremental, - Boundedness::Bounded, - )) + fn total_file_count(&self) -> usize { + self.buckets().iter().map(|b| b.len()).sum() } } @@ -132,8 +205,15 @@ impl ExecutionPlan for IcebergTableScan { fn with_new_children( self: Arc, - _children: Vec>, + children: Vec>, ) -> DFResult> { + if !children.is_empty() { + return Err(DataFusionError::Internal(format!( + "{} is a leaf node and expects no children, but {} were provided", + self.name(), + children.len() + ))); + } Ok(self) } @@ -143,20 +223,66 @@ impl ExecutionPlan for IcebergTableScan { fn execute( &self, - _partition: usize, + partition: usize, _context: Arc, ) -> DFResult { - let fut = get_batch_stream( - self.table.clone(), - self.snapshot_id, - self.projection.clone(), - self.predicates.clone(), - ); - let stream = futures::stream::once(fut).try_flatten(); - - // Apply limit if specified + let table = self.table.clone(); + let snapshot_id = self.snapshot_id; + let column_names = self.projection.clone(); + let predicates = self.predicates.clone(); + let limit = self.limit; + + let stream = match &self.buckets { + Some(buckets) => { + // Eager mode: stream the pre-planned bucket for this partition. + let bucket = buckets.get(partition).cloned().ok_or_else(|| { + DataFusionError::Internal(format!( + "{}: partition index {partition} is out of bounds (total buckets: {})", + self.name(), + buckets.len() + )) + })?; + + let fut = async move { + // Rebuild a TableScan so we inherit the same defaults + // (row-group filtering, batch size, concurrency, ...). + let scan_builder = match snapshot_id { + Some(id) => table.scan().snapshot_id(id), + None => table.scan(), + }; + let mut scan_builder = match column_names { + Some(names) => scan_builder.select(names), + None => scan_builder.select_all(), + }; + if let Some(pred) = predicates { + scan_builder = scan_builder.with_filter(pred); + } + let table_scan = scan_builder.build().map_err(to_datafusion_error)?; + + let task_stream = Box::pin(futures::stream::iter( + bucket.into_iter().map(Ok::<_, iceberg::Error>), + )); + let record_batch_stream = table_scan + .to_arrow_with_tasks(task_stream) + .map_err(to_datafusion_error)? + .map_err(to_datafusion_error); + Ok::<_, DataFusionError>(record_batch_stream) + }; + + let s = futures::stream::once(fut).try_flatten(); + Box::pin(s) as Pin> + Send>> + } + None => { + // Lazy mode: discover and read all tasks inside execute(). + let fut = get_batch_stream(table, snapshot_id, column_names, predicates); + let s = futures::stream::once(fut).try_flatten(); + Box::pin(s) + } + }; + + // Apply limit if specified. let limited_stream: Pin> + Send>> = - if let Some(limit) = self.limit { + if let Some(limit) = limit { let mut remaining = limit; Box::pin(stream.try_filter_map(move |batch| { futures::future::ready(if remaining == 0 { @@ -171,7 +297,7 @@ impl ExecutionPlan for IcebergTableScan { }) })) } else { - Box::pin(stream) + stream }; Ok(Box::pin(RecordBatchStreamAdapter::new( @@ -187,16 +313,40 @@ impl DisplayAs for IcebergTableScan { _t: datafusion::physical_plan::DisplayFormatType, f: &mut std::fmt::Formatter, ) -> std::fmt::Result { - write!( - f, - "IcebergTableScan projection:[{}] predicate:[{}]", - self.projection - .clone() - .map_or(String::new(), |v| v.join(",")), - self.predicates - .clone() - .map_or(String::from(""), |p| format!("{p}")) - )?; + let projection = self + .projection + .clone() + .map_or(String::new(), |v| v.join(",")); + let predicate = self + .predicates + .clone() + .map_or(String::new(), |p| format!("{p}")); + + match &self.buckets { + Some(buckets) => { + let file_count = self.total_file_count(); + let bucket_count = buckets.len(); + write!( + f, + "{} projection:[{projection}] predicate:[{predicate}] \ + buckets:[{bucket_count}] file_count:[{file_count}]", + self.name() + )?; + if file_count <= 5 { + let files = buckets + .iter() + .flat_map(|b| b.iter().map(|t| t.data_file_path())) + .collect::>() + .join(", "); + write!(f, " files:[{files}]")?; + } + } + None => write!( + f, + "{} projection:[{projection}] predicate:[{predicate}]", + self.name() + )?, + } if let Some(limit) = self.limit { write!(f, " limit:[{limit}]")?; } @@ -204,11 +354,8 @@ impl DisplayAs for IcebergTableScan { } } -/// Asynchronously retrieves a stream of [`RecordBatch`] instances -/// from a given table. -/// -/// This function initializes a [`TableScan`], builds it, -/// and then converts it into a stream of Arrow [`RecordBatch`]es. +/// Asynchronously retrieves a stream of [`RecordBatch`] instances from a +/// given table. Used in lazy (single-partition) scan mode. async fn get_batch_stream( table: Table, snapshot_id: Option, diff --git a/crates/integrations/datafusion/src/table/bucketing.rs b/crates/integrations/datafusion/src/table/bucketing.rs new file mode 100644 index 0000000000..4c58730ce5 --- /dev/null +++ b/crates/integrations/datafusion/src/table/bucketing.rs @@ -0,0 +1,189 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion::arrow::array::{ + ArrayRef, BooleanArray, Date32Array, Float32Array, Float64Array, Int32Array, Int64Array, + StringArray, +}; +use datafusion::arrow::datatypes::{DataType, Schema as ArrowSchema}; +use datafusion::common::hash_utils::create_hashes; +use datafusion::physical_plan::repartition::REPARTITION_RANDOM_STATE; +use iceberg::scan::FileScanTask; +use iceberg::spec::{Literal, PrimitiveLiteral, Transform}; +use iceberg::table::Table; + +/// Identity-partitioned column that is also present in the output projection +/// and whose Arrow type can be reconstructed from a `Literal` for hashing. +pub(super) struct IdentityCol { + pub(super) name: String, + /// Position of this column in the *output* schema (after projection). + pub(super) output_idx: usize, + /// Position of this column inside the partition spec's `fields()` slice, + /// matching the slot order of `FileScanTask::partition`. + pub(super) spec_field_idx: usize, + pub(super) output_dtype: DataType, +} + +/// Inspect the table's default partition spec and return the list of identity +/// columns that can support a [`Partitioning::Hash`] declaration. Returns +/// `None` if any condition is violated: +/// - the source column for an identity field is not in the output projection +/// - the source column's Arrow type is not currently supported by +/// [`literal_to_array`] +/// - the table has spec evolution (>1 historical specs), since older files +/// may carry a partition tuple that does not align with the default spec +/// +/// Returning `None` forces the scan to declare `UnknownPartitioning` even if +/// bucketing succeeds. +pub(super) fn compute_identity_cols( + table: &Table, + output_schema: &ArrowSchema, +) -> Option> { + let metadata = table.metadata(); + if metadata.partition_specs_iter().len() > 1 { + return None; + } + let spec = metadata.default_partition_spec(); + let table_schema = metadata.current_schema(); + + let mut cols = Vec::new(); + for (spec_field_idx, pf) in spec.fields().iter().enumerate() { + if pf.transform != Transform::Identity { + continue; + } + let source_field = table_schema.field_by_id(pf.source_id)?; + let output_idx = output_schema.index_of(source_field.name.as_str()).ok()?; + let output_dtype = output_schema.field(output_idx).data_type().clone(); + if !is_supported_dtype(&output_dtype) { + return None; + } + cols.push(IdentityCol { + name: source_field.name.clone(), + output_idx, + spec_field_idx, + output_dtype, + }); + } + Some(cols) +} + +fn is_supported_dtype(dt: &DataType) -> bool { + matches!( + dt, + DataType::Boolean + | DataType::Int32 + | DataType::Int64 + | DataType::Float32 + | DataType::Float64 + | DataType::Utf8 + | DataType::Date32 + ) +} + +/// Distribute `tasks` across `n_partitions` buckets. When `identity_cols` +/// describes a non-empty, hashable identity key, each task is hashed on +/// that key using DataFusion's repartition hash so the resulting partitioning +/// matches what `RepartitionExec` would produce on the same data. Tasks +/// missing partition data fall back to hashing `data_file_path`, which still +/// distributes evenly but breaks the `Hash` contract — the second tuple +/// element flags whether every task supplied a full identity key. +pub(super) fn bucket_tasks( + tasks: Vec, + n_partitions: usize, + identity_cols: Option<&[IdentityCol]>, +) -> (Vec>, bool) { + if n_partitions == 0 { + return (Vec::new(), tasks.is_empty()); + } + let mut buckets: Vec> = (0..n_partitions).map(|_| Vec::new()).collect(); + let mut all_full_key = true; + let cols = identity_cols.unwrap_or(&[]); + + for task in tasks { + let bucket_idx = match identity_hash(&task, cols) { + Some(h) => (h % n_partitions as u64) as usize, + None => { + all_full_key = false; + fallback_hash(&task) as usize % n_partitions + } + }; + buckets[bucket_idx].push(task); + } + (buckets, all_full_key) +} + +/// Hash the identity-partition values of `task` using +/// [`REPARTITION_RANDOM_STATE`] so the bucket assignment matches DataFusion's +/// hash-repartition convention. Returns `None` if the task lacks partition +/// data or any required slot is null/unsupported. +fn identity_hash(task: &FileScanTask, cols: &[IdentityCol]) -> Option { + if cols.is_empty() { + return None; + } + let partition = task.partition.as_ref()?; + let mut arrays: Vec = Vec::with_capacity(cols.len()); + for col in cols { + let lit = partition.fields().get(col.spec_field_idx)?.as_ref()?; + arrays.push(literal_to_array(lit, &col.output_dtype)?); + } + let mut hashes = vec![0u64; 1]; + create_hashes( + &arrays, + REPARTITION_RANDOM_STATE.random_state(), + &mut hashes, + ) + .ok()?; + Some(hashes[0]) +} + +/// Deterministic per-file fallback used when `identity_hash` cannot produce a +/// bucket. The hash function does not need to match DataFusion's because any +/// task taking this path causes the scan to drop to `UnknownPartitioning`. +fn fallback_hash(task: &FileScanTask) -> u64 { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + let mut hasher = DefaultHasher::new(); + task.data_file_path.hash(&mut hasher); + hasher.finish() +} + +/// Materialize a single-element Arrow array of `dt` holding the value of +/// `lit`. The Arrow type must match what DataFusion will see for this column +/// at scan time, otherwise `create_hashes` would dispatch on a different type +/// and produce a hash that disagrees with DataFusion's row-wise hashing. +fn literal_to_array(lit: &Literal, dt: &DataType) -> Option { + let prim = match lit { + Literal::Primitive(p) => p, + _ => return None, + }; + Some(match (prim, dt) { + (PrimitiveLiteral::Boolean(v), DataType::Boolean) => Arc::new(BooleanArray::from(vec![*v])), + (PrimitiveLiteral::Int(v), DataType::Int32) => Arc::new(Int32Array::from(vec![*v])), + (PrimitiveLiteral::Int(v), DataType::Date32) => Arc::new(Date32Array::from(vec![*v])), + (PrimitiveLiteral::Long(v), DataType::Int64) => Arc::new(Int64Array::from(vec![*v])), + (PrimitiveLiteral::Float(v), DataType::Float32) => Arc::new(Float32Array::from(vec![v.0])), + (PrimitiveLiteral::Double(v), DataType::Float64) => { + Arc::new(Float64Array::from(vec![v.0])) + } + (PrimitiveLiteral::String(v), DataType::Utf8) => { + Arc::new(StringArray::from(vec![v.as_str()])) + } + _ => return None, + }) +} diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index e2c9ca6efa..df9631cb3c 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -21,16 +21,14 @@ //! //! - [`IcebergTableProvider`]: Catalog-backed provider with automatic metadata refresh. //! Use for write operations and when you need to see the latest table state. +//! On each scan, file tasks are eagerly planned and distributed across DataFusion +//! partitions for parallel execution. //! //! - [`IcebergStaticTableProvider`]: Static provider for read-only access to a specific //! table snapshot. Use for consistent analytical queries or time-travel scenarios. -//! -//! - [`IcebergPartitionedTableProvider`]: Catalog-backed provider that assigns one -//! DataFusion partition per data file, enabling parallel file-level scanning. -//! Read-only; use [`IcebergTableProvider`] for write operations. +mod bucketing; pub mod metadata_table; -pub mod partitioned; pub mod table_provider_factory; use std::any::Any; @@ -45,10 +43,15 @@ use datafusion::datasource::{TableProvider, TableType}; use datafusion::error::Result as DFResult; use datafusion::logical_expr::dml::InsertOp; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_expr::expressions::Column; use datafusion::physical_plan::ExecutionPlan; +use datafusion::physical_plan::Partitioning; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; +use futures::TryStreamExt; use iceberg::arrow::schema_to_arrow_schema; use iceberg::inspect::MetadataTableType; +use iceberg::scan::FileScanTask; use iceberg::spec::TableProperties; use iceberg::table::Table; use iceberg::{Catalog, Error, ErrorKind, NamespaceIdent, Result, TableIdent}; @@ -56,6 +59,7 @@ use metadata_table::IcebergMetadataTableProvider; use crate::error::to_datafusion_error; use crate::physical_plan::commit::IcebergCommitExec; +use crate::physical_plan::expr_to_predicate::convert_filters_to_predicate; use crate::physical_plan::project::project_with_partition; use crate::physical_plan::repartition::repartition; use crate::physical_plan::scan::IcebergTableScan; @@ -129,26 +133,93 @@ impl TableProvider for IcebergTableProvider { async fn scan( &self, - _state: &dyn Session, + state: &dyn Session, projection: Option<&Vec>, filters: &[Expr], limit: Option, ) -> DFResult> { - // Load fresh table metadata from catalog + // Second load: fetch the latest snapshot so scans always reflect current table state. let table = self .catalog .load_table(&self.table_ident) .await .map_err(to_datafusion_error)?; - // Create scan with fresh metadata (always use current snapshot) - Ok(Arc::new(IcebergTableScan::new( + // Build a TableScan mirroring the inputs we'll hand to IcebergTableScan, + // so plan_files() uses the same projection/filters the scan will replay in execute(). + let col_names = projection.map(|indices| { + indices + .iter() + .map(|&i| self.schema.field(i).name().clone()) + .collect::>() + }); + + let predicate = convert_filters_to_predicate(filters); + + let mut builder = table.scan(); + builder = match col_names { + Some(names) => builder.select(names), + None => builder.select_all(), + }; + if let Some(pred) = predicate { + builder = builder.with_filter(pred); + } + + let tasks: Vec = builder + .build() + .map_err(to_datafusion_error)? + .plan_files() + .await + .map_err(to_datafusion_error)? + .try_collect::>() + .await + .map_err(to_datafusion_error)?; + + // Output schema after projection: column indices in `Hash` exprs and any + // Arrow array we hash must reference this schema, not the full table schema. + let output_schema = match projection { + None => self.schema.clone(), + Some(p) => Arc::new(self.schema.project(p).map_err(|e| { + to_datafusion_error(Error::new(ErrorKind::DataInvalid, e.to_string())) + })?), + }; + + let target_partitions = state.config().target_partitions(); + // Always produce at least 1 partition so that DataFusion can schedule + // the plan normally and callers can safely call execute(0). An empty + // bucket simply yields an empty record-batch stream. + let n_partitions = target_partitions.min(tasks.len()).max(1); + + // identity_cols is Some(non-empty) iff every condition for declaring + // Partitioning::Hash is met: the table's default spec has identity-transform + // fields, every such source column is present in the output projection, and + // every column type is supported by literal_to_array. Any miss collapses to + // None, which forces UnknownPartitioning regardless of bucketing strategy. + let identity_cols = bucketing::compute_identity_cols(&table, &output_schema); + + let (buckets, all_had_full_key) = + bucketing::bucket_tasks(tasks, n_partitions, identity_cols.as_deref()); + + let partitioning = match identity_cols { + Some(cols) if !cols.is_empty() && all_had_full_key && n_partitions > 0 => { + let exprs: Vec> = cols + .iter() + .map(|c| Arc::new(Column::new(&c.name, c.output_idx)) as Arc) + .collect(); + Partitioning::Hash(exprs, n_partitions) + } + _ => Partitioning::UnknownPartitioning(n_partitions), + }; + + Ok(Arc::new(IcebergTableScan::new_with_tasks( table, None, // Always use current snapshot for catalog-backed provider self.schema.clone(), projection, filters, limit, + buckets, + partitioning, ))) } @@ -870,4 +941,216 @@ mod tests { "Limit should be None when not specified" ); } + + // ── IcebergTableProvider bucketed scan tests ───────────────────────────── + // (Originally from table/partitioned.rs; updated to use IcebergTableProvider + // and IcebergTableScan after the IcebergPartitionedTableProvider merge.) + + async fn make_catalog_and_table_for_bucketing( + ) -> (Arc, NamespaceIdent, String, tempfile::TempDir) { + use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; + use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; + use iceberg::{CatalogBuilder, TableCreation}; + + let temp_dir = tempfile::TempDir::new().unwrap(); + let warehouse = temp_dir.path().to_str().unwrap().to_string(); + + let catalog = Arc::new( + MemoryCatalogBuilder::default() + .load( + "memory", + std::collections::HashMap::from([( + MEMORY_CATALOG_WAREHOUSE.to_string(), + warehouse.clone(), + )]), + ) + .await + .unwrap(), + ); + + let namespace = NamespaceIdent::new("ns".to_string()); + catalog + .create_namespace(&namespace, std::collections::HashMap::new()) + .await + .unwrap(); + + let schema = Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(); + + catalog + .create_table( + &namespace, + TableCreation::builder() + .name("t".to_string()) + .location(format!("{warehouse}/t")) + .schema(schema) + .properties(std::collections::HashMap::new()) + .build(), + ) + .await + .unwrap(); + + (catalog, namespace, "t".to_string(), temp_dir) + } + + /// Registers `n` synthetic data files in the table metadata via the iceberg + /// transaction API. No actual parquet files are written, only the metadata + /// entries that `plan_files()` reads are created. + async fn append_fake_data_files( + catalog: &Arc, + namespace: &NamespaceIdent, + table_name: &str, + n: usize, + ) { + use iceberg::spec::{DataContentType, DataFileBuilder, DataFileFormat}; + use iceberg::transaction::{ApplyTransactionAction, Transaction}; + + let table = catalog + .load_table(&TableIdent::new(namespace.clone(), table_name.to_string())) + .await + .unwrap(); + + let data_files = (0..n) + .map(|i| { + DataFileBuilder::default() + .content(DataContentType::Data) + .file_path(format!( + "{}/data/fake_{i}.parquet", + table.metadata().location() + )) + .file_format(DataFileFormat::Parquet) + .file_size_in_bytes(128) + .record_count(1) + .partition_spec_id(table.metadata().default_partition_spec_id()) + .build() + .unwrap() + }) + .collect::>(); + + let tx = Transaction::new(&table); + let action = tx.fast_append().add_data_files(data_files); + action + .apply(tx) + .unwrap() + .commit(catalog.as_ref()) + .await + .unwrap(); + } + + fn ctx_with_target_partitions(n: usize) -> SessionContext { + use datafusion::prelude::SessionConfig; + SessionContext::new_with_config(SessionConfig::new().with_target_partitions(n)) + } + + /// An empty table must produce a single empty-bucket scan so that DataFusion + /// can schedule the plan normally. execute(0) on an empty bucket simply + /// returns an empty record-batch stream. + #[tokio::test] + async fn test_empty_table_single_empty_bucket() { + let (catalog, namespace, table_name, _temp_dir) = + make_catalog_and_table_for_bucketing().await; + // no files appended + let provider = IcebergTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan(&ctx_with_target_partitions(8).state(), None, &[], None) + .await + .unwrap(); + let scan = plan + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(scan.buckets().len(), 1); + assert_eq!(scan.buckets()[0].len(), 0); + assert_eq!(scan.properties().partitioning.partition_count(), 1); + } + + /// When the table has no identity-partition columns, every task takes the + /// fallback (file_path) bucket path, so the declaration must drop to + /// `UnknownPartitioning`. The bucket count should still equal + /// min(target_partitions, num_files). + #[tokio::test] + async fn test_unpartitioned_falls_back_to_unknown() { + use datafusion::physical_plan::Partitioning; + + let (catalog, namespace, table_name, _temp_dir) = + make_catalog_and_table_for_bucketing().await; + append_fake_data_files(&catalog, &namespace, &table_name, 5).await; + + let provider = IcebergTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan(&ctx_with_target_partitions(3).state(), None, &[], None) + .await + .unwrap(); + let scan = plan + .as_any() + .downcast_ref::() + .unwrap(); + + let total_files: usize = scan.buckets().iter().map(|b| b.len()).sum(); + assert_eq!(total_files, 5); + assert_eq!(scan.buckets().len(), 3); + assert!(matches!( + scan.properties().partitioning, + Partitioning::UnknownPartitioning(3) + )); + } + + /// Bucket count must be capped at the number of files: spinning up more + /// DataFusion partitions than there are tasks would just leave empty + /// streams, wasting scheduler slots. + #[tokio::test] + async fn test_bucket_count_capped_at_file_count() { + let (catalog, namespace, table_name, _temp_dir) = + make_catalog_and_table_for_bucketing().await; + append_fake_data_files(&catalog, &namespace, &table_name, 2).await; + + let provider = IcebergTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan(&ctx_with_target_partitions(16).state(), None, &[], None) + .await + .unwrap(); + let scan = plan + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(scan.buckets().len(), 2); + } + + /// target_partitions = 1 collapses every task into a single bucket, giving + /// the same execution profile as a single-partition scan. + #[tokio::test] + async fn test_single_target_partition_single_bucket() { + let (catalog, namespace, table_name, _temp_dir) = + make_catalog_and_table_for_bucketing().await; + append_fake_data_files(&catalog, &namespace, &table_name, 4).await; + + let provider = IcebergTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan(&ctx_with_target_partitions(1).state(), None, &[], None) + .await + .unwrap(); + let scan = plan + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(scan.buckets().len(), 1); + assert_eq!(scan.buckets()[0].len(), 4); + } } diff --git a/crates/integrations/datafusion/src/table/partitioned.rs b/crates/integrations/datafusion/src/table/partitioned.rs deleted file mode 100644 index 580cf68a47..0000000000 --- a/crates/integrations/datafusion/src/table/partitioned.rs +++ /dev/null @@ -1,571 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use std::any::Any; -use std::sync::Arc; - -use async_trait::async_trait; -use datafusion::arrow::array::{ - ArrayRef, BooleanArray, Date32Array, Float32Array, Float64Array, Int32Array, Int64Array, - StringArray, -}; -use datafusion::arrow::datatypes::{DataType, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; -use datafusion::catalog::Session; -use datafusion::common::hash_utils::create_hashes; -use datafusion::datasource::{TableProvider, TableType}; -use datafusion::error::Result as DFResult; -use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; -use datafusion::physical_expr::PhysicalExpr; -use datafusion::physical_expr::expressions::Column; -use datafusion::physical_plan::repartition::REPARTITION_RANDOM_STATE; -use datafusion::physical_plan::{ExecutionPlan, Partitioning}; -use futures::TryStreamExt; -use iceberg::arrow::schema_to_arrow_schema; -use iceberg::scan::FileScanTask; -use iceberg::spec::{Literal, PrimitiveLiteral, Transform}; -use iceberg::table::Table; -use iceberg::{Catalog, Error, ErrorKind, NamespaceIdent, Result, TableIdent}; - -use crate::error::to_datafusion_error; -use crate::physical_plan::expr_to_predicate::convert_filters_to_predicate; -use crate::physical_plan::partitioned_scan::IcebergPartitionedScan; - -/// Catalog-backed table provider that scans each data file in a separate DataFusion partition. -/// -/// This provider reloads table metadata from the catalog on every [`scan`][Self::scan] call -/// to guarantee freshness, then issues one DataFusion partition per data file so that -/// DataFusion's scheduler can execute file reads in parallel. -/// -/// Write operations are not supported. Use [`IcebergTableProvider`] for write access. -/// -/// For consistent read-only access to a fixed snapshot without per-scan catalog overhead, -/// use [`IcebergStaticTableProvider`] instead. -#[derive(Debug, Clone)] -pub struct IcebergPartitionedTableProvider { - catalog: Arc, - table_ident: TableIdent, - schema: ArrowSchemaRef, -} - -impl IcebergPartitionedTableProvider { - pub async fn try_new( - catalog: Arc, - namespace: NamespaceIdent, - name: impl Into, - ) -> Result { - let table_ident = TableIdent::new(namespace, name.into()); - // First load: used only to snapshot the Arrow schema for DataFusion planning. - // A second load_table is issued at scan time to guarantee the freshest snapshot. - let table = catalog.load_table(&table_ident).await?; - let schema = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?); - Ok(Self { - catalog, - table_ident, - schema, - }) - } -} - -#[async_trait] -impl TableProvider for IcebergPartitionedTableProvider { - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> ArrowSchemaRef { - self.schema.clone() - } - - fn table_type(&self) -> TableType { - TableType::Base - } - - async fn scan( - &self, - state: &dyn Session, - projection: Option<&Vec>, - filters: &[Expr], - _limit: Option, - ) -> DFResult> { - // Per-partition row limits are not yet implemented for IcebergPartitionedScan. - // DataFusion will apply a GlobalLimitExec on top of this node when needed. - - // Second load: fetch the latest snapshot so scans always reflect current table state. - let table = self - .catalog - .load_table(&self.table_ident) - .await - .map_err(to_datafusion_error)?; - - // Build a TableScan mirroring the inputs we'll hand to IcebergPartitionedScan, - // so plan_files() uses the same projection/filters the scan will replay in execute(). - let col_names = projection.map(|indices| { - indices - .iter() - .map(|&i| self.schema.field(i).name().clone()) - .collect::>() - }); - - let predicate = convert_filters_to_predicate(filters); - - let mut builder = table.scan(); - builder = match col_names { - Some(names) => builder.select(names), - None => builder.select_all(), - }; - if let Some(pred) = predicate { - builder = builder.with_filter(pred); - } - - let tasks: Vec = builder - .build() - .map_err(to_datafusion_error)? - .plan_files() - .await - .map_err(to_datafusion_error)? - .try_collect::>() - .await - .map_err(to_datafusion_error)?; - - // Output schema after projection: column indices in `Hash` exprs and any - // Arrow array we hash must reference this schema, not the full table schema. - let output_schema = match projection { - None => self.schema.clone(), - Some(p) => Arc::new(self.schema.project(p).map_err(|e| { - to_datafusion_error(Error::new(ErrorKind::DataInvalid, e.to_string())) - })?), - }; - - let target_partitions = state.config().target_partitions(); - let n_partitions = if tasks.is_empty() { - 0 - } else { - target_partitions.min(tasks.len()).max(1) - }; - - // identity_cols is Some(non-empty) iff every condition for declaring - // Partitioning::Hash is met: the table's default spec has identity-transform - // fields, every such source column is present in the output projection, and - // every column type is supported by literal_to_array. Any miss collapses to - // None, which forces UnknownPartitioning regardless of bucketing strategy. - let identity_cols = compute_identity_cols(&table, &output_schema); - - let (buckets, all_had_full_key) = - bucket_tasks(tasks, n_partitions, identity_cols.as_deref()); - - let partitioning = match identity_cols { - Some(cols) if !cols.is_empty() && all_had_full_key && n_partitions > 0 => { - let exprs: Vec> = cols - .iter() - .map(|c| Arc::new(Column::new(&c.name, c.output_idx)) as Arc) - .collect(); - Partitioning::Hash(exprs, n_partitions) - } - _ => Partitioning::UnknownPartitioning(n_partitions), - }; - - Ok(Arc::new(IcebergPartitionedScan::new( - table, - None, // Always use current snapshot for catalog-backed provider - self.schema.clone(), - projection, - filters, - buckets, - partitioning, - ))) - } - - fn supports_filters_pushdown( - &self, - filters: &[&Expr], - ) -> DFResult> { - Ok(vec![TableProviderFilterPushDown::Inexact; filters.len()]) - } - - async fn insert_into( - &self, - _state: &dyn Session, - _input: Arc, - _insert_op: datafusion::logical_expr::dml::InsertOp, - ) -> DFResult> { - Err(to_datafusion_error(Error::new( - ErrorKind::FeatureUnsupported, - "IcebergPartitionedTableProvider does not support writes; \ - use IcebergTableProvider instead", - ))) - } -} - -/// Identity-partitioned column that is also present in the output projection -/// and whose Arrow type can be reconstructed from a `Literal` for hashing. -struct IdentityCol { - name: String, - /// Position of this column in the *output* schema (after projection). - output_idx: usize, - /// Position of this column inside the partition spec's `fields()` slice, - /// matching the slot order of `FileScanTask::partition`. - spec_field_idx: usize, - output_dtype: DataType, -} - -/// Inspect the table's default partition spec and return the list of identity -/// columns that can support a [`Partitioning::Hash`] declaration. Returns -/// `None` if any condition is violated: -/// - the source column for an identity field is not in the output projection -/// - the source column's Arrow type is not currently supported by -/// [`literal_to_array`] -/// - the table has spec evolution (>1 historical specs), since older files -/// may carry a partition tuple that does not align with the default spec -/// -/// Returning `None` forces the scan to declare `UnknownPartitioning` even if -/// bucketing succeeds. -fn compute_identity_cols(table: &Table, output_schema: &ArrowSchema) -> Option> { - let metadata = table.metadata(); - if metadata.partition_specs_iter().len() > 1 { - return None; - } - let spec = metadata.default_partition_spec(); - let table_schema = metadata.current_schema(); - - let mut cols = Vec::new(); - for (spec_field_idx, pf) in spec.fields().iter().enumerate() { - if pf.transform != Transform::Identity { - continue; - } - let source_field = table_schema.field_by_id(pf.source_id)?; - let output_idx = output_schema.index_of(source_field.name.as_str()).ok()?; - let output_dtype = output_schema.field(output_idx).data_type().clone(); - if !is_supported_dtype(&output_dtype) { - return None; - } - cols.push(IdentityCol { - name: source_field.name.clone(), - output_idx, - spec_field_idx, - output_dtype, - }); - } - Some(cols) -} - -fn is_supported_dtype(dt: &DataType) -> bool { - matches!( - dt, - DataType::Boolean - | DataType::Int32 - | DataType::Int64 - | DataType::Float32 - | DataType::Float64 - | DataType::Utf8 - | DataType::Date32 - ) -} - -/// Distribute `tasks` across `n_partitions` buckets. When `identity_cols` -/// describes a non-empty, hashable identity key, each task is hashed on -/// that key using DataFusion's repartition hash so the resulting partitioning -/// matches what `RepartitionExec` would produce on the same data. Tasks -/// missing partition data fall back to hashing `data_file_path`, which still -/// distributes evenly but breaks the `Hash` contract — the second tuple -/// element flags whether every task supplied a full identity key. -fn bucket_tasks( - tasks: Vec, - n_partitions: usize, - identity_cols: Option<&[IdentityCol]>, -) -> (Vec>, bool) { - if n_partitions == 0 { - return (Vec::new(), tasks.is_empty()); - } - let mut buckets: Vec> = (0..n_partitions).map(|_| Vec::new()).collect(); - let mut all_full_key = true; - let cols = identity_cols.unwrap_or(&[]); - - for task in tasks { - let bucket_idx = match identity_hash(&task, cols) { - Some(h) => (h % n_partitions as u64) as usize, - None => { - all_full_key = false; - fallback_hash(&task) as usize % n_partitions - } - }; - buckets[bucket_idx].push(task); - } - (buckets, all_full_key) -} - -/// Hash the identity-partition values of `task` using -/// [`REPARTITION_RANDOM_STATE`] so the bucket assignment matches DataFusion's -/// hash-repartition convention. Returns `None` if the task lacks partition -/// data or any required slot is null/unsupported. -fn identity_hash(task: &FileScanTask, cols: &[IdentityCol]) -> Option { - if cols.is_empty() { - return None; - } - let partition = task.partition.as_ref()?; - let mut arrays: Vec = Vec::with_capacity(cols.len()); - for col in cols { - let lit = partition.fields().get(col.spec_field_idx)?.as_ref()?; - arrays.push(literal_to_array(lit, &col.output_dtype)?); - } - let mut hashes = vec![0u64; 1]; - create_hashes( - &arrays, - REPARTITION_RANDOM_STATE.random_state(), - &mut hashes, - ) - .ok()?; - Some(hashes[0]) -} - -/// Deterministic per-file fallback used when `identity_hash` cannot produce a -/// bucket. The hash function does not need to match DataFusion's because any -/// task taking this path causes the scan to drop to `UnknownPartitioning`. -fn fallback_hash(task: &FileScanTask) -> u64 { - use std::collections::hash_map::DefaultHasher; - use std::hash::{Hash, Hasher}; - let mut hasher = DefaultHasher::new(); - task.data_file_path.hash(&mut hasher); - hasher.finish() -} - -/// Materialize a single-element Arrow array of `dt` holding the value of -/// `lit`. The Arrow type must match what DataFusion will see for this column -/// at scan time, otherwise `create_hashes` would dispatch on a different type -/// and produce a hash that disagrees with DataFusion's row-wise hashing. -fn literal_to_array(lit: &Literal, dt: &DataType) -> Option { - let prim = match lit { - Literal::Primitive(p) => p, - _ => return None, - }; - Some(match (prim, dt) { - (PrimitiveLiteral::Boolean(v), DataType::Boolean) => Arc::new(BooleanArray::from(vec![*v])), - (PrimitiveLiteral::Int(v), DataType::Int32) => Arc::new(Int32Array::from(vec![*v])), - (PrimitiveLiteral::Int(v), DataType::Date32) => Arc::new(Date32Array::from(vec![*v])), - (PrimitiveLiteral::Long(v), DataType::Int64) => Arc::new(Int64Array::from(vec![*v])), - (PrimitiveLiteral::Float(v), DataType::Float32) => Arc::new(Float32Array::from(vec![v.0])), - (PrimitiveLiteral::Double(v), DataType::Float64) => Arc::new(Float64Array::from(vec![v.0])), - (PrimitiveLiteral::String(v), DataType::Utf8) => { - Arc::new(StringArray::from(vec![v.as_str()])) - } - _ => return None, - }) -} - -#[cfg(test)] -mod tests { - use std::collections::HashMap; - use std::sync::Arc; - - use datafusion::prelude::{SessionConfig, SessionContext}; - use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; - use iceberg::spec::{ - DataContentType, DataFileBuilder, DataFileFormat, NestedField, PrimitiveType, Schema, Type, - }; - use iceberg::transaction::{ApplyTransactionAction, Transaction}; - use iceberg::{Catalog, CatalogBuilder, NamespaceIdent, TableCreation, TableIdent}; - use tempfile::TempDir; - - use super::*; - - async fn make_catalog_and_table() -> (Arc, NamespaceIdent, String, TempDir) { - let temp_dir = TempDir::new().unwrap(); - let warehouse = temp_dir.path().to_str().unwrap().to_string(); - - let catalog = Arc::new( - MemoryCatalogBuilder::default() - .load( - "memory", - HashMap::from([(MEMORY_CATALOG_WAREHOUSE.to_string(), warehouse.clone())]), - ) - .await - .unwrap(), - ); - - let namespace = NamespaceIdent::new("ns".to_string()); - catalog - .create_namespace(&namespace, HashMap::new()) - .await - .unwrap(); - - let schema = Schema::builder() - .with_schema_id(0) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(); - - catalog - .create_table( - &namespace, - TableCreation::builder() - .name("t".to_string()) - .location(format!("{warehouse}/t")) - .schema(schema) - .properties(HashMap::new()) - .build(), - ) - .await - .unwrap(); - - (catalog, namespace, "t".to_string(), temp_dir) - } - - /// Registers `n` synthetic data files in the table metadata via the iceberg - /// transaction API. No actual parquet files are written, only the metadata - /// entries that `plan_files()` reads are created. - async fn append_fake_data_files( - catalog: &Arc, - namespace: &NamespaceIdent, - table_name: &str, - n: usize, - ) { - let table = catalog - .load_table(&TableIdent::new(namespace.clone(), table_name.to_string())) - .await - .unwrap(); - - let data_files = (0..n) - .map(|i| { - DataFileBuilder::default() - .content(DataContentType::Data) - .file_path(format!( - "{}/data/fake_{i}.parquet", - table.metadata().location() - )) - .file_format(DataFileFormat::Parquet) - .file_size_in_bytes(128) - .record_count(1) - .partition_spec_id(table.metadata().default_partition_spec_id()) - .build() - .unwrap() - }) - .collect::>(); - - let tx = Transaction::new(&table); - let action = tx.fast_append().add_data_files(data_files); - action - .apply(tx) - .unwrap() - .commit(catalog.as_ref()) - .await - .unwrap(); - } - - fn ctx_with_target_partitions(n: usize) -> SessionContext { - SessionContext::new_with_config(SessionConfig::new().with_target_partitions(n)) - } - - /// An empty table must produce a zero-partition scan so DataFusion never calls - /// execute(0), which would otherwise return an out-of-bounds error. - #[tokio::test] - async fn test_empty_table_zero_partitions() { - let (catalog, namespace, table_name, _temp_dir) = make_catalog_and_table().await; - // no files appended - let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) - .await - .unwrap(); - let plan = provider - .scan(&ctx_with_target_partitions(8).state(), None, &[], None) - .await - .unwrap(); - let scan = plan - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!(scan.buckets().len(), 0); - assert_eq!(scan.properties().partitioning.partition_count(), 0); - } - - /// When the table has no identity-partition columns, every task takes the - /// fallback (file_path) bucket path, so the declaration must drop to - /// `UnknownPartitioning`. The bucket count should still equal - /// min(target_partitions, num_files). - #[tokio::test] - async fn test_unpartitioned_falls_back_to_unknown() { - let (catalog, namespace, table_name, _temp_dir) = make_catalog_and_table().await; - append_fake_data_files(&catalog, &namespace, &table_name, 5).await; - - let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) - .await - .unwrap(); - let plan = provider - .scan(&ctx_with_target_partitions(3).state(), None, &[], None) - .await - .unwrap(); - let scan = plan - .as_any() - .downcast_ref::() - .unwrap(); - - let total_files: usize = scan.buckets().iter().map(|b| b.len()).sum(); - assert_eq!(total_files, 5); - assert_eq!(scan.buckets().len(), 3); - assert!(matches!( - scan.properties().partitioning, - Partitioning::UnknownPartitioning(3) - )); - } - - /// Bucket count must be capped at the number of files: spinning up more - /// DataFusion partitions than there are tasks would just leave empty - /// streams, wasting scheduler slots. - #[tokio::test] - async fn test_bucket_count_capped_at_file_count() { - let (catalog, namespace, table_name, _temp_dir) = make_catalog_and_table().await; - append_fake_data_files(&catalog, &namespace, &table_name, 2).await; - - let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) - .await - .unwrap(); - let plan = provider - .scan(&ctx_with_target_partitions(16).state(), None, &[], None) - .await - .unwrap(); - let scan = plan - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!(scan.buckets().len(), 2); - } - - /// target_partitions = 1 collapses every task into a single bucket, giving - /// the same execution profile as `IcebergTableScan`. - #[tokio::test] - async fn test_single_target_partition_single_bucket() { - let (catalog, namespace, table_name, _temp_dir) = make_catalog_and_table().await; - append_fake_data_files(&catalog, &namespace, &table_name, 4).await; - - let provider = IcebergPartitionedTableProvider::try_new(catalog, namespace, table_name) - .await - .unwrap(); - let plan = provider - .scan(&ctx_with_target_partitions(1).state(), None, &[], None) - .await - .unwrap(); - let scan = plan - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!(scan.buckets().len(), 1); - assert_eq!(scan.buckets()[0].len(), 4); - } -} diff --git a/crates/integrations/datafusion/tests/integration_datafusion_test.rs b/crates/integrations/datafusion/tests/integration_datafusion_test.rs index cebac75dd9..8a58e94577 100644 --- a/crates/integrations/datafusion/tests/integration_datafusion_test.rs +++ b/crates/integrations/datafusion/tests/integration_datafusion_test.rs @@ -131,7 +131,7 @@ async fn test_provider_plan_stream_schema() -> Result<()> { let task_ctx = Arc::new(df.task_ctx()); let plan = df.create_physical_plan().await.unwrap(); - let stream = plan.execute(1, task_ctx).unwrap(); + let stream = plan.execute(0, task_ctx).unwrap(); // Ensure both the plan and the stream conform to the same schema assert_eq!(plan.schema(), stream.schema()); From 00527f808bc73b2aa5f03b78bd00327187e6cf81 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Wed, 29 Apr 2026 14:40:51 +0200 Subject: [PATCH 12/32] refactor(datafusion): polish scan API and add bucketing tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review pass over the partitioned-scan branch ahead of upstream contribution. - Rename `TableScan::to_arrow_with_tasks` to `to_arrow_from_tasks` — `from` better signals that the tasks are the input source rather than a builder-style modifier. - Restructure the doc with a `# Correctness` section that calls out the projection/filter contract while clarifying that reader-side configuration (concurrency, batch size, row-group filtering, row selection) is taken from `self`. - Make `IcebergTableScan::new` and `new_with_tasks` `pub` (were `pub(crate)`) so external users can construct the node directly, matching the public visibility of the struct itself. - Drop the `convert_filters_to_predicate` re-export from `physical_plan/mod.rs`: it was unused outside the module. - Extract a private `new_inner` constructor on `IcebergTableScan` so `new` and `new_with_tasks` share a single source of truth for the `PlanProperties` / projection / predicate setup. - Split `IcebergTableScan::execute` into a linear pipeline backed by three helpers: `build_table_scan` (synchronous scan-builder plumbing), `build_record_batch_stream` (async stream construction for the lazy/eager modes), and `apply_limit`. - Trim the `IcebergTableScan` struct doc and field comments to match the rest of the file's style; drop the verbose `to_arrow_with_tasks` rationale (the `# Correctness` doc carries the load-bearing info). - Tighten `DisplayAs::fmt_as`: remove the file-path enumeration (file count alone is enough for `EXPLAIN`) and factor the common prefix. - Trim several narrating comments in `table/mod.rs` and the module doc that duplicated information already evident from the code. - Add `test_identity_partitioned_declares_hash`: verifies the happy path where an identity-partitioned table with the partition column in the projection produces `Partitioning::Hash` referencing that column. This was the main missing coverage for the bucketing logic. - Add `test_projection_without_partition_col_falls_back_to_unknown`: verifies the `compute_identity_cols → None` branch when the projection omits the partition source column. - Add helpers (`make_partitioned_catalog_and_table_for_bucketing`, `append_partitioned_fake_data_files`) to build identity-partitioned fixtures without writing real Parquet files. (cherry picked from commit b1f2d6632509d6af474fe5a4be16341bd54ff13e) --- crates/iceberg/src/scan/mod.rs | 28 +- .../datafusion/src/physical_plan/mod.rs | 1 - .../datafusion/src/physical_plan/scan.rs | 315 ++++++++---------- .../datafusion/src/table/bucketing.rs | 4 +- .../integrations/datafusion/src/table/mod.rs | 220 ++++++++++-- .../tests/integration_datafusion_test.rs | 18 +- 6 files changed, 352 insertions(+), 234 deletions(-) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 2d8edfce29..9e321dc81b 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -462,29 +462,21 @@ impl TableScan { /// Returns an [`ArrowRecordBatchStream`]. pub async fn to_arrow(&self) -> Result { - self.to_arrow_with_tasks(self.plan_files().await?) + self.to_arrow_from_tasks(self.plan_files().await?) } - /// Consumes an externally-planned [`FileScanTask`] stream and returns an - /// [`ArrowRecordBatchStream`] using this scan's [`ArrowReaderBuilder`] - /// configuration (row-group filtering, row selection, data-file - /// concurrency limit, batch size). - /// - /// Equivalent to [`TableScan::to_arrow`] — which delegates to this method - /// after awaiting [`TableScan::plan_files`] — but lets the caller supply - /// a pre-computed task stream. This decouples planning from reading, so - /// external executors (e.g. a DataFusion partitioned scan) can plan once, - /// distribute tasks across workers, and replay them here without - /// re-running `plan_files()`. + /// Like [`TableScan::to_arrow`], but accepts a caller-supplied + /// [`FileScanTask`] stream instead of running [`TableScan::plan_files`] + /// internally. /// /// # Correctness /// - /// The tasks passed in must have been produced by a [`TableScan`] whose - /// projection and filter match `self`: filters are already baked into - /// each [`FileScanTask::predicate`] at planning time and are not - /// re-applied here. Using tasks from a scan with a different projection - /// or filter yields undefined behavior. - pub fn to_arrow_with_tasks(&self, tasks: FileScanTaskStream) -> Result { + /// Tasks must come from a [`TableScan`] with the same projection and + /// filter as `self`: predicates are baked into each task at planning + /// time and are not re-applied here. Reader-side configuration + /// (concurrency, batch size, row-group filtering, row selection) is + /// taken from `self` and may differ from the planning scan. + pub fn to_arrow_from_tasks(&self, tasks: FileScanTaskStream) -> Result { let mut arrow_reader_builder = ArrowReaderBuilder::new(self.file_io.clone(), self.runtime.clone()) .with_data_file_concurrency_limit(self.concurrency_limit_data_files) diff --git a/crates/integrations/datafusion/src/physical_plan/mod.rs b/crates/integrations/datafusion/src/physical_plan/mod.rs index aeac30de32..5a9845cde0 100644 --- a/crates/integrations/datafusion/src/physical_plan/mod.rs +++ b/crates/integrations/datafusion/src/physical_plan/mod.rs @@ -26,6 +26,5 @@ pub(crate) mod write; pub(crate) const DATA_FILES_COL_NAME: &str = "data_files"; -pub use expr_to_predicate::convert_filters_to_predicate; pub use project::project_with_partition; pub use scan::IcebergTableScan; diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index 49ff61e999..0a386bded7 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -30,63 +30,45 @@ use datafusion::physical_plan::{DisplayAs, ExecutionPlan, Partitioning, PlanProp use datafusion::prelude::Expr; use futures::{Stream, TryStreamExt}; use iceberg::expr::Predicate; -use iceberg::scan::FileScanTask; +use iceberg::scan::{FileScanTask, TableScan}; use iceberg::table::Table; use super::expr_to_predicate::convert_filters_to_predicate; use crate::to_datafusion_error; -/// Manages the scanning process of an Iceberg [`Table`], encapsulating the -/// necessary details and computed properties required for execution planning. +/// Iceberg [`Table`] scan as a DataFusion [`ExecutionPlan`]. /// -/// When constructed with pre-planned [`FileScanTask`] buckets via -/// [`IcebergTableScan::new_with_tasks`], each DataFusion partition `i` streams -/// every task in `buckets[i]` using -/// [`TableScan::to_arrow_with_tasks`][iceberg::scan::TableScan::to_arrow_with_tasks]. +/// Has two construction modes: [`IcebergTableScan::new`] for a lazy +/// single-partition scan, and [`IcebergTableScan::new_with_tasks`] for an +/// eager multi-partition scan over pre-planned [`FileScanTask`] buckets. /// -/// When constructed via [`IcebergTableScan::new`] (no pre-planned tasks), the -/// full table is scanned lazily in a single partition using -/// [`TableScan::to_arrow`][iceberg::scan::TableScan::to_arrow]. This mode is -/// used by [`IcebergStaticTableProvider`][crate::table::IcebergStaticTableProvider]. -/// -/// In both modes the optional `limit` field truncates the output stream to at -/// most that many rows. -/// -/// Note: when using pre-planned tasks, the `TableScan` is rebuilt on every -/// `execute(partition)` call rather than cached. `TableScan` carries a -/// `PlanContext` with `Arc`-shared evaluator caches which is awkward to -/// serialize if this plan ever needs to be shipped across workers. The -/// per-build cost is bounded (no I/O), so the rebuild is kept for now; -/// revisit once the cross-worker story is clearer. +/// Note: in eager mode the underlying `TableScan` is rebuilt on every +/// `execute(partition)` call. The per-build cost is bounded (no I/O) and +/// keeps the plan free of `Arc`-shared evaluator caches that are awkward to +/// serialize across workers. #[derive(Debug)] pub struct IcebergTableScan { /// A table in the catalog. table: Table, /// Snapshot of the table to scan. snapshot_id: Option, - /// Stores certain, often expensive to compute, - /// plan properties used in query optimization. + /// Cached plan properties used by query optimization. plan_properties: Arc, /// Projection column names, None means all columns. projection: Option>, /// Filters to apply to the table scan. predicates: Option, - /// Pre-planned file scan tasks grouped by output DataFusion partition. - /// `None` in lazy mode (single-partition scan via `to_arrow()`). - /// `Some(buckets)` in eager mode: `buckets[i]` holds every task that - /// `execute(i)` will read. + /// Pre-planned file scan tasks per partition (eager mode), or `None` (lazy mode). buckets: Option>>, /// Optional limit on the number of rows to return. limit: Option, } impl IcebergTableScan { - /// Creates a lazy single-partition scan. - /// - /// All file tasks are discovered and read inside `execute(0)` via - /// [`TableScan::to_arrow`][iceberg::scan::TableScan::to_arrow]. - /// Used by [`IcebergStaticTableProvider`][crate::table::IcebergStaticTableProvider]. - pub(crate) fn new( + /// Creates a lazy single-partition scan that plans and reads all tasks + /// inside `execute(0)`. Used by + /// [`IcebergStaticTableProvider`][crate::table::IcebergStaticTableProvider]. + pub fn new( table: Table, snapshot_id: Option, schema: ArrowSchemaRef, @@ -94,47 +76,55 @@ impl IcebergTableScan { filters: &[Expr], limit: Option, ) -> Self { - let output_schema = match projection { - None => schema.clone(), - Some(projection) => Arc::new(schema.project(projection).unwrap()), - }; - let plan_properties = Arc::new(PlanProperties::new( - EquivalenceProperties::new(output_schema), + Self::new_inner( + table, + snapshot_id, + schema, + projection, + filters, + limit, Partitioning::UnknownPartitioning(1), - EmissionType::Incremental, - Boundedness::Bounded, - )); - let projection = get_column_names(schema, projection); - let predicates = convert_filters_to_predicate(filters); + None, + ) + } - Self { + /// Creates an eager multi-partition scan over pre-planned task buckets. + /// Partition `i` streams `buckets[i]`. The caller is responsible for + /// ensuring `partitioning` matches the bucketing. Used by + /// [`IcebergTableProvider`][crate::table::IcebergTableProvider]. + #[allow(clippy::too_many_arguments)] + pub fn new_with_tasks( + table: Table, + snapshot_id: Option, + schema: ArrowSchemaRef, + projection: Option<&Vec>, + filters: &[Expr], + limit: Option, + buckets: Vec>, + partitioning: Partitioning, + ) -> Self { + Self::new_inner( table, snapshot_id, - plan_properties, + schema, projection, - predicates, - buckets: None, + filters, limit, - } + partitioning, + Some(buckets), + ) } - /// Creates an eager multi-partition scan from pre-planned file task buckets. - /// - /// Each DataFusion partition `i` streams the tasks in `buckets[i]` via - /// [`TableScan::to_arrow_with_tasks`][iceberg::scan::TableScan::to_arrow_with_tasks]. - /// The `partitioning` argument is used directly for [`PlanProperties`], so the - /// caller is responsible for ensuring it matches the bucketing strategy. - /// Used by [`IcebergTableProvider`][crate::table::IcebergTableProvider]. #[allow(clippy::too_many_arguments)] - pub(crate) fn new_with_tasks( + fn new_inner( table: Table, snapshot_id: Option, schema: ArrowSchemaRef, projection: Option<&Vec>, filters: &[Expr], limit: Option, - buckets: Vec>, partitioning: Partitioning, + buckets: Option>>, ) -> Self { let output_schema = match projection { None => schema.clone(), @@ -155,7 +145,7 @@ impl IcebergTableScan { plan_properties, projection, predicates, - buckets: Some(buckets), + buckets, limit, } } @@ -226,79 +216,31 @@ impl ExecutionPlan for IcebergTableScan { partition: usize, _context: Arc, ) -> DFResult { - let table = self.table.clone(); - let snapshot_id = self.snapshot_id; - let column_names = self.projection.clone(); - let predicates = self.predicates.clone(); - let limit = self.limit; - - let stream = match &self.buckets { - Some(buckets) => { - // Eager mode: stream the pre-planned bucket for this partition. - let bucket = buckets.get(partition).cloned().ok_or_else(|| { - DataFusionError::Internal(format!( - "{}: partition index {partition} is out of bounds (total buckets: {})", - self.name(), - buckets.len() - )) - })?; - - let fut = async move { - // Rebuild a TableScan so we inherit the same defaults - // (row-group filtering, batch size, concurrency, ...). - let scan_builder = match snapshot_id { - Some(id) => table.scan().snapshot_id(id), - None => table.scan(), - }; - let mut scan_builder = match column_names { - Some(names) => scan_builder.select(names), - None => scan_builder.select_all(), - }; - if let Some(pred) = predicates { - scan_builder = scan_builder.with_filter(pred); - } - let table_scan = scan_builder.build().map_err(to_datafusion_error)?; - - let task_stream = Box::pin(futures::stream::iter( - bucket.into_iter().map(Ok::<_, iceberg::Error>), - )); - let record_batch_stream = table_scan - .to_arrow_with_tasks(task_stream) - .map_err(to_datafusion_error)? - .map_err(to_datafusion_error); - Ok::<_, DataFusionError>(record_batch_stream) - }; - - let s = futures::stream::once(fut).try_flatten(); - Box::pin(s) as Pin> + Send>> - } - None => { - // Lazy mode: discover and read all tasks inside execute(). - let fut = get_batch_stream(table, snapshot_id, column_names, predicates); - let s = futures::stream::once(fut).try_flatten(); - Box::pin(s) - } + let bucket = match &self.buckets { + Some(buckets) => Some(buckets.get(partition).cloned().ok_or_else(|| { + DataFusionError::Internal(format!( + "{}: partition index {partition} is out of bounds (total buckets: {})", + self.name(), + buckets.len() + )) + })?), + None => None, }; - // Apply limit if specified. - let limited_stream: Pin> + Send>> = - if let Some(limit) = limit { - let mut remaining = limit; - Box::pin(stream.try_filter_map(move |batch| { - futures::future::ready(if remaining == 0 { - Ok(None) - } else if batch.num_rows() <= remaining { - remaining -= batch.num_rows(); - Ok(Some(batch)) - } else { - let limited_batch = batch.slice(0, remaining); - remaining = 0; - Ok(Some(limited_batch)) - }) - })) - } else { - stream - }; + let fut = build_record_batch_stream( + self.table.clone(), + self.snapshot_id, + self.projection.clone(), + self.predicates.clone(), + bucket, + ); + let stream = Box::pin(futures::stream::once(fut).try_flatten()) + as Pin> + Send>>; + + let limited_stream = match self.limit { + Some(limit) => apply_limit(stream, limit), + None => stream, + }; Ok(Box::pin(RecordBatchStreamAdapter::new( self.schema(), @@ -315,37 +257,22 @@ impl DisplayAs for IcebergTableScan { ) -> std::fmt::Result { let projection = self .projection - .clone() + .as_deref() .map_or(String::new(), |v| v.join(",")); let predicate = self .predicates - .clone() - .map_or(String::new(), |p| format!("{p}")); - - match &self.buckets { - Some(buckets) => { - let file_count = self.total_file_count(); - let bucket_count = buckets.len(); - write!( - f, - "{} projection:[{projection}] predicate:[{predicate}] \ - buckets:[{bucket_count}] file_count:[{file_count}]", - self.name() - )?; - if file_count <= 5 { - let files = buckets - .iter() - .flat_map(|b| b.iter().map(|t| t.data_file_path())) - .collect::>() - .join(", "); - write!(f, " files:[{files}]")?; - } - } - None => write!( - f, - "{} projection:[{projection}] predicate:[{predicate}]", - self.name() - )?, + .as_ref() + .map_or(String::new(), |p| p.to_string()); + + write!( + f, + "{} projection:[{projection}] predicate:[{predicate}]", + self.name() + )?; + if let Some(buckets) = &self.buckets { + let file_count = self.total_file_count(); + let bucket_count = buckets.len(); + write!(f, " buckets:[{bucket_count}] file_count:[{file_count}]")?; } if let Some(limit) = self.limit { write!(f, " limit:[{limit}]")?; @@ -354,34 +281,78 @@ impl DisplayAs for IcebergTableScan { } } -/// Asynchronously retrieves a stream of [`RecordBatch`] instances from a -/// given table. Used in lazy (single-partition) scan mode. -async fn get_batch_stream( +fn build_table_scan( table: Table, snapshot_id: Option, column_names: Option>, predicates: Option, -) -> DFResult> + Send>>> { +) -> DFResult { let scan_builder = match snapshot_id { - Some(snapshot_id) => table.scan().snapshot_id(snapshot_id), + Some(id) => table.scan().snapshot_id(id), None => table.scan(), }; - let mut scan_builder = match column_names { - Some(column_names) => scan_builder.select(column_names), + Some(names) => scan_builder.select(names), None => scan_builder.select_all(), }; if let Some(pred) = predicates { scan_builder = scan_builder.with_filter(pred); } - let table_scan = scan_builder.build().map_err(to_datafusion_error)?; + scan_builder.build().map_err(to_datafusion_error) +} + +/// Builds the `RecordBatch` stream for a single partition. When `bucket` is +/// `Some`, streams the pre-planned tasks via `to_arrow_from_tasks`; when +/// `None`, plans and reads the full scan via `to_arrow`. +async fn build_record_batch_stream( + table: Table, + snapshot_id: Option, + column_names: Option>, + predicates: Option, + bucket: Option>, +) -> DFResult> + Send>>> { + let table_scan = build_table_scan(table, snapshot_id, column_names, predicates)?; + let stream: Pin> + Send>> = match bucket { + Some(bucket) => { + let task_stream = Box::pin(futures::stream::iter( + bucket.into_iter().map(Ok::<_, iceberg::Error>), + )); + Box::pin( + table_scan + .to_arrow_from_tasks(task_stream) + .map_err(to_datafusion_error)? + .map_err(to_datafusion_error), + ) + } + None => Box::pin( + table_scan + .to_arrow() + .await + .map_err(to_datafusion_error)? + .map_err(to_datafusion_error), + ), + }; + Ok(stream) +} - let stream = table_scan - .to_arrow() - .await - .map_err(to_datafusion_error)? - .map_err(to_datafusion_error); - Ok(Box::pin(stream)) +/// Truncates a stream of `RecordBatch` to at most `limit` rows. +fn apply_limit( + stream: Pin> + Send>>, + limit: usize, +) -> Pin> + Send>> { + let mut remaining = limit; + Box::pin(stream.try_filter_map(move |batch| { + futures::future::ready(if remaining == 0 { + Ok(None) + } else if batch.num_rows() <= remaining { + remaining -= batch.num_rows(); + Ok(Some(batch)) + } else { + let limited_batch = batch.slice(0, remaining); + remaining = 0; + Ok(Some(limited_batch)) + }) + })) } pub(super) fn get_column_names( diff --git a/crates/integrations/datafusion/src/table/bucketing.rs b/crates/integrations/datafusion/src/table/bucketing.rs index 4c58730ce5..55ab23b6fd 100644 --- a/crates/integrations/datafusion/src/table/bucketing.rs +++ b/crates/integrations/datafusion/src/table/bucketing.rs @@ -178,9 +178,7 @@ fn literal_to_array(lit: &Literal, dt: &DataType) -> Option { (PrimitiveLiteral::Int(v), DataType::Date32) => Arc::new(Date32Array::from(vec![*v])), (PrimitiveLiteral::Long(v), DataType::Int64) => Arc::new(Int64Array::from(vec![*v])), (PrimitiveLiteral::Float(v), DataType::Float32) => Arc::new(Float32Array::from(vec![v.0])), - (PrimitiveLiteral::Double(v), DataType::Float64) => { - Arc::new(Float64Array::from(vec![v.0])) - } + (PrimitiveLiteral::Double(v), DataType::Float64) => Arc::new(Float64Array::from(vec![v.0])), (PrimitiveLiteral::String(v), DataType::Utf8) => { Arc::new(StringArray::from(vec![v.as_str()])) } diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index df9631cb3c..14aff7af3a 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -17,12 +17,10 @@ //! Iceberg table providers for DataFusion. //! -//! This module provides various table provider implementations: +//! This module provides two table provider implementations: //! //! - [`IcebergTableProvider`]: Catalog-backed provider with automatic metadata refresh. //! Use for write operations and when you need to see the latest table state. -//! On each scan, file tasks are eagerly planned and distributed across DataFusion -//! partitions for parallel execution. //! //! - [`IcebergStaticTableProvider`]: Static provider for read-only access to a specific //! table snapshot. Use for consistent analytical queries or time-travel scenarios. @@ -45,9 +43,8 @@ use datafusion::logical_expr::dml::InsertOp; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_expr::expressions::Column; -use datafusion::physical_plan::ExecutionPlan; -use datafusion::physical_plan::Partitioning; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; +use datafusion::physical_plan::{ExecutionPlan, Partitioning}; use futures::TryStreamExt; use iceberg::arrow::schema_to_arrow_schema; use iceberg::inspect::MetadataTableType; @@ -96,7 +93,6 @@ impl IcebergTableProvider { ) -> Result { let table_ident = TableIdent::new(namespace, name.into()); - // Load table once to get initial schema let table = catalog.load_table(&table_ident).await?; let schema = Arc::new(schema_to_arrow_schema(table.metadata().current_schema())?); @@ -111,7 +107,6 @@ impl IcebergTableProvider { &self, r#type: MetadataTableType, ) -> Result { - // Load fresh table metadata for metadata table access let table = self.catalog.load_table(&self.table_ident).await?; Ok(IcebergMetadataTableProvider { table, r#type }) } @@ -237,7 +232,6 @@ impl TableProvider for IcebergTableProvider { input: Arc, _insert_op: InsertOp, ) -> DFResult> { - // Load fresh table metadata from catalog let table = self .catalog .load_table(&self.table_ident) @@ -264,7 +258,6 @@ impl TableProvider for IcebergTableProvider { let repartitioned_plan = repartition(plan_with_partition, table.metadata_ref(), target_partitions)?; - // Apply sort node when it's not fanout mode let fanout_enabled = table .metadata() .properties() @@ -390,7 +383,6 @@ impl TableProvider for IcebergStaticTableProvider { filters: &[Expr], limit: Option, ) -> DFResult> { - // Use cached table (no refresh) Ok(Arc::new(IcebergTableScan::new( self.table.clone(), self.snapshot_id, @@ -942,12 +934,10 @@ mod tests { ); } - // ── IcebergTableProvider bucketed scan tests ───────────────────────────── - // (Originally from table/partitioned.rs; updated to use IcebergTableProvider - // and IcebergTableScan after the IcebergPartitionedTableProvider merge.) + // ── Bucketed scan tests ────────────────────────────────────────────────── - async fn make_catalog_and_table_for_bucketing( - ) -> (Arc, NamespaceIdent, String, tempfile::TempDir) { + async fn make_catalog_and_table_for_bucketing() + -> (Arc, NamespaceIdent, String, tempfile::TempDir) { use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; use iceberg::{CatalogBuilder, TableCreation}; @@ -1063,10 +1053,7 @@ mod tests { .scan(&ctx_with_target_partitions(8).state(), None, &[], None) .await .unwrap(); - let scan = plan - .as_any() - .downcast_ref::() - .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); assert_eq!(scan.buckets().len(), 1); assert_eq!(scan.buckets()[0].len(), 0); @@ -1092,10 +1079,7 @@ mod tests { .scan(&ctx_with_target_partitions(3).state(), None, &[], None) .await .unwrap(); - let scan = plan - .as_any() - .downcast_ref::() - .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); let total_files: usize = scan.buckets().iter().map(|b| b.len()).sum(); assert_eq!(total_files, 5); @@ -1122,10 +1106,7 @@ mod tests { .scan(&ctx_with_target_partitions(16).state(), None, &[], None) .await .unwrap(); - let scan = plan - .as_any() - .downcast_ref::() - .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); assert_eq!(scan.buckets().len(), 2); } @@ -1145,12 +1126,189 @@ mod tests { .scan(&ctx_with_target_partitions(1).state(), None, &[], None) .await .unwrap(); - let scan = plan - .as_any() - .downcast_ref::() - .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); assert_eq!(scan.buckets().len(), 1); assert_eq!(scan.buckets()[0].len(), 4); } + + async fn make_partitioned_catalog_and_table_for_bucketing() + -> (Arc, NamespaceIdent, String, tempfile::TempDir) { + use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; + use iceberg::spec::{ + NestedField, PrimitiveType, Schema, Transform, Type, UnboundPartitionSpec, + }; + use iceberg::{CatalogBuilder, TableCreation}; + + let temp_dir = tempfile::TempDir::new().unwrap(); + let warehouse = temp_dir.path().to_str().unwrap().to_string(); + + let catalog = Arc::new( + MemoryCatalogBuilder::default() + .load( + "memory", + std::collections::HashMap::from([( + MEMORY_CATALOG_WAREHOUSE.to_string(), + warehouse.clone(), + )]), + ) + .await + .unwrap(), + ); + + let namespace = NamespaceIdent::new("ns".to_string()); + catalog + .create_namespace(&namespace, std::collections::HashMap::new()) + .await + .unwrap(); + + let schema = Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(); + + let partition_spec = UnboundPartitionSpec::builder() + .with_spec_id(0) + .add_partition_field(2, "name_part", Transform::Identity) + .unwrap() + .build(); + + catalog + .create_table( + &namespace, + TableCreation::builder() + .name("t".to_string()) + .location(format!("{warehouse}/t")) + .schema(schema) + .partition_spec(partition_spec) + .properties(std::collections::HashMap::new()) + .build(), + ) + .await + .unwrap(); + + (catalog, namespace, "t".to_string(), temp_dir) + } + + /// Like [`append_fake_data_files`] but each file carries a partition tuple + /// matching the table's identity-partition spec on `name`. + async fn append_partitioned_fake_data_files( + catalog: &Arc, + namespace: &NamespaceIdent, + table_name: &str, + partition_values: Vec<&str>, + ) { + use iceberg::spec::{DataContentType, DataFileBuilder, DataFileFormat, Literal, Struct}; + use iceberg::transaction::{ApplyTransactionAction, Transaction}; + + let table = catalog + .load_table(&TableIdent::new(namespace.clone(), table_name.to_string())) + .await + .unwrap(); + + let data_files = partition_values + .iter() + .enumerate() + .map(|(i, value)| { + DataFileBuilder::default() + .content(DataContentType::Data) + .file_path(format!( + "{}/data/fake_{i}.parquet", + table.metadata().location() + )) + .file_format(DataFileFormat::Parquet) + .file_size_in_bytes(128) + .record_count(1) + .partition_spec_id(table.metadata().default_partition_spec_id()) + .partition(Struct::from_iter(vec![Some(Literal::string(*value))])) + .build() + .unwrap() + }) + .collect::>(); + + let tx = Transaction::new(&table); + let action = tx.fast_append().add_data_files(data_files); + action + .apply(tx) + .unwrap() + .commit(catalog.as_ref()) + .await + .unwrap(); + } + + /// Identity-partitioned table whose source column is in the projection + /// must produce `Partitioning::Hash` referencing that column. + #[tokio::test] + async fn test_identity_partitioned_declares_hash() { + use datafusion::physical_expr::expressions::Column; + use datafusion::physical_plan::Partitioning; + + let (catalog, namespace, table_name, _temp_dir) = + make_partitioned_catalog_and_table_for_bucketing().await; + append_partitioned_fake_data_files(&catalog, &namespace, &table_name, vec![ + "a", "b", "c", "a", "b", "c", + ]) + .await; + + let provider = IcebergTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan(&ctx_with_target_partitions(3).state(), None, &[], None) + .await + .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); + + let total_files: usize = scan.buckets().iter().map(|b| b.len()).sum(); + assert_eq!(total_files, 6); + + match &scan.properties().partitioning { + Partitioning::Hash(exprs, n) => { + assert_eq!(*n, 3); + assert_eq!(exprs.len(), 1); + let col = exprs[0] + .as_any() + .downcast_ref::() + .expect("expected Column expr"); + assert_eq!(col.name(), "name"); + } + other => panic!("expected Partitioning::Hash, got {other:?}"), + } + } + + /// A projection that omits the partition source column drops + /// `compute_identity_cols` to `None`, collapsing to `UnknownPartitioning`. + #[tokio::test] + async fn test_projection_without_partition_col_falls_back_to_unknown() { + use datafusion::physical_plan::Partitioning; + + let (catalog, namespace, table_name, _temp_dir) = + make_partitioned_catalog_and_table_for_bucketing().await; + append_partitioned_fake_data_files(&catalog, &namespace, &table_name, vec!["a", "b"]).await; + + let provider = IcebergTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + // Project only "id" (idx 0), excluding the partition column "name" (idx 1). + let projection = vec![0_usize]; + let plan = provider + .scan( + &ctx_with_target_partitions(3).state(), + Some(&projection), + &[], + None, + ) + .await + .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); + + assert!(matches!( + scan.properties().partitioning, + Partitioning::UnknownPartitioning(_) + )); + } } diff --git a/crates/integrations/datafusion/tests/integration_datafusion_test.rs b/crates/integrations/datafusion/tests/integration_datafusion_test.rs index 8a58e94577..7603c8b7ab 100644 --- a/crates/integrations/datafusion/tests/integration_datafusion_test.rs +++ b/crates/integrations/datafusion/tests/integration_datafusion_test.rs @@ -600,8 +600,8 @@ async fn test_insert_into_nested() -> Result<()> { // Insert data with nested structs let insert_sql = r#" INSERT INTO catalog.test_insert_nested.nested_table - SELECT - 1 as id, + SELECT + 1 as id, 'Alice' as name, named_struct( 'address', named_struct( @@ -615,8 +615,8 @@ async fn test_insert_into_nested() -> Result<()> { ) ) as profile UNION ALL - SELECT - 2 as id, + SELECT + 2 as id, 'Bob' as name, named_struct( 'address', named_struct( @@ -738,15 +738,15 @@ async fn test_insert_into_nested() -> Result<()> { let df = ctx .sql( r#" - SELECT - id, + SELECT + id, name, profile.address.street, profile.address.city, profile.address.zip, profile.contact.email, profile.contact.phone - FROM catalog.test_insert_nested.nested_table + FROM catalog.test_insert_nested.nested_table ORDER BY id "#, ) @@ -852,8 +852,8 @@ async fn test_insert_into_partitioned() -> Result<()> { let df = ctx .sql( r#" - INSERT INTO catalog.test_partitioned_write.partitioned_table - VALUES + INSERT INTO catalog.test_partitioned_write.partitioned_table + VALUES (1, 'electronics', 'laptop'), (2, 'electronics', 'phone'), (3, 'books', 'novel'), From c0ffb36146acecdec805b1bb5b5f4b91cd0d2110 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Wed, 29 Apr 2026 15:36:04 +0200 Subject: [PATCH 13/32] test(sqllogictest): update EXPLAIN snapshots for eager bucketing output IcebergTableProvider::scan now plans files eagerly and buckets them across DataFusion partitions before returning the ExecutionPlan. As a result, IcebergTableScan's DisplayAs output always includes `buckets:[N] file_count:[M]` - even for unpartitioned tables where N = 1. Update the four .slt files whose EXPLAIN snapshots were missing this suffix, and fix the like_predicate_pushdown snapshots that also had a stale input_partitions count on RepartitionExec (the table now has multiple files across multiple buckets). (cherry picked from commit 6ae4a71247a158380257a4b6339348c8ff232f1c) --- .../sqllogictest/testdata/slts/df_test/basic_queries.slt | 2 +- .../testdata/slts/df_test/binary_predicate_pushdown.slt | 2 +- .../testdata/slts/df_test/boolean_predicate_pushdown.slt | 6 +++--- .../testdata/slts/df_test/like_predicate_pushdown.slt | 8 ++++---- .../slts/df_test/timestamp_predicate_pushdown.slt | 8 ++++---- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt b/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt index a5ca4de46a..d9933e0f87 100644 --- a/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt +++ b/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt @@ -53,7 +53,7 @@ logical_plan physical_plan 01)GlobalLimitExec: skip=0, fetch=3 02)--CooperativeExec -03)----IcebergTableScan projection:[id,name,score,category] predicate:[] limit:[3] +03)----IcebergTableScan projection:[id,name,score,category] predicate:[] buckets:[1] file_count:[1] limit:[3] # Test SELECT * with ORDER BY and LIMIT query ITRT diff --git a/crates/sqllogictest/testdata/slts/df_test/binary_predicate_pushdown.slt b/crates/sqllogictest/testdata/slts/df_test/binary_predicate_pushdown.slt index aa68ab2762..249d52edd0 100644 --- a/crates/sqllogictest/testdata/slts/df_test/binary_predicate_pushdown.slt +++ b/crates/sqllogictest/testdata/slts/df_test/binary_predicate_pushdown.slt @@ -28,7 +28,7 @@ logical_plan physical_plan 01)FilterExec: data@1 = 0102 02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -03)----IcebergTableScan projection:[id,data] predicate:[data = 0102] +03)----IcebergTableScan projection:[id,data] predicate:[data = 0102] buckets:[1] file_count:[0] # Verify empty result from empty table query I? diff --git a/crates/sqllogictest/testdata/slts/df_test/boolean_predicate_pushdown.slt b/crates/sqllogictest/testdata/slts/df_test/boolean_predicate_pushdown.slt index 496f719261..b4596ba6ba 100644 --- a/crates/sqllogictest/testdata/slts/df_test/boolean_predicate_pushdown.slt +++ b/crates/sqllogictest/testdata/slts/df_test/boolean_predicate_pushdown.slt @@ -39,7 +39,7 @@ logical_plan physical_plan 01)FilterExec: is_active@1 02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -03)----IcebergTableScan projection:[id,is_active,description] predicate:[is_active = true] +03)----IcebergTableScan projection:[id,is_active,description] predicate:[is_active = true] buckets:[1] file_count:[1] # Query with is_active = true query ITT rowsort @@ -59,7 +59,7 @@ logical_plan physical_plan 01)FilterExec: NOT is_active@1 02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -03)----IcebergTableScan projection:[id,is_active,description] predicate:[is_active = false] +03)----IcebergTableScan projection:[id,is_active,description] predicate:[is_active = false] buckets:[1] file_count:[1] # Query with is_active = false query ITT rowsort @@ -78,7 +78,7 @@ logical_plan physical_plan 01)FilterExec: NOT is_active@1 02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -03)----IcebergTableScan projection:[id,is_active,description] predicate:[is_active = false] +03)----IcebergTableScan projection:[id,is_active,description] predicate:[is_active = false] buckets:[1] file_count:[1] # Query with is_active != true (includes false and NULL) query ITT rowsort diff --git a/crates/sqllogictest/testdata/slts/df_test/like_predicate_pushdown.slt b/crates/sqllogictest/testdata/slts/df_test/like_predicate_pushdown.slt index 3d8b151aa9..698046046a 100644 --- a/crates/sqllogictest/testdata/slts/df_test/like_predicate_pushdown.slt +++ b/crates/sqllogictest/testdata/slts/df_test/like_predicate_pushdown.slt @@ -36,8 +36,8 @@ logical_plan 02)--TableScan: default.default.test_unpartitioned_table projection=[id, name], partial_filters=[default.default.test_unpartitioned_table.name LIKE Utf8("Al%")] physical_plan 01)FilterExec: name@1 LIKE Al% -02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -03)----IcebergTableScan projection:[id,name] predicate:[name STARTS WITH "Al"] +02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2 +03)----IcebergTableScan projection:[id,name] predicate:[name STARTS WITH "Al"] buckets:[2] file_count:[2] # Test LIKE filtering with case-sensitive match query IT rowsort @@ -55,8 +55,8 @@ logical_plan 02)--TableScan: default.default.test_unpartitioned_table projection=[id, name], partial_filters=[default.default.test_unpartitioned_table.name NOT LIKE Utf8("Al%")] physical_plan 01)FilterExec: name@1 NOT LIKE Al% -02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -03)----IcebergTableScan projection:[id,name] predicate:[name NOT STARTS WITH "Al"] +02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=3 +03)----IcebergTableScan projection:[id,name] predicate:[name NOT STARTS WITH "Al"] buckets:[3] file_count:[3] # Test NOT LIKE filtering query IT rowsort diff --git a/crates/sqllogictest/testdata/slts/df_test/timestamp_predicate_pushdown.slt b/crates/sqllogictest/testdata/slts/df_test/timestamp_predicate_pushdown.slt index ffa74173dc..47100cc36d 100644 --- a/crates/sqllogictest/testdata/slts/df_test/timestamp_predicate_pushdown.slt +++ b/crates/sqllogictest/testdata/slts/df_test/timestamp_predicate_pushdown.slt @@ -50,7 +50,7 @@ logical_plan physical_plan 01)FilterExec: ts@1 = 1672921800000000000 02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -03)----IcebergTableScan projection:[id,ts] predicate:[ts = 2023-01-05 12:30:00] +03)----IcebergTableScan projection:[id,ts] predicate:[ts = 2023-01-05 12:30:00] buckets:[1] file_count:[1] # Verify timestamp equality filtering works query I? @@ -68,7 +68,7 @@ logical_plan physical_plan 01)FilterExec: ts@1 > 1673308800000000000 02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -03)----IcebergTableScan projection:[id,ts] predicate:[ts > 2023-01-10 00:00:00] +03)----IcebergTableScan projection:[id,ts] predicate:[ts > 2023-01-10 00:00:00] buckets:[1] file_count:[1] # Verify timestamp greater than filtering query I? rowsort @@ -97,7 +97,7 @@ logical_plan physical_plan 01)FilterExec: ts@1 >= 1672876800000000000 AND ts@1 <= 1673827199000000000 02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -03)----IcebergTableScan projection:[id,ts] predicate:[(ts >= 2023-01-05 00:00:00) AND (ts <= 2023-01-15 23:59:59)] +03)----IcebergTableScan projection:[id,ts] predicate:[(ts >= 2023-01-05 00:00:00) AND (ts <= 2023-01-15 23:59:59)] buckets:[1] file_count:[1] # Test timestamp range predicate filtering query I? rowsort @@ -162,7 +162,7 @@ logical_plan physical_plan 01)FilterExec: ts@1 > 1672531200000000 02)--RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -03)----IcebergTableScan projection:[id,ts] predicate:[ts > 2023-01-01 00:00:00] +03)----IcebergTableScan projection:[id,ts] predicate:[ts > 2023-01-01 00:00:00] buckets:[1] file_count:[1] query I? SELECT * FROM default.default.test_timestamp_micros WHERE ts > CAST('2023-01-01 00:00:00' AS TIMESTAMP) From 89bc410826056193c9c3b2cbf6eb458725f66040 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Wed, 29 Apr 2026 16:47:46 +0200 Subject: [PATCH 14/32] fix(datafusion): resolve conflicts --- crates/iceberg/src/scan/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 9e321dc81b..21cd3bb571 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -487,7 +487,7 @@ impl TableScan { arrow_reader_builder = arrow_reader_builder.with_batch_size(batch_size); } - Ok(arrow_reader_builder.build().read(tasks)?.stream()) + arrow_reader_builder.build().read(tasks).map(|r| r.stream()) } /// Returns a reference to the column names of the table scan. From d83b29b307075f7a2266723eb9cdfc68917dd78e Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Mon, 15 Jun 2026 15:28:01 +0200 Subject: [PATCH 15/32] refactor(datafusion): read preplanned tasks with ArrowReaderBuilder --- crates/iceberg/src/scan/mod.rs | 5 ++- .../datafusion/src/physical_plan/scan.rs | 42 +++++++++++++------ 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 21cd3bb571..03effb1496 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -487,7 +487,10 @@ impl TableScan { arrow_reader_builder = arrow_reader_builder.with_batch_size(batch_size); } - arrow_reader_builder.build().read(tasks).map(|r| r.stream()) + arrow_reader_builder + .build() + .read(tasks) + .map(|result| result.stream()) } /// Returns a reference to the column names of the table scan. diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index 0a386bded7..5211629394 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -16,6 +16,7 @@ // under the License. use std::any::Any; +use std::num::NonZeroUsize; use std::pin::Pin; use std::sync::Arc; @@ -29,12 +30,18 @@ use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{DisplayAs, ExecutionPlan, Partitioning, PlanProperties}; use datafusion::prelude::Expr; use futures::{Stream, TryStreamExt}; +use iceberg::arrow::ArrowReaderBuilder; use iceberg::expr::Predicate; use iceberg::scan::{FileScanTask, TableScan}; use iceberg::table::Table; use super::expr_to_predicate::convert_filters_to_predicate; use crate::to_datafusion_error; +const DEFAULT_PARALLELISM: usize = 1; +fn available_parallelism() -> NonZeroUsize { + std::thread::available_parallelism() + .unwrap_or_else(|_err| NonZeroUsize::new(DEFAULT_PARALLELISM).unwrap()) +} /// Iceberg [`Table`] scan as a DataFusion [`ExecutionPlan`]. /// @@ -117,7 +124,7 @@ impl IcebergTableScan { #[allow(clippy::too_many_arguments)] fn new_inner( - table: Table, + table: Table, // could we remove the Table from here ? snapshot_id: Option, schema: ArrowSchemaRef, projection: Option<&Vec>, @@ -234,6 +241,7 @@ impl ExecutionPlan for IcebergTableScan { self.predicates.clone(), bucket, ); + let stream = Box::pin(futures::stream::once(fut).try_flatten()) as Pin> + Send>>; @@ -302,8 +310,8 @@ fn build_table_scan( } /// Builds the `RecordBatch` stream for a single partition. When `bucket` is -/// `Some`, streams the pre-planned tasks via `to_arrow_from_tasks`; when -/// `None`, plans and reads the full scan via `to_arrow`. +/// `Some`, streams the pre-planned tasks directly through an `ArrowReader`; +/// when `None`, plans and reads the full scan via `to_arrow`. async fn build_record_batch_stream( table: Table, snapshot_id: Option, @@ -311,26 +319,36 @@ async fn build_record_batch_stream( predicates: Option, bucket: Option>, ) -> DFResult> + Send>>> { - let table_scan = build_table_scan(table, snapshot_id, column_names, predicates)?; let stream: Pin> + Send>> = match bucket { Some(bucket) => { let task_stream = Box::pin(futures::stream::iter( bucket.into_iter().map(Ok::<_, iceberg::Error>), )); + let num_cpus = available_parallelism().get(); + let arrow_reader_builder = ArrowReaderBuilder::new(table.file_io().clone()) + .with_data_file_concurrency_limit(num_cpus) + .with_row_group_filtering_enabled(true) + .with_row_selection_enabled(true); + + Box::pin( + arrow_reader_builder + .build() + .read(task_stream) + .map_err(to_datafusion_error)? + .stream() + .map_err(to_datafusion_error), + ) + } + None => { + let table_scan = build_table_scan(table, snapshot_id, column_names, predicates)?; Box::pin( table_scan - .to_arrow_from_tasks(task_stream) + .to_arrow() + .await .map_err(to_datafusion_error)? .map_err(to_datafusion_error), ) } - None => Box::pin( - table_scan - .to_arrow() - .await - .map_err(to_datafusion_error)? - .map_err(to_datafusion_error), - ), }; Ok(stream) } From c29c0594447390654addfac6851394c60481785f Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Mon, 15 Jun 2026 15:41:54 +0200 Subject: [PATCH 16/32] refactor(datafusion): distinguish lazy and eager scan buckets --- .../datafusion/src/physical_plan/scan.rs | 12 +++++++---- .../integrations/datafusion/src/table/mod.rs | 21 ++++++++++++------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index 5211629394..ea254c9146 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -173,9 +173,12 @@ impl IcebergTableScan { self.predicates.as_ref() } - /// Returns the pre-planned file task buckets, or an empty slice in lazy mode. - pub fn buckets(&self) -> &[Vec] { - self.buckets.as_deref().unwrap_or(&[]) + /// Returns the pre-planned file task buckets. + /// + /// `None` means lazy mode, where file tasks are planned inside `execute`; + /// `Some` means eager mode, where `execute` reads from pre-planned buckets. + pub fn buckets(&self) -> Option<&[Vec]> { + self.buckets.as_deref() } pub fn limit(&self) -> Option { @@ -183,7 +186,8 @@ impl IcebergTableScan { } fn total_file_count(&self) -> usize { - self.buckets().iter().map(|b| b.len()).sum() + self.buckets() + .map_or(0, |buckets| buckets.iter().map(|b| b.len()).sum()) } } diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 14aff7af3a..b4ade5b648 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -1054,9 +1054,10 @@ mod tests { .await .unwrap(); let scan = plan.as_any().downcast_ref::().unwrap(); + let buckets = scan.buckets().expect("expected eager scan buckets"); - assert_eq!(scan.buckets().len(), 1); - assert_eq!(scan.buckets()[0].len(), 0); + assert_eq!(buckets.len(), 1); + assert_eq!(buckets[0].len(), 0); assert_eq!(scan.properties().partitioning.partition_count(), 1); } @@ -1080,10 +1081,11 @@ mod tests { .await .unwrap(); let scan = plan.as_any().downcast_ref::().unwrap(); + let buckets = scan.buckets().expect("expected eager scan buckets"); - let total_files: usize = scan.buckets().iter().map(|b| b.len()).sum(); + let total_files: usize = buckets.iter().map(|b| b.len()).sum(); assert_eq!(total_files, 5); - assert_eq!(scan.buckets().len(), 3); + assert_eq!(buckets.len(), 3); assert!(matches!( scan.properties().partitioning, Partitioning::UnknownPartitioning(3) @@ -1107,8 +1109,9 @@ mod tests { .await .unwrap(); let scan = plan.as_any().downcast_ref::().unwrap(); + let buckets = scan.buckets().expect("expected eager scan buckets"); - assert_eq!(scan.buckets().len(), 2); + assert_eq!(buckets.len(), 2); } /// target_partitions = 1 collapses every task into a single bucket, giving @@ -1127,9 +1130,10 @@ mod tests { .await .unwrap(); let scan = plan.as_any().downcast_ref::().unwrap(); + let buckets = scan.buckets().expect("expected eager scan buckets"); - assert_eq!(scan.buckets().len(), 1); - assert_eq!(scan.buckets()[0].len(), 4); + assert_eq!(buckets.len(), 1); + assert_eq!(buckets[0].len(), 4); } async fn make_partitioned_catalog_and_table_for_bucketing() @@ -1262,8 +1266,9 @@ mod tests { .await .unwrap(); let scan = plan.as_any().downcast_ref::().unwrap(); + let buckets = scan.buckets().expect("expected eager scan buckets"); - let total_files: usize = scan.buckets().iter().map(|b| b.len()).sum(); + let total_files: usize = buckets.iter().map(|b| b.len()).sum(); assert_eq!(total_files, 6); match &scan.properties().partitioning { From c748e2322492f85fbeefa7df025a62904905cbdb Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Mon, 15 Jun 2026 15:53:39 +0200 Subject: [PATCH 17/32] refactor(datafusion): introduce IcebergTableScanBuilder --- .../datafusion/src/physical_plan/scan.rs | 153 ++++++++++-------- .../integrations/datafusion/src/table/mod.rs | 39 ++--- 2 files changed, 105 insertions(+), 87 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index ea254c9146..79e87a9a4f 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -45,9 +45,9 @@ fn available_parallelism() -> NonZeroUsize { /// Iceberg [`Table`] scan as a DataFusion [`ExecutionPlan`]. /// -/// Has two construction modes: [`IcebergTableScan::new`] for a lazy -/// single-partition scan, and [`IcebergTableScan::new_with_tasks`] for an -/// eager multi-partition scan over pre-planned [`FileScanTask`] buckets. +/// Has two construction modes: lazy single-partition scans that plan files +/// inside `execute(0)`, and eager multi-partition scans over pre-planned +/// [`FileScanTask`] buckets. /// /// Note: in eager mode the underlying `TableScan` is rebuilt on every /// `execute(partition)` call. The per-build cost is bounded (no I/O) and @@ -71,92 +71,109 @@ pub struct IcebergTableScan { limit: Option, } -impl IcebergTableScan { - /// Creates a lazy single-partition scan that plans and reads all tasks - /// inside `execute(0)`. Used by - /// [`IcebergStaticTableProvider`][crate::table::IcebergStaticTableProvider]. - pub fn new( - table: Table, - snapshot_id: Option, - schema: ArrowSchemaRef, - projection: Option<&Vec>, - filters: &[Expr], - limit: Option, - ) -> Self { - Self::new_inner( +/// Builder to create an [`IcebergTableScan`]. +pub struct IcebergTableScanBuilder { + table: Table, + snapshot_id: Option, + schema: ArrowSchemaRef, + projection: Option>, + filters: Vec, + limit: Option, + partitioning: Partitioning, + buckets: Option>>, +} + +impl IcebergTableScanBuilder { + /// Creates a builder for a lazy single-partition scan. + pub fn new(table: Table, schema: ArrowSchemaRef) -> Self { + Self { table, - snapshot_id, schema, - projection, - filters, - limit, - Partitioning::UnknownPartitioning(1), - None, - ) + snapshot_id: None, + projection: None, + filters: vec![], + limit: None, + partitioning: Partitioning::UnknownPartitioning(1), + buckets: None, + } + } + + /// Sets the snapshot to scan. When not set, it uses current snapshot. + pub fn with_snapshot_id(mut self, snapshot_id: Option) -> Self { + self.snapshot_id = snapshot_id; + self + } + + /// Sets the projected output columns. + pub fn with_projection(mut self, projection: Option<&Vec>) -> Self { + self.projection = projection.cloned(); + self + } + + /// Sets the filters to apply to the table scan. + pub fn with_filters(mut self, filters: &[Expr]) -> Self { + self.filters = filters.to_vec(); + self } - /// Creates an eager multi-partition scan over pre-planned task buckets. - /// Partition `i` streams `buckets[i]`. The caller is responsible for - /// ensuring `partitioning` matches the bucketing. Used by - /// [`IcebergTableProvider`][crate::table::IcebergTableProvider]. - #[allow(clippy::too_many_arguments)] - pub fn new_with_tasks( - table: Table, - snapshot_id: Option, - schema: ArrowSchemaRef, - projection: Option<&Vec>, - filters: &[Expr], - limit: Option, + /// Sets the optional row limit. + pub fn with_limit(mut self, limit: Option) -> Self { + self.limit = limit; + self + } + + /// Sets pre-planned task buckets for eager multi-partition scans. + pub fn with_task_buckets( + mut self, buckets: Vec>, partitioning: Partitioning, ) -> Self { - Self::new_inner( - table, - snapshot_id, - schema, - projection, - filters, - limit, - partitioning, - Some(buckets), - ) + self.buckets = Some(buckets); + self.partitioning = partitioning; + self } - #[allow(clippy::too_many_arguments)] - fn new_inner( - table: Table, // could we remove the Table from here ? - snapshot_id: Option, - schema: ArrowSchemaRef, - projection: Option<&Vec>, - filters: &[Expr], - limit: Option, - partitioning: Partitioning, - buckets: Option>>, - ) -> Self { - let output_schema = match projection { - None => schema.clone(), - Some(projection) => Arc::new(schema.project(projection).unwrap()), + /// Builds the [`IcebergTableScan`]. + pub fn build(self) -> DFResult { + if let Some(buckets) = &self.buckets { + let partition_count = self.partitioning.partition_count(); + if buckets.len() != partition_count { + return Err(DataFusionError::Internal(format!( + "IcebergTableScan expected {} task buckets to match partitioning, got {}", + partition_count, + buckets.len() + ))); + } + } + + let output_schema = match &self.projection { + None => self.schema.clone(), + Some(projection) => Arc::new(self.schema.project(projection).map_err(|err| { + DataFusionError::Plan(format!("Failed to project Iceberg table schema: {err}")) + })?), }; let plan_properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(output_schema), - partitioning, + self.partitioning, EmissionType::Incremental, Boundedness::Bounded, )); - let projection = get_column_names(schema, projection); - let predicates = convert_filters_to_predicate(filters); + let projection = get_column_names(self.schema, self.projection.as_ref()); + let predicates = convert_filters_to_predicate(&self.filters); - Self { - table, - snapshot_id, + Ok(IcebergTableScan { + table: self.table, + snapshot_id: self.snapshot_id, plan_properties, projection, predicates, - buckets, - limit, - } + buckets: self.buckets, + limit: self.limit, + }) } +} +impl IcebergTableScan { pub fn table(&self) -> &Table { &self.table } diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index b4ade5b648..9302a5add4 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -59,7 +59,7 @@ use crate::physical_plan::commit::IcebergCommitExec; use crate::physical_plan::expr_to_predicate::convert_filters_to_predicate; use crate::physical_plan::project::project_with_partition; use crate::physical_plan::repartition::repartition; -use crate::physical_plan::scan::IcebergTableScan; +use crate::physical_plan::scan::IcebergTableScanBuilder; use crate::physical_plan::sort::sort_by_partition; use crate::physical_plan::write::IcebergWriteExec; @@ -206,16 +206,16 @@ impl TableProvider for IcebergTableProvider { _ => Partitioning::UnknownPartitioning(n_partitions), }; - Ok(Arc::new(IcebergTableScan::new_with_tasks( - table, - None, // Always use current snapshot for catalog-backed provider - self.schema.clone(), - projection, - filters, - limit, - buckets, - partitioning, - ))) + Ok(Arc::new( + IcebergTableScanBuilder::new(table, self.schema.clone()) + // Always use current snapshot for catalog-backed provider. + .with_snapshot_id(None) + .with_projection(projection) + .with_filters(filters) + .with_limit(limit) + .with_task_buckets(buckets, partitioning) + .build()?, + )) } fn supports_filters_pushdown( @@ -383,14 +383,14 @@ impl TableProvider for IcebergStaticTableProvider { filters: &[Expr], limit: Option, ) -> DFResult> { - Ok(Arc::new(IcebergTableScan::new( - self.table.clone(), - self.snapshot_id, - self.schema.clone(), - projection, - filters, - limit, - ))) + Ok(Arc::new( + IcebergTableScanBuilder::new(self.table.clone(), self.schema.clone()) + .with_snapshot_id(self.snapshot_id) + .with_projection(projection) + .with_filters(filters) + .with_limit(limit) + .build()?, + )) } fn supports_filters_pushdown( @@ -432,6 +432,7 @@ mod tests { use tempfile::TempDir; use super::*; + use crate::physical_plan::scan::IcebergTableScan; async fn get_test_table_from_metadata_file() -> Table { let metadata_file_name = "TableMetadataV2Valid.json"; From cd2b68b4736bb267d4bbdc47f1364ffccd96570b Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Mon, 15 Jun 2026 16:31:12 +0200 Subject: [PATCH 18/32] perf(datafusion): avoid cloning task buckets during execute --- .../datafusion/src/physical_plan/scan.rs | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index 79e87a9a4f..b6e0fb4177 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -66,7 +66,7 @@ pub struct IcebergTableScan { /// Filters to apply to the table scan. predicates: Option, /// Pre-planned file scan tasks per partition (eager mode), or `None` (lazy mode). - buckets: Option>>, + buckets: Option]>>, /// Optional limit on the number of rows to return. limit: Option, } @@ -80,7 +80,7 @@ pub struct IcebergTableScanBuilder { filters: Vec, limit: Option, partitioning: Partitioning, - buckets: Option>>, + buckets: Option]>>, } impl IcebergTableScanBuilder { @@ -128,7 +128,11 @@ impl IcebergTableScanBuilder { buckets: Vec>, partitioning: Partitioning, ) -> Self { - self.buckets = Some(buckets); + let buckets = buckets + .into_iter() + .map(Arc::<[FileScanTask]>::from) + .collect::>(); + self.buckets = Some(Arc::<[Arc<[FileScanTask]>]>::from(buckets)); self.partitioning = partitioning; self } @@ -194,7 +198,7 @@ impl IcebergTableScan { /// /// `None` means lazy mode, where file tasks are planned inside `execute`; /// `Some` means eager mode, where `execute` reads from pre-planned buckets. - pub fn buckets(&self) -> Option<&[Vec]> { + pub fn buckets(&self) -> Option<&[Arc<[FileScanTask]>]> { self.buckets.as_deref() } @@ -245,13 +249,13 @@ impl ExecutionPlan for IcebergTableScan { _context: Arc, ) -> DFResult { let bucket = match &self.buckets { - Some(buckets) => Some(buckets.get(partition).cloned().ok_or_else(|| { + Some(buckets) => Some(Arc::clone(buckets.get(partition).ok_or_else(|| { DataFusionError::Internal(format!( "{}: partition index {partition} is out of bounds (total buckets: {})", self.name(), buckets.len() )) - })?), + })?)), None => None, }; @@ -338,12 +342,12 @@ async fn build_record_batch_stream( snapshot_id: Option, column_names: Option>, predicates: Option, - bucket: Option>, + bucket: Option>, ) -> DFResult> + Send>>> { let stream: Pin> + Send>> = match bucket { Some(bucket) => { let task_stream = Box::pin(futures::stream::iter( - bucket.into_iter().map(Ok::<_, iceberg::Error>), + (0..bucket.len()).map(move |idx| Ok::<_, iceberg::Error>(bucket[idx].clone())), )); let num_cpus = available_parallelism().get(); let arrow_reader_builder = ArrowReaderBuilder::new(table.file_io().clone()) From 8fe57aa24e951eed8016ba7e27de9e4b78b3d0c1 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Mon, 15 Jun 2026 16:36:36 +0200 Subject: [PATCH 19/32] refactor(datafusion): centralize scan stream error mapping --- .../datafusion/src/physical_plan/scan.rs | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index b6e0fb4177..d7b689d0cc 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -344,7 +344,7 @@ async fn build_record_batch_stream( predicates: Option, bucket: Option>, ) -> DFResult> + Send>>> { - let stream: Pin> + Send>> = match bucket { + let stream: Pin> + Send>> = match bucket { Some(bucket) => { let task_stream = Box::pin(futures::stream::iter( (0..bucket.len()).map(move |idx| Ok::<_, iceberg::Error>(bucket[idx].clone())), @@ -360,22 +360,15 @@ async fn build_record_batch_stream( .build() .read(task_stream) .map_err(to_datafusion_error)? - .stream() - .map_err(to_datafusion_error), + .stream(), ) } None => { let table_scan = build_table_scan(table, snapshot_id, column_names, predicates)?; - Box::pin( - table_scan - .to_arrow() - .await - .map_err(to_datafusion_error)? - .map_err(to_datafusion_error), - ) + Box::pin(table_scan.to_arrow().await.map_err(to_datafusion_error)?) } }; - Ok(stream) + Ok(Box::pin(stream.map_err(to_datafusion_error))) } /// Truncates a stream of `RecordBatch` to at most `limit` rows. From ace9b7e17133e10d8ac5d9de78cbd0cbbcde32c6 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Mon, 15 Jun 2026 16:57:54 +0200 Subject: [PATCH 20/32] test(datafusion): verify identity bucket hashes match repartitioning --- .../integrations/datafusion/src/table/mod.rs | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 9302a5add4..5b8be4b766 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -201,6 +201,13 @@ impl TableProvider for IcebergTableProvider { .iter() .map(|c| Arc::new(Column::new(&c.name, c.output_idx)) as Arc) .collect(); + // This declaration is only sound if the Arrow arrays built from + // partition literals hash identically to the column arrays the + // reader emits at scan time. DataFusion's hash dispatch is + // dtype-specific, so any drift in the reader output type (for + // example Utf8 vs Utf8View) must either update the bucketing + // path to materialize that exact dtype or fall back to + // UnknownPartitioning. Partitioning::Hash(exprs, n_partitions) } _ => Partitioning::UnknownPartitioning(n_partitions), @@ -1286,6 +1293,80 @@ mod tests { } } + /// Identity partition task buckets must match DataFusion's own hash + /// repartition bucket calculation for the same concrete Arrow array type. + #[tokio::test] + async fn test_identity_partitioned_hash_buckets_match_datafusion_repartition() { + use datafusion::arrow::array::{ArrayRef, StringArray}; + use datafusion::common::hash_utils::create_hashes; + use datafusion::physical_plan::Partitioning; + use datafusion::physical_plan::repartition::REPARTITION_RANDOM_STATE; + + let partition_values = vec!["a", "b", "c", "a", "b", "c", "z"]; + let n_partitions = 4_usize; + + let (catalog, namespace, table_name, _temp_dir) = + make_partitioned_catalog_and_table_for_bucketing().await; + append_partitioned_fake_data_files( + &catalog, + &namespace, + &table_name, + partition_values.clone(), + ) + .await; + + let provider = IcebergTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan( + &ctx_with_target_partitions(n_partitions).state(), + None, + &[], + None, + ) + .await + .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); + let buckets = scan.buckets().expect("expected eager scan buckets"); + + assert!(matches!( + scan.properties().partitioning, + Partitioning::Hash(_, 4) + )); + + let arrays: Vec = vec![Arc::new(StringArray::from(partition_values))]; + let mut hashes = vec![0_u64; arrays[0].len()]; + create_hashes( + &arrays, + REPARTITION_RANDOM_STATE.random_state(), + &mut hashes, + ) + .unwrap(); + + let mut actual_bucket_by_file = vec![None; hashes.len()]; + for (bucket_idx, bucket) in buckets.iter().enumerate() { + for task in bucket.iter() { + let file_idx = task + .data_file_path() + .strip_suffix(".parquet") + .and_then(|path| path.rsplit_once("fake_").map(|(_, idx)| idx)) + .and_then(|idx| idx.parse::().ok()) + .expect("fake data file path should include its row index"); + actual_bucket_by_file[file_idx] = Some(bucket_idx); + } + } + + for (file_idx, hash) in hashes.iter().enumerate() { + let expected_bucket = (hash % n_partitions as u64) as usize; + assert_eq!( + actual_bucket_by_file[file_idx], + Some(expected_bucket), + "file {file_idx} should be assigned to DataFusion hash bucket {expected_bucket}" + ); + } + } + /// A projection that omits the partition source column drops /// `compute_identity_cols` to `None`, collapsing to `UnknownPartitioning`. #[tokio::test] From f3ac17d1a2cca2faeb0dc03c1a37a6bb422e93a7 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Mon, 15 Jun 2026 17:40:51 +0200 Subject: [PATCH 21/32] fix(datafusion): reuse eager scan predicates during execution --- .../datafusion/src/physical_plan/scan.rs | 12 +++++++++++- crates/integrations/datafusion/src/table/mod.rs | 7 ++++--- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index d7b689d0cc..b41721c9de 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -77,6 +77,7 @@ pub struct IcebergTableScanBuilder { snapshot_id: Option, schema: ArrowSchemaRef, projection: Option>, + predicates: Option, filters: Vec, limit: Option, partitioning: Partitioning, @@ -91,6 +92,7 @@ impl IcebergTableScanBuilder { schema, snapshot_id: None, projection: None, + predicates: None, filters: vec![], limit: None, partitioning: Partitioning::UnknownPartitioning(1), @@ -110,6 +112,12 @@ impl IcebergTableScanBuilder { self } + /// Sets the predicates + pub fn with_predicates(mut self, predicates: Option) -> Self { + self.predicates = predicates; + self + } + /// Sets the filters to apply to the table scan. pub fn with_filters(mut self, filters: &[Expr]) -> Self { self.filters = filters.to_vec(); @@ -163,7 +171,9 @@ impl IcebergTableScanBuilder { Boundedness::Bounded, )); let projection = get_column_names(self.schema, self.projection.as_ref()); - let predicates = convert_filters_to_predicate(&self.filters); + let predicates = self + .predicates + .or_else(|| convert_filters_to_predicate(&self.filters)); Ok(IcebergTableScan { table: self.table, diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 5b8be4b766..34c89c204b 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -149,15 +149,15 @@ impl TableProvider for IcebergTableProvider { .collect::>() }); - let predicate = convert_filters_to_predicate(filters); + let predicates = convert_filters_to_predicate(filters); let mut builder = table.scan(); builder = match col_names { Some(names) => builder.select(names), None => builder.select_all(), }; - if let Some(pred) = predicate { - builder = builder.with_filter(pred); + if let Some(pred) = &predicates { + builder = builder.with_filter(pred.clone()); } let tasks: Vec = builder @@ -218,6 +218,7 @@ impl TableProvider for IcebergTableProvider { // Always use current snapshot for catalog-backed provider. .with_snapshot_id(None) .with_projection(projection) + .with_predicates(predicates) .with_filters(filters) .with_limit(limit) .with_task_buckets(buckets, partitioning) From b6369e2687f4b7254d0a799f57109e02052499df Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Mon, 15 Jun 2026 17:46:03 +0200 Subject: [PATCH 22/32] perf(datafusion): batch identity partition hashing --- crates/iceberg/src/arrow/value.rs | 534 ++++++++++++------ .../datafusion/src/physical_plan/scan.rs | 2 + .../datafusion/src/table/bucketing.rs | 341 +++++++++-- .../integrations/datafusion/src/table/mod.rs | 5 +- 4 files changed, 664 insertions(+), 218 deletions(-) diff --git a/crates/iceberg/src/arrow/value.rs b/crates/iceberg/src/arrow/value.rs index d07233c420..ed465c0dca 100644 --- a/crates/iceberg/src/arrow/value.rs +++ b/crates/iceberg/src/arrow/value.rs @@ -17,6 +17,12 @@ use std::sync::Arc; +use arrow_array::builder::{ + BinaryBuilder, BooleanBuilder, Date32Builder, Decimal128Builder, FixedSizeBinaryBuilder, + Float32Builder, Float64Builder, Int32Builder, Int64Builder, LargeBinaryBuilder, + LargeStringBuilder, StringBuilder, Time64MicrosecondBuilder, TimestampMicrosecondBuilder, + TimestampNanosecondBuilder, +}; use arrow_array::{ Array, ArrayRef, BinaryArray, BooleanArray, Date32Array, Decimal128Array, FixedSizeBinaryArray, FixedSizeListArray, Float32Array, Float64Array, Int32Array, Int64Array, LargeBinaryArray, @@ -620,187 +626,309 @@ pub fn arrow_primitive_to_literal( ) } +enum PrimitiveLiteralArrayBuilderInner { + Boolean(BooleanBuilder), + Int32(Int32Builder), + Date32(Date32Builder), + Int64(Int64Builder), + Time64Microsecond(Time64MicrosecondBuilder), + TimestampMicrosecond(TimestampMicrosecondBuilder), + TimestampNanosecond(TimestampNanosecondBuilder), + Float32(Float32Builder), + Float64(Float64Builder), + Utf8(StringBuilder), + LargeUtf8(LargeStringBuilder), + Binary(BinaryBuilder), + LargeBinary(LargeBinaryBuilder), + Decimal128(Decimal128Builder), + FixedSizeBinary(FixedSizeBinaryBuilder), +} + +/// Incrementally build an Arrow array from Iceberg primitive literals. +/// +/// The builder's Arrow type is fixed at construction and must match the type +/// DataFusion or the reader will consume. `append_or_null` returns `true` only +/// when the provided literal matched that Arrow type and was appended as a +/// non-null value. +pub struct PrimitiveLiteralArrayBuilder { + inner: PrimitiveLiteralArrayBuilderInner, +} + +impl PrimitiveLiteralArrayBuilder { + /// Create a builder for supported primitive Arrow types. + pub fn try_new(data_type: &DataType, capacity: usize) -> Result { + let inner = match data_type { + DataType::Boolean => { + PrimitiveLiteralArrayBuilderInner::Boolean(BooleanBuilder::with_capacity(capacity)) + } + DataType::Int32 => { + PrimitiveLiteralArrayBuilderInner::Int32(Int32Builder::with_capacity(capacity)) + } + DataType::Date32 => { + PrimitiveLiteralArrayBuilderInner::Date32(Date32Builder::with_capacity(capacity)) + } + DataType::Int64 => { + PrimitiveLiteralArrayBuilderInner::Int64(Int64Builder::with_capacity(capacity)) + } + DataType::Time64(TimeUnit::Microsecond) => { + PrimitiveLiteralArrayBuilderInner::Time64Microsecond( + Time64MicrosecondBuilder::with_capacity(capacity), + ) + } + DataType::Timestamp(TimeUnit::Microsecond, _) => { + PrimitiveLiteralArrayBuilderInner::TimestampMicrosecond( + TimestampMicrosecondBuilder::with_capacity(capacity) + .with_data_type(data_type.clone()), + ) + } + DataType::Timestamp(TimeUnit::Nanosecond, _) => { + PrimitiveLiteralArrayBuilderInner::TimestampNanosecond( + TimestampNanosecondBuilder::with_capacity(capacity) + .with_data_type(data_type.clone()), + ) + } + DataType::Float32 => { + PrimitiveLiteralArrayBuilderInner::Float32(Float32Builder::with_capacity(capacity)) + } + DataType::Float64 => { + PrimitiveLiteralArrayBuilderInner::Float64(Float64Builder::with_capacity(capacity)) + } + DataType::Utf8 => PrimitiveLiteralArrayBuilderInner::Utf8( + StringBuilder::with_capacity(capacity, capacity), + ), + DataType::LargeUtf8 => PrimitiveLiteralArrayBuilderInner::LargeUtf8( + LargeStringBuilder::with_capacity(capacity, capacity), + ), + DataType::Binary => PrimitiveLiteralArrayBuilderInner::Binary( + BinaryBuilder::with_capacity(capacity, capacity), + ), + DataType::LargeBinary => PrimitiveLiteralArrayBuilderInner::LargeBinary( + LargeBinaryBuilder::with_capacity(capacity, capacity), + ), + DataType::Decimal128(_, _) => PrimitiveLiteralArrayBuilderInner::Decimal128( + Decimal128Builder::with_capacity(capacity).with_data_type(data_type.clone()), + ), + DataType::FixedSizeBinary(width) if *width >= 0 => { + PrimitiveLiteralArrayBuilderInner::FixedSizeBinary( + FixedSizeBinaryBuilder::with_capacity(capacity, *width), + ) + } + _ => { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!("Unsupported primitive literal array type: {data_type:?}"), + )); + } + }; + + Ok(Self { inner }) + } + + /// Append a primitive literal or a null value. + /// + /// Returns `false` when `prim_lit` is null or does not match the builder's + /// Arrow type. In either case a null is appended so all columns retain the + /// same row count. + pub fn append_or_null(&mut self, prim_lit: Option<&PrimitiveLiteral>) -> Result { + let Some(prim_lit) = prim_lit else { + self.append_null(); + return Ok(false); + }; + + let appended = match (&mut self.inner, prim_lit) { + (PrimitiveLiteralArrayBuilderInner::Boolean(builder), PrimitiveLiteral::Boolean(v)) => { + builder.append_value(*v); + true + } + (PrimitiveLiteralArrayBuilderInner::Int32(builder), PrimitiveLiteral::Int(v)) => { + builder.append_value(*v); + true + } + (PrimitiveLiteralArrayBuilderInner::Date32(builder), PrimitiveLiteral::Int(v)) => { + builder.append_value(*v); + true + } + (PrimitiveLiteralArrayBuilderInner::Int64(builder), PrimitiveLiteral::Long(v)) => { + builder.append_value(*v); + true + } + ( + PrimitiveLiteralArrayBuilderInner::Time64Microsecond(builder), + PrimitiveLiteral::Long(v), + ) => { + builder.append_value(*v); + true + } + ( + PrimitiveLiteralArrayBuilderInner::TimestampMicrosecond(builder), + PrimitiveLiteral::Long(v), + ) => { + builder.append_value(*v); + true + } + ( + PrimitiveLiteralArrayBuilderInner::TimestampNanosecond(builder), + PrimitiveLiteral::Long(v), + ) => { + builder.append_value(*v); + true + } + (PrimitiveLiteralArrayBuilderInner::Float32(builder), PrimitiveLiteral::Float(v)) => { + builder.append_value(v.0); + true + } + (PrimitiveLiteralArrayBuilderInner::Float64(builder), PrimitiveLiteral::Double(v)) => { + builder.append_value(v.0); + true + } + (PrimitiveLiteralArrayBuilderInner::Utf8(builder), PrimitiveLiteral::String(v)) => { + builder.append_value(v.as_str()); + true + } + ( + PrimitiveLiteralArrayBuilderInner::LargeUtf8(builder), + PrimitiveLiteral::String(v), + ) => { + builder.append_value(v.as_str()); + true + } + (PrimitiveLiteralArrayBuilderInner::Binary(builder), PrimitiveLiteral::Binary(v)) => { + builder.append_value(v.as_slice()); + true + } + ( + PrimitiveLiteralArrayBuilderInner::LargeBinary(builder), + PrimitiveLiteral::Binary(v), + ) => { + builder.append_value(v.as_slice()); + true + } + ( + PrimitiveLiteralArrayBuilderInner::Decimal128(builder), + PrimitiveLiteral::Int128(v), + ) => { + builder.append_value(*v); + true + } + ( + PrimitiveLiteralArrayBuilderInner::Decimal128(builder), + PrimitiveLiteral::UInt128(v), + ) => { + builder.append_value(*v as i128); + true + } + ( + PrimitiveLiteralArrayBuilderInner::FixedSizeBinary(builder), + PrimitiveLiteral::Binary(v), + ) => append_fixed_size_binary_or_null(builder, v.as_slice()), + ( + PrimitiveLiteralArrayBuilderInner::FixedSizeBinary(builder), + PrimitiveLiteral::UInt128(v), + ) => { + let bytes = Uuid::from_u128(*v).into_bytes(); + append_fixed_size_binary_or_null(builder, bytes.as_slice()) + } + (builder, _) => { + append_null_to_inner(builder); + false + } + }; + + Ok(appended) + } + + fn append_null(&mut self) { + append_null_to_inner(&mut self.inner); + } + + /// Finish the builder and return the typed Arrow array. + pub fn finish(mut self) -> Result { + Ok(match &mut self.inner { + PrimitiveLiteralArrayBuilderInner::Boolean(builder) => Arc::new(builder.finish()), + PrimitiveLiteralArrayBuilderInner::Int32(builder) => Arc::new(builder.finish()), + PrimitiveLiteralArrayBuilderInner::Date32(builder) => Arc::new(builder.finish()), + PrimitiveLiteralArrayBuilderInner::Int64(builder) => Arc::new(builder.finish()), + PrimitiveLiteralArrayBuilderInner::Time64Microsecond(builder) => { + Arc::new(builder.finish()) + } + PrimitiveLiteralArrayBuilderInner::TimestampMicrosecond(builder) => { + Arc::new(builder.finish()) + } + PrimitiveLiteralArrayBuilderInner::TimestampNanosecond(builder) => { + Arc::new(builder.finish()) + } + PrimitiveLiteralArrayBuilderInner::Float32(builder) => Arc::new(builder.finish()), + PrimitiveLiteralArrayBuilderInner::Float64(builder) => Arc::new(builder.finish()), + PrimitiveLiteralArrayBuilderInner::Utf8(builder) => Arc::new(builder.finish()), + PrimitiveLiteralArrayBuilderInner::LargeUtf8(builder) => Arc::new(builder.finish()), + PrimitiveLiteralArrayBuilderInner::Binary(builder) => Arc::new(builder.finish()), + PrimitiveLiteralArrayBuilderInner::LargeBinary(builder) => Arc::new(builder.finish()), + PrimitiveLiteralArrayBuilderInner::Decimal128(builder) => Arc::new(builder.finish()), + PrimitiveLiteralArrayBuilderInner::FixedSizeBinary(builder) => { + Arc::new(builder.finish()) + } + }) + } +} + +fn append_null_to_inner(builder: &mut PrimitiveLiteralArrayBuilderInner) { + match builder { + PrimitiveLiteralArrayBuilderInner::Boolean(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::Int32(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::Date32(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::Int64(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::Time64Microsecond(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::TimestampMicrosecond(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::TimestampNanosecond(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::Float32(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::Float64(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::Utf8(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::LargeUtf8(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::Binary(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::LargeBinary(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::Decimal128(builder) => builder.append_null(), + PrimitiveLiteralArrayBuilderInner::FixedSizeBinary(builder) => builder.append_null(), + } +} + +fn append_fixed_size_binary_or_null(builder: &mut FixedSizeBinaryBuilder, value: &[u8]) -> bool { + if builder.append_value(value).is_ok() { + true + } else { + builder.append_null(); + false + } +} + /// Create a single-element array from a primitive literal. /// /// This is used for creating constant arrays (Run-End Encoded arrays) where we need /// a single value that represents all rows. -pub(crate) fn create_primitive_array_single_element( +pub fn create_primitive_array_single_element( data_type: &DataType, prim_lit: &Option, ) -> Result { - match (data_type, prim_lit) { - (DataType::Boolean, Some(PrimitiveLiteral::Boolean(v))) => { - Ok(Arc::new(BooleanArray::from(vec![*v]))) - } - (DataType::Boolean, None) => Ok(Arc::new(BooleanArray::from(vec![Option::::None]))), - (DataType::Int32, Some(PrimitiveLiteral::Int(v))) => { - Ok(Arc::new(Int32Array::from(vec![*v]))) - } - (DataType::Int32, None) => Ok(Arc::new(Int32Array::from(vec![Option::::None]))), - (DataType::Date32, Some(PrimitiveLiteral::Int(v))) => { - Ok(Arc::new(Date32Array::from(vec![*v]))) - } - (DataType::Date32, None) => Ok(Arc::new(Date32Array::from(vec![Option::::None]))), - (DataType::Int64, Some(PrimitiveLiteral::Long(v))) => { - Ok(Arc::new(Int64Array::from(vec![*v]))) - } - (DataType::Int64, None) => Ok(Arc::new(Int64Array::from(vec![Option::::None]))), - (DataType::Timestamp(TimeUnit::Microsecond, timezone), Some(PrimitiveLiteral::Long(v))) => { - let array = TimestampMicrosecondArray::from(vec![*v]); - if let Some(timezone) = timezone { - Ok(Arc::new(array.with_timezone(timezone.clone()))) - } else { - Ok(Arc::new(array)) - } - } - (DataType::Timestamp(TimeUnit::Microsecond, timezone), None) => { - let array = TimestampMicrosecondArray::from(vec![Option::::None]); - if let Some(timezone) = timezone { - Ok(Arc::new(array.with_timezone(timezone.clone()))) - } else { - Ok(Arc::new(array)) - } - } - (DataType::Timestamp(TimeUnit::Nanosecond, timezone), Some(PrimitiveLiteral::Long(v))) => { - let array = TimestampNanosecondArray::from(vec![*v]); - if let Some(timezone) = timezone { - Ok(Arc::new(array.with_timezone(timezone.clone()))) - } else { - Ok(Arc::new(array)) - } - } - (DataType::Timestamp(TimeUnit::Nanosecond, timezone), None) => { - let array = TimestampNanosecondArray::from(vec![Option::::None]); - if let Some(timezone) = timezone { - Ok(Arc::new(array.with_timezone(timezone.clone()))) - } else { - Ok(Arc::new(array)) - } - } - (DataType::Float32, Some(PrimitiveLiteral::Float(v))) => { - Ok(Arc::new(Float32Array::from(vec![v.0]))) - } - (DataType::Float32, None) => Ok(Arc::new(Float32Array::from(vec![Option::::None]))), - (DataType::Float64, Some(PrimitiveLiteral::Double(v))) => { - Ok(Arc::new(Float64Array::from(vec![v.0]))) - } - (DataType::Float64, None) => Ok(Arc::new(Float64Array::from(vec![Option::::None]))), - (DataType::Utf8, Some(PrimitiveLiteral::String(v))) => { - Ok(Arc::new(StringArray::from(vec![v.as_str()]))) - } - (DataType::Utf8, None) => Ok(Arc::new(StringArray::from(vec![Option::<&str>::None]))), - (DataType::Binary, Some(PrimitiveLiteral::Binary(v))) => { - Ok(Arc::new(BinaryArray::from_vec(vec![v.as_slice()]))) - } - (DataType::Binary, None) => Ok(Arc::new(BinaryArray::from_opt_vec(vec![ - Option::<&[u8]>::None, - ]))), - (DataType::Decimal128(precision, scale), Some(PrimitiveLiteral::Int128(v))) => { - let array = Decimal128Array::from(vec![{ *v }]) - .with_precision_and_scale(*precision, *scale) - .map_err(|e| { - Error::new( - ErrorKind::DataInvalid, - format!( - "Failed to create Decimal128Array with precision {precision} and scale {scale}: {e}" - ), - ) - })?; - Ok(Arc::new(array)) - } - (DataType::Decimal128(precision, scale), Some(PrimitiveLiteral::UInt128(v))) => { - let array = Decimal128Array::from(vec![*v as i128]) - .with_precision_and_scale(*precision, *scale) - .map_err(|e| { - Error::new( - ErrorKind::DataInvalid, - format!( - "Failed to create Decimal128Array with precision {precision} and scale {scale}: {e}" - ), - ) - })?; - Ok(Arc::new(array)) - } - (DataType::Decimal128(precision, scale), None) => { - let array = Decimal128Array::from(vec![Option::::None]) - .with_precision_and_scale(*precision, *scale) - .map_err(|e| { - Error::new( - ErrorKind::DataInvalid, - format!( - "Failed to create Decimal128Array with precision {precision} and scale {scale}: {e}" - ), - ) - })?; - Ok(Arc::new(array)) - } - (DataType::Struct(fields), None) => { - // Create a single-element StructArray with nulls - let null_arrays: Vec = fields - .iter() - .map(|f| { - // Recursively create null arrays for struct fields - // For primitive fields in structs, use simple null arrays (not REE within struct) - match f.data_type() { - DataType::Boolean => { - Ok(Arc::new(BooleanArray::from(vec![Option::::None])) - as ArrayRef) - } - DataType::Int32 | DataType::Date32 => { - Ok(Arc::new(Int32Array::from(vec![Option::::None])) as ArrayRef) - } - DataType::Int64 => { - Ok(Arc::new(Int64Array::from(vec![Option::::None])) as ArrayRef) - } - DataType::Timestamp(TimeUnit::Microsecond, timezone) => { - let array = TimestampMicrosecondArray::from(vec![Option::::None]); - if let Some(timezone) = timezone { - Ok(Arc::new(array.with_timezone(timezone.clone())) as ArrayRef) - } else { - Ok(Arc::new(array) as ArrayRef) - } - } - DataType::Timestamp(TimeUnit::Nanosecond, timezone) => { - let array = TimestampNanosecondArray::from(vec![Option::::None]); - if let Some(timezone) = timezone { - Ok(Arc::new(array.with_timezone(timezone.clone())) as ArrayRef) - } else { - Ok(Arc::new(array) as ArrayRef) - } - } - DataType::Float32 => { - Ok(Arc::new(Float32Array::from(vec![Option::::None])) as ArrayRef) - } - DataType::Float64 => { - Ok(Arc::new(Float64Array::from(vec![Option::::None])) as ArrayRef) - } - DataType::Utf8 => { - Ok(Arc::new(StringArray::from(vec![Option::<&str>::None])) as ArrayRef) - } - DataType::Binary => { - Ok( - Arc::new(BinaryArray::from_opt_vec(vec![Option::<&[u8]>::None])) - as ArrayRef, - ) - } - _ => Err(Error::new( - ErrorKind::Unexpected, - format!("Unsupported struct field type: {:?}", f.data_type()), - )), - } - }) - .collect::>>()?; - Ok(Arc::new(arrow_array::StructArray::new( - fields.clone(), - null_arrays, - Some(arrow_buffer::NullBuffer::new_null(1)), - ))) - } - _ => Err(Error::new( + if let (DataType::Struct(fields), None) = (data_type, prim_lit) { + let null_arrays = fields + .iter() + .map(|f| create_primitive_array_single_element(f.data_type(), &None)) + .collect::>>()?; + return Ok(Arc::new(arrow_array::StructArray::new( + fields.clone(), + null_arrays, + Some(arrow_buffer::NullBuffer::new_null(1)), + ))); + } + + let mut builder = PrimitiveLiteralArrayBuilder::try_new(data_type, 1)?; + let appended = builder.append_or_null(prim_lit.as_ref())?; + if prim_lit.is_some() && !appended { + return Err(Error::new( ErrorKind::Unexpected, format!("Unsupported constant type combination: {data_type:?} with {prim_lit:?}"), - )), + )); } + builder.finish() } /// Create a repeated array from a primitive literal for a given number of rows. @@ -1847,6 +1975,70 @@ mod test { } } + #[test] + fn test_primitive_literal_array_builder_timestamp_timezone_and_null() { + let target_type = DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())); + let mut builder = PrimitiveLiteralArrayBuilder::try_new(&target_type, 2).unwrap(); + let value = PrimitiveLiteral::Long(1_740_600_000_000_000); + + assert!(builder.append_or_null(Some(&value)).unwrap()); + assert!(!builder.append_or_null(None).unwrap()); + + let array = builder.finish().unwrap(); + assert_eq!(array.data_type(), &target_type); + assert_eq!(array.len(), 2); + assert!(array.is_null(1)); + } + + #[test] + fn test_primitive_literal_array_builder_large_binary() { + let mut builder = PrimitiveLiteralArrayBuilder::try_new(&DataType::LargeBinary, 2).unwrap(); + let value = PrimitiveLiteral::Binary(vec![1, 2, 3]); + + assert!(builder.append_or_null(Some(&value)).unwrap()); + assert!(!builder.append_or_null(None).unwrap()); + + let array = builder.finish().unwrap(); + let binary_array = array + .as_any() + .downcast_ref::() + .expect("expected LargeBinaryArray"); + assert_eq!(binary_array.value(0), &[1, 2, 3]); + assert!(binary_array.is_null(1)); + } + + #[test] + fn test_primitive_literal_array_builder_fixed_size_binary_uuid() { + let mut builder = + PrimitiveLiteralArrayBuilder::try_new(&DataType::FixedSizeBinary(16), 2).unwrap(); + let uuid_bytes = [7_u8; 16]; + let uuid = Uuid::from_bytes(uuid_bytes); + let uuid_value = PrimitiveLiteral::UInt128(uuid.as_u128()); + let wrong_width_value = PrimitiveLiteral::Binary(vec![1, 2]); + + assert!(builder.append_or_null(Some(&uuid_value)).unwrap()); + assert!(!builder.append_or_null(Some(&wrong_width_value)).unwrap()); + + let array = builder.finish().unwrap(); + let fixed_array = array + .as_any() + .downcast_ref::() + .expect("expected FixedSizeBinaryArray"); + assert_eq!(fixed_array.value(0), uuid_bytes.as_slice()); + assert!(fixed_array.is_null(1)); + } + + #[test] + fn test_create_single_element_errors_on_mismatched_literal() { + let value = PrimitiveLiteral::String("not an int".to_string()); + + assert!(create_primitive_array_single_element(&DataType::Int32, &Some(value)).is_err()); + + let null_array = create_primitive_array_single_element(&DataType::Int32, &None).unwrap(); + assert_eq!(null_array.len(), 1); + assert!(null_array.is_null(0)); + } + #[test] fn test_create_decimal_array_repeated_respects_precision() { // Ensure repeated arrays also respect target precision, not Arrow's default. diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index b41721c9de..ad0ae2b150 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -37,6 +37,8 @@ use iceberg::table::Table; use super::expr_to_predicate::convert_filters_to_predicate; use crate::to_datafusion_error; + +// TODO: use crate::util for available_parallelism const DEFAULT_PARALLELISM: usize = 1; fn available_parallelism() -> NonZeroUsize { std::thread::available_parallelism() diff --git a/crates/integrations/datafusion/src/table/bucketing.rs b/crates/integrations/datafusion/src/table/bucketing.rs index 55ab23b6fd..dfa43306a8 100644 --- a/crates/integrations/datafusion/src/table/bucketing.rs +++ b/crates/integrations/datafusion/src/table/bucketing.rs @@ -15,17 +15,12 @@ // specific language governing permissions and limitations // under the License. -use std::sync::Arc; - -use datafusion::arrow::array::{ - ArrayRef, BooleanArray, Date32Array, Float32Array, Float64Array, Int32Array, Int64Array, - StringArray, -}; -use datafusion::arrow::datatypes::{DataType, Schema as ArrowSchema}; +use datafusion::arrow::datatypes::{DataType, Schema as ArrowSchema, TimeUnit}; use datafusion::common::hash_utils::create_hashes; use datafusion::physical_plan::repartition::REPARTITION_RANDOM_STATE; +use iceberg::arrow::PrimitiveLiteralArrayBuilder; use iceberg::scan::FileScanTask; -use iceberg::spec::{Literal, PrimitiveLiteral, Transform}; +use iceberg::spec::{Literal, Transform}; use iceberg::table::Table; /// Identity-partitioned column that is also present in the output projection @@ -45,7 +40,7 @@ pub(super) struct IdentityCol { /// `None` if any condition is violated: /// - the source column for an identity field is not in the output projection /// - the source column's Arrow type is not currently supported by -/// [`literal_to_array`] +/// the identity hash materialization path /// - the table has spec evolution (>1 historical specs), since older files /// may carry a partition tuple that does not align with the default spec /// @@ -92,7 +87,15 @@ fn is_supported_dtype(dt: &DataType) -> bool { | DataType::Float32 | DataType::Float64 | DataType::Utf8 + | DataType::LargeUtf8 | DataType::Date32 + | DataType::Time64(TimeUnit::Microsecond) + | DataType::Timestamp(TimeUnit::Microsecond, _) + | DataType::Timestamp(TimeUnit::Nanosecond, _) + | DataType::Binary + | DataType::LargeBinary + | DataType::Decimal128(_, _) + | DataType::FixedSizeBinary(_) ) } @@ -101,7 +104,7 @@ fn is_supported_dtype(dt: &DataType) -> bool { /// that key using DataFusion's repartition hash so the resulting partitioning /// matches what `RepartitionExec` would produce on the same data. Tasks /// missing partition data fall back to hashing `data_file_path`, which still -/// distributes evenly but breaks the `Hash` contract — the second tuple +/// distributes evenly but breaks the `Hash` contract; the second tuple /// element flags whether every task supplied a full identity key. pub(super) fn bucket_tasks( tasks: Vec, @@ -114,45 +117,88 @@ pub(super) fn bucket_tasks( let mut buckets: Vec> = (0..n_partitions).map(|_| Vec::new()).collect(); let mut all_full_key = true; let cols = identity_cols.unwrap_or(&[]); + let identity_hashes = identity_hashes_for_tasks(&tasks, cols); - for task in tasks { - let bucket_idx = match identity_hash(&task, cols) { - Some(h) => (h % n_partitions as u64) as usize, + for (task_idx, task) in tasks.into_iter().enumerate() { + let bucket_idx = match &identity_hashes { + Some(identity_hashes) if identity_hashes.full_key_by_task[task_idx] => { + (identity_hashes.hashes[task_idx] % n_partitions as u64) as usize + } None => { all_full_key = false; fallback_hash(&task) as usize % n_partitions } + Some(_) => { + all_full_key = false; + fallback_hash(&task) as usize % n_partitions + } }; buckets[bucket_idx].push(task); } (buckets, all_full_key) } -/// Hash the identity-partition values of `task` using -/// [`REPARTITION_RANDOM_STATE`] so the bucket assignment matches DataFusion's -/// hash-repartition convention. Returns `None` if the task lacks partition -/// data or any required slot is null/unsupported. -fn identity_hash(task: &FileScanTask, cols: &[IdentityCol]) -> Option { +struct IdentityHashes { + hashes: Vec, + full_key_by_task: Vec, +} + +/// Hash all identity-partition values using [`REPARTITION_RANDOM_STATE`] so the +/// bucket assignment matches DataFusion's hash-repartition convention. The +/// returned `full_key_by_task` marks rows whose task supplied every identity key +/// slot with a supported non-null literal. +fn identity_hashes_for_tasks( + tasks: &[FileScanTask], + cols: &[IdentityCol], +) -> Option { if cols.is_empty() { return None; } - let partition = task.partition.as_ref()?; - let mut arrays: Vec = Vec::with_capacity(cols.len()); - for col in cols { - let lit = partition.fields().get(col.spec_field_idx)?.as_ref()?; - arrays.push(literal_to_array(lit, &col.output_dtype)?); + + let mut builders = cols + .iter() + .map(|col| PrimitiveLiteralArrayBuilder::try_new(&col.output_dtype, tasks.len())) + .collect::>>() + .ok()?; + let mut full_key_by_task = Vec::with_capacity(tasks.len()); + + for task in tasks { + let partition_fields = task.partition.as_ref().map(|partition| partition.fields()); + let mut full_key = partition_fields.is_some(); + + for (builder, col) in builders.iter_mut().zip(cols) { + let lit = partition_fields + .and_then(|fields| fields.get(col.spec_field_idx)) + .and_then(|lit| lit.as_ref()); + let prim_lit = lit.and_then(|lit| match lit { + Literal::Primitive(prim) => Some(prim), + _ => None, + }); + let appended = builder.append_or_null(prim_lit).ok()?; + full_key = full_key && appended; + } + full_key_by_task.push(full_key); } - let mut hashes = vec![0u64; 1]; + + let arrays = builders + .into_iter() + .map(PrimitiveLiteralArrayBuilder::finish) + .collect::>>() + .ok()?; + let mut hashes = vec![0u64; tasks.len()]; create_hashes( &arrays, REPARTITION_RANDOM_STATE.random_state(), &mut hashes, ) .ok()?; - Some(hashes[0]) + Some(IdentityHashes { + hashes, + full_key_by_task, + }) } -/// Deterministic per-file fallback used when `identity_hash` cannot produce a +/// Deterministic per-file fallback used when identity hashing cannot produce a /// bucket. The hash function does not need to match DataFusion's because any /// task taking this path causes the scan to drop to `UnknownPartitioning`. fn fallback_hash(task: &FileScanTask) -> u64 { @@ -163,25 +209,230 @@ fn fallback_hash(task: &FileScanTask) -> u64 { hasher.finish() } -/// Materialize a single-element Arrow array of `dt` holding the value of -/// `lit`. The Arrow type must match what DataFusion will see for this column -/// at scan time, otherwise `create_hashes` would dispatch on a different type -/// and produce a hash that disagrees with DataFusion's row-wise hashing. -fn literal_to_array(lit: &Literal, dt: &DataType) -> Option { - let prim = match lit { - Literal::Primitive(p) => p, - _ => return None, +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use datafusion::arrow::array::{ + ArrayRef, Decimal128Array, Int32Array, StringArray, TimestampMicrosecondArray, + }; + use iceberg::spec::{ + DataFileFormat, Literal, NestedField, PrimitiveType, Schema, Struct, Type, }; - Some(match (prim, dt) { - (PrimitiveLiteral::Boolean(v), DataType::Boolean) => Arc::new(BooleanArray::from(vec![*v])), - (PrimitiveLiteral::Int(v), DataType::Int32) => Arc::new(Int32Array::from(vec![*v])), - (PrimitiveLiteral::Int(v), DataType::Date32) => Arc::new(Date32Array::from(vec![*v])), - (PrimitiveLiteral::Long(v), DataType::Int64) => Arc::new(Int64Array::from(vec![*v])), - (PrimitiveLiteral::Float(v), DataType::Float32) => Arc::new(Float32Array::from(vec![v.0])), - (PrimitiveLiteral::Double(v), DataType::Float64) => Arc::new(Float64Array::from(vec![v.0])), - (PrimitiveLiteral::String(v), DataType::Utf8) => { - Arc::new(StringArray::from(vec![v.as_str()])) + + use super::*; + + fn scan_task(file_idx: usize, partition: Option) -> FileScanTask { + FileScanTask { + file_size_in_bytes: 1, + start: 0, + length: 1, + record_count: Some(1), + data_file_path: format!("/tmp/file_{file_idx}.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: Arc::new( + Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)) + .into(), + ]) + .build() + .unwrap(), + ), + project_field_ids: vec![1, 2], + predicate: None, + deletes: Vec::new(), + partition, + partition_spec: None, + name_mapping: None, + case_sensitive: true, } - _ => return None, - }) + } + + fn bucket_by_file_index( + buckets: &[Vec], + file_count: usize, + ) -> Vec> { + let mut actual_bucket_by_file = vec![None; file_count]; + for (bucket_idx, bucket) in buckets.iter().enumerate() { + for task in bucket { + let file_idx = task + .data_file_path() + .strip_suffix(".parquet") + .and_then(|path| path.rsplit_once("file_").map(|(_, idx)| idx)) + .and_then(|idx| idx.parse::().ok()) + .expect("test data file path should include its row index"); + actual_bucket_by_file[file_idx] = Some(bucket_idx); + } + } + actual_bucket_by_file + } + + #[test] + fn bucket_tasks_hashes_multiple_identity_columns() { + let rows = vec![(1, "a"), (2, "b"), (1, "b"), (3, "c"), (2, "a")]; + let tasks = rows + .iter() + .enumerate() + .map(|(idx, (id, name))| { + scan_task( + idx, + Some(Struct::from_iter(vec![ + Some(Literal::int(*id)), + Some(Literal::string(*name)), + ])), + ) + }) + .collect::>(); + let cols = vec![ + IdentityCol { + name: "id".to_string(), + output_idx: 0, + spec_field_idx: 0, + output_dtype: DataType::Int32, + }, + IdentityCol { + name: "name".to_string(), + output_idx: 1, + spec_field_idx: 1, + output_dtype: DataType::Utf8, + }, + ]; + let n_partitions = 4_usize; + + let (buckets, all_full_key) = bucket_tasks(tasks, n_partitions, Some(&cols)); + + assert!(all_full_key); + let arrays: Vec = vec![ + Arc::new(Int32Array::from( + rows.iter().map(|(id, _)| *id).collect::>(), + )), + Arc::new(StringArray::from( + rows.iter().map(|(_, name)| *name).collect::>(), + )), + ]; + let mut hashes = vec![0_u64; rows.len()]; + create_hashes( + &arrays, + REPARTITION_RANDOM_STATE.random_state(), + &mut hashes, + ) + .unwrap(); + + let actual_bucket_by_file = bucket_by_file_index(&buckets, rows.len()); + for (file_idx, hash) in hashes.iter().enumerate() { + let expected_bucket = (hash % n_partitions as u64) as usize; + assert_eq!(actual_bucket_by_file[file_idx], Some(expected_bucket)); + } + } + + #[test] + fn bucket_tasks_hashes_decimal_and_timestamp_identity_columns() { + let rows = vec![ + (100_i128, 1_740_600_000_000_000_i64), + (200_i128, 1_740_600_100_000_000_i64), + (100_i128, 1_740_600_200_000_000_i64), + ]; + let tasks = rows + .iter() + .enumerate() + .map(|(idx, (price, ts))| { + scan_task( + idx, + Some(Struct::from_iter(vec![ + Some(Literal::decimal(*price)), + Some(Literal::timestamp(*ts)), + ])), + ) + }) + .collect::>(); + let timestamp_type = DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())); + let cols = vec![ + IdentityCol { + name: "price".to_string(), + output_idx: 0, + spec_field_idx: 0, + output_dtype: DataType::Decimal128(18, 2), + }, + IdentityCol { + name: "ts".to_string(), + output_idx: 1, + spec_field_idx: 1, + output_dtype: timestamp_type, + }, + ]; + let n_partitions = 4_usize; + + let (buckets, all_full_key) = bucket_tasks(tasks, n_partitions, Some(&cols)); + + assert!(all_full_key); + let decimal_array = + Decimal128Array::from(rows.iter().map(|(price, _)| *price).collect::>()) + .with_precision_and_scale(18, 2) + .unwrap(); + let timestamp_array = + TimestampMicrosecondArray::from(rows.iter().map(|(_, ts)| *ts).collect::>()) + .with_timezone("UTC"); + let arrays: Vec = vec![Arc::new(decimal_array), Arc::new(timestamp_array)]; + let mut hashes = vec![0_u64; rows.len()]; + create_hashes( + &arrays, + REPARTITION_RANDOM_STATE.random_state(), + &mut hashes, + ) + .unwrap(); + + let actual_bucket_by_file = bucket_by_file_index(&buckets, rows.len()); + for (file_idx, hash) in hashes.iter().enumerate() { + let expected_bucket = (hash % n_partitions as u64) as usize; + assert_eq!(actual_bucket_by_file[file_idx], Some(expected_bucket)); + } + } + + #[test] + fn bucket_tasks_falls_back_per_task_for_missing_identity_key() { + let tasks = vec![ + scan_task(0, Some(Struct::from_iter(vec![Some(Literal::string("a"))]))), + scan_task(1, Some(Struct::from_iter(vec![None::]))), + scan_task(2, Some(Struct::from_iter(vec![Some(Literal::string("c"))]))), + scan_task(3, None), + ]; + let expected_tasks = tasks.clone(); + let cols = vec![IdentityCol { + name: "name".to_string(), + output_idx: 1, + spec_field_idx: 0, + output_dtype: DataType::Utf8, + }]; + let n_partitions = 5_usize; + + let (buckets, all_full_key) = bucket_tasks(tasks, n_partitions, Some(&cols)); + + assert!(!all_full_key); + let arrays: Vec = vec![Arc::new(StringArray::from(vec![ + Some("a"), + None, + Some("c"), + None, + ]))]; + let mut hashes = vec![0_u64; expected_tasks.len()]; + create_hashes( + &arrays, + REPARTITION_RANDOM_STATE.random_state(), + &mut hashes, + ) + .unwrap(); + + let actual_bucket_by_file = bucket_by_file_index(&buckets, expected_tasks.len()); + for file_idx in [0_usize, 2] { + let expected_bucket = (hashes[file_idx] % n_partitions as u64) as usize; + assert_eq!(actual_bucket_by_file[file_idx], Some(expected_bucket)); + } + for file_idx in [1_usize, 3] { + let expected_bucket = fallback_hash(&expected_tasks[file_idx]) as usize % n_partitions; + assert_eq!(actual_bucket_by_file[file_idx], Some(expected_bucket)); + } + } } diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 34c89c204b..c168f2ae07 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -188,8 +188,9 @@ impl TableProvider for IcebergTableProvider { // identity_cols is Some(non-empty) iff every condition for declaring // Partitioning::Hash is met: the table's default spec has identity-transform // fields, every such source column is present in the output projection, and - // every column type is supported by literal_to_array. Any miss collapses to - // None, which forces UnknownPartitioning regardless of bucketing strategy. + // every column type is supported by the identity hash materialization path. + // Any miss collapses to None, which forces UnknownPartitioning regardless + // of bucketing strategy. let identity_cols = bucketing::compute_identity_cols(&table, &output_schema); let (buckets, all_had_full_key) = From 6f17acf6474c2433f738d416eb4125a70f16fada Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Mon, 15 Jun 2026 18:31:21 +0200 Subject: [PATCH 23/32] refactor(datafusion): share Iceberg scan config --- .../datafusion/src/physical_plan/scan.rs | 88 ++++++++++++------- .../integrations/datafusion/src/table/mod.rs | 88 +++++++++++-------- 2 files changed, 108 insertions(+), 68 deletions(-) diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index ad0ae2b150..250c2878f6 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -79,13 +79,18 @@ pub struct IcebergTableScanBuilder { snapshot_id: Option, schema: ArrowSchemaRef, projection: Option>, - predicates: Option, filters: Vec, limit: Option, partitioning: Partitioning, buckets: Option]>>, } +pub(crate) struct TableScanConfig { + snapshot_id: Option, + column_names: Option>, + predicates: Option, +} + impl IcebergTableScanBuilder { /// Creates a builder for a lazy single-partition scan. pub fn new(table: Table, schema: ArrowSchemaRef) -> Self { @@ -94,7 +99,6 @@ impl IcebergTableScanBuilder { schema, snapshot_id: None, projection: None, - predicates: None, filters: vec![], limit: None, partitioning: Partitioning::UnknownPartitioning(1), @@ -114,12 +118,6 @@ impl IcebergTableScanBuilder { self } - /// Sets the predicates - pub fn with_predicates(mut self, predicates: Option) -> Self { - self.predicates = predicates; - self - } - /// Sets the filters to apply to the table scan. pub fn with_filters(mut self, filters: &[Expr]) -> Self { self.filters = filters.to_vec(); @@ -147,8 +145,44 @@ impl IcebergTableScanBuilder { self } + pub(crate) fn table_scan_config(&self) -> TableScanConfig { + TableScanConfig { + snapshot_id: self.snapshot_id, + column_names: get_column_names(self.schema.clone(), self.projection.as_ref()), + predicates: convert_filters_to_predicate(&self.filters), + } + } + + /// Returns the Arrow schema produced by this scan after projection. + pub(crate) fn output_schema(&self) -> DFResult { + match &self.projection { + None => Ok(self.schema.clone()), + Some(projection) => Ok(Arc::new(self.schema.project(projection).map_err( + |err| { + DataFusionError::Plan(format!("Failed to project Iceberg table schema: {err}")) + }, + )?)), + } + } + + /// Builds the underlying Iceberg [`TableScan`] using the same inputs as this plan. + pub(crate) fn build_iceberg_table_scan( + &self, + table_scan_config: &TableScanConfig, + ) -> DFResult { + build_iceberg_table_scan_from_config(&self.table, table_scan_config) + } + /// Builds the [`IcebergTableScan`]. pub fn build(self) -> DFResult { + let table_scan_config = self.table_scan_config(); + self.build_with_table_scan_config(table_scan_config) + } + + pub(crate) fn build_with_table_scan_config( + self, + table_scan_config: TableScanConfig, + ) -> DFResult { if let Some(buckets) = &self.buckets { let partition_count = self.partitioning.partition_count(); if buckets.len() != partition_count { @@ -160,29 +194,20 @@ impl IcebergTableScanBuilder { } } - let output_schema = match &self.projection { - None => self.schema.clone(), - Some(projection) => Arc::new(self.schema.project(projection).map_err(|err| { - DataFusionError::Plan(format!("Failed to project Iceberg table schema: {err}")) - })?), - }; + let output_schema = self.output_schema()?; let plan_properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(output_schema), self.partitioning, EmissionType::Incremental, Boundedness::Bounded, )); - let projection = get_column_names(self.schema, self.projection.as_ref()); - let predicates = self - .predicates - .or_else(|| convert_filters_to_predicate(&self.filters)); Ok(IcebergTableScan { table: self.table, - snapshot_id: self.snapshot_id, + snapshot_id: table_scan_config.snapshot_id, plan_properties, - projection, - predicates, + projection: table_scan_config.column_names, + predicates: table_scan_config.predicates, buckets: self.buckets, limit: self.limit, }) @@ -326,21 +351,19 @@ impl DisplayAs for IcebergTableScan { } } -fn build_table_scan( - table: Table, - snapshot_id: Option, - column_names: Option>, - predicates: Option, +fn build_iceberg_table_scan_from_config( + table: &Table, + table_scan_config: &TableScanConfig, ) -> DFResult { - let scan_builder = match snapshot_id { + let scan_builder = match table_scan_config.snapshot_id { Some(id) => table.scan().snapshot_id(id), None => table.scan(), }; - let mut scan_builder = match column_names { + let mut scan_builder = match table_scan_config.column_names.clone() { Some(names) => scan_builder.select(names), None => scan_builder.select_all(), }; - if let Some(pred) = predicates { + if let Some(pred) = table_scan_config.predicates.clone() { scan_builder = scan_builder.with_filter(pred); } scan_builder.build().map_err(to_datafusion_error) @@ -376,7 +399,12 @@ async fn build_record_batch_stream( ) } None => { - let table_scan = build_table_scan(table, snapshot_id, column_names, predicates)?; + let table_scan_config = TableScanConfig { + snapshot_id, + column_names, + predicates, + }; + let table_scan = build_iceberg_table_scan_from_config(&table, &table_scan_config)?; Box::pin(table_scan.to_arrow().await.map_err(to_datafusion_error)?) } }; diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index c168f2ae07..379f3c23b1 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -56,7 +56,6 @@ use metadata_table::IcebergMetadataTableProvider; use crate::error::to_datafusion_error; use crate::physical_plan::commit::IcebergCommitExec; -use crate::physical_plan::expr_to_predicate::convert_filters_to_predicate; use crate::physical_plan::project::project_with_partition; use crate::physical_plan::repartition::repartition; use crate::physical_plan::scan::IcebergTableScanBuilder; @@ -140,29 +139,18 @@ impl TableProvider for IcebergTableProvider { .await .map_err(to_datafusion_error)?; - // Build a TableScan mirroring the inputs we'll hand to IcebergTableScan, - // so plan_files() uses the same projection/filters the scan will replay in execute(). - let col_names = projection.map(|indices| { - indices - .iter() - .map(|&i| self.schema.field(i).name().clone()) - .collect::>() - }); - - let predicates = convert_filters_to_predicate(filters); - - let mut builder = table.scan(); - builder = match col_names { - Some(names) => builder.select(names), - None => builder.select_all(), - }; - if let Some(pred) = &predicates { - builder = builder.with_filter(pred.clone()); - } - - let tasks: Vec = builder - .build() - .map_err(to_datafusion_error)? + // Use the same builder path for eager file planning and execution so + // snapshot, projection, and filter handling cannot drift. + let scan_builder = IcebergTableScanBuilder::new(table.clone(), self.schema.clone()) + // Always use current snapshot for catalog-backed provider. + .with_snapshot_id(None) + .with_projection(projection) + .with_filters(filters) + .with_limit(limit); + let table_scan_config = scan_builder.table_scan_config(); + + let tasks: Vec = scan_builder + .build_iceberg_table_scan(&table_scan_config)? .plan_files() .await .map_err(to_datafusion_error)? @@ -172,12 +160,7 @@ impl TableProvider for IcebergTableProvider { // Output schema after projection: column indices in `Hash` exprs and any // Arrow array we hash must reference this schema, not the full table schema. - let output_schema = match projection { - None => self.schema.clone(), - Some(p) => Arc::new(self.schema.project(p).map_err(|e| { - to_datafusion_error(Error::new(ErrorKind::DataInvalid, e.to_string())) - })?), - }; + let output_schema = scan_builder.output_schema()?; let target_partitions = state.config().target_partitions(); // Always produce at least 1 partition so that DataFusion can schedule @@ -215,15 +198,9 @@ impl TableProvider for IcebergTableProvider { }; Ok(Arc::new( - IcebergTableScanBuilder::new(table, self.schema.clone()) - // Always use current snapshot for catalog-backed provider. - .with_snapshot_id(None) - .with_projection(projection) - .with_predicates(predicates) - .with_filters(filters) - .with_limit(limit) + scan_builder .with_task_buckets(buckets, partitioning) - .build()?, + .build_with_table_scan_config(table_scan_config)?, )) } @@ -1146,6 +1123,41 @@ mod tests { assert_eq!(buckets[0].len(), 4); } + #[tokio::test] + async fn test_catalog_backed_eager_scan_uses_builder_projection_and_predicate() { + use datafusion::prelude::{col, lit}; + use iceberg::expr::Reference; + use iceberg::spec::Datum; + + let (catalog, namespace, table_name, _temp_dir) = + make_catalog_and_table_for_bucketing().await; + append_fake_data_files(&catalog, &namespace, &table_name, 2).await; + + let provider = IcebergTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let projection = vec![1_usize]; + let filters = vec![col("id").eq(lit(1_i32))]; + + let plan = provider + .scan( + &ctx_with_target_partitions(2).state(), + Some(&projection), + &filters, + None, + ) + .await + .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); + + assert!(scan.buckets().is_some(), "expected eager scan buckets"); + assert_eq!(scan.projection().unwrap(), &["name".to_string()]); + assert_eq!( + scan.predicates(), + Some(&Reference::new("id").equal_to(Datum::int(1))) + ); + } + async fn make_partitioned_catalog_and_table_for_bucketing() -> (Arc, NamespaceIdent, String, tempfile::TempDir) { use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; From 971aa0f055d576b5ef339597970884964cad701f Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 16 Jun 2026 09:39:08 +0200 Subject: [PATCH 24/32] fix(datafusion): use unknown partitioning for empty eager scans --- .../integrations/datafusion/src/table/mod.rs | 66 ++++++++++++++----- 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 379f3c23b1..9054815eb7 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -163,10 +163,11 @@ impl TableProvider for IcebergTableProvider { let output_schema = scan_builder.output_schema()?; let target_partitions = state.config().target_partitions(); + let task_count = tasks.len(); // Always produce at least 1 partition so that DataFusion can schedule // the plan normally and callers can safely call execute(0). An empty // bucket simply yields an empty record-batch stream. - let n_partitions = target_partitions.min(tasks.len()).max(1); + let n_partitions = target_partitions.min(task_count).max(1); // identity_cols is Some(non-empty) iff every condition for declaring // Partitioning::Hash is met: the table's default spec has identity-transform @@ -179,22 +180,28 @@ impl TableProvider for IcebergTableProvider { let (buckets, all_had_full_key) = bucketing::bucket_tasks(tasks, n_partitions, identity_cols.as_deref()); - let partitioning = match identity_cols { - Some(cols) if !cols.is_empty() && all_had_full_key && n_partitions > 0 => { - let exprs: Vec> = cols - .iter() - .map(|c| Arc::new(Column::new(&c.name, c.output_idx)) as Arc) - .collect(); - // This declaration is only sound if the Arrow arrays built from - // partition literals hash identically to the column arrays the - // reader emits at scan time. DataFusion's hash dispatch is - // dtype-specific, so any drift in the reader output type (for - // example Utf8 vs Utf8View) must either update the bucketing - // path to materialize that exact dtype or fall back to - // UnknownPartitioning. - Partitioning::Hash(exprs, n_partitions) + let partitioning = if task_count == 0 { + Partitioning::UnknownPartitioning(n_partitions) + } else { + match identity_cols { + Some(cols) if !cols.is_empty() && all_had_full_key => { + let exprs: Vec> = cols + .iter() + .map(|c| { + Arc::new(Column::new(&c.name, c.output_idx)) as Arc + }) + .collect(); + // This declaration is only sound if the Arrow arrays built from + // partition literals hash identically to the column arrays the + // reader emits at scan time. DataFusion's hash dispatch is + // dtype-specific, so any drift in the reader output type (for + // example Utf8 vs Utf8View) must either update the bucketing + // path to materialize that exact dtype or fall back to + // UnknownPartitioning. + Partitioning::Hash(exprs, n_partitions) + } + _ => Partitioning::UnknownPartitioning(n_partitions), } - _ => Partitioning::UnknownPartitioning(n_partitions), }; Ok(Arc::new( @@ -1307,6 +1314,33 @@ mod tests { } } + /// Empty identity-partitioned tables still use one empty bucket, but do not + /// claim hash partitioning because there are no tasks proving a full key. + #[tokio::test] + async fn test_empty_identity_partitioned_table_falls_back_to_unknown() { + use datafusion::physical_plan::Partitioning; + + let (catalog, namespace, table_name, _temp_dir) = + make_partitioned_catalog_and_table_for_bucketing().await; + + let provider = IcebergTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan(&ctx_with_target_partitions(8).state(), None, &[], None) + .await + .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); + let buckets = scan.buckets().expect("expected eager scan buckets"); + + assert_eq!(buckets.len(), 1); + assert_eq!(buckets[0].len(), 0); + assert!(matches!( + scan.properties().partitioning, + Partitioning::UnknownPartitioning(1) + )); + } + /// Identity partition task buckets must match DataFusion's own hash /// repartition bucket calculation for the same concrete Arrow array type. #[tokio::test] From 2802af4384da211b90aae3d76f1f25da4f8be5bd Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 16 Jun 2026 10:06:45 +0200 Subject: [PATCH 25/32] test(datafusion): cover hash partitioning fallback cases --- .../integrations/datafusion/src/table/mod.rs | 265 +++++++++++++++++- 1 file changed, 262 insertions(+), 3 deletions(-) diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index 9054815eb7..a13e8beb15 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -414,6 +414,7 @@ mod tests { use std::collections::HashMap; use std::sync::Arc; + use async_trait::async_trait; use datafusion::common::Column; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::SessionContext; @@ -421,7 +422,9 @@ mod tests { use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; use iceberg::table::{StaticTable, Table}; - use iceberg::{Catalog, CatalogBuilder, NamespaceIdent, TableCreation, TableIdent}; + use iceberg::{ + Catalog, CatalogBuilder, Namespace, NamespaceIdent, TableCommit, TableCreation, TableIdent, + }; use tempfile::TempDir; use super::*; @@ -928,7 +931,7 @@ mod tests { ); } - // ── Bucketed scan tests ────────────────────────────────────────────────── + // Bucketed scan tests async fn make_catalog_and_table_for_bucketing() -> (Arc, NamespaceIdent, String, tempfile::TempDir) { @@ -1032,6 +1035,106 @@ mod tests { SessionContext::new_with_config(SessionConfig::new().with_target_partitions(n)) } + #[derive(Debug, Clone)] + struct SingleTableCatalog { + table: Table, + } + + impl SingleTableCatalog { + fn new(table: Table) -> Self { + Self { table } + } + } + + #[async_trait] + impl Catalog for SingleTableCatalog { + async fn list_namespaces( + &self, + _parent: Option<&NamespaceIdent>, + ) -> Result> { + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn create_namespace( + &self, + _namespace: &NamespaceIdent, + _properties: HashMap, + ) -> Result { + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn get_namespace(&self, _namespace: &NamespaceIdent) -> Result { + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn namespace_exists(&self, _namespace: &NamespaceIdent) -> Result { + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn update_namespace( + &self, + _namespace: &NamespaceIdent, + _properties: HashMap, + ) -> Result<()> { + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn drop_namespace(&self, _namespace: &NamespaceIdent) -> Result<()> { + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn list_tables(&self, _namespace: &NamespaceIdent) -> Result> { + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn create_table( + &self, + _namespace: &NamespaceIdent, + _creation: TableCreation, + ) -> Result { + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn load_table(&self, table: &TableIdent) -> Result
{ + if table == self.table.identifier() { + Ok(self.table.clone()) + } else { + Err(Error::new( + ErrorKind::DataInvalid, + format!("Unknown test table: {table}"), + )) + } + } + + async fn drop_table(&self, _table: &TableIdent) -> Result<()> { + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn purge_table(&self, _table: &TableIdent) -> Result<()> { + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn table_exists(&self, table: &TableIdent) -> Result { + Ok(table == self.table.identifier()) + } + + async fn rename_table(&self, _src: &TableIdent, _dest: &TableIdent) -> Result<()> { + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn register_table( + &self, + _table: &TableIdent, + _metadata_location: String, + ) -> Result
{ + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + + async fn update_table(&self, _commit: TableCommit) -> Result
{ + unimplemented!("SingleTableCatalog only supports load_table in these tests") + } + } + /// An empty table must produce a single empty-bucket scan so that DataFusion /// can schedule the plan normally. execute(0) on an empty bucket simply /// returns an empty record-batch stream. @@ -1234,6 +1337,21 @@ mod tests { namespace: &NamespaceIdent, table_name: &str, partition_values: Vec<&str>, + ) { + append_partitioned_fake_data_files_with_optional_values( + catalog, + namespace, + table_name, + partition_values.into_iter().map(Some).collect(), + ) + .await; + } + + async fn append_partitioned_fake_data_files_with_optional_values( + catalog: &Arc, + namespace: &NamespaceIdent, + table_name: &str, + partition_values: Vec>, ) { use iceberg::spec::{DataContentType, DataFileBuilder, DataFileFormat, Literal, Struct}; use iceberg::transaction::{ApplyTransactionAction, Transaction}; @@ -1257,7 +1375,9 @@ mod tests { .file_size_in_bytes(128) .record_count(1) .partition_spec_id(table.metadata().default_partition_spec_id()) - .partition(Struct::from_iter(vec![Some(Literal::string(*value))])) + .partition(Struct::from_iter(vec![ + value.as_ref().map(|value| Literal::string(*value)), + ])) .build() .unwrap() }) @@ -1415,6 +1535,145 @@ mod tests { } } + fn table_with_additional_partition_spec(table: &Table) -> Table { + use iceberg::TableUpdate; + use iceberg::spec::{Transform, UnboundPartitionSpec}; + + let extra_spec = UnboundPartitionSpec::builder() + .with_spec_id(1) + .add_partition_field(1, "id_part", Transform::Identity) + .unwrap() + .build(); + let metadata = TableUpdate::AddSpec { spec: extra_spec } + .apply(table.metadata().clone().into_builder(None)) + .unwrap() + .build() + .unwrap() + .metadata; + + let mut builder = Table::builder() + .file_io(table.file_io().clone()) + .metadata(Arc::new(metadata)) + .identifier(table.identifier().clone()); + if let Some(metadata_location) = table.metadata_location() { + builder = builder.metadata_location(metadata_location); + } + builder.build().unwrap() + } + + /// If a table has partition spec evolution, older files may have partition + /// tuples that do not align with the default spec. The scan must therefore + /// keep the eager buckets but avoid declaring hash partitioning. + #[tokio::test] + async fn test_spec_evolution_falls_back_to_unknown_partitioning() { + use datafusion::physical_plan::Partitioning; + + let (catalog, namespace, table_name, _temp_dir) = + make_partitioned_catalog_and_table_for_bucketing().await; + append_partitioned_fake_data_files(&catalog, &namespace, &table_name, vec![ + "a", "b", "c", "d", + ]) + .await; + + let table_ident = TableIdent::new(namespace.clone(), table_name.clone()); + let table = catalog.load_table(&table_ident).await.unwrap(); + let evolved_table = table_with_additional_partition_spec(&table); + assert_eq!(evolved_table.metadata().partition_specs_iter().len(), 2); + + let provider = IcebergTableProvider::try_new( + Arc::new(SingleTableCatalog::new(evolved_table)), + namespace, + table_name, + ) + .await + .unwrap(); + let plan = provider + .scan(&ctx_with_target_partitions(4).state(), None, &[], None) + .await + .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); + + assert!(matches!( + scan.properties().partitioning, + Partitioning::UnknownPartitioning(4) + )); + } + + /// If the scan output dtype for an identity partition source cannot be + /// materialized for DataFusion-compatible hashing, the hash declaration is + /// unsound. Timestamp dtypes are supported here, so this uses `Utf8View` as + /// a deliberately unsupported output dtype. + #[tokio::test] + async fn test_unsupported_output_partition_dtype_falls_back_to_unknown_partitioning() { + use datafusion::arrow::datatypes::{DataType, Field as ArrowField, Schema as ArrowSchema}; + use datafusion::physical_plan::Partitioning; + + let (catalog, namespace, table_name, _temp_dir) = + make_partitioned_catalog_and_table_for_bucketing().await; + append_partitioned_fake_data_files(&catalog, &namespace, &table_name, vec![ + "a", "b", "c", "d", + ]) + .await; + + let table_ident = TableIdent::new(namespace.clone(), table_name.clone()); + let table = catalog.load_table(&table_ident).await.unwrap(); + let schema = Arc::new(ArrowSchema::new(vec![ + ArrowField::new("id", DataType::Int32, false), + ArrowField::new("name", DataType::Utf8View, false), + ])); + let provider = IcebergTableProvider { + catalog: Arc::new(SingleTableCatalog::new(table)), + table_ident, + schema, + }; + + let plan = provider + .scan(&ctx_with_target_partitions(4).state(), None, &[], None) + .await + .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); + + assert!(matches!( + scan.properties().partitioning, + Partitioning::UnknownPartitioning(4) + )); + } + + /// A null identity partition value forces that task through fallback hashing. + /// Since at least one task did not have a full hash key, the scan must not + /// claim DataFusion hash partitioning. + #[tokio::test] + async fn test_null_partition_value_falls_back_to_unknown_partitioning() { + use datafusion::physical_plan::Partitioning; + + let (catalog, namespace, table_name, _temp_dir) = + make_partitioned_catalog_and_table_for_bucketing().await; + append_partitioned_fake_data_files_with_optional_values( + &catalog, + &namespace, + &table_name, + vec![Some("a"), None, Some("c"), Some("d")], + ) + .await; + + let provider = IcebergTableProvider::try_new(catalog, namespace, table_name) + .await + .unwrap(); + let plan = provider + .scan(&ctx_with_target_partitions(4).state(), None, &[], None) + .await + .unwrap(); + let scan = plan.as_any().downcast_ref::().unwrap(); + let buckets = scan.buckets().expect("expected eager scan buckets"); + + let total_files: usize = buckets.iter().map(|bucket| bucket.len()).sum(); + assert_eq!(total_files, 4); + assert!(matches!( + scan.properties().partitioning, + Partitioning::UnknownPartitioning(4) + )); + } + /// A projection that omits the partition source column drops /// `compute_identity_cols` to `None`, collapsing to `UnknownPartitioning`. #[tokio::test] From f4c407f736200a60394429e3fe3828ed23885558 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 16 Jun 2026 12:21:44 +0200 Subject: [PATCH 26/32] fix(datafusion): give table runtime to ArrowReaderBuilder::new() --- crates/iceberg/src/table.rs | 2 +- crates/integrations/datafusion/src/physical_plan/scan.rs | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/crates/iceberg/src/table.rs b/crates/iceberg/src/table.rs index 31feade038..d12a92f452 100644 --- a/crates/iceberg/src/table.rs +++ b/crates/iceberg/src/table.rs @@ -287,7 +287,7 @@ impl Table { } /// Returns the [`Runtime`] for this table. - pub(crate) fn runtime(&self) -> &Runtime { + pub fn runtime(&self) -> &Runtime { &self.runtime } diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index 250c2878f6..73c331d2d5 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -385,10 +385,11 @@ async fn build_record_batch_stream( (0..bucket.len()).map(move |idx| Ok::<_, iceberg::Error>(bucket[idx].clone())), )); let num_cpus = available_parallelism().get(); - let arrow_reader_builder = ArrowReaderBuilder::new(table.file_io().clone()) - .with_data_file_concurrency_limit(num_cpus) - .with_row_group_filtering_enabled(true) - .with_row_selection_enabled(true); + let arrow_reader_builder = + ArrowReaderBuilder::new(table.file_io().clone(), table.runtime().clone()) + .with_data_file_concurrency_limit(num_cpus) + .with_row_group_filtering_enabled(true) + .with_row_selection_enabled(true); Box::pin( arrow_reader_builder From 277ee9ca87950e86f51494c074d79f12b123d1c4 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 16 Jun 2026 12:45:52 +0200 Subject: [PATCH 27/32] fix(datafusion): give table runtime to table_with_additional_partition_spec --- crates/integrations/datafusion/src/table/mod.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index a13e8beb15..af4512308a 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -1554,7 +1554,8 @@ mod tests { let mut builder = Table::builder() .file_io(table.file_io().clone()) .metadata(Arc::new(metadata)) - .identifier(table.identifier().clone()); + .identifier(table.identifier().clone()) + .runtime(table.runtime().clone()); if let Some(metadata_location) = table.metadata_location() { builder = builder.metadata_location(metadata_location); } From 41ae275dc8e6c94107eed56d3628586fef266638 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 16 Jun 2026 12:58:12 +0200 Subject: [PATCH 28/32] fix(datafusion): make check green --- crates/integrations/datafusion/src/table/bucketing.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/integrations/datafusion/src/table/bucketing.rs b/crates/integrations/datafusion/src/table/bucketing.rs index dfa43306a8..cd8e334d60 100644 --- a/crates/integrations/datafusion/src/table/bucketing.rs +++ b/crates/integrations/datafusion/src/table/bucketing.rs @@ -272,7 +272,7 @@ mod tests { #[test] fn bucket_tasks_hashes_multiple_identity_columns() { - let rows = vec![(1, "a"), (2, "b"), (1, "b"), (3, "c"), (2, "a")]; + let rows = [(1, "a"), (2, "b"), (1, "b"), (3, "c"), (2, "a")]; let tasks = rows .iter() .enumerate() @@ -330,7 +330,7 @@ mod tests { #[test] fn bucket_tasks_hashes_decimal_and_timestamp_identity_columns() { - let rows = vec![ + let rows = [ (100_i128, 1_740_600_000_000_000_i64), (200_i128, 1_740_600_100_000_000_i64), (100_i128, 1_740_600_200_000_000_i64), From 3d0297bcc7a2910b65f56e3373c509b3b4b2042b Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 16 Jun 2026 14:21:05 +0200 Subject: [PATCH 29/32] refactor(datafusion): reuse iceberg::util::available_parallelism --- crates/iceberg/src/util/mod.rs | 2 +- crates/integrations/datafusion/src/physical_plan/scan.rs | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/crates/iceberg/src/util/mod.rs b/crates/iceberg/src/util/mod.rs index 3cf2eef9b0..3532b5e62c 100644 --- a/crates/iceberg/src/util/mod.rs +++ b/crates/iceberg/src/util/mod.rs @@ -34,7 +34,7 @@ const DEFAULT_PARALLELISM: usize = 1; /// are circumstances where the level of available /// parallelism can change during the lifetime of an executing /// process, but this should not be called in a hot loop. -pub(crate) fn available_parallelism() -> NonZeroUsize { +pub fn available_parallelism() -> NonZeroUsize { std::thread::available_parallelism().unwrap_or_else(|err| { tracing::warn!( error = %err, diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index 73c331d2d5..762ee5590e 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -16,7 +16,6 @@ // under the License. use std::any::Any; -use std::num::NonZeroUsize; use std::pin::Pin; use std::sync::Arc; @@ -34,17 +33,11 @@ use iceberg::arrow::ArrowReaderBuilder; use iceberg::expr::Predicate; use iceberg::scan::{FileScanTask, TableScan}; use iceberg::table::Table; +use iceberg::util::available_parallelism; use super::expr_to_predicate::convert_filters_to_predicate; use crate::to_datafusion_error; -// TODO: use crate::util for available_parallelism -const DEFAULT_PARALLELISM: usize = 1; -fn available_parallelism() -> NonZeroUsize { - std::thread::available_parallelism() - .unwrap_or_else(|_err| NonZeroUsize::new(DEFAULT_PARALLELISM).unwrap()) -} - /// Iceberg [`Table`] scan as a DataFusion [`ExecutionPlan`]. /// /// Has two construction modes: lazy single-partition scans that plan files From 39da3c064dfa2141de9496086984c496c9354437 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 16 Jun 2026 14:32:47 +0200 Subject: [PATCH 30/32] docs(datafusion): note count-based bucketing limitation Document that task bucketing distributes by file count, not file_size_in_bytes, and track the size-based bin-packing follow-up in https://github.com/apache/iceberg-rust/issues/128. --- .../datafusion/src/table/bucketing.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/crates/integrations/datafusion/src/table/bucketing.rs b/crates/integrations/datafusion/src/table/bucketing.rs index cd8e334d60..457e4146d1 100644 --- a/crates/integrations/datafusion/src/table/bucketing.rs +++ b/crates/integrations/datafusion/src/table/bucketing.rs @@ -15,6 +15,23 @@ // specific language governing permissions and limitations // under the License. +//! Distribution of pre-planned [`FileScanTask`]s into per-partition buckets for +//! eager multi-partition scans. +//! +//! Tasks are distributed by *count*: each task is hashed (on its identity +//! partition key when available, otherwise on its data file path) and placed in +//! `hash % n_partitions`. This evens out the number of files per bucket but is +//! unaware of `file_size_in_bytes`, so a table mixing one large file with many +//! small ones can pile most of the bytes into a single bucket and serialize the +//! query on that partition. +//! +//! A size-aware strategy — first-fit-decreasing bin-packing on +//! `file_size_in_bytes` (optionally with a target split size), mirroring +//! iceberg-java's `TableScanUtil.planTaskGroups` / `BinPacking` — would spread +//! the work more evenly. The byte size is already carried on each +//! [`FileScanTask`], so this is a fairly contained extension; it is tracked as a +//! follow-up in . + use datafusion::arrow::datatypes::{DataType, Schema as ArrowSchema, TimeUnit}; use datafusion::common::hash_utils::create_hashes; use datafusion::physical_plan::repartition::REPARTITION_RANDOM_STATE; From eff04fb1f33f0ff448a13eab36b5a87ebfb66e5d Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 16 Jun 2026 14:55:01 +0200 Subject: [PATCH 31/32] docs(datafusion): explain conservative multi-spec bucketing gate Note that compute_identity_cols intentionally bails out on any partition spec evolution rather than intersecting identity fields across specs like iceberg-java does. Link the follow-up issue tracking that relaxation. Closes review comment on #2298. Tracked in apache/iceberg-rust#2658. --- crates/integrations/datafusion/src/table/bucketing.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crates/integrations/datafusion/src/table/bucketing.rs b/crates/integrations/datafusion/src/table/bucketing.rs index 457e4146d1..819ba3b3f3 100644 --- a/crates/integrations/datafusion/src/table/bucketing.rs +++ b/crates/integrations/datafusion/src/table/bucketing.rs @@ -68,6 +68,14 @@ pub(super) fn compute_identity_cols( output_schema: &ArrowSchema, ) -> Option> { let metadata = table.metadata(); + // iceberg-java is less conservative here: it intersects the identity fields + // present in every spec (`Partitioning.groupingKeyType` / + // `commonActiveFieldIds`) and still reports a grouping key on the columns + // that are identity-partitioned across all of them. We deliberately bail + // out on any spec evolution instead, because the bucketing path aligns each + // task's partition slot to the *default* spec and `FileScanTask` does not + // yet carry its own spec id to disambiguate. Tracked as a follow-up in + // . if metadata.partition_specs_iter().len() > 1 { return None; } From d54684d411d96c86fb590f0f0df90e80022154da Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Tue, 16 Jun 2026 15:59:31 +0200 Subject: [PATCH 32/32] fix: generate-public-api --- crates/iceberg/public-api.txt | 9 +++++++++ crates/integrations/datafusion/public-api.txt | 10 +++++----- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/crates/iceberg/public-api.txt b/crates/iceberg/public-api.txt index eb2d4f932b..bb418b14e9 100644 --- a/crates/iceberg/public-api.txt +++ b/crates/iceberg/public-api.txt @@ -86,6 +86,11 @@ pub fn iceberg::arrow::PartitionValueCalculator::partition_type(&self) -> &icebe pub fn iceberg::arrow::PartitionValueCalculator::try_new(partition_spec: &iceberg::spec::PartitionSpec, table_schema: &iceberg::spec::Schema) -> iceberg::Result impl core::fmt::Debug for iceberg::arrow::PartitionValueCalculator pub fn iceberg::arrow::PartitionValueCalculator::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result +pub struct iceberg::arrow::PrimitiveLiteralArrayBuilder +impl iceberg::arrow::PrimitiveLiteralArrayBuilder +pub fn iceberg::arrow::PrimitiveLiteralArrayBuilder::append_or_null(&mut self, prim_lit: core::option::Option<&iceberg::spec::PrimitiveLiteral>) -> iceberg::Result +pub fn iceberg::arrow::PrimitiveLiteralArrayBuilder::finish(self) -> iceberg::Result +pub fn iceberg::arrow::PrimitiveLiteralArrayBuilder::try_new(data_type: &arrow_schema::datatype::DataType, capacity: usize) -> iceberg::Result pub struct iceberg::arrow::RecordBatchPartitionSplitter impl iceberg::arrow::RecordBatchPartitionSplitter pub fn iceberg::arrow::RecordBatchPartitionSplitter::split(&self, batch: &arrow_array::record_batch::RecordBatch) -> iceberg::Result> @@ -127,6 +132,7 @@ pub fn iceberg::arrow::arrow_schema_to_schema(schema: &arrow_schema::schema::Sch pub fn iceberg::arrow::arrow_schema_to_schema_auto_assign_ids(schema: &arrow_schema::schema::Schema) -> iceberg::Result pub fn iceberg::arrow::arrow_struct_to_literal(struct_array: &arrow_array::array::ArrayRef, ty: &iceberg::spec::StructType) -> iceberg::Result>> pub fn iceberg::arrow::arrow_type_to_type(ty: &arrow_schema::datatype::DataType) -> iceberg::Result +pub fn iceberg::arrow::create_primitive_array_single_element(data_type: &arrow_schema::datatype::DataType, prim_lit: &core::option::Option) -> iceberg::Result pub fn iceberg::arrow::datum_to_arrow_type_with_ree(datum: &iceberg::spec::Datum) -> arrow_schema::datatype::DataType pub fn iceberg::arrow::schema_to_arrow_schema(schema: &iceberg::spec::Schema) -> iceberg::Result pub fn iceberg::arrow::strip_metadata_from_schema(schema: &arrow_schema::schema::Schema) -> iceberg::Result @@ -1289,6 +1295,7 @@ pub fn iceberg::scan::TableScan::column_names(&self) -> core::option::Option<&[a pub async fn iceberg::scan::TableScan::plan_files(&self) -> iceberg::Result pub fn iceberg::scan::TableScan::snapshot(&self) -> core::option::Option<&iceberg::spec::SnapshotRef> pub async fn iceberg::scan::TableScan::to_arrow(&self) -> iceberg::Result +pub fn iceberg::scan::TableScan::to_arrow_from_tasks(&self, tasks: iceberg::scan::FileScanTaskStream) -> iceberg::Result impl core::fmt::Debug for iceberg::scan::TableScan pub fn iceberg::scan::TableScan::fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result pub struct iceberg::scan::TableScanBuilder<'a> @@ -3034,6 +3041,7 @@ pub fn iceberg::table::Table::metadata_location_result(&self) -> iceberg::Result pub fn iceberg::table::Table::metadata_ref(&self) -> iceberg::spec::TableMetadataRef pub fn iceberg::table::Table::reader_builder(&self) -> iceberg::arrow::ArrowReaderBuilder pub fn iceberg::table::Table::readonly(&self) -> bool +pub fn iceberg::table::Table::runtime(&self) -> &iceberg::Runtime pub fn iceberg::table::Table::scan(&self) -> iceberg::scan::TableScanBuilder<'_> impl core::clone::Clone for iceberg::table::Table pub fn iceberg::table::Table::clone(&self) -> iceberg::table::Table @@ -3094,6 +3102,7 @@ pub mod iceberg::util pub mod iceberg::util::snapshot pub fn iceberg::util::snapshot::ancestors_between(table_metadata: &iceberg::spec::TableMetadataRef, latest_snapshot_id: i64, oldest_snapshot_id: core::option::Option) -> impl core::iter::traits::iterator::Iterator + core::marker::Send pub fn iceberg::util::snapshot::ancestors_of(table_metadata: &iceberg::spec::TableMetadataRef, snapshot_id: i64) -> impl core::iter::traits::iterator::Iterator + core::marker::Send +pub fn iceberg::util::available_parallelism() -> core::num::nonzero::NonZeroUsize pub mod iceberg::writer pub mod iceberg::writer::base_writer pub mod iceberg::writer::base_writer::data_file_writer diff --git a/crates/integrations/datafusion/public-api.txt b/crates/integrations/datafusion/public-api.txt index d24bd9fc9e..40bb36f3b5 100644 --- a/crates/integrations/datafusion/public-api.txt +++ b/crates/integrations/datafusion/public-api.txt @@ -15,6 +15,7 @@ pub fn iceberg_datafusion::metadata_table::IcebergMetadataTableProvider::table_t pub mod iceberg_datafusion::physical_plan pub struct iceberg_datafusion::physical_plan::IcebergTableScan impl iceberg_datafusion::physical_plan::IcebergTableScan +pub fn iceberg_datafusion::physical_plan::IcebergTableScan::buckets(&self) -> core::option::Option<&[alloc::sync::Arc<[iceberg::scan::task::FileScanTask]>]> pub fn iceberg_datafusion::physical_plan::IcebergTableScan::limit(&self) -> core::option::Option pub fn iceberg_datafusion::physical_plan::IcebergTableScan::predicates(&self) -> core::option::Option<&iceberg::expr::predicate::Predicate> pub fn iceberg_datafusion::physical_plan::IcebergTableScan::projection(&self) -> core::option::Option<&[alloc::string::String]> @@ -27,11 +28,10 @@ pub fn iceberg_datafusion::physical_plan::IcebergTableScan::fmt_as(&self, _t: da impl datafusion_physical_plan::execution_plan::ExecutionPlan for iceberg_datafusion::physical_plan::IcebergTableScan pub fn iceberg_datafusion::physical_plan::IcebergTableScan::as_any(&self) -> &dyn core::any::Any pub fn iceberg_datafusion::physical_plan::IcebergTableScan::children(&self) -> alloc::vec::Vec<&alloc::sync::Arc<(dyn datafusion_physical_plan::execution_plan::ExecutionPlan + 'static)>> -pub fn iceberg_datafusion::physical_plan::IcebergTableScan::execute(&self, _partition: usize, _context: alloc::sync::Arc) -> datafusion_common::error::Result +pub fn iceberg_datafusion::physical_plan::IcebergTableScan::execute(&self, partition: usize, _context: alloc::sync::Arc) -> datafusion_common::error::Result pub fn iceberg_datafusion::physical_plan::IcebergTableScan::name(&self) -> &str pub fn iceberg_datafusion::physical_plan::IcebergTableScan::properties(&self) -> &alloc::sync::Arc -pub fn iceberg_datafusion::physical_plan::IcebergTableScan::with_new_children(self: alloc::sync::Arc, _children: alloc::vec::Vec>) -> datafusion_common::error::Result> -pub fn iceberg_datafusion::physical_plan::convert_filters_to_predicate(filters: &[datafusion_expr::expr::Expr]) -> core::option::Option +pub fn iceberg_datafusion::physical_plan::IcebergTableScan::with_new_children(self: alloc::sync::Arc, children: alloc::vec::Vec>) -> datafusion_common::error::Result> pub fn iceberg_datafusion::physical_plan::project_with_partition(input: alloc::sync::Arc, table: &iceberg::table::Table) -> datafusion_common::error::Result> pub mod iceberg_datafusion::table pub mod iceberg_datafusion::table::metadata_table @@ -81,7 +81,7 @@ pub fn iceberg_datafusion::IcebergTableProvider::fmt(&self, f: &mut core::fmt::F impl datafusion_catalog::table::TableProvider for iceberg_datafusion::IcebergTableProvider pub fn iceberg_datafusion::IcebergTableProvider::as_any(&self) -> &dyn core::any::Any pub fn iceberg_datafusion::IcebergTableProvider::insert_into<'life0, 'life1, 'async_trait>(&'life0 self, state: &'life1 dyn datafusion_session::session::Session, input: alloc::sync::Arc, _insert_op: datafusion_expr::logical_plan::dml::InsertOp) -> core::pin::Pin>> + core::marker::Send + 'async_trait)>> where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait -pub fn iceberg_datafusion::IcebergTableProvider::scan<'life0, 'life1, 'life2, 'life3, 'async_trait>(&'life0 self, _state: &'life1 dyn datafusion_session::session::Session, projection: core::option::Option<&'life2 alloc::vec::Vec>, filters: &'life3 [datafusion_expr::expr::Expr], limit: core::option::Option) -> core::pin::Pin>> + core::marker::Send + 'async_trait)>> where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait, 'life3: 'async_trait +pub fn iceberg_datafusion::IcebergTableProvider::scan<'life0, 'life1, 'life2, 'life3, 'async_trait>(&'life0 self, state: &'life1 dyn datafusion_session::session::Session, projection: core::option::Option<&'life2 alloc::vec::Vec>, filters: &'life3 [datafusion_expr::expr::Expr], limit: core::option::Option) -> core::pin::Pin>> + core::marker::Send + 'async_trait)>> where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait, 'life3: 'async_trait pub fn iceberg_datafusion::IcebergTableProvider::schema(&self) -> arrow_schema::schema::SchemaRef pub fn iceberg_datafusion::IcebergTableProvider::supports_filters_pushdown(&self, filters: &[&datafusion_expr::expr::Expr]) -> datafusion_common::error::Result> pub fn iceberg_datafusion::IcebergTableProvider::table_type(&self) -> datafusion_expr::table_source::TableType @@ -128,7 +128,7 @@ pub fn iceberg_datafusion::IcebergTableProvider::fmt(&self, f: &mut core::fmt::F impl datafusion_catalog::table::TableProvider for iceberg_datafusion::IcebergTableProvider pub fn iceberg_datafusion::IcebergTableProvider::as_any(&self) -> &dyn core::any::Any pub fn iceberg_datafusion::IcebergTableProvider::insert_into<'life0, 'life1, 'async_trait>(&'life0 self, state: &'life1 dyn datafusion_session::session::Session, input: alloc::sync::Arc, _insert_op: datafusion_expr::logical_plan::dml::InsertOp) -> core::pin::Pin>> + core::marker::Send + 'async_trait)>> where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait -pub fn iceberg_datafusion::IcebergTableProvider::scan<'life0, 'life1, 'life2, 'life3, 'async_trait>(&'life0 self, _state: &'life1 dyn datafusion_session::session::Session, projection: core::option::Option<&'life2 alloc::vec::Vec>, filters: &'life3 [datafusion_expr::expr::Expr], limit: core::option::Option) -> core::pin::Pin>> + core::marker::Send + 'async_trait)>> where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait, 'life3: 'async_trait +pub fn iceberg_datafusion::IcebergTableProvider::scan<'life0, 'life1, 'life2, 'life3, 'async_trait>(&'life0 self, state: &'life1 dyn datafusion_session::session::Session, projection: core::option::Option<&'life2 alloc::vec::Vec>, filters: &'life3 [datafusion_expr::expr::Expr], limit: core::option::Option) -> core::pin::Pin>> + core::marker::Send + 'async_trait)>> where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait, 'life3: 'async_trait pub fn iceberg_datafusion::IcebergTableProvider::schema(&self) -> arrow_schema::schema::SchemaRef pub fn iceberg_datafusion::IcebergTableProvider::supports_filters_pushdown(&self, filters: &[&datafusion_expr::expr::Expr]) -> datafusion_common::error::Result> pub fn iceberg_datafusion::IcebergTableProvider::table_type(&self) -> datafusion_expr::table_source::TableType