From 411f617a5a0eda5ca7d91768f3ad471f1b2760b3 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Wed, 17 Jun 2026 18:47:20 -0600 Subject: [PATCH 01/12] ParallelWindow optimizer skeleton: probe per-partition stats for RANGE-frame windows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new physical optimizer rule (run just before EnsureRequirements) that finds BoundedWindowAggExec nodes with a single ORDER BY column, no PARTITION BY, and a RANGE frame, then logs per-partition min/max on the order column via partition_statistics(). No transformation yet — this confirms that the per-input-partition Exact stats are available at the right spot in the pipeline to drive a future range-repartitioning step. Also adds a sqllogictest fixture (parallel_window.slt) with four scrambled parquet files, overlapping seq ranges, so the routing problem is non-trivial and stats remain Exact per partition. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 1 + datafusion/physical-optimizer/Cargo.toml | 1 + datafusion/physical-optimizer/src/lib.rs | 1 + .../physical-optimizer/src/optimizer.rs | 6 + .../physical-optimizer/src/parallel_window.rs | 105 ++++++++++++++++++ .../test_files/parallel_window.slt | 103 +++++++++++++++++ 6 files changed, 217 insertions(+) create mode 100644 datafusion/physical-optimizer/src/parallel_window.rs create mode 100644 datafusion/sqllogictest/test_files/parallel_window.slt diff --git a/Cargo.lock b/Cargo.lock index fdca3237b71d0..5f9af4edc8f56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2433,6 +2433,7 @@ dependencies = [ "datafusion-pruning", "insta", "itertools 0.14.0", + "log", "recursive", "tokio", ] diff --git a/datafusion/physical-optimizer/Cargo.toml b/datafusion/physical-optimizer/Cargo.toml index 38c8a7c37211f..eb201d0c2f655 100644 --- a/datafusion/physical-optimizer/Cargo.toml +++ b/datafusion/physical-optimizer/Cargo.toml @@ -51,6 +51,7 @@ datafusion-physical-expr-common = { workspace = true } datafusion-physical-plan = { workspace = true } datafusion-pruning = { workspace = true } itertools = { workspace = true } +log = { workspace = true } recursive = { workspace = true, optional = true } [dev-dependencies] diff --git a/datafusion/physical-optimizer/src/lib.rs b/datafusion/physical-optimizer/src/lib.rs index b9eb248f6e843..a7c0a4a1396df 100644 --- a/datafusion/physical-optimizer/src/lib.rs +++ b/datafusion/physical-optimizer/src/lib.rs @@ -40,6 +40,7 @@ pub mod limit_pushdown_past_window; pub mod limited_distinct_aggregation; pub mod optimizer; pub mod output_requirements; +pub mod parallel_window; pub mod projection_pushdown; pub use datafusion_pruning as pruning; pub mod hash_join_buffering; diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs index 0f81512b61c8e..3bb5495bbca87 100644 --- a/datafusion/physical-optimizer/src/optimizer.rs +++ b/datafusion/physical-optimizer/src/optimizer.rs @@ -29,6 +29,7 @@ use crate::join_selection::JoinSelection; use crate::limit_pushdown::LimitPushdown; use crate::limited_distinct_aggregation::LimitedDistinctAggregation; use crate::output_requirements::OutputRequirements; +use crate::parallel_window::ParallelWindow; use crate::projection_pushdown::ProjectionPushdown; use crate::sanity_checker::SanityCheckPlan; use crate::topk_aggregation::TopKAggregation; @@ -187,6 +188,11 @@ impl PhysicalOptimizer { // [`EnsureRequirements`](crate::ensure_requirements) for the per-phase // breakdown, and // for the original failure mode. + // Probe candidate BoundedWindowAggExec nodes for per-partition stats + // on their ORDER BY column. Skeleton step toward range-repartitioning + // unpartitioned RANGE-frame windows. Must run before EnsureRequirements + // so the window's child is still the raw source (no inserted Sort/SPM). + Arc::new(ParallelWindow::new()), Arc::new(EnsureRequirements::new()), // The CombinePartialFinalAggregate rule should be applied after distribution enforcement Arc::new(CombinePartialFinalAggregate::new()), diff --git a/datafusion/physical-optimizer/src/parallel_window.rs b/datafusion/physical-optimizer/src/parallel_window.rs new file mode 100644 index 0000000000000..98d1e10083972 --- /dev/null +++ b/datafusion/physical-optimizer/src/parallel_window.rs @@ -0,0 +1,105 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parallelize bounded RANGE-frame window functions that have an ORDER BY but +//! no PARTITION BY by range-partitioning their input. +//! +//! Skeleton: this pass finds candidate `BoundedWindowAggExec` nodes and probes +//! the child's `partition_statistics(Some(i))` for per-partition min/max on the +//! single ORDER BY column. It does not transform the plan yet. + +use crate::PhysicalOptimizerRule; +use datafusion_common::config::ConfigOptions; +use datafusion_common::tree_node::{Transformed, TreeNode}; +use datafusion_expr::WindowFrameUnits; +use datafusion_physical_expr::expressions::Column; +use datafusion_physical_plan::windows::BoundedWindowAggExec; +use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; +use log::info; +use std::sync::Arc; + +#[derive(Default, Clone, Debug)] +pub struct ParallelWindow; + +impl ParallelWindow { + pub fn new() -> Self { + Self + } +} + +impl PhysicalOptimizerRule for ParallelWindow { + fn optimize( + &self, + plan: Arc, + _config: &ConfigOptions, + ) -> datafusion_common::Result> { + let out = plan.transform_down(|node| { + if let Some(window) = node.downcast_ref::() { + probe_candidate(window)?; + } + Ok(Transformed::no(node)) + })?; + Ok(out.data) + } + + fn name(&self) -> &str { + "ParallelWindow" + } + + fn schema_check(&self) -> bool { + true + } +} + +fn probe_candidate(window: &BoundedWindowAggExec) -> datafusion_common::Result<()> { + // v1 scope: single ORDER BY column, no PARTITION BY, RANGE frame. + if !window.partition_keys().is_empty() { + return Ok(()); + } + let order_by = window.window_expr()[0].order_by(); + if order_by.len() != 1 { + return Ok(()); + } + let frame = window.window_expr()[0].get_window_frame(); + if frame.units != WindowFrameUnits::Range { + return Ok(()); + } + + let child = window.input(); + let Some(col) = order_by[0].expr.downcast_ref::() else { + return Ok(()); + }; + let col_idx = col.index(); + let col_name = col.name().to_string(); + let n = child.output_partitioning().partition_count(); + + info!( + "ParallelWindow: candidate BoundedWindowAggExec on `{col_name}` (RANGE frame, no PARTITION BY); \ + child has {n} partitions" + ); + + for i in 0..n { + let stats = child.partition_statistics(Some(i))?; + let col_stats = &stats.column_statistics[col_idx]; + info!( + " partition {i}: min={:?} max={:?} rows={:?}", + col_stats.min_value, col_stats.max_value, stats.num_rows + ); + } + + Ok(()) +} diff --git a/datafusion/sqllogictest/test_files/parallel_window.slt b/datafusion/sqllogictest/test_files/parallel_window.slt new file mode 100644 index 0000000000000..4aeb1d9df510a --- /dev/null +++ b/datafusion/sqllogictest/test_files/parallel_window.slt @@ -0,0 +1,103 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Goal: parallelize window functions that have ORDER BY but no PARTITION BY, +# over a bounded RANGE frame. Today this collapses to a single partition via +# BoundedWindowAggExec::required_input_distribution -> Distribution::SinglePartition +# (datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs:333-340). + +statement ok +set datafusion.execution.target_partitions = 4; + +statement ok +set datafusion.explain.show_statistics = true; + +# Build four parquet files with OVERLAPPING seq ranges so range-repartitioning +# actually has to move rows around. Each file has seq congruent to its index +# mod 4 in [0, 100); a deterministic scramble (seq * 37 mod 100) defeats the +# natural generate_series ordering so DataFusion does not claim output_ordering +# on seq. Per-file min/max still come back Exact from the parquet footer. +query I +COPY (SELECT seq, seq % 7 AS amount FROM ( + SELECT value AS seq FROM generate_series(0, 99, 4) + ) ORDER BY (seq * 37) % 100) +TO 'test_files/scratch/parallel_window/events/0.parquet' +STORED AS PARQUET; +---- +25 + +query I +COPY (SELECT seq, seq % 7 AS amount FROM ( + SELECT value AS seq FROM generate_series(1, 99, 4) + ) ORDER BY (seq * 37) % 100) +TO 'test_files/scratch/parallel_window/events/1.parquet' +STORED AS PARQUET; +---- +25 + +query I +COPY (SELECT seq, seq % 7 AS amount FROM ( + SELECT value AS seq FROM generate_series(2, 99, 4) + ) ORDER BY (seq * 37) % 100) +TO 'test_files/scratch/parallel_window/events/2.parquet' +STORED AS PARQUET; +---- +25 + +query I +COPY (SELECT seq, seq % 7 AS amount FROM ( + SELECT value AS seq FROM generate_series(3, 99, 4) + ) ORDER BY (seq * 37) % 100) +TO 'test_files/scratch/parallel_window/events/3.parquet' +STORED AS PARQUET; +---- +25 + +statement ok +CREATE EXTERNAL TABLE events +STORED AS PARQUET +LOCATION 'test_files/scratch/parallel_window/events'; + +# Bounded RANGE frame, ORDER BY only (no PARTITION BY). Canonical shape. +# Each input partition should report Exact min/max on seq from its parquet footer. +query TT +EXPLAIN SELECT + seq, + SUM(amount) OVER ( + ORDER BY seq + RANGE BETWEEN 5 PRECEDING AND CURRENT ROW + ) AS rolling_sum +FROM events +ORDER BY seq +LIMIT 5; +---- +logical_plan +01)Sort: events.seq ASC NULLS LAST, fetch=5 +02)--Projection: events.seq, sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW AS rolling_sum +03)----WindowAggr: windowExpr=[[sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW]] +04)------TableScan: events projection=[seq, amount] +physical_plan +01)ProjectionExec: expr=[seq@0 as seq, sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW@2 as rolling_sum], statistics=[Rows=Exact(5), Bytes=Exact(80), [(Col[0]: Min=Inexact(Int64(0)) Max=Inexact(Int64(99)) Null=Inexact(0) ScanBytes=Inexact(40)),(Col[1]:)]] +02)--GlobalLimitExec: skip=0, fetch=5, statistics=[Rows=Exact(5), Bytes=Absent, [(Col[0]: Min=Inexact(Int64(0)) Max=Inexact(Int64(99)) Null=Inexact(0) ScanBytes=Inexact(40)),(Col[1]: Min=Inexact(Int64(0)) Max=Inexact(Int64(6)) Null=Inexact(0) ScanBytes=Inexact(40)),(Col[2]:)]] +03)----BoundedWindowAggExec: wdw=[sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted], statistics=[Rows=Exact(100), Bytes=Absent, [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800)),(Col[2]:)]] +04)------SortPreservingMergeExec: [seq@0 ASC NULLS LAST], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] +05)--------SortExec: expr=[seq@0 ASC NULLS LAST], preserve_partitioning=[true], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] +06)----------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/3.parquet]]}, projection=[seq, amount], file_type=parquet, sort_order_for_reorder=[seq@0 ASC NULLS LAST], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] + +# Reset session settings so this file doesn't leak config into the rest of the run. +statement ok +set datafusion.explain.show_statistics = false; From fac7c25f70126c8752984b95bb2f37b503734dd5 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Wed, 17 Jun 2026 18:52:18 -0600 Subject: [PATCH 02/12] ParallelWindow: compute global min/max + equal-width interior boundaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reduces per-input-partition Exact stats into a global (min, max) on the ORDER BY column, then splits that range into target_partitions equal-width buckets and logs the N-1 interior cut points. Int64-only for now to keep the boundary-math API tractable; types we don't handle log a skip message and fall through to today's plan. Still no plan transformation — boundaries are computed and printed only. With the test fixture (min=0, max=99, target_partitions=4) we now log: interior boundaries: [Int64(24), Int64(49), Int64(74)] Co-Authored-By: Claude Opus 4.7 (1M context) --- .../physical-optimizer/src/parallel_window.rs | 67 ++++++++++++++++--- 1 file changed, 58 insertions(+), 9 deletions(-) diff --git a/datafusion/physical-optimizer/src/parallel_window.rs b/datafusion/physical-optimizer/src/parallel_window.rs index 98d1e10083972..88380d398d734 100644 --- a/datafusion/physical-optimizer/src/parallel_window.rs +++ b/datafusion/physical-optimizer/src/parallel_window.rs @@ -23,7 +23,9 @@ //! single ORDER BY column. It does not transform the plan yet. use crate::PhysicalOptimizerRule; +use datafusion_common::ScalarValue; use datafusion_common::config::ConfigOptions; +use datafusion_common::stats::Precision; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_expr::WindowFrameUnits; use datafusion_physical_expr::expressions::Column; @@ -45,11 +47,12 @@ impl PhysicalOptimizerRule for ParallelWindow { fn optimize( &self, plan: Arc, - _config: &ConfigOptions, + config: &ConfigOptions, ) -> datafusion_common::Result> { + let target_partitions = config.execution.target_partitions; let out = plan.transform_down(|node| { if let Some(window) = node.downcast_ref::() { - probe_candidate(window)?; + probe_candidate(window, target_partitions)?; } Ok(Transformed::no(node)) })?; @@ -65,7 +68,10 @@ impl PhysicalOptimizerRule for ParallelWindow { } } -fn probe_candidate(window: &BoundedWindowAggExec) -> datafusion_common::Result<()> { +fn probe_candidate( + window: &BoundedWindowAggExec, + target_partitions: usize, +) -> datafusion_common::Result<()> { // v1 scope: single ORDER BY column, no PARTITION BY, RANGE frame. if !window.partition_keys().is_empty() { return Ok(()); @@ -85,21 +91,64 @@ fn probe_candidate(window: &BoundedWindowAggExec) -> datafusion_common::Result<( }; let col_idx = col.index(); let col_name = col.name().to_string(); - let n = child.output_partitioning().partition_count(); + let n_in = child.output_partitioning().partition_count(); info!( "ParallelWindow: candidate BoundedWindowAggExec on `{col_name}` (RANGE frame, no PARTITION BY); \ - child has {n} partitions" + child has {n_in} partitions, target_partitions={target_partitions}" ); - for i in 0..n { + let mut global_min: Precision = Precision::Absent; + let mut global_max: Precision = Precision::Absent; + for i in 0..n_in { let stats = child.partition_statistics(Some(i))?; let col_stats = &stats.column_statistics[col_idx]; + global_min = match global_min { + Precision::Absent => col_stats.min_value.clone(), + other => other.min(&col_stats.min_value), + }; + global_max = match global_max { + Precision::Absent => col_stats.max_value.clone(), + other => other.max(&col_stats.max_value), + }; + } + + let (Precision::Exact(min), Precision::Exact(max)) = (&global_min, &global_max) + else { + info!(" global bounds not Exact (min={global_min:?}, max={global_max:?}); skip"); + return Ok(()); + }; + + let Some(boundaries) = equal_width_boundaries(min, max, target_partitions) else { info!( - " partition {i}: min={:?} max={:?} rows={:?}", - col_stats.min_value, col_stats.max_value, stats.num_rows + " cannot split [{min:?}, {max:?}] into {target_partitions} buckets (type not yet supported)" ); - } + return Ok(()); + }; + info!(" global min={min:?} max={max:?}; interior boundaries: {boundaries:?}"); Ok(()) } + +/// Split the closed interval `[min, max]` into `n` equal-width buckets and +/// return the `n - 1` interior cut points. +/// v1: Int64 only — keeps the API small while we settle the optimizer shape. +fn equal_width_boundaries( + min: &ScalarValue, + max: &ScalarValue, + n: usize, +) -> Option> { + if n <= 1 { + return Some(vec![]); + } + let (ScalarValue::Int64(Some(lo)), ScalarValue::Int64(Some(hi))) = (min, max) else { + return None; + }; + let span = hi.checked_sub(*lo)?; + let n_i = i64::try_from(n).ok()?; + let mut cuts = Vec::with_capacity(n - 1); + for i in 1..n_i { + cuts.push(ScalarValue::Int64(Some(lo + span * i / n_i))); + } + Some(cuts) +} From 8bab6530d90c0b725276e81cabfbe5f5a96fd544 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Wed, 17 Jun 2026 20:01:43 -0600 Subject: [PATCH 03/12] ParallelWindow: extract frame halo and log halo-expanded bucket ranges MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reads (halo_preceding, halo_following) from the window frame's start/end bounds (Int64 only, finite values only). For each output partition, logs its primary range [b_i, b_{i+1}) and the halo-expanded range [b_i - halo_p, b_{i+1} + halo_f) — the latter is what the routing layer will need to actually deliver per partition so the window frames at the seams compute correctly. Adds a TODO referencing a future HaloDropExec that sits above the window per partition and drops rows outside the primary range, so each input row surfaces in exactly one output partition. With the test fixture (min=0, max=99, target_partitions=4, RANGE BETWEEN 5 PRECEDING AND CURRENT ROW) we now log: bucket 0: primary [0, 24) expanded [-5, 24) bucket 1: primary [24, 49) expanded [19, 49) bucket 2: primary [49, 74) expanded [44, 74) bucket 3: primary [74, 100) expanded [69, 100) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../physical-optimizer/src/parallel_window.rs | 56 ++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/datafusion/physical-optimizer/src/parallel_window.rs b/datafusion/physical-optimizer/src/parallel_window.rs index 88380d398d734..23d771e616e61 100644 --- a/datafusion/physical-optimizer/src/parallel_window.rs +++ b/datafusion/physical-optimizer/src/parallel_window.rs @@ -27,7 +27,7 @@ use datafusion_common::ScalarValue; use datafusion_common::config::ConfigOptions; use datafusion_common::stats::Precision; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_expr::WindowFrameUnits; +use datafusion_expr::{WindowFrameBound, WindowFrameUnits}; use datafusion_physical_expr::expressions::Column; use datafusion_physical_plan::windows::BoundedWindowAggExec; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; @@ -84,6 +84,15 @@ fn probe_candidate( if frame.units != WindowFrameUnits::Range { return Ok(()); } + let Some((halo_preceding, halo_following)) = + i64_halo(&frame.start_bound, &frame.end_bound) + else { + info!( + " frame bounds not finite Int64 (start={:?}, end={:?}); skip", + frame.start_bound, frame.end_bound + ); + return Ok(()); + }; let child = window.input(); let Some(col) = order_by[0].expr.downcast_ref::() else { @@ -126,10 +135,55 @@ fn probe_candidate( return Ok(()); }; info!(" global min={min:?} max={max:?}; interior boundaries: {boundaries:?}"); + info!(" halo: {halo_preceding} preceding, {halo_following} following"); + + // Each output partition's primary range is half-open [b_i, b_{i+1}); the + // expanded (routed) range adds halo on each side so the per-partition + // window can compute correct frame values at the seam. + // TODO: a future `HaloDropExec` (or equivalent) sits above the window per + // partition and drops rows outside its primary range so each input row + // surfaces in exactly one output partition. + let (ScalarValue::Int64(Some(lo)), ScalarValue::Int64(Some(hi))) = (min, max) else { + return Ok(()); + }; + let mut edges: Vec = std::iter::once(*lo) + .chain(boundaries.iter().filter_map(|b| match b { + ScalarValue::Int64(Some(v)) => Some(*v), + _ => None, + })) + .chain(std::iter::once(*hi + 1)) + .collect(); + edges.dedup(); + for (i, win) in edges.windows(2).enumerate() { + let start = win[0] - halo_preceding; + let end = win[1] + halo_following; + info!( + " bucket {i}: primary [{}, {}) expanded [{start}, {end})", + win[0], win[1] + ); + } Ok(()) } +/// Extract `(halo_preceding, halo_following)` in order-key units from a RANGE +/// window frame. Returns `None` for UNBOUNDED bounds or non-`Int64` distances. +/// v1 scope: only `Preceding(Int64)` / `CurrentRow` for the start bound, and +/// `CurrentRow` / `Following(Int64)` for the end bound. +fn i64_halo(start: &WindowFrameBound, end: &WindowFrameBound) -> Option<(i64, i64)> { + let preceding = match start { + WindowFrameBound::Preceding(ScalarValue::Int64(Some(n))) => *n, + WindowFrameBound::CurrentRow => 0, + _ => return None, + }; + let following = match end { + WindowFrameBound::Following(ScalarValue::Int64(Some(n))) => *n, + WindowFrameBound::CurrentRow => 0, + _ => return None, + }; + Some((preceding, following)) +} + /// Split the closed interval `[min, max]` into `n` equal-width buckets and /// return the `n - 1` interior cut points. /// v1: Int64 only — keeps the API small while we settle the optimizer shape. From 7e73305b7139b2a10729a9d678fc6e3646d62d3d Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Wed, 17 Jun 2026 20:40:28 -0600 Subject: [PATCH 04/12] Add runtime_statistics() trait method + log SortExec runtime min/max MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ExecutionPlan gains a default-impl runtime_statistics(partition) returning Pin>> + Send>>. Default resolves immediately to partition_statistics(Some(partition)); pipeline- breaking operators (e.g. SortExec) will later override to complete the future only after relevant work has run. For the in-memory ExternalSorter path in SortExec, wrap the sorted output stream with an observer that captures first/last value of the leading sort column and logs the per-partition min/max once the stream ends. This is side-effect logging only — the trait override is not wired yet — but confirms the values we need for the upcoming RangeRepartitionExec are computable at runtime and match what plan-time stats reported. Updates parallel_window.slt to actually execute the query (in addition to EXPLAIN) so the sort path runs and the new log fires. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../physical-plan/src/execution_plan.rs | 23 ++++ datafusion/physical-plan/src/sorts/sort.rs | 121 ++++++++++++++++-- .../test_files/parallel_window.slt | 20 +++ 3 files changed, 150 insertions(+), 14 deletions(-) diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index 8577e86f00514..bd333a12b82c7 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -39,6 +39,8 @@ pub use datafusion_physical_expr::{ use std::any::Any; use std::fmt::Debug; +use std::future::{Future, ready}; +use std::pin::Pin; use std::sync::{Arc, LazyLock}; use crate::coalesce_partitions::CoalescePartitionsExec; @@ -513,6 +515,27 @@ pub trait ExecutionPlan: Any + Debug + DisplayAs + Send + Sync { Ok(Arc::new(Statistics::new_unknown(&self.schema()))) } + /// Returns statistics for `partition` that are guaranteed to be accurate + /// only once that partition's input has been fully consumed. + /// + /// Default impl resolves immediately to [`Self::partition_statistics`] + /// (i.e. whatever plan-time stats are available). Pipeline-breaking + /// operators (e.g. `SortExec`) may override to expose runtime-derived + /// exact stats — for example, the min/max of the sorted buffer — by + /// completing the returned future only after the relevant work has run. + /// + /// Callers are expected to drive the input by polling `execute()` (or to + /// arrange equivalent forward progress) so the override can signal + /// completion; the default impl is always immediately ready and does not + /// depend on execution. + fn runtime_statistics( + &self, + partition: usize, + ) -> Pin>> + Send>> { + let stats = self.partition_statistics(Some(partition)); + Box::pin(ready(stats)) + } + /// Returns `true` if a limit can be safely pushed down through this /// `ExecutionPlan` node. /// diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 929ff4f7dfc85..86b59ef96a0db 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -70,8 +70,79 @@ use datafusion_physical_expr::LexOrdering; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr::expressions::{DynamicFilterPhysicalExpr, lit}; -use futures::{StreamExt, TryStreamExt}; -use log::{debug, trace}; +use crate::expressions::Column; +use datafusion_common::ScalarValue; +use datafusion_execution::RecordBatchStream; +use futures::{Stream, StreamExt, TryStreamExt}; +use log::{debug, info, trace}; +use std::pin::Pin; +use std::task::{Context, Poll}; + +/// Stream adapter that observes the first and last value of one column as +/// batches flow through, logging the resulting min/max once the upstream +/// stream ends. Intended to wrap a sorted stream so the observed values are +/// the exact partition-local extrema on the sort key. +struct ObserveSortedMinMax { + inner: SendableRecordBatchStream, + state: Option, +} + +struct ObserveState { + partition: usize, + col_idx: usize, + col_name: String, + descending: bool, + first: Option, + last: Option, + rows_seen: usize, +} + +impl Stream for ObserveSortedMinMax { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + match Pin::new(&mut self.inner).poll_next(cx) { + Poll::Ready(Some(Ok(batch))) => { + if let Some(state) = self.state.as_mut() + && batch.num_rows() > 0 + { + let col = batch.column(state.col_idx); + let last_idx = col.len() - 1; + if state.first.is_none() { + state.first = ScalarValue::try_from_array(col, 0).ok(); + } + state.last = ScalarValue::try_from_array(col, last_idx).ok(); + state.rows_seen += col.len(); + } + Poll::Ready(Some(Ok(batch))) + } + Poll::Ready(None) => { + if let Some(state) = self.state.take() { + let (min, max) = if state.descending { + (state.last, state.first) + } else { + (state.first, state.last) + }; + info!( + "SortExec runtime min/max partition {} `{}`: min={:?} max={:?} rows={}", + state.partition, state.col_name, min, max, state.rows_seen + ); + } + Poll::Ready(None) + } + other => other, + } + } +} + +impl RecordBatchStream for ObserveSortedMinMax { + fn schema(&self) -> SchemaRef { + self.inner.schema() + } +} struct ExternalSorterMetrics { /// metrics @@ -1263,18 +1334,40 @@ impl ExecutionPlan for SortExec { &self.metrics_set, context.runtime_env(), )?; - Ok(Box::pin(RecordBatchStreamAdapter::new( - self.schema(), - futures::stream::once(async move { - while let Some(batch) = input.next().await { - let batch = batch?; - sorter.insert_batch(batch).await?; - } - drop(input); - sorter.sort().await - }) - .try_flatten(), - ))) + let sort_expr = self.expr.first(); + let observe_state = + sort_expr + .expr + .downcast_ref::() + .map(|col| ObserveState { + partition, + col_idx: col.index(), + col_name: col.name().to_string(), + descending: sort_expr.options.descending, + first: None, + last: None, + rows_seen: 0, + }); + let sorted: SendableRecordBatchStream = + Box::pin(RecordBatchStreamAdapter::new( + self.schema(), + futures::stream::once(async move { + while let Some(batch) = input.next().await { + let batch = batch?; + sorter.insert_batch(batch).await?; + } + drop(input); + sorter.sort().await + }) + .try_flatten(), + )); + match observe_state { + Some(state) => Ok(Box::pin(ObserveSortedMinMax { + inner: sorted, + state: Some(state), + })), + None => Ok(sorted), + } } } } diff --git a/datafusion/sqllogictest/test_files/parallel_window.slt b/datafusion/sqllogictest/test_files/parallel_window.slt index 4aeb1d9df510a..2eb408fcfa86c 100644 --- a/datafusion/sqllogictest/test_files/parallel_window.slt +++ b/datafusion/sqllogictest/test_files/parallel_window.slt @@ -98,6 +98,26 @@ physical_plan 05)--------SortExec: expr=[seq@0 ASC NULLS LAST], preserve_partitioning=[true], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] 06)----------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/3.parquet]]}, projection=[seq, amount], file_type=parquet, sort_order_for_reorder=[seq@0 ASC NULLS LAST], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] +# Actually execute the query — this drives the SortExec down the (false, None) +# external-sort branch in each input partition and lets the runtime min/max +# observer fire. Sorted-output truncated to 5 for a small reproducible row set. +query II +SELECT + seq, + SUM(amount) OVER ( + ORDER BY seq + RANGE BETWEEN 5 PRECEDING AND CURRENT ROW + ) AS rolling_sum +FROM events +ORDER BY seq +LIMIT 5; +---- +0 0 +1 1 +2 3 +3 6 +4 10 + # Reset session settings so this file doesn't leak config into the rest of the run. statement ok set datafusion.explain.show_statistics = false; From aad2e786beda8653cc9dcddf0ffb5e352b478034 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Wed, 17 Jun 2026 21:57:23 -0600 Subject: [PATCH 05/12] before big refactor --- .../physical-optimizer/src/optimizer.rs | 12 +- .../physical-optimizer/src/parallel_window.rs | 54 +++++++- datafusion/physical-plan/src/lib.rs | 1 + .../physical-plan/src/range_repartition.rs | 128 ++++++++++++++++++ datafusion/physical-plan/src/sorts/sort.rs | 111 ++++++--------- .../test_files/parallel_window.slt | 13 +- 6 files changed, 238 insertions(+), 81 deletions(-) create mode 100644 datafusion/physical-plan/src/range_repartition.rs diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs index 3bb5495bbca87..762f8308379bb 100644 --- a/datafusion/physical-optimizer/src/optimizer.rs +++ b/datafusion/physical-optimizer/src/optimizer.rs @@ -188,12 +188,14 @@ impl PhysicalOptimizer { // [`EnsureRequirements`](crate::ensure_requirements) for the per-phase // breakdown, and // for the original failure mode. - // Probe candidate BoundedWindowAggExec nodes for per-partition stats - // on their ORDER BY column. Skeleton step toward range-repartitioning - // unpartitioned RANGE-frame windows. Must run before EnsureRequirements - // so the window's child is still the raw source (no inserted Sort/SPM). - Arc::new(ParallelWindow::new()), Arc::new(EnsureRequirements::new()), + // Wrap the per-partition SortExec under a no-PARTITION-BY RANGE-frame + // window with a (currently passthrough) RangeRepartitionExec. The + // wrapper will eventually drive range routing; for now it just awaits + // `runtime_statistics` on each input partition and logs them. Runs + // *after* EnsureRequirements so the SortExec that EnsureRequirements + // inserts is in place. + Arc::new(ParallelWindow::new()), // The CombinePartialFinalAggregate rule should be applied after distribution enforcement Arc::new(CombinePartialFinalAggregate::new()), // Run once after the local sorting requirement is changed diff --git a/datafusion/physical-optimizer/src/parallel_window.rs b/datafusion/physical-optimizer/src/parallel_window.rs index 23d771e616e61..6725df92bd7b4 100644 --- a/datafusion/physical-optimizer/src/parallel_window.rs +++ b/datafusion/physical-optimizer/src/parallel_window.rs @@ -29,6 +29,8 @@ use datafusion_common::stats::Precision; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_expr::{WindowFrameBound, WindowFrameUnits}; use datafusion_physical_expr::expressions::Column; +use datafusion_physical_plan::range_repartition::RangeRepartitionExec; +use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::windows::BoundedWindowAggExec; use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use log::info; @@ -51,10 +53,19 @@ impl PhysicalOptimizerRule for ParallelWindow { ) -> datafusion_common::Result> { let target_partitions = config.execution.target_partitions; let out = plan.transform_down(|node| { - if let Some(window) = node.downcast_ref::() { - probe_candidate(window, target_partitions)?; + let Some(window) = node.downcast_ref::() else { + return Ok(Transformed::no(node)); + }; + probe_candidate(window, target_partitions)?; + if !is_eligible(window) { + return Ok(Transformed::no(node)); } - Ok(Transformed::no(node)) + // Descend through whatever EnsureRequirements stacked on top of the + // sort (typically SortPreservingMergeExec) until we find the + // SortExec and wrap it with RangeRepartitionExec. + let new_child = wrap_first_sort_descendant(Arc::clone(&node.children()[0]))?; + let new_window = Arc::clone(&node).with_new_children(vec![new_child])?; + Ok(Transformed::yes(new_window)) })?; Ok(out.data) } @@ -68,6 +79,43 @@ impl PhysicalOptimizerRule for ParallelWindow { } } +/// Same shape gate that `probe_candidate` enforces, returned as a bool so +/// the rule can decide whether to do the plan rewrite. +fn is_eligible(window: &BoundedWindowAggExec) -> bool { + if !window.partition_keys().is_empty() { + return false; + } + let order_by = window.window_expr()[0].order_by(); + if order_by.len() != 1 { + return false; + } + if order_by[0].expr.downcast_ref::().is_none() { + return false; + } + let frame = window.window_expr()[0].get_window_frame(); + if frame.units != WindowFrameUnits::Range { + return false; + } + i64_halo(&frame.start_bound, &frame.end_bound).is_some() +} + +/// Walk down through single-child operators until we hit a `SortExec`; +/// replace it in-place with `RangeRepartitionExec(SortExec)` and rebuild +/// the chain back up. If no `SortExec` is found, leave the subtree alone. +fn wrap_first_sort_descendant( + node: Arc, +) -> datafusion_common::Result> { + if node.downcast_ref::().is_some() { + return Ok(Arc::new(RangeRepartitionExec::new(node))); + } + let children = node.children(); + if children.len() != 1 { + return Ok(node); + } + let new_child = wrap_first_sort_descendant(Arc::clone(children[0]))?; + node.with_new_children(vec![new_child]) +} + fn probe_candidate( window: &BoundedWindowAggExec, target_partitions: usize, diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index c7b1d4729e21d..9e71ac61ec189 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -83,6 +83,7 @@ pub mod metrics; pub mod operator_statistics; pub mod placeholder_row; pub mod projection; +pub mod range_repartition; pub mod recursive_query; pub mod repartition; pub mod scalar_subquery; diff --git a/datafusion/physical-plan/src/range_repartition.rs b/datafusion/physical-plan/src/range_repartition.rs new file mode 100644 index 0000000000000..2cb9da04947cb --- /dev/null +++ b/datafusion/physical-plan/src/range_repartition.rs @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Skeleton operator that will eventually range-partition its input on a +//! single order-key into N output partitions, with halo overlap for +//! bounded RANGE-frame window functions sitting above it. +//! +//! This commit is the bare scaffold: +//! - one instance, one cached `PlanProperties` inherited from the input +//! - on the first call to `execute()`, spawn one tokio task per input +//! partition that awaits `child.runtime_statistics(i)` and logs the +//! result. This is the connective tissue that the eventual operator +//! will sit on top of. +//! - `execute(i)` is a pass-through: it forwards to `child.execute(i)` +//! verbatim, so inserting this operator into a plan is a no-op for +//! correctness today. + +use std::sync::{Arc, Once}; + +use datafusion_common::Result; +use datafusion_execution::TaskContext; +use log::{info, warn}; + +use crate::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, +}; + +#[derive(Debug)] +pub struct RangeRepartitionExec { + input: Arc, + cache: Arc, + init: Arc, +} + +impl RangeRepartitionExec { + pub fn new(input: Arc) -> Self { + let cache = Arc::clone(input.properties()); + Self { + input, + cache, + init: Arc::new(Once::new()), + } + } + + pub fn input(&self) -> &Arc { + &self.input + } +} + +impl DisplayAs for RangeRepartitionExec { + fn fmt_as( + &self, + _t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "RangeRepartitionExec") + } +} + +impl ExecutionPlan for RangeRepartitionExec { + fn name(&self) -> &'static str { + "RangeRepartitionExec" + } + + fn properties(&self) -> &Arc { + &self.cache + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + mut children: Vec>, + ) -> Result> { + Ok(Arc::new(Self::new(children.swap_remove(0)))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + let n_in = self.input.output_partitioning().partition_count(); + let input = Arc::clone(&self.input); + self.init.call_once(|| { + for i in 0..n_in { + let input_for_task = Arc::clone(&input); + tokio::spawn(async move { + match input_for_task.runtime_statistics(i).await { + Ok(stats) => { + let col_stats = &stats.column_statistics; + info!( + "RangeRepartitionExec: input partition {i} runtime stats: \ + rows={:?} cols={:?}", + stats.num_rows, + col_stats + .iter() + .map(|c| (c.min_value.clone(), c.max_value.clone())) + .collect::>() + ); + } + Err(e) => warn!( + "RangeRepartitionExec: input partition {i} runtime stats error: {e}" + ), + } + }); + } + }); + self.input.execute(partition, context) + } +} diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 86b59ef96a0db..9817569809253 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -72,76 +72,49 @@ use datafusion_physical_expr::expressions::{DynamicFilterPhysicalExpr, lit}; use crate::expressions::Column; use datafusion_common::ScalarValue; -use datafusion_execution::RecordBatchStream; -use futures::{Stream, StreamExt, TryStreamExt}; -use log::{debug, info, trace}; -use std::pin::Pin; -use std::task::{Context, Poll}; - -/// Stream adapter that observes the first and last value of one column as -/// batches flow through, logging the resulting min/max once the upstream -/// stream ends. Intended to wrap a sorted stream so the observed values are -/// the exact partition-local extrema on the sort key. -struct ObserveSortedMinMax { - inner: SendableRecordBatchStream, - state: Option, +use datafusion_common::stats::Precision; +use datafusion_physical_expr::PhysicalSortExpr; +use futures::{StreamExt, TryStreamExt}; +use log::{debug, trace}; +use std::sync::Mutex; + +/// One mutable slot per SortExec output partition. Populated by `execute()` +/// after its input loop drains — every row has been observed exactly once +/// by then (all three sort paths consume their entire input before emitting +/// the first sorted batch). Surfaced through +/// `ExecutionPlan::runtime_statistics`. +type RuntimeStatsSlots = Arc>>>>; + +fn make_runtime_stats_slots(n_partitions: usize) -> RuntimeStatsSlots { + Arc::new((0..n_partitions).map(|_| Mutex::new(None)).collect()) } -struct ObserveState { - partition: usize, - col_idx: usize, - col_name: String, - descending: bool, - first: Option, - last: Option, - rows_seen: usize, -} - -impl Stream for ObserveSortedMinMax { - type Item = Result; - - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - match Pin::new(&mut self.inner).poll_next(cx) { - Poll::Ready(Some(Ok(batch))) => { - if let Some(state) = self.state.as_mut() - && batch.num_rows() > 0 - { - let col = batch.column(state.col_idx); - let last_idx = col.len() - 1; - if state.first.is_none() { - state.first = ScalarValue::try_from_array(col, 0).ok(); - } - state.last = ScalarValue::try_from_array(col, last_idx).ok(); - state.rows_seen += col.len(); - } - Poll::Ready(Some(Ok(batch))) - } - Poll::Ready(None) => { - if let Some(state) = self.state.take() { - let (min, max) = if state.descending { - (state.last, state.first) - } else { - (state.first, state.last) - }; - info!( - "SortExec runtime min/max partition {} `{}`: min={:?} max={:?} rows={}", - state.partition, state.col_name, min, max, state.rows_seen - ); - } - Poll::Ready(None) - } - other => other, - } - } -} - -impl RecordBatchStream for ObserveSortedMinMax { - fn schema(&self) -> SchemaRef { - self.inner.schema() +/// Update `running_min`/`running_max` with values from one input batch, +/// evaluating the leading sort expression to a column-shaped array and +/// folding row-by-row. Type-agnostic: relies on `ScalarValue`'s ordering +/// rather than per-type arrow kernels. +fn update_running_min_max( + sort_expr: &PhysicalSortExpr, + batch: &RecordBatch, + running_min: &mut Option, + running_max: &mut Option, +) -> Result<()> { + let col = sort_expr.expr.evaluate(batch)?.into_array(batch.num_rows())?; + for i in 0..col.len() { + if col.is_null(i) { + continue; + } + let value = ScalarValue::try_from_array(&col, i)?; + *running_min = Some(match running_min.take() { + Some(current) if current <= value => current, + _ => value.clone(), + }); + *running_max = Some(match running_max.take() { + Some(current) if current >= value => current, + _ => value, + }); } + Ok(()) } struct ExternalSorterMetrics { @@ -937,6 +910,10 @@ pub struct SortExec { /// If `fetch` is `Some`, this will also be set and a TopK operator may be used. /// If `fetch` is `None`, this will be `None`. filter: Option>>, + /// Per-output-partition slot populated once that partition's input has + /// been fully consumed; exposes exact min/max on the leading sort key + /// via `ExecutionPlan::runtime_statistics`. + runtime_stats: RuntimeStatsSlots, } impl SortExec { diff --git a/datafusion/sqllogictest/test_files/parallel_window.slt b/datafusion/sqllogictest/test_files/parallel_window.slt index 2eb408fcfa86c..ec90d68d4f8e4 100644 --- a/datafusion/sqllogictest/test_files/parallel_window.slt +++ b/datafusion/sqllogictest/test_files/parallel_window.slt @@ -91,12 +91,13 @@ logical_plan 03)----WindowAggr: windowExpr=[[sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW]] 04)------TableScan: events projection=[seq, amount] physical_plan -01)ProjectionExec: expr=[seq@0 as seq, sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW@2 as rolling_sum], statistics=[Rows=Exact(5), Bytes=Exact(80), [(Col[0]: Min=Inexact(Int64(0)) Max=Inexact(Int64(99)) Null=Inexact(0) ScanBytes=Inexact(40)),(Col[1]:)]] -02)--GlobalLimitExec: skip=0, fetch=5, statistics=[Rows=Exact(5), Bytes=Absent, [(Col[0]: Min=Inexact(Int64(0)) Max=Inexact(Int64(99)) Null=Inexact(0) ScanBytes=Inexact(40)),(Col[1]: Min=Inexact(Int64(0)) Max=Inexact(Int64(6)) Null=Inexact(0) ScanBytes=Inexact(40)),(Col[2]:)]] -03)----BoundedWindowAggExec: wdw=[sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted], statistics=[Rows=Exact(100), Bytes=Absent, [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800)),(Col[2]:)]] -04)------SortPreservingMergeExec: [seq@0 ASC NULLS LAST], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] -05)--------SortExec: expr=[seq@0 ASC NULLS LAST], preserve_partitioning=[true], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] -06)----------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/3.parquet]]}, projection=[seq, amount], file_type=parquet, sort_order_for_reorder=[seq@0 ASC NULLS LAST], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] +01)ProjectionExec: expr=[seq@0 as seq, sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW@2 as rolling_sum], statistics=[Rows=Inexact(5), Bytes=Inexact(80), [(Col[0]:),(Col[1]:)]] +02)--GlobalLimitExec: skip=0, fetch=5, statistics=[Rows=Inexact(5), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]] +03)----BoundedWindowAggExec: wdw=[sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted], statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]] +04)------SortPreservingMergeExec: [seq@0 ASC NULLS LAST], statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:)]] +05)--------RangeRepartitionExec, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:)]] +06)----------SortExec: expr=[seq@0 ASC NULLS LAST], preserve_partitioning=[true], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] +07)------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/3.parquet]]}, projection=[seq, amount], file_type=parquet, sort_order_for_reorder=[seq@0 ASC NULLS LAST], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] # Actually execute the query — this drives the SortExec down the (false, None) # external-sort branch in each input partition and lets the runtime min/max From cae9ac6ad80282f5789fae79f7f0f3c4bdd12044 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Wed, 17 Jun 2026 22:18:52 -0600 Subject: [PATCH 06/12] Replace runtime_statistics with SortExtremes; populate inside the sort path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reshape the runtime hook to match the data: a `LexOrdering` (one or more sort exprs over arbitrary expressions, ASC/DESC, nulls first/last) doesn't align with `Statistics`'s per-column min/max. Replace the previous `runtime_statistics` trait method with `runtime_sort_extremes`, returning `Option` with `min`/`max: Vec` whose length matches the operator's output ordering. SortExec carries one `Arc>>` per output partition. The slot is populated by `ExternalSorter::sort_batch_stream` right after `sort_batch_chunked` produces sorted chunks — leading and trailing rows are the lex-smallest and lex-largest endpoints for that chunk, and `merge_chunk_into_slot` folds across chunks (path 3 spill case) using a small `SortOptions`-aware lex_compare. Zero double-sort, zero row iteration on our side. `runtime_sort_extremes` reads the slot synchronously: by the time downstream sees the first output batch, the slot is populated for that partition. Default impl on the trait returns `Ok(None)` so other operators opt in only when they have something to say. RangeRepartitionExec is rewritten as a pass-through that wraps each forwarded stream with a tiny one-shot observer: on the first batch yielded, call `child.runtime_sort_extremes(partition)` and log. Confirms end-to-end wiring; future commits replace the pass-through with real range routing. Adds a TODO in `evaluate_row` to switch from "evaluate over the whole batch, take one row" to a single-row slice/RowConverter once we care about the constant-factor cost. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../physical-plan/src/execution_plan.rs | 54 ++-- datafusion/physical-plan/src/lib.rs | 6 +- .../physical-plan/src/range_repartition.rs | 124 +++++---- datafusion/physical-plan/src/sorts/sort.rs | 253 +++++++++++++----- 4 files changed, 297 insertions(+), 140 deletions(-) diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index bd333a12b82c7..1b08035e13ebc 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -29,7 +29,7 @@ use arrow_schema::Schema; pub use datafusion_common::hash_utils; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; pub use datafusion_common::utils::project_schema; -pub use datafusion_common::{ColumnStatistics, Statistics, internal_err}; +pub use datafusion_common::{ColumnStatistics, ScalarValue, Statistics, internal_err}; pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream}; pub use datafusion_expr::{Accumulator, ColumnarValue}; pub use datafusion_physical_expr::window::WindowExpr; @@ -39,8 +39,6 @@ pub use datafusion_physical_expr::{ use std::any::Any; use std::fmt::Debug; -use std::future::{Future, ready}; -use std::pin::Pin; use std::sync::{Arc, LazyLock}; use crate::coalesce_partitions::CoalescePartitionsExec; @@ -95,6 +93,26 @@ use futures::stream::{StreamExt, TryStreamExt}; /// /// [`datafusion-examples`]: https://github.com/apache/datafusion/tree/main/datafusion-examples /// [`memory_pool_execution_plan.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs + +/// Runtime-derived endpoints of a single partition's output, expressed in the +/// operator's declared output ordering (the `PhysicalSortExpr` list returned +/// by `equivalence_properties().output_ordering()`). +/// +/// `min` and `max` are tuples of values — one entry per sort key — taken at +/// the lex-smallest and lex-largest output rows respectively. For the leading +/// sort key these are exactly the natural min/max of that key (after honoring +/// ASC/DESC). For trailing sort keys the entries hold the value of that key +/// at the lex-extreme row, not that key's own natural extreme. +#[derive(Debug, Clone)] +pub struct SortExtremes { + /// Sort-key values at the lex-smallest row across the partition. + pub min: Vec, + /// Sort-key values at the lex-largest row across the partition. + pub max: Vec, + /// Total non-empty rows that contributed to `min`/`max`. + pub row_count: usize, +} + pub trait ExecutionPlan: Any + Debug + DisplayAs + Send + Sync { /// Short name for the ExecutionPlan, such as 'DataSourceExec'. /// @@ -515,25 +533,21 @@ pub trait ExecutionPlan: Any + Debug + DisplayAs + Send + Sync { Ok(Arc::new(Statistics::new_unknown(&self.schema()))) } - /// Returns statistics for `partition` that are guaranteed to be accurate - /// only once that partition's input has been fully consumed. + /// Runtime-derived endpoints of `partition`'s output along the operator's + /// declared output ordering (i.e. each `PhysicalSortExpr` in + /// `equivalence_properties().output_ordering()`). /// - /// Default impl resolves immediately to [`Self::partition_statistics`] - /// (i.e. whatever plan-time stats are available). Pipeline-breaking - /// operators (e.g. `SortExec`) may override to expose runtime-derived - /// exact stats — for example, the min/max of the sorted buffer — by - /// completing the returned future only after the relevant work has run. + /// Returns `Ok(None)` by default. Operators that are pipeline-breaking + /// along their output ordering (e.g. `SortExec`) may override to expose + /// the lex-min / lex-max tuple once the partition's input has been fully + /// consumed. Callers are responsible for ensuring the upstream has made + /// enough progress (typically by polling `execute()`) before relying on + /// the result; until then this returns `Ok(None)`. /// - /// Callers are expected to drive the input by polling `execute()` (or to - /// arrange equivalent forward progress) so the override can signal - /// completion; the default impl is always immediately ready and does not - /// depend on execution. - fn runtime_statistics( - &self, - partition: usize, - ) -> Pin>> + Send>> { - let stats = self.partition_statistics(Some(partition)); - Box::pin(ready(stats)) + /// See [`SortExtremes`] for the result shape. + fn runtime_sort_extremes(&self, partition: usize) -> Result> { + let _ = partition; + Ok(None) } /// Returns `true` if a limit can be safely pushed down through this diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 9e71ac61ec189..96ac6f604d5e9 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -42,9 +42,9 @@ pub use datafusion_physical_expr::{ pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay}; pub use crate::execution_plan::{ - ExecutionPlan, ExecutionPlanProperties, PlanProperties, collect, collect_partitioned, - displayable, execute_input_stream, execute_stream, execute_stream_partitioned, - get_plan_string, with_new_children_if_necessary, + ExecutionPlan, ExecutionPlanProperties, PlanProperties, SortExtremes, collect, + collect_partitioned, displayable, execute_input_stream, execute_stream, + execute_stream_partitioned, get_plan_string, with_new_children_if_necessary, }; pub use crate::metrics::Metric; pub use crate::ordering::InputOrderMode; diff --git a/datafusion/physical-plan/src/range_repartition.rs b/datafusion/physical-plan/src/range_repartition.rs index 2cb9da04947cb..55bfab52e1a11 100644 --- a/datafusion/physical-plan/src/range_repartition.rs +++ b/datafusion/physical-plan/src/range_repartition.rs @@ -19,42 +19,45 @@ //! single order-key into N output partitions, with halo overlap for //! bounded RANGE-frame window functions sitting above it. //! -//! This commit is the bare scaffold: -//! - one instance, one cached `PlanProperties` inherited from the input -//! - on the first call to `execute()`, spawn one tokio task per input -//! partition that awaits `child.runtime_statistics(i)` and logs the -//! result. This is the connective tissue that the eventual operator -//! will sit on top of. -//! - `execute(i)` is a pass-through: it forwards to `child.execute(i)` -//! verbatim, so inserting this operator into a plan is a no-op for -//! correctness today. - -use std::sync::{Arc, Once}; +//! Today this is a pass-through that wraps each forwarded stream with a +//! one-shot observer: when the first batch of partition `i` is yielded +//! (which guarantees the upstream sort has folded at least one chunk +//! into its `SortExtremes` slot), it calls +//! `child.runtime_sort_extremes(i)` and logs the result. Plan integration +//! is therefore a no-op for correctness today, but the log line confirms +//! the runtime-stats plumbing reaches the downstream consumer. +// +// TODO: replace pass-through with real range routing — gather all K +// per-input-partition `SortExtremes`, derive global boundaries, route +// rows by binary search on the leading sort key, emit halo-expanded +// per-output-partition streams. + +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; +use arrow::array::RecordBatch; use datafusion_common::Result; -use datafusion_execution::TaskContext; -use log::{info, warn}; +use datafusion_execution::{RecordBatchStream, TaskContext}; +use futures::Stream; +use log::info; use crate::{ - DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, SendableRecordBatchStream, }; +use arrow::datatypes::SchemaRef; #[derive(Debug)] pub struct RangeRepartitionExec { input: Arc, cache: Arc, - init: Arc, } impl RangeRepartitionExec { pub fn new(input: Arc) -> Self { let cache = Arc::clone(input.properties()); - Self { - input, - cache, - init: Arc::new(Once::new()), - } + Self { input, cache } } pub fn input(&self) -> &Arc { @@ -97,32 +100,61 @@ impl ExecutionPlan for RangeRepartitionExec { partition: usize, context: Arc, ) -> Result { - let n_in = self.input.output_partitioning().partition_count(); - let input = Arc::clone(&self.input); - self.init.call_once(|| { - for i in 0..n_in { - let input_for_task = Arc::clone(&input); - tokio::spawn(async move { - match input_for_task.runtime_statistics(i).await { - Ok(stats) => { - let col_stats = &stats.column_statistics; - info!( - "RangeRepartitionExec: input partition {i} runtime stats: \ - rows={:?} cols={:?}", - stats.num_rows, - col_stats - .iter() - .map(|c| (c.min_value.clone(), c.max_value.clone())) - .collect::>() - ); - } - Err(e) => warn!( - "RangeRepartitionExec: input partition {i} runtime stats error: {e}" - ), - } - }); + let inner = self.input.execute(partition, context)?; + Ok(Box::pin(LogOnFirstBatch { + inner, + partition, + input: Arc::clone(&self.input), + logged: false, + })) + } +} + +/// Pass-through stream that, on the first batch it yields, asks the child +/// operator for its `runtime_sort_extremes` and logs them. Confirms that +/// the upstream sort has populated its slot by the time downstream sees +/// data. +struct LogOnFirstBatch { + inner: SendableRecordBatchStream, + partition: usize, + input: Arc, + logged: bool, +} + +impl Stream for LogOnFirstBatch { + type Item = Result; + + fn poll_next( + mut self: Pin<&mut Self>, + cx: &mut Context<'_>, + ) -> Poll> { + let item = Pin::new(&mut self.inner).poll_next(cx); + if let Poll::Ready(Some(Ok(_))) = &item + && !self.logged + { + self.logged = true; + match self.input.runtime_sort_extremes(self.partition) { + Ok(Some(extremes)) => info!( + "RangeRepartitionExec: input partition {} runtime_sort_extremes: \ + min={:?} max={:?} rows={}", + self.partition, extremes.min, extremes.max, extremes.row_count + ), + Ok(None) => info!( + "RangeRepartitionExec: input partition {} runtime_sort_extremes: None", + self.partition + ), + Err(e) => info!( + "RangeRepartitionExec: input partition {} runtime_sort_extremes error: {e}", + self.partition + ), } - }); - self.input.execute(partition, context) + } + item + } +} + +impl RecordBatchStream for LogOnFirstBatch { + fn schema(&self) -> SchemaRef { + self.inner.schema() } } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 9817569809253..4c8dcc850a586 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -70,50 +70,138 @@ use datafusion_physical_expr::LexOrdering; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr::expressions::{DynamicFilterPhysicalExpr, lit}; -use crate::expressions::Column; +use crate::SortExtremes; +use arrow::array::ArrayRef; use datafusion_common::ScalarValue; -use datafusion_common::stats::Precision; -use datafusion_physical_expr::PhysicalSortExpr; use futures::{StreamExt, TryStreamExt}; use log::{debug, trace}; use std::sync::Mutex; -/// One mutable slot per SortExec output partition. Populated by `execute()` -/// after its input loop drains — every row has been observed exactly once -/// by then (all three sort paths consume their entire input before emitting -/// the first sorted batch). Surfaced through -/// `ExecutionPlan::runtime_statistics`. -type RuntimeStatsSlots = Arc>>>>; - -fn make_runtime_stats_slots(n_partitions: usize) -> RuntimeStatsSlots { - Arc::new((0..n_partitions).map(|_| Mutex::new(None)).collect()) +/// One slot per SortExec output partition. Populated inside the sort code +/// path each time `sort_batch_chunked` produces sorted batches, and +/// surfaced via `ExecutionPlan::runtime_sort_extremes`. By the time +/// downstream sees the first output batch the slot for that partition is +/// already populated. +type SortExtremesSlot = Arc>>; +type SortExtremesSlots = Arc>; + +fn make_sort_extremes_slots(n_partitions: usize) -> SortExtremesSlots { + Arc::new( + (0..n_partitions) + .map(|_| Arc::new(Mutex::new(None))) + .collect(), + ) } -/// Update `running_min`/`running_max` with values from one input batch, -/// evaluating the leading sort expression to a column-shaped array and -/// folding row-by-row. Type-agnostic: relies on `ScalarValue`'s ordering -/// rather than per-type arrow kernels. -fn update_running_min_max( - sort_expr: &PhysicalSortExpr, +/// Pull row values for each sort expression at `row` of an already-sorted +/// chunk. Used to read the lex-smallest (row 0) and lex-largest (row n-1) +/// candidate tuples from each `sort_batch_chunked` invocation. +// +// TODO: this evaluates each PhysicalExpr over the whole batch and then takes +// one row out of the resulting array. We could slice the batch down to a +// single-row view first (`batch.slice(row, 1)`) and evaluate against that, or +// use a `RowConverter` to pull a single row through the expression — either +// is O(1) per call instead of O(rows). +fn evaluate_row( + expressions: &LexOrdering, batch: &RecordBatch, - running_min: &mut Option, - running_max: &mut Option, + row: usize, +) -> Result> { + let n = batch.num_rows(); + expressions + .iter() + .map(|sort_expr| { + let arr: ArrayRef = sort_expr.expr.evaluate(batch)?.into_array(n)?; + ScalarValue::try_from_array(&arr, row) + }) + .collect() +} + +/// Compare two `ScalarValue` tuples using `SortOptions` per key +/// (descending and nulls_first), so callers can pick the lex-smaller or +/// lex-larger of two endpoint candidates without an arrow round-trip. +fn lex_compare( + a: &[ScalarValue], + b: &[ScalarValue], + expressions: &LexOrdering, +) -> std::cmp::Ordering { + use std::cmp::Ordering; + for ((va, vb), sort_expr) in a.iter().zip(b.iter()).zip(expressions.iter()) { + let cmp = match (va.is_null(), vb.is_null()) { + (true, true) => Ordering::Equal, + (true, false) => { + if sort_expr.options.nulls_first { + Ordering::Less + } else { + Ordering::Greater + } + } + (false, true) => { + if sort_expr.options.nulls_first { + Ordering::Greater + } else { + Ordering::Less + } + } + (false, false) => va.partial_cmp(vb).unwrap_or(Ordering::Equal), + }; + let cmp = if sort_expr.options.descending { + cmp.reverse() + } else { + cmp + }; + if cmp != Ordering::Equal { + return cmp; + } + } + Ordering::Equal +} + +/// Fold the endpoints of one already-sorted chunk produced by +/// `sort_batch_chunked` into the running [`SortExtremes`] for one partition. +fn merge_chunk_into_slot( + slot: &Mutex>, + expressions: &LexOrdering, + sorted_chunks: &[RecordBatch], ) -> Result<()> { - let col = sort_expr.expr.evaluate(batch)?.into_array(batch.num_rows())?; - for i in 0..col.len() { - if col.is_null(i) { - continue; - } - let value = ScalarValue::try_from_array(&col, i)?; - *running_min = Some(match running_min.take() { - Some(current) if current <= value => current, - _ => value.clone(), - }); - *running_max = Some(match running_max.take() { - Some(current) if current >= value => current, - _ => value, - }); + let total_rows: usize = sorted_chunks.iter().map(|b| b.num_rows()).sum(); + if total_rows == 0 { + return Ok(()); } + let Some(first_chunk) = sorted_chunks.iter().find(|b| b.num_rows() > 0) else { + return Ok(()); + }; + let Some(last_chunk) = sorted_chunks.iter().rev().find(|b| b.num_rows() > 0) else { + return Ok(()); + }; + let chunk_min = evaluate_row(expressions, first_chunk, 0)?; + let chunk_max = evaluate_row(expressions, last_chunk, last_chunk.num_rows() - 1)?; + + let mut guard = slot.lock().unwrap(); + *guard = Some(match guard.take() { + None => SortExtremes { + min: chunk_min, + max: chunk_max, + row_count: total_rows, + }, + Some(prev) => { + let min = if lex_compare(&chunk_min, &prev.min, expressions).is_lt() { + chunk_min + } else { + prev.min + }; + let max = if lex_compare(&chunk_max, &prev.max, expressions).is_gt() { + chunk_max + } else { + prev.max + }; + SortExtremes { + min, + max, + row_count: prev.row_count + total_rows, + } + } + }); Ok(()) } @@ -305,6 +393,12 @@ struct ExternalSorter { /// How much memory to reserve for performing in-memory sort/merges /// prior to spilling. sort_spill_reservation_bytes: usize, + + /// Optional slot that `sort_batch_stream` updates after each + /// `sort_batch_chunked` call with the leading-key endpoints of the + /// sorted output. `SortExec` provides this so consumers can fetch the + /// runtime sort extremes via `ExecutionPlan::runtime_sort_extremes`. + extremes_slot: Option>>>, } impl ExternalSorter { @@ -353,9 +447,18 @@ impl ExternalSorter { batch_size, sort_spill_reservation_bytes, sort_in_place_threshold_bytes, + extremes_slot: None, }) } + /// Provide a slot for `sort_batch_stream` to publish runtime endpoints + /// into. Used by `SortExec` so its `runtime_sort_extremes` override has + /// a value to return. + fn with_extremes_slot(mut self, slot: Arc>>) -> Self { + self.extremes_slot = Some(slot); + self + } + /// Appends an unsorted [`RecordBatch`] to `in_mem_batches` /// /// Updates memory usage metrics, and possibly triggers spilling to disk @@ -713,6 +816,7 @@ impl ExternalSorter { let expressions = self.expr.clone(); let batch_size = self.batch_size; let output_row_metrics = metrics.output_rows().clone(); + let extremes_slot = self.extremes_slot.clone(); let stream = futures::stream::once(async move { let schema = batch.schema(); @@ -720,6 +824,13 @@ impl ExternalSorter { // Sort the batch immediately and get all output batches let sorted_batches = sort_batch_chunked(&batch, &expressions, batch_size)?; + // Publish leading-key endpoints from this sorted chunk into the + // partition's extremes slot. Combining across multiple chunks + // (path 3) happens inside `merge_chunk_into_slot`. + if let Some(slot) = &extremes_slot { + merge_chunk_into_slot(slot, &expressions, &sorted_batches)?; + } + // Resize the reservation to match the actual sorted output size. // Using try_resize avoids a release-then-reacquire cycle, which // matters for MemoryPool implementations where grow/shrink have @@ -910,10 +1021,10 @@ pub struct SortExec { /// If `fetch` is `Some`, this will also be set and a TopK operator may be used. /// If `fetch` is `None`, this will be `None`. filter: Option>>, - /// Per-output-partition slot populated once that partition's input has - /// been fully consumed; exposes exact min/max on the leading sort key - /// via `ExecutionPlan::runtime_statistics`. - runtime_stats: RuntimeStatsSlots, + /// Per-output-partition slot populated by the sort code path. Surfaced + /// via `ExecutionPlan::runtime_sort_extremes` so range-partitioning + /// callers can derive global boundaries from runtime data. + runtime_extremes: SortExtremesSlots, } impl SortExec { @@ -924,6 +1035,8 @@ impl SortExec { let (cache, sort_prefix) = Self::compute_properties(&input, expr.clone(), preserve_partitioning) .unwrap(); + let runtime_extremes = + make_sort_extremes_slots(cache.partitioning.partition_count()); Self { expr, input, @@ -933,6 +1046,7 @@ impl SortExec { common_sort_prefix: sort_prefix, cache: Arc::new(cache), filter: None, + runtime_extremes, } } @@ -952,6 +1066,8 @@ impl SortExec { self.preserve_partitioning = preserve_partitioning; Arc::make_mut(&mut self.cache).partitioning = Self::output_partitioning_helper(&self.input, self.preserve_partitioning); + self.runtime_extremes = + make_sort_extremes_slots(self.cache.partitioning.partition_count()); self } @@ -977,6 +1093,7 @@ impl SortExec { fetch: self.fetch, cache: Arc::clone(&self.cache), filter: self.filter.clone(), + runtime_extremes: Arc::clone(&self.runtime_extremes), } } @@ -1220,6 +1337,8 @@ impl ExecutionPlan for SortExec { )?; new_sort.cache = Arc::new(cache); new_sort.common_sort_prefix = sort_prefix; + new_sort.runtime_extremes = + make_sort_extremes_slots(new_sort.cache.partitioning.partition_count()); } Ok(Arc::new(new_sort)) @@ -1311,40 +1430,21 @@ impl ExecutionPlan for SortExec { &self.metrics_set, context.runtime_env(), )?; - let sort_expr = self.expr.first(); - let observe_state = - sort_expr - .expr - .downcast_ref::() - .map(|col| ObserveState { - partition, - col_idx: col.index(), - col_name: col.name().to_string(), - descending: sort_expr.options.descending, - first: None, - last: None, - rows_seen: 0, - }); - let sorted: SendableRecordBatchStream = - Box::pin(RecordBatchStreamAdapter::new( - self.schema(), - futures::stream::once(async move { - while let Some(batch) = input.next().await { - let batch = batch?; - sorter.insert_batch(batch).await?; - } - drop(input); - sorter.sort().await - }) - .try_flatten(), - )); - match observe_state { - Some(state) => Ok(Box::pin(ObserveSortedMinMax { - inner: sorted, - state: Some(state), - })), - None => Ok(sorted), + if let Some(slot) = self.runtime_extremes.get(partition) { + sorter = sorter.with_extremes_slot(Arc::clone(slot)); } + Ok(Box::pin(RecordBatchStreamAdapter::new( + self.schema(), + futures::stream::once(async move { + while let Some(batch) = input.next().await { + let batch = batch?; + sorter.insert_batch(batch).await?; + } + drop(input); + sorter.sort().await + }) + .try_flatten(), + ))) } } } @@ -1363,6 +1463,17 @@ impl ExecutionPlan for SortExec { Ok(Arc::new(stats.with_fetch(self.fetch, 0, 1)?)) } + /// Returns the runtime sort extremes for the requested output partition. + /// `None` while that partition's sort path has not yet folded any + /// chunk into its slot — the caller is expected to have driven enough of + /// `execute(partition)` to reach the first `sort_batch_chunked` call. + fn runtime_sort_extremes(&self, partition: usize) -> Result> { + Ok(self + .runtime_extremes + .get(partition) + .and_then(|slot| slot.lock().unwrap().clone())) + } + fn with_fetch(&self, limit: Option) -> Option> { Some(Arc::new(SortExec::with_fetch(self, limit))) } From b70a941879e2f91109a5343e5ab49a79653fbf0c Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Wed, 17 Jun 2026 22:32:09 -0600 Subject: [PATCH 07/12] RangeRepartitionExec: K-way coordinator + global extremes log MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the per-stream LogOnFirstBatch wrapper with a single-instance coordinator task that: 1. opens child.execute(k) for every input partition k, 2. pulls the first batch from each — enough to drive a pipeline- breaking child sort to populate its SortExtremes slot, 3. reads child.runtime_sort_extremes(k) for each input, 4. lex-reduces the per-input extremes into one global SortExtremes using the input's declared output_ordering (so descending / nulls-first are honored), and logs it, 5. hands each input's (first_batch, remaining_stream) pair to the corresponding output partition via a tokio::oneshot channel. Output partition i returns a stream that awaits its handoff and emits the buffered first batch followed by the remainder — so semantically this is still a pass-through, but it now demonstrates the K-way fan-in machinery the real routing impl needs. Makes `lex_compare` pub in sort.rs so the reducer can reuse the small SortOptions-aware comparator instead of duplicating it. With the test fixture the log now reads: RangeRepartitionExec: coordinator gathered 4 input partitions; global extremes = Some(SortExtremes { min: [Int64(0)], max: [Int64(99)], row_count: 100 }) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../physical-plan/src/range_repartition.rs | 260 +++++++++++++----- datafusion/physical-plan/src/sorts/sort.rs | 2 +- 2 files changed, 192 insertions(+), 70 deletions(-) diff --git a/datafusion/physical-plan/src/range_repartition.rs b/datafusion/physical-plan/src/range_repartition.rs index 55bfab52e1a11..ac539b1382c41 100644 --- a/datafusion/physical-plan/src/range_repartition.rs +++ b/datafusion/physical-plan/src/range_repartition.rs @@ -19,45 +19,87 @@ //! single order-key into N output partitions, with halo overlap for //! bounded RANGE-frame window functions sitting above it. //! -//! Today this is a pass-through that wraps each forwarded stream with a -//! one-shot observer: when the first batch of partition `i` is yielded -//! (which guarantees the upstream sort has folded at least one chunk -//! into its `SortExtremes` slot), it calls -//! `child.runtime_sort_extremes(i)` and logs the result. Plan integration -//! is therefore a no-op for correctness today, but the log line confirms -//! the runtime-stats plumbing reaches the downstream consumer. -// -// TODO: replace pass-through with real range routing — gather all K -// per-input-partition `SortExtremes`, derive global boundaries, route -// rows by binary search on the leading sort key, emit halo-expanded -// per-output-partition streams. +//! Today it is a pass-through *with a coordinator*: the first call to +//! `execute()` spawns a single task that +//! 1. opens `child.execute(k)` for every input partition `k`, +//! 2. drives each stream to its first batch (which is enough to make +//! pipeline-breaking sort children populate their `SortExtremes` +//! slot), +//! 3. reads `child.runtime_sort_extremes(k)` per input, +//! 4. lex-reduces the per-input results into a single global +//! [`SortExtremes`] and logs it, +//! 5. hands each input partition's `(first_batch, remaining_stream)` +//! pair off to the corresponding output partition through a +//! [`oneshot`] channel. +//! +//! Output partition `i` returns a stream that awaits its handoff and then +//! emits the buffered first batch followed by the remainder. So the +//! coordinator demonstrates the K-way fan-in machinery that real range +//! routing will need, without yet performing any actual routing. -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; +use std::sync::{Arc, Mutex}; use arrow::array::RecordBatch; -use datafusion_common::Result; -use datafusion_execution::{RecordBatchStream, TaskContext}; -use futures::Stream; +use arrow::datatypes::SchemaRef; +use datafusion_common::{Result, internal_datafusion_err}; +use datafusion_execution::TaskContext; +use datafusion_physical_expr::LexOrdering; +use futures::StreamExt; use log::info; +use tokio::sync::oneshot; +use crate::sorts::sort::lex_compare; +use crate::stream::RecordBatchStreamAdapter; use crate::{ - DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, - SendableRecordBatchStream, + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, SortExtremes, }; -use arrow::datatypes::SchemaRef; #[derive(Debug)] pub struct RangeRepartitionExec { input: Arc, cache: Arc, + state: Arc>, +} + +struct State { + initialized: bool, + /// One `oneshot::Receiver` per output partition, populated when the + /// coordinator hands off this partition's data. `take()`n by the + /// corresponding `execute(partition)` call. + handoffs: Vec>>>, +} + +/// Per-partition payload the coordinator publishes to its output stream. +/// `first_batch` is the batch we had to pull from the input stream to +/// drive the sort's pipeline-break; `rest` is the still-unconsumed +/// remainder of that input stream. +struct PartitionData { + first_batch: Option, + rest: SendableRecordBatchStream, +} + +impl std::fmt::Debug for State { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("State") + .field("initialized", &self.initialized) + .field("handoffs", &self.handoffs.len()) + .finish() + } } impl RangeRepartitionExec { pub fn new(input: Arc) -> Self { + let n = input.output_partitioning().partition_count(); let cache = Arc::clone(input.properties()); - Self { input, cache } + Self { + input, + cache, + state: Arc::new(Mutex::new(State { + initialized: false, + handoffs: (0..n).map(|_| None).collect(), + })), + } } pub fn input(&self) -> &Arc { @@ -100,61 +142,141 @@ impl ExecutionPlan for RangeRepartitionExec { partition: usize, context: Arc, ) -> Result { - let inner = self.input.execute(partition, context)?; - Ok(Box::pin(LogOnFirstBatch { - inner, - partition, - input: Arc::clone(&self.input), - logged: false, - })) + let mut state = self.state.lock().map_err(|_| { + internal_datafusion_err!("RangeRepartitionExec mutex poisoned") + })?; + if !state.initialized { + state.initialized = true; + let n = state.handoffs.len(); + let mut senders = Vec::with_capacity(n); + for slot in state.handoffs.iter_mut() { + let (tx, rx) = oneshot::channel(); + senders.push(tx); + *slot = Some(rx); + } + let child = Arc::clone(&self.input); + let ctx = Arc::clone(&context); + tokio::spawn(coordinator(child, ctx, senders)); + } + let rx = state + .handoffs + .get_mut(partition) + .and_then(Option::take) + .ok_or_else(|| { + internal_datafusion_err!("partition {partition} already taken") + })?; + drop(state); + + let schema = self.schema(); + Ok(Box::pin(RecordBatchStreamAdapter::new( + Arc::clone(&schema), + partition_stream(schema, rx), + ))) } } -/// Pass-through stream that, on the first batch it yields, asks the child -/// operator for its `runtime_sort_extremes` and logs them. Confirms that -/// the upstream sort has populated its slot by the time downstream sees -/// data. -struct LogOnFirstBatch { - inner: SendableRecordBatchStream, - partition: usize, - input: Arc, - logged: bool, +/// Stream that awaits the coordinator's handoff for one output partition, +/// then yields the buffered first batch followed by the remaining input +/// stream. If the handoff sender is dropped (coordinator failed) it +/// surfaces an error. +fn partition_stream( + schema: SchemaRef, + rx: oneshot::Receiver>, +) -> impl futures::Stream> + Send { + use futures::stream::{TryStreamExt, once}; + once(async move { + let data = rx + .await + .map_err(|_| internal_datafusion_err!("coordinator dropped"))??; + let head = futures::stream::iter(data.first_batch.into_iter().map(Ok)); + let merged: SendableRecordBatchStream = + Box::pin(RecordBatchStreamAdapter::new(schema, head.chain(data.rest))); + Ok::<_, datafusion_common::DataFusionError>(merged) + }) + .try_flatten() } -impl Stream for LogOnFirstBatch { - type Item = Result; - - fn poll_next( - mut self: Pin<&mut Self>, - cx: &mut Context<'_>, - ) -> Poll> { - let item = Pin::new(&mut self.inner).poll_next(cx); - if let Poll::Ready(Some(Ok(_))) = &item - && !self.logged - { - self.logged = true; - match self.input.runtime_sort_extremes(self.partition) { - Ok(Some(extremes)) => info!( - "RangeRepartitionExec: input partition {} runtime_sort_extremes: \ - min={:?} max={:?} rows={}", - self.partition, extremes.min, extremes.max, extremes.row_count - ), - Ok(None) => info!( - "RangeRepartitionExec: input partition {} runtime_sort_extremes: None", - self.partition - ), - Err(e) => info!( - "RangeRepartitionExec: input partition {} runtime_sort_extremes error: {e}", - self.partition - ), +/// Coordinator task: drive every input partition to first batch, gather +/// runtime extremes, log the lex-reduced global, then hand off per-input +/// payloads to their corresponding output partition. +async fn coordinator( + child: Arc, + ctx: Arc, + mut senders: Vec>>, +) { + let n = senders.len(); + + // Phase 1: open every input stream and pull the first batch from each. + let mut firsts: Vec<(Option, SendableRecordBatchStream)> = + Vec::with_capacity(n); + for k in 0..n { + let mut stream = match child.execute(k, Arc::clone(&ctx)) { + Ok(s) => s, + Err(e) => { + let msg = format!("input {k} open failed: {e}"); + for tx in senders.drain(..) { + let _ = tx.send(Err(internal_datafusion_err!("{msg}"))); + } + return; } - } - item + }; + let first = match stream.next().await { + Some(Ok(batch)) => Some(batch), + Some(Err(e)) => { + let msg = format!("first batch from input {k} failed: {e}"); + for tx in senders.drain(..) { + let _ = tx.send(Err(internal_datafusion_err!("{msg}"))); + } + return; + } + None => None, + }; + firsts.push((first, stream)); + } + + // Phase 2: collect per-input runtime extremes. + let per_input: Vec> = (0..n) + .map(|k| child.runtime_sort_extremes(k).ok().flatten()) + .collect(); + + // Phase 3: lex-reduce per-input → global, using the input's declared + // output ordering so direction and null ordering are honored. + let ordering: Option = child.output_ordering().cloned(); + let global = ordering + .as_ref() + .and_then(|o| reduce_global_extremes(&per_input, o)); + + info!( + "RangeRepartitionExec: coordinator gathered {} input partitions; \ + global extremes = {:?}", + n, global + ); + + // Phase 4: hand off each input's payload to its corresponding output + // partition. (Today: pass-through; future: per-output route streams.) + for (sender, (first_batch, rest)) in senders.into_iter().zip(firsts.into_iter()) { + let _ = sender.send(Ok(PartitionData { first_batch, rest })); } } -impl RecordBatchStream for LogOnFirstBatch { - fn schema(&self) -> SchemaRef { - self.inner.schema() +/// Lex-reduce per-input partition extremes into one global [`SortExtremes`] +/// honoring `ordering`'s direction / nulls-first per key. Returns `None` +/// when no input partition produced extremes (e.g. all inputs were empty, +/// or no upstream supports the trait method). +fn reduce_global_extremes( + per_input: &[Option], + ordering: &LexOrdering, +) -> Option { + let mut iter = per_input.iter().filter_map(Option::clone); + let mut global = iter.next()?; + for next in iter { + if lex_compare(&next.min, &global.min, ordering).is_lt() { + global.min = next.min; + } + if lex_compare(&next.max, &global.max, ordering).is_gt() { + global.max = next.max; + } + global.row_count += next.row_count; } + Some(global) } diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 4c8dcc850a586..2d5a52235ea23 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -120,7 +120,7 @@ fn evaluate_row( /// Compare two `ScalarValue` tuples using `SortOptions` per key /// (descending and nulls_first), so callers can pick the lex-smaller or /// lex-larger of two endpoint candidates without an arrow round-trip. -fn lex_compare( +pub fn lex_compare( a: &[ScalarValue], b: &[ScalarValue], expressions: &LexOrdering, From b1c62083bf5f9baad2ca6b1cf2c8afc287cf6334 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Wed, 17 Jun 2026 22:38:55 -0600 Subject: [PATCH 08/12] Move boundary/halo computation into RangeRepartitionExec coordinator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Boundary math is now consumed at runtime against the global SortExtremes the coordinator gathers — no more plan-time partition_statistics math in the optimizer rule. RangeRepartitionExec::new takes the halo distances from the optimizer rule (extracted from the window frame at plan time) and the coordinator combines them with the runtime global to compute interior cut points and per-bucket primary / expanded ranges. ParallelWindow shrinks to shape detection + halo extraction + plan rewrite. The previous probe_candidate plan-time logging is gone — the identical log lines now come out of RangeRepartitionExec at execution time, where they're computed from actual data rather than from possibly- inexact stats. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../physical-optimizer/src/parallel_window.rs | 208 +++++------------- .../physical-plan/src/range_repartition.rs | 106 ++++++++- 2 files changed, 158 insertions(+), 156 deletions(-) diff --git a/datafusion/physical-optimizer/src/parallel_window.rs b/datafusion/physical-optimizer/src/parallel_window.rs index 6725df92bd7b4..d0716934e7e96 100644 --- a/datafusion/physical-optimizer/src/parallel_window.rs +++ b/datafusion/physical-optimizer/src/parallel_window.rs @@ -15,24 +15,27 @@ // specific language governing permissions and limitations // under the License. -//! Parallelize bounded RANGE-frame window functions that have an ORDER BY but -//! no PARTITION BY by range-partitioning their input. +//! Parallelize bounded RANGE-frame window functions that have an ORDER BY +//! but no PARTITION BY by inserting a `RangeRepartitionExec` between the +//! window's `SortPreservingMergeExec`/`SortExec` chain and its child sort. //! -//! Skeleton: this pass finds candidate `BoundedWindowAggExec` nodes and probes -//! the child's `partition_statistics(Some(i))` for per-partition min/max on the -//! single ORDER BY column. It does not transform the plan yet. +//! This rule's responsibilities are now narrow: detect the eligible +//! window shape, pull the halo distances out of the window frame, and +//! wrap the per-partition `SortExec` with a `RangeRepartitionExec` +//! carrying those halo values. All boundary / global-extremes logic +//! lives in `RangeRepartitionExec`'s coordinator and runs against +//! runtime stats rather than plan-time `Statistics`. use crate::PhysicalOptimizerRule; use datafusion_common::ScalarValue; use datafusion_common::config::ConfigOptions; -use datafusion_common::stats::Precision; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_expr::{WindowFrameBound, WindowFrameUnits}; use datafusion_physical_expr::expressions::Column; +use datafusion_physical_plan::ExecutionPlan; use datafusion_physical_plan::range_repartition::RangeRepartitionExec; use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::windows::BoundedWindowAggExec; -use datafusion_physical_plan::{ExecutionPlan, ExecutionPlanProperties}; use log::info; use std::sync::Arc; @@ -49,21 +52,28 @@ impl PhysicalOptimizerRule for ParallelWindow { fn optimize( &self, plan: Arc, - config: &ConfigOptions, + _config: &ConfigOptions, ) -> datafusion_common::Result> { - let target_partitions = config.execution.target_partitions; let out = plan.transform_down(|node| { let Some(window) = node.downcast_ref::() else { return Ok(Transformed::no(node)); }; - probe_candidate(window, target_partitions)?; - if !is_eligible(window) { + let Some((halo_preceding, halo_following)) = candidate_halo(window) + else { return Ok(Transformed::no(node)); - } - // Descend through whatever EnsureRequirements stacked on top of the - // sort (typically SortPreservingMergeExec) until we find the - // SortExec and wrap it with RangeRepartitionExec. - let new_child = wrap_first_sort_descendant(Arc::clone(&node.children()[0]))?; + }; + info!( + "ParallelWindow: candidate BoundedWindowAggExec (RANGE frame, no PARTITION BY); \ + halo: {halo_preceding} preceding, {halo_following} following" + ); + // Descend through whatever EnsureRequirements stacked on top of + // the sort (typically SortPreservingMergeExec) until we find the + // SortExec, and wrap it with RangeRepartitionExec(halo…). + let new_child = wrap_first_sort_descendant( + Arc::clone(&node.children()[0]), + halo_preceding, + halo_following, + )?; let new_window = Arc::clone(&node).with_new_children(vec![new_child])?; Ok(Transformed::yes(new_window)) })?; @@ -79,145 +89,60 @@ impl PhysicalOptimizerRule for ParallelWindow { } } -/// Same shape gate that `probe_candidate` enforces, returned as a bool so -/// the rule can decide whether to do the plan rewrite. -fn is_eligible(window: &BoundedWindowAggExec) -> bool { +/// Returns `(halo_preceding, halo_following)` if the window matches the +/// v1 shape we know how to parallelize: no PARTITION BY, a single +/// `Column` sort key, RANGE frame, finite Int64 bounds (or CurrentRow). +/// Returns `None` otherwise so the rule leaves the plan alone. +fn candidate_halo(window: &BoundedWindowAggExec) -> Option<(i64, i64)> { if !window.partition_keys().is_empty() { - return false; + return None; } let order_by = window.window_expr()[0].order_by(); if order_by.len() != 1 { - return false; + return None; } if order_by[0].expr.downcast_ref::().is_none() { - return false; + return None; } let frame = window.window_expr()[0].get_window_frame(); if frame.units != WindowFrameUnits::Range { - return false; + return None; } - i64_halo(&frame.start_bound, &frame.end_bound).is_some() + i64_halo(&frame.start_bound, &frame.end_bound) } /// Walk down through single-child operators until we hit a `SortExec`; -/// replace it in-place with `RangeRepartitionExec(SortExec)` and rebuild -/// the chain back up. If no `SortExec` is found, leave the subtree alone. +/// replace it in place with `RangeRepartitionExec` wrapping the same +/// `SortExec`, and rebuild the chain back up. If no `SortExec` is found, +/// leave the subtree alone. fn wrap_first_sort_descendant( node: Arc, + halo_preceding: i64, + halo_following: i64, ) -> datafusion_common::Result> { if node.downcast_ref::().is_some() { - return Ok(Arc::new(RangeRepartitionExec::new(node))); + return Ok(Arc::new(RangeRepartitionExec::new( + node, + halo_preceding, + halo_following, + ))); } let children = node.children(); if children.len() != 1 { return Ok(node); } - let new_child = wrap_first_sort_descendant(Arc::clone(children[0]))?; + let new_child = wrap_first_sort_descendant( + Arc::clone(children[0]), + halo_preceding, + halo_following, + )?; node.with_new_children(vec![new_child]) } -fn probe_candidate( - window: &BoundedWindowAggExec, - target_partitions: usize, -) -> datafusion_common::Result<()> { - // v1 scope: single ORDER BY column, no PARTITION BY, RANGE frame. - if !window.partition_keys().is_empty() { - return Ok(()); - } - let order_by = window.window_expr()[0].order_by(); - if order_by.len() != 1 { - return Ok(()); - } - let frame = window.window_expr()[0].get_window_frame(); - if frame.units != WindowFrameUnits::Range { - return Ok(()); - } - let Some((halo_preceding, halo_following)) = - i64_halo(&frame.start_bound, &frame.end_bound) - else { - info!( - " frame bounds not finite Int64 (start={:?}, end={:?}); skip", - frame.start_bound, frame.end_bound - ); - return Ok(()); - }; - - let child = window.input(); - let Some(col) = order_by[0].expr.downcast_ref::() else { - return Ok(()); - }; - let col_idx = col.index(); - let col_name = col.name().to_string(); - let n_in = child.output_partitioning().partition_count(); - - info!( - "ParallelWindow: candidate BoundedWindowAggExec on `{col_name}` (RANGE frame, no PARTITION BY); \ - child has {n_in} partitions, target_partitions={target_partitions}" - ); - - let mut global_min: Precision = Precision::Absent; - let mut global_max: Precision = Precision::Absent; - for i in 0..n_in { - let stats = child.partition_statistics(Some(i))?; - let col_stats = &stats.column_statistics[col_idx]; - global_min = match global_min { - Precision::Absent => col_stats.min_value.clone(), - other => other.min(&col_stats.min_value), - }; - global_max = match global_max { - Precision::Absent => col_stats.max_value.clone(), - other => other.max(&col_stats.max_value), - }; - } - - let (Precision::Exact(min), Precision::Exact(max)) = (&global_min, &global_max) - else { - info!(" global bounds not Exact (min={global_min:?}, max={global_max:?}); skip"); - return Ok(()); - }; - - let Some(boundaries) = equal_width_boundaries(min, max, target_partitions) else { - info!( - " cannot split [{min:?}, {max:?}] into {target_partitions} buckets (type not yet supported)" - ); - return Ok(()); - }; - info!(" global min={min:?} max={max:?}; interior boundaries: {boundaries:?}"); - info!(" halo: {halo_preceding} preceding, {halo_following} following"); - - // Each output partition's primary range is half-open [b_i, b_{i+1}); the - // expanded (routed) range adds halo on each side so the per-partition - // window can compute correct frame values at the seam. - // TODO: a future `HaloDropExec` (or equivalent) sits above the window per - // partition and drops rows outside its primary range so each input row - // surfaces in exactly one output partition. - let (ScalarValue::Int64(Some(lo)), ScalarValue::Int64(Some(hi))) = (min, max) else { - return Ok(()); - }; - let mut edges: Vec = std::iter::once(*lo) - .chain(boundaries.iter().filter_map(|b| match b { - ScalarValue::Int64(Some(v)) => Some(*v), - _ => None, - })) - .chain(std::iter::once(*hi + 1)) - .collect(); - edges.dedup(); - for (i, win) in edges.windows(2).enumerate() { - let start = win[0] - halo_preceding; - let end = win[1] + halo_following; - info!( - " bucket {i}: primary [{}, {}) expanded [{start}, {end})", - win[0], win[1] - ); - } - - Ok(()) -} - -/// Extract `(halo_preceding, halo_following)` in order-key units from a RANGE -/// window frame. Returns `None` for UNBOUNDED bounds or non-`Int64` distances. -/// v1 scope: only `Preceding(Int64)` / `CurrentRow` for the start bound, and -/// `CurrentRow` / `Following(Int64)` for the end bound. +/// Extract `(halo_preceding, halo_following)` in order-key units from a +/// RANGE window frame. Returns `None` for UNBOUNDED bounds or non-`Int64` +/// distances. v1 scope: only `Preceding(Int64)` / `CurrentRow` for the +/// start bound, and `CurrentRow` / `Following(Int64)` for the end bound. fn i64_halo(start: &WindowFrameBound, end: &WindowFrameBound) -> Option<(i64, i64)> { let preceding = match start { WindowFrameBound::Preceding(ScalarValue::Int64(Some(n))) => *n, @@ -231,26 +156,3 @@ fn i64_halo(start: &WindowFrameBound, end: &WindowFrameBound) -> Option<(i64, i6 }; Some((preceding, following)) } - -/// Split the closed interval `[min, max]` into `n` equal-width buckets and -/// return the `n - 1` interior cut points. -/// v1: Int64 only — keeps the API small while we settle the optimizer shape. -fn equal_width_boundaries( - min: &ScalarValue, - max: &ScalarValue, - n: usize, -) -> Option> { - if n <= 1 { - return Some(vec![]); - } - let (ScalarValue::Int64(Some(lo)), ScalarValue::Int64(Some(hi))) = (min, max) else { - return None; - }; - let span = hi.checked_sub(*lo)?; - let n_i = i64::try_from(n).ok()?; - let mut cuts = Vec::with_capacity(n - 1); - for i in 1..n_i { - cuts.push(ScalarValue::Int64(Some(lo + span * i / n_i))); - } - Some(cuts) -} diff --git a/datafusion/physical-plan/src/range_repartition.rs b/datafusion/physical-plan/src/range_repartition.rs index ac539b1382c41..a578ae1f4c70a 100644 --- a/datafusion/physical-plan/src/range_repartition.rs +++ b/datafusion/physical-plan/src/range_repartition.rs @@ -48,6 +48,8 @@ use futures::StreamExt; use log::info; use tokio::sync::oneshot; +use datafusion_common::ScalarValue; + use crate::sorts::sort::lex_compare; use crate::stream::RecordBatchStreamAdapter; use crate::{ @@ -59,6 +61,12 @@ use crate::{ pub struct RangeRepartitionExec { input: Arc, cache: Arc, + /// Halo distance preceding each bucket's primary range, in + /// leading-sort-key units. Carried over from the window frame at plan + /// time so the coordinator can derive per-bucket expanded ranges. + halo_preceding: i64, + /// Halo distance following each bucket's primary range. + halo_following: i64, state: Arc>, } @@ -89,12 +97,18 @@ impl std::fmt::Debug for State { } impl RangeRepartitionExec { - pub fn new(input: Arc) -> Self { + pub fn new( + input: Arc, + halo_preceding: i64, + halo_following: i64, + ) -> Self { let n = input.output_partitioning().partition_count(); let cache = Arc::clone(input.properties()); Self { input, cache, + halo_preceding, + halo_following, state: Arc::new(Mutex::new(State { initialized: false, handoffs: (0..n).map(|_| None).collect(), @@ -134,7 +148,11 @@ impl ExecutionPlan for RangeRepartitionExec { self: Arc, mut children: Vec>, ) -> Result> { - Ok(Arc::new(Self::new(children.swap_remove(0)))) + Ok(Arc::new(Self::new( + children.swap_remove(0), + self.halo_preceding, + self.halo_following, + ))) } fn execute( @@ -156,7 +174,15 @@ impl ExecutionPlan for RangeRepartitionExec { } let child = Arc::clone(&self.input); let ctx = Arc::clone(&context); - tokio::spawn(coordinator(child, ctx, senders)); + let halo_preceding = self.halo_preceding; + let halo_following = self.halo_following; + tokio::spawn(coordinator( + child, + ctx, + senders, + halo_preceding, + halo_following, + )); } let rx = state .handoffs @@ -203,6 +229,8 @@ async fn coordinator( child: Arc, ctx: Arc, mut senders: Vec>>, + halo_preceding: i64, + halo_following: i64, ) { let n = senders.len(); @@ -252,6 +280,11 @@ async fn coordinator( n, global ); + // Derive boundaries and per-bucket primary/expanded ranges from the + // global extremes + the halo we were constructed with. Log only — the + // routing impl in a future commit will consume the same values. + log_buckets(global.as_ref(), n, halo_preceding, halo_following); + // Phase 4: hand off each input's payload to its corresponding output // partition. (Today: pass-through; future: per-output route streams.) for (sender, (first_batch, rest)) in senders.into_iter().zip(firsts.into_iter()) { @@ -259,6 +292,73 @@ async fn coordinator( } } +/// Compute and log this RangeRepartitionExec's bucket layout for a single +/// run: `n` output partitions split into equal-width buckets over the +/// global leading-sort-key range, each bucket's primary range and its +/// halo-expanded range. +/// +/// v1 boundary math only supports `Int64` leading sort keys (matches the +/// optimizer rule's eligibility gate). Logs a skip line for other types +/// or when the global extremes were unavailable. +fn log_buckets( + global: Option<&SortExtremes>, + n: usize, + halo_preceding: i64, + halo_following: i64, +) { + let Some(global) = global else { + info!("RangeRepartitionExec: no runtime extremes; skip bucket layout"); + return; + }; + let (Some(ScalarValue::Int64(Some(lo))), Some(ScalarValue::Int64(Some(hi)))) = + (global.min.first(), global.max.first()) + else { + info!( + "RangeRepartitionExec: leading sort key is not Int64 (min={:?}, max={:?}); \ + skip bucket layout", + global.min, global.max + ); + return; + }; + let Some(boundaries) = equal_width_int64_boundaries(*lo, *hi, n) else { + info!("RangeRepartitionExec: cannot split [{lo}, {hi}] into {n} buckets"); + return; + }; + info!( + "RangeRepartitionExec: global leading [{lo}, {hi}] split into {n} buckets; \ + interior boundaries: {boundaries:?}; halo: {halo_preceding} preceding, \ + {halo_following} following" + ); + let mut edges: Vec = std::iter::once(*lo) + .chain(boundaries.iter().copied()) + .chain(std::iter::once(*hi + 1)) + .collect(); + edges.dedup(); + for (i, win) in edges.windows(2).enumerate() { + let start = win[0] - halo_preceding; + let end = win[1] + halo_following; + info!( + "RangeRepartitionExec: bucket {i}: primary [{}, {}) expanded [{start}, {end})", + win[0], win[1] + ); + } +} + +/// Split the closed `Int64` interval `[lo, hi]` into `n` equal-width +/// buckets, returning the `n - 1` interior cut points. +fn equal_width_int64_boundaries(lo: i64, hi: i64, n: usize) -> Option> { + if n <= 1 { + return Some(vec![]); + } + let span = hi.checked_sub(lo)?; + let n_i = i64::try_from(n).ok()?; + let mut cuts = Vec::with_capacity(n - 1); + for i in 1..n_i { + cuts.push(lo + span * i / n_i); + } + Some(cuts) +} + /// Lex-reduce per-input partition extremes into one global [`SortExtremes`] /// honoring `ordering`'s direction / nulls-first per key. Returns `None` /// when no input partition produced extremes (e.g. all inputs were empty, From 9fbd877071b9465d6237808babb0c395af4d141b Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Wed, 17 Jun 2026 22:59:42 -0600 Subject: [PATCH 09/12] RangeRepartitionExec: actually route rows with halo duplication MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Coordinator's hand-off changes from "give output i my input partition i's remaining stream" to "give output i an mpsc receiver, route from every input into the per-bucket channel I own". For each input batch the router computes bucket b's expanded range `[boundaries[b-1] - halo_preceding, boundaries[b] + halo_following)` and sends the matching slice of the batch (via arrow::compute::take_arrays) to bucket b's channel. Bucket-driven loop — N takes per input batch. Routing is Int64-only by design (matches the optimizer rule's gate); non-Int64 leading keys propagate a clear error to every output stream instead of silently producing wrong data. SLT picks up a `count(rolling_sum)` assertion that exposes halo duplication: plain `count(*)` would be statistics-pruned from parquet's row count and never instantiate the window operator, so it wouldn't notice. With routing in place but no halo drop yet, the merged output has 115 rows = 100 + 15 halo duplicates (5 per boundary × 3 interior boundaries). HaloDropExec is the next commit and will bring this back to 100. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../physical-plan/src/range_repartition.rs | 317 ++++++++++++++---- .../test_files/parallel_window.slt | 21 ++ 2 files changed, 266 insertions(+), 72 deletions(-) diff --git a/datafusion/physical-plan/src/range_repartition.rs b/datafusion/physical-plan/src/range_repartition.rs index a578ae1f4c70a..411f18b5c94a0 100644 --- a/datafusion/physical-plan/src/range_repartition.rs +++ b/datafusion/physical-plan/src/range_repartition.rs @@ -15,38 +15,43 @@ // specific language governing permissions and limitations // under the License. -//! Skeleton operator that will eventually range-partition its input on a -//! single order-key into N output partitions, with halo overlap for -//! bounded RANGE-frame window functions sitting above it. +//! Range-partition an input stream on a single Int64 order-key into N +//! output partitions, with halo overlap for bounded RANGE-frame window +//! functions sitting above it. //! -//! Today it is a pass-through *with a coordinator*: the first call to -//! `execute()` spawns a single task that +//! `execute()`'s first call spawns a coordinator that: //! 1. opens `child.execute(k)` for every input partition `k`, -//! 2. drives each stream to its first batch (which is enough to make -//! pipeline-breaking sort children populate their `SortExtremes` -//! slot), +//! 2. drives each stream to its first batch (which makes the pipeline- +//! breaking sort child populate its `SortExtremes` slot), //! 3. reads `child.runtime_sort_extremes(k)` per input, -//! 4. lex-reduces the per-input results into a single global -//! [`SortExtremes`] and logs it, -//! 5. hands each input partition's `(first_batch, remaining_stream)` -//! pair off to the corresponding output partition through a -//! [`oneshot`] channel. +//! 4. lex-reduces those into a single global [`SortExtremes`], derives +//! `N` equal-width Int64 bucket boundaries from `[global.min, +//! global.max]`, and computes per-bucket expanded ranges by +//! extending each primary [b_i, b_{i+1}) outward by +//! `halo_preceding` / `halo_following`, +//! 5. then for every batch flowing out of every input stream, splits +//! the batch into per-bucket pieces (rows whose order key lies in +//! bucket `b`'s expanded range), and sends each piece into bucket +//! `b`'s output channel. //! -//! Output partition `i` returns a stream that awaits its handoff and then -//! emits the buffered first batch followed by the remainder. So the -//! coordinator demonstrates the K-way fan-in machinery that real range -//! routing will need, without yet performing any actual routing. +//! Halo rows therefore appear in *two* output partitions (their primary +//! bucket and the neighbor whose expanded range reaches them). That's +//! correct for letting the per-bucket window operator compute frame +//! values at the seams — but it also means rows are duplicated in the +//! merged output until a future `HaloDropExec` strips halo rows after +//! the window. use std::sync::{Arc, Mutex}; -use arrow::array::RecordBatch; +use arrow::array::{Array, Int64Array, RecordBatch, UInt32Array}; +use arrow::compute::take_arrays; use arrow::datatypes::SchemaRef; -use datafusion_common::{Result, internal_datafusion_err}; +use datafusion_common::{DataFusionError, Result, internal_datafusion_err}; use datafusion_execution::TaskContext; use datafusion_physical_expr::LexOrdering; use futures::StreamExt; use log::info; -use tokio::sync::oneshot; +use tokio::sync::{mpsc, oneshot}; use datafusion_common::ScalarValue; @@ -78,13 +83,12 @@ struct State { handoffs: Vec>>>, } -/// Per-partition payload the coordinator publishes to its output stream. -/// `first_batch` is the batch we had to pull from the input stream to -/// drive the sort's pipeline-break; `rest` is the still-unconsumed -/// remainder of that input stream. +/// Per-output-partition payload the coordinator hands to its stream. +/// Once the coordinator has computed boundaries it starts router tasks +/// that funnel routed batches into bucket-keyed mpsc channels. Each +/// output partition's stream drains its receiver. struct PartitionData { - first_batch: Option, - rest: SendableRecordBatchStream, + rx: mpsc::Receiver>, } impl std::fmt::Debug for State { @@ -202,11 +206,11 @@ impl ExecutionPlan for RangeRepartitionExec { } /// Stream that awaits the coordinator's handoff for one output partition, -/// then yields the buffered first batch followed by the remaining input -/// stream. If the handoff sender is dropped (coordinator failed) it -/// surfaces an error. +/// then drains the bucket-keyed mpsc receiver router tasks are pushing +/// into. If the coordinator drops the sender (e.g. setup failed) the +/// stream surfaces an error. fn partition_stream( - schema: SchemaRef, + _schema: SchemaRef, rx: oneshot::Receiver>, ) -> impl futures::Stream> + Send { use futures::stream::{TryStreamExt, once}; @@ -214,10 +218,9 @@ fn partition_stream( let data = rx .await .map_err(|_| internal_datafusion_err!("coordinator dropped"))??; - let head = futures::stream::iter(data.first_batch.into_iter().map(Ok)); - let merged: SendableRecordBatchStream = - Box::pin(RecordBatchStreamAdapter::new(schema, head.chain(data.rest))); - Ok::<_, datafusion_common::DataFusionError>(merged) + let mut bucket_rx = data.rx; + let inner = futures::stream::poll_fn(move |cx| bucket_rx.poll_recv(cx)); + Ok::<_, DataFusionError>(inner) }) .try_flatten() } @@ -280,58 +283,215 @@ async fn coordinator( n, global ); - // Derive boundaries and per-bucket primary/expanded ranges from the - // global extremes + the halo we were constructed with. Log only — the - // routing impl in a future commit will consume the same values. - log_buckets(global.as_ref(), n, halo_preceding, halo_following); + // Phase 4: derive bucket boundaries from the global extremes. v1 is + // Int64-only — if anything's missing we propagate an error to all + // output streams. + let (lo, hi) = match int64_range(global.as_ref()) { + Some(v) => v, + None => { + for tx in senders.drain(..) { + let _ = tx.send(Err(internal_datafusion_err!( + "RangeRepartitionExec: leading sort key must be Int64 \ + with a non-empty global range" + ))); + } + return; + } + }; + let Some(boundaries) = equal_width_int64_boundaries(lo, hi, n) else { + for tx in senders.drain(..) { + let _ = tx.send(Err(internal_datafusion_err!( + "RangeRepartitionExec: cannot split [{lo}, {hi}] into {n} buckets" + ))); + } + return; + }; + log_buckets(lo, hi, &boundaries, halo_preceding, halo_following); + + // Phase 5: figure out which column carries the leading sort key. + let col_idx = match ordering + .as_ref() + .and_then(|o| o.first().expr.downcast_ref::()) + .map(|c| c.index()) + { + Some(idx) => idx, + None => { + for tx in senders.drain(..) { + let _ = tx.send(Err(internal_datafusion_err!( + "RangeRepartitionExec: leading sort key must be a Column \ + (got {:?})", + ordering + ))); + } + return; + } + }; - // Phase 4: hand off each input's payload to its corresponding output - // partition. (Today: pass-through; future: per-output route streams.) - for (sender, (first_batch, rest)) in senders.into_iter().zip(firsts.into_iter()) { - let _ = sender.send(Ok(PartitionData { first_batch, rest })); + // Phase 6: build N output mpsc channels, one per output partition / + // bucket. Hand each receiver to the corresponding output stream. + let mut bucket_txs: Vec>> = Vec::with_capacity(n); + for sender in senders.drain(..) { + let (tx, rx) = mpsc::channel(4); + bucket_txs.push(tx); + let _ = sender.send(Ok(PartitionData { rx })); + } + + // Phase 7: for each input partition, route its batches into per-bucket + // pieces and push to the corresponding bucket_txs. Drop the local + // bucket_txs once all router tasks complete so receivers see EOS. + let bucket_txs = Arc::new(bucket_txs); + let mut routers = Vec::with_capacity(firsts.len()); + for (first_batch, rest) in firsts.into_iter() { + let txs = Arc::clone(&bucket_txs); + let boundaries = boundaries.clone(); + routers.push(tokio::spawn(run_router( + first_batch, + rest, + txs, + boundaries, + col_idx, + halo_preceding, + halo_following, + ))); } + // Wait for routers so we can drop the senders here (allowing receivers + // to observe EOS). Errors are propagated through the channels. + for handle in routers { + let _ = handle.await; + } + drop(bucket_txs); } -/// Compute and log this RangeRepartitionExec's bucket layout for a single -/// run: `n` output partitions split into equal-width buckets over the -/// global leading-sort-key range, each bucket's primary range and its -/// halo-expanded range. -/// -/// v1 boundary math only supports `Int64` leading sort keys (matches the -/// optimizer rule's eligibility gate). Logs a skip line for other types -/// or when the global extremes were unavailable. -fn log_buckets( - global: Option<&SortExtremes>, - n: usize, +/// Drain one input partition's stream and split each batch into pieces +/// keyed by the bucket whose expanded range contains the row's leading +/// sort value. Halo rows land in two buckets (their primary and the +/// neighbor that needs them as halo). +async fn run_router( + first_batch: Option, + mut rest: SendableRecordBatchStream, + bucket_txs: Arc>>>, + boundaries: Vec, + col_idx: usize, halo_preceding: i64, halo_following: i64, ) { - let Some(global) = global else { - info!("RangeRepartitionExec: no runtime extremes; skip bucket layout"); - return; - }; - let (Some(ScalarValue::Int64(Some(lo))), Some(ScalarValue::Int64(Some(hi)))) = - (global.min.first(), global.max.first()) - else { - info!( - "RangeRepartitionExec: leading sort key is not Int64 (min={:?}, max={:?}); \ - skip bucket layout", - global.min, global.max - ); - return; - }; - let Some(boundaries) = equal_width_int64_boundaries(*lo, *hi, n) else { - info!("RangeRepartitionExec: cannot split [{lo}, {hi}] into {n} buckets"); + if let Some(batch) = first_batch + && let Err(_) = route_batch( + &batch, + col_idx, + &boundaries, + halo_preceding, + halo_following, + &bucket_txs, + ) + .await + { return; - }; + } + while let Some(item) = rest.next().await { + let batch = match item { + Ok(b) => b, + Err(e) => { + // Best-effort propagation: try each bucket sender. The + // error is rendered to string because `DataFusionError` + // doesn't `Clone`. + let msg = e.to_string(); + for tx in bucket_txs.iter() { + let _ = tx + .send(Err(internal_datafusion_err!("input batch error: {msg}"))) + .await; + } + return; + } + }; + if route_batch( + &batch, + col_idx, + &boundaries, + halo_preceding, + halo_following, + &bucket_txs, + ) + .await + .is_err() + { + return; + } + } +} + +/// For each output bucket, take rows whose leading-sort-key value lands in +/// `[primary_low - halo_preceding, primary_high + halo_following)` and +/// push that piece into the bucket's channel. Bucket-driven loop so each +/// bucket's filter expression is in one place. +async fn route_batch( + batch: &RecordBatch, + col_idx: usize, + boundaries: &[i64], + halo_preceding: i64, + halo_following: i64, + bucket_txs: &[mpsc::Sender>], +) -> Result<()> { + let n_out = bucket_txs.len(); + let arr = batch.column(col_idx); + let col = arr.as_any().downcast_ref::().ok_or_else(|| { + internal_datafusion_err!("RangeRepartitionExec: leading sort key not Int64") + })?; + let n_rows = batch.num_rows(); + + for b in 0..n_out { + let low: i64 = if b == 0 { + i64::MIN + } else { + boundaries[b - 1].saturating_sub(halo_preceding) + }; + let high: i64 = if b == n_out - 1 { + i64::MAX + } else { + boundaries[b].saturating_add(halo_following) + }; + let mut indices: Vec = Vec::new(); + for r in 0..n_rows { + if col.is_null(r) { + continue; + } + let s = col.value(r); + if s >= low && s < high { + indices.push(r as u32); + } + } + if indices.is_empty() { + continue; + } + let take_idx = UInt32Array::from(indices); + let cols = take_arrays(batch.columns(), &take_idx, None)?; + let piece = RecordBatch::try_new(batch.schema(), cols)?; + if bucket_txs[b].send(Ok(piece)).await.is_err() { + return Ok(()); // receiver dropped; bail quietly + } + } + Ok(()) +} + +/// Log the bucket layout the coordinator will route into: the leading-key +/// global range, the interior boundaries, and each output bucket's +/// primary and halo-expanded ranges. +fn log_buckets( + lo: i64, + hi: i64, + boundaries: &[i64], + halo_preceding: i64, + halo_following: i64, +) { + let n = boundaries.len() + 1; info!( "RangeRepartitionExec: global leading [{lo}, {hi}] split into {n} buckets; \ interior boundaries: {boundaries:?}; halo: {halo_preceding} preceding, \ {halo_following} following" ); - let mut edges: Vec = std::iter::once(*lo) + let mut edges: Vec = std::iter::once(lo) .chain(boundaries.iter().copied()) - .chain(std::iter::once(*hi + 1)) + .chain(std::iter::once(hi + 1)) .collect(); edges.dedup(); for (i, win) in edges.windows(2).enumerate() { @@ -344,6 +504,19 @@ fn log_buckets( } } +/// Extract `(lo, hi)` from the leading slot of a global [`SortExtremes`]. +/// Returns `None` if the extremes are missing, the leading key isn't +/// Int64, or either endpoint is null. +fn int64_range(global: Option<&SortExtremes>) -> Option<(i64, i64)> { + let global = global?; + let (ScalarValue::Int64(Some(lo)), ScalarValue::Int64(Some(hi))) = + (global.min.first()?, global.max.first()?) + else { + return None; + }; + Some((*lo, *hi)) +} + /// Split the closed `Int64` interval `[lo, hi]` into `n` equal-width /// buckets, returning the `n - 1` interior cut points. fn equal_width_int64_boundaries(lo: i64, hi: i64, n: usize) -> Option> { diff --git a/datafusion/sqllogictest/test_files/parallel_window.slt b/datafusion/sqllogictest/test_files/parallel_window.slt index ec90d68d4f8e4..57ee592c98486 100644 --- a/datafusion/sqllogictest/test_files/parallel_window.slt +++ b/datafusion/sqllogictest/test_files/parallel_window.slt @@ -119,6 +119,27 @@ LIMIT 5; 3 6 4 10 +# Halo duplication is observable as an inflated row count: each row whose +# seq is within `halo_preceding` of a bucket boundary appears in both its +# primary bucket and the neighbor that needs it as halo. For this fixture +# (100 rows, halo_preceding=5, 3 interior boundaries) that's 15 extra +# rows, so the merged output has 115. The eventual `HaloDropExec` will +# drop halo rows post-window and bring this back to 100. +# +# `count(rolling_sum)` forces the window to actually execute (a plain +# count(*) gets statistics-pruned from the parquet row count, never +# instantiates the window operator, and so wouldn't notice the bug). +query I +SELECT count(rolling_sum) AS n FROM ( + SELECT seq, SUM(amount) OVER ( + ORDER BY seq + RANGE BETWEEN 5 PRECEDING AND CURRENT ROW + ) AS rolling_sum + FROM events +) t; +---- +115 + # Reset session settings so this file doesn't leak config into the rest of the run. statement ok set datafusion.explain.show_statistics = false; From 05d8dd528acd0328e94a1948e493706d22d37a6c Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Thu, 18 Jun 2026 09:14:49 -0600 Subject: [PATCH 10/12] ParallelWindow: run pre-EnsureRequirements with parallel-aware BWAG Move ParallelWindow rule ahead of EnsureRequirements so it owns the distribution decision instead of surgically undoing what EnsureRequirements inserts. - BoundedWindowAggExec gains a `parallel_aware: bool` and a with_parallel_aware() builder. When set, required_input_distribution() returns UnspecifiedDistribution instead of SinglePartition, so EnsureRequirements stops wrapping us in an SPM and collapsing back to one partition. - RangeRepartitionExec now takes a LexOrdering, declares required_input_ordering (Hard) on it, and maintains_input_order=true. This anchors the pipeline-breaking SortExec beneath us instead of letting EnsureRequirements push it above. - ParallelWindow builds BWAG(parallel_aware) -> RangeRepartitionExec directly; EnsureRequirements plants the per-partition SortExec on its next pass. Removed the descend-and-wrap helper, no longer needed. - SLT EXPLAIN updated to match the new (correct) plan shape. Outer SPM in the EXPLAIN is just the user-visible `ORDER BY ... LIMIT`, not BWAG distribution. count(rolling_sum)=115 still holds; the halo duplication is now real per-partition output rather than corrupted single-partition slide. HaloDropExec follow-up brings it to 100. --- .../physical-optimizer/src/optimizer.rs | 14 +-- .../physical-optimizer/src/parallel_window.rs | 89 ++++++++++--------- .../physical-plan/src/range_repartition.rs | 16 ++++ .../src/windows/bounded_window_agg_exec.rs | 30 +++++-- .../test_files/parallel_window.slt | 11 ++- 5 files changed, 97 insertions(+), 63 deletions(-) diff --git a/datafusion/physical-optimizer/src/optimizer.rs b/datafusion/physical-optimizer/src/optimizer.rs index 762f8308379bb..a99fba4fa2b05 100644 --- a/datafusion/physical-optimizer/src/optimizer.rs +++ b/datafusion/physical-optimizer/src/optimizer.rs @@ -188,14 +188,14 @@ impl PhysicalOptimizer { // [`EnsureRequirements`](crate::ensure_requirements) for the per-phase // breakdown, and // for the original failure mode. - Arc::new(EnsureRequirements::new()), - // Wrap the per-partition SortExec under a no-PARTITION-BY RANGE-frame - // window with a (currently passthrough) RangeRepartitionExec. The - // wrapper will eventually drive range routing; for now it just awaits - // `runtime_statistics` on each input partition and logs them. Runs - // *after* EnsureRequirements so the SortExec that EnsureRequirements - // inserts is in place. + // Re-shape no-PARTITION-BY RANGE-frame windows into a parallel + // form: SortExec(preserve_partitioning) + RangeRepartitionExec + // + parallel-aware BWAG. Runs *before* EnsureRequirements so + // we own the distribution decision — otherwise EnsureRequirements + // would satisfy BWAG's SinglePartition requirement by inserting + // an SPM that collapses the parallelism we're trying to create. Arc::new(ParallelWindow::new()), + Arc::new(EnsureRequirements::new()), // The CombinePartialFinalAggregate rule should be applied after distribution enforcement Arc::new(CombinePartialFinalAggregate::new()), // Run once after the local sorting requirement is changed diff --git a/datafusion/physical-optimizer/src/parallel_window.rs b/datafusion/physical-optimizer/src/parallel_window.rs index d0716934e7e96..f66d42b1ced20 100644 --- a/datafusion/physical-optimizer/src/parallel_window.rs +++ b/datafusion/physical-optimizer/src/parallel_window.rs @@ -16,25 +16,35 @@ // under the License. //! Parallelize bounded RANGE-frame window functions that have an ORDER BY -//! but no PARTITION BY by inserting a `RangeRepartitionExec` between the -//! window's `SortPreservingMergeExec`/`SortExec` chain and its child sort. +//! but no PARTITION BY by re-shaping the plan above the window's input. //! -//! This rule's responsibilities are now narrow: detect the eligible -//! window shape, pull the halo distances out of the window frame, and -//! wrap the per-partition `SortExec` with a `RangeRepartitionExec` -//! carrying those halo values. All boundary / global-extremes logic -//! lives in `RangeRepartitionExec`'s coordinator and runs against -//! runtime stats rather than plan-time `Statistics`. +//! Runs *before* `EnsureRequirements`. For each eligible +//! `BoundedWindowAggExec`, this rule: +//! - inserts a `SortExec(preserve_partitioning=true)` on the ORDER BY +//! key above the window's existing input; +//! - inserts a `RangeRepartitionExec` carrying the halo distances above +//! that sort; +//! - rebuilds the `BoundedWindowAggExec` on top of the result with +//! `parallel_aware = true`, so its `required_input_distribution()` +//! returns `UnspecifiedDistribution` instead of `SinglePartition` — +//! which is what would otherwise force `EnsureRequirements` to insert +//! a `SortPreservingMergeExec` and collapse our parallelism. +//! +//! By owning the structural decisions before `EnsureRequirements` runs, +//! this rule avoids the post-hoc surgery of stripping an inserted +//! `SortPreservingMergeExec`. All boundary / global-extremes logic lives +//! in `RangeRepartitionExec`'s coordinator and runs against runtime +//! stats rather than plan-time `Statistics`. use crate::PhysicalOptimizerRule; use datafusion_common::ScalarValue; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_expr::{WindowFrameBound, WindowFrameUnits}; +use datafusion_physical_expr::LexOrdering; use datafusion_physical_expr::expressions::Column; use datafusion_physical_plan::ExecutionPlan; use datafusion_physical_plan::range_repartition::RangeRepartitionExec; -use datafusion_physical_plan::sorts::sort::SortExec; use datafusion_physical_plan::windows::BoundedWindowAggExec; use log::info; use std::sync::Arc; @@ -66,15 +76,34 @@ impl PhysicalOptimizerRule for ParallelWindow { "ParallelWindow: candidate BoundedWindowAggExec (RANGE frame, no PARTITION BY); \ halo: {halo_preceding} preceding, {halo_following} following" ); - // Descend through whatever EnsureRequirements stacked on top of - // the sort (typically SortPreservingMergeExec) until we find the - // SortExec, and wrap it with RangeRepartitionExec(halo…). - let new_child = wrap_first_sort_descendant( - Arc::clone(&node.children()[0]), + // `candidate_halo` already verified order_by.len()==1. + let sort_key = window.window_expr()[0].order_by()[0].clone(); + let lex = LexOrdering::new(vec![sort_key]) + .expect("candidate_halo guarantees one sort key"); + let original_input = Arc::clone(&node.children()[0]); + // Don't pre-insert a SortExec; RangeRepartitionExec now declares + // its required input ordering, so EnsureRequirements will plant + // the pipeline-breaking sort beneath us. Doing both would just + // produce a redundant SortExec that the optimizer collapses. + let range = Arc::new(RangeRepartitionExec::new( + original_input, + lex, halo_preceding, halo_following, - )?; - let new_window = Arc::clone(&node).with_new_children(vec![new_child])?; + )); + // `parallel_aware = true` flips BWAG's required_input_distribution + // to UnspecifiedDistribution, so EnsureRequirements won't wrap + // us in an SPM. `can_repartition` is vacuous because + // candidate_halo already required partition_keys empty. + let new_window = Arc::new( + BoundedWindowAggExec::try_new( + window.window_expr().to_vec(), + range, + window.input_order_mode.clone(), + true, + )? + .with_parallel_aware(true), + ) as Arc; Ok(Transformed::yes(new_window)) })?; Ok(out.data) @@ -111,34 +140,6 @@ fn candidate_halo(window: &BoundedWindowAggExec) -> Option<(i64, i64)> { i64_halo(&frame.start_bound, &frame.end_bound) } -/// Walk down through single-child operators until we hit a `SortExec`; -/// replace it in place with `RangeRepartitionExec` wrapping the same -/// `SortExec`, and rebuild the chain back up. If no `SortExec` is found, -/// leave the subtree alone. -fn wrap_first_sort_descendant( - node: Arc, - halo_preceding: i64, - halo_following: i64, -) -> datafusion_common::Result> { - if node.downcast_ref::().is_some() { - return Ok(Arc::new(RangeRepartitionExec::new( - node, - halo_preceding, - halo_following, - ))); - } - let children = node.children(); - if children.len() != 1 { - return Ok(node); - } - let new_child = wrap_first_sort_descendant( - Arc::clone(children[0]), - halo_preceding, - halo_following, - )?; - node.with_new_children(vec![new_child]) -} - /// Extract `(halo_preceding, halo_following)` in order-key units from a /// RANGE window frame. Returns `None` for UNBOUNDED bounds or non-`Int64` /// distances. v1 scope: only `Preceding(Int64)` / `CurrentRow` for the diff --git a/datafusion/physical-plan/src/range_repartition.rs b/datafusion/physical-plan/src/range_repartition.rs index 411f18b5c94a0..c138b18280b5f 100644 --- a/datafusion/physical-plan/src/range_repartition.rs +++ b/datafusion/physical-plan/src/range_repartition.rs @@ -49,6 +49,7 @@ use arrow::datatypes::SchemaRef; use datafusion_common::{DataFusionError, Result, internal_datafusion_err}; use datafusion_execution::TaskContext; use datafusion_physical_expr::LexOrdering; +use datafusion_physical_expr_common::sort_expr::OrderingRequirements; use futures::StreamExt; use log::info; use tokio::sync::{mpsc, oneshot}; @@ -66,6 +67,10 @@ use crate::{ pub struct RangeRepartitionExec { input: Arc, cache: Arc, + /// Required input ordering — passed down from the consumer (window + /// operator) so EnsureRequirements inserts the pipeline-breaking sort + /// *below* us, not above. Same key feeds the routing decision. + ordering: LexOrdering, /// Halo distance preceding each bucket's primary range, in /// leading-sort-key units. Carried over from the window frame at plan /// time so the coordinator can derive per-bucket expanded ranges. @@ -103,6 +108,7 @@ impl std::fmt::Debug for State { impl RangeRepartitionExec { pub fn new( input: Arc, + ordering: LexOrdering, halo_preceding: i64, halo_following: i64, ) -> Self { @@ -111,6 +117,7 @@ impl RangeRepartitionExec { Self { input, cache, + ordering, halo_preceding, halo_following, state: Arc::new(Mutex::new(State { @@ -154,11 +161,20 @@ impl ExecutionPlan for RangeRepartitionExec { ) -> Result> { Ok(Arc::new(Self::new( children.swap_remove(0), + self.ordering.clone(), self.halo_preceding, self.halo_following, ))) } + fn required_input_ordering(&self) -> Vec> { + vec![Some(OrderingRequirements::from(self.ordering.clone()))] + } + + fn maintains_input_order(&self) -> Vec { + vec![true] + } + fn execute( &self, partition: usize, diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 6c6b26c9cf49f..157f593b0965f 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -97,6 +97,11 @@ pub struct BoundedWindowAggExec { cache: Arc, /// If `can_rerepartition` is false, partition_keys is always empty. can_repartition: bool, + /// When true, `required_input_distribution()` returns + /// `UnspecifiedDistribution` instead of `SinglePartition`, so the operator + /// can run per-partition over a range-partitioned input (with halo). + /// Set by `ParallelWindow` optimizer rule; never round-trips through proto. + parallel_aware: bool, } impl BoundedWindowAggExec { @@ -137,9 +142,16 @@ impl BoundedWindowAggExec { ordered_partition_by_indices, cache: Arc::new(cache), can_repartition, + parallel_aware: false, }) } + /// Opt this BWAG into parallel-aware mode. See `parallel_aware` field. + pub fn with_parallel_aware(mut self, value: bool) -> Self { + self.parallel_aware = value; + self + } + /// Window expressions pub fn window_expr(&self) -> &[Arc] { &self.window_expr @@ -331,6 +343,9 @@ impl ExecutionPlan for BoundedWindowAggExec { } fn required_input_distribution(&self) -> Vec { + if self.parallel_aware { + return vec![Distribution::UnspecifiedDistribution]; + } if self.partition_keys().is_empty() { debug!("No partition defined for BoundedWindowAggExec!!!"); vec![Distribution::SinglePartition] @@ -348,12 +363,15 @@ impl ExecutionPlan for BoundedWindowAggExec { children: Vec>, ) -> Result> { check_if_same_properties!(self, children); - Ok(Arc::new(BoundedWindowAggExec::try_new( - self.window_expr.clone(), - Arc::clone(&children[0]), - self.input_order_mode.clone(), - self.can_repartition, - )?)) + Ok(Arc::new( + BoundedWindowAggExec::try_new( + self.window_expr.clone(), + Arc::clone(&children[0]), + self.input_order_mode.clone(), + self.can_repartition, + )? + .with_parallel_aware(self.parallel_aware), + )) } fn execute( diff --git a/datafusion/sqllogictest/test_files/parallel_window.slt b/datafusion/sqllogictest/test_files/parallel_window.slt index 57ee592c98486..0c29933fb5ab9 100644 --- a/datafusion/sqllogictest/test_files/parallel_window.slt +++ b/datafusion/sqllogictest/test_files/parallel_window.slt @@ -91,13 +91,12 @@ logical_plan 03)----WindowAggr: windowExpr=[[sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW]] 04)------TableScan: events projection=[seq, amount] physical_plan -01)ProjectionExec: expr=[seq@0 as seq, sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW@2 as rolling_sum], statistics=[Rows=Inexact(5), Bytes=Inexact(80), [(Col[0]:),(Col[1]:)]] -02)--GlobalLimitExec: skip=0, fetch=5, statistics=[Rows=Inexact(5), Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]] +01)SortPreservingMergeExec: [seq@0 ASC NULLS LAST], fetch=5, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:)]] +02)--ProjectionExec: expr=[seq@0 as seq, sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW@2 as rolling_sum], statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:)]] 03)----BoundedWindowAggExec: wdw=[sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted], statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]] -04)------SortPreservingMergeExec: [seq@0 ASC NULLS LAST], statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:)]] -05)--------RangeRepartitionExec, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:)]] -06)----------SortExec: expr=[seq@0 ASC NULLS LAST], preserve_partitioning=[true], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] -07)------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/3.parquet]]}, projection=[seq, amount], file_type=parquet, sort_order_for_reorder=[seq@0 ASC NULLS LAST], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] +04)------RangeRepartitionExec, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:)]] +05)--------SortExec: expr=[seq@0 ASC NULLS LAST], preserve_partitioning=[true], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] +06)----------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/3.parquet]]}, projection=[seq, amount], file_type=parquet, sort_order_for_reorder=[seq@0 ASC NULLS LAST], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] # Actually execute the query — this drives the SortExec down the (false, None) # external-sort branch in each input partition and lets the runtime min/max From 148493454d2fb51f41468384605116b1971d71d0 Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Thu, 18 Jun 2026 11:06:07 -0600 Subject: [PATCH 11/12] pass --- .../physical-optimizer/src/parallel_window.rs | 19 +- datafusion/physical-plan/src/halo_drop.rs | 230 ++++++++++++++++++ datafusion/physical-plan/src/lib.rs | 1 + .../physical-plan/src/range_repartition.rs | 70 +++++- .../src/windows/bounded_window_agg_exec.rs | 13 +- .../test_files/parallel_window.slt | 27 +- 6 files changed, 338 insertions(+), 22 deletions(-) create mode 100644 datafusion/physical-plan/src/halo_drop.rs diff --git a/datafusion/physical-optimizer/src/parallel_window.rs b/datafusion/physical-optimizer/src/parallel_window.rs index f66d42b1ced20..aeda46d378076 100644 --- a/datafusion/physical-optimizer/src/parallel_window.rs +++ b/datafusion/physical-optimizer/src/parallel_window.rs @@ -39,11 +39,12 @@ use crate::PhysicalOptimizerRule; use datafusion_common::ScalarValue; use datafusion_common::config::ConfigOptions; -use datafusion_common::tree_node::{Transformed, TreeNode}; +use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRecursion}; use datafusion_expr::{WindowFrameBound, WindowFrameUnits}; use datafusion_physical_expr::LexOrdering; use datafusion_physical_expr::expressions::Column; use datafusion_physical_plan::ExecutionPlan; +use datafusion_physical_plan::halo_drop::HaloDropExec; use datafusion_physical_plan::range_repartition::RangeRepartitionExec; use datafusion_physical_plan::windows::BoundedWindowAggExec; use log::info; @@ -87,7 +88,7 @@ impl PhysicalOptimizerRule for ParallelWindow { // produce a redundant SortExec that the optimizer collapses. let range = Arc::new(RangeRepartitionExec::new( original_input, - lex, + lex.clone(), halo_preceding, halo_following, )); @@ -95,7 +96,7 @@ impl PhysicalOptimizerRule for ParallelWindow { // to UnspecifiedDistribution, so EnsureRequirements won't wrap // us in an SPM. `can_repartition` is vacuous because // candidate_halo already required partition_keys empty. - let new_window = Arc::new( + let new_window: Arc = Arc::new( BoundedWindowAggExec::try_new( window.window_expr().to_vec(), range, @@ -103,8 +104,16 @@ impl PhysicalOptimizerRule for ParallelWindow { true, )? .with_parallel_aware(true), - ) as Arc; - Ok(Transformed::yes(new_window)) + ); + // Drop halo rows above the per-partition window. HaloDropExec + // reads its primary range from `input.runtime_sort_extremes`, + // which BWAG passes through and RangeRepartitionExec populates. + let drop_halo: Arc = + Arc::new(HaloDropExec::try_new(new_window, &lex)?); + // Jump past the result's children: the BWAG we just emitted is + // still a candidate by shape (RANGE frame, no PARTITION BY) and + // `transform_down` would otherwise re-wrap it forever. + Ok(Transformed::new(drop_halo, true, TreeNodeRecursion::Jump)) })?; Ok(out.data) } diff --git a/datafusion/physical-plan/src/halo_drop.rs b/datafusion/physical-plan/src/halo_drop.rs new file mode 100644 index 0000000000000..fe4ea41785c2e --- /dev/null +++ b/datafusion/physical-plan/src/halo_drop.rs @@ -0,0 +1,230 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Drop halo rows above a `BoundedWindowAggExec` running per-partition +//! over `RangeRepartitionExec`-routed input. +//! +//! Each partition reads its *intended primary range* from +//! `input.runtime_sort_extremes(partition)` — which `RangeRepartitionExec` +//! exposes as a "useful lie" — and filters rows whose leading sort key +//! falls outside that range. Halo rows (rows duplicated into this +//! partition for the window's frame context at boundaries) sit *outside* +//! the primary range by construction, so the filter drops them. +//! +//! Reads extremes lazily on the first batch, because +//! `RangeRepartitionExec`'s coordinator populates ranges before routing +//! any data — so any batch arriving at us implies ranges are ready. + +use std::sync::Arc; + +use arrow::array::{Array, BooleanArray, Int64Array, RecordBatch}; +use arrow::compute::filter_record_batch; +use datafusion_common::{Result, ScalarValue, internal_datafusion_err}; +use datafusion_execution::TaskContext; +use datafusion_physical_expr::LexOrdering; +use datafusion_physical_expr::expressions::Column; + +use crate::stream::RecordBatchStreamAdapter; +use crate::{ + DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, + SendableRecordBatchStream, +}; + +#[derive(Debug)] +pub struct HaloDropExec { + input: Arc, + /// Column index of the leading sort key in the input schema. We + /// resolve it at construction from a `LexOrdering`'s first key, + /// which must be a `Column` (the same constraint `ParallelWindow` + /// applies to candidate windows). + sort_col: usize, + cache: Arc, +} + +impl HaloDropExec { + pub fn try_new( + input: Arc, + ordering: &LexOrdering, + ) -> Result { + // PhysicalExpr: Any, so downcast_ref via Any works directly on + // the Arc through auto-deref. + let sort_col = ordering + .first() + .expr + .downcast_ref::() + .ok_or_else(|| { + internal_datafusion_err!( + "HaloDropExec: leading sort key must be a Column" + ) + })? + .index(); + let cache = Arc::clone(input.properties()); + Ok(Self { + input, + sort_col, + cache, + }) + } +} + +impl DisplayAs for HaloDropExec { + fn fmt_as( + &self, + _t: DisplayFormatType, + f: &mut std::fmt::Formatter, + ) -> std::fmt::Result { + write!(f, "HaloDropExec") + } +} + +impl ExecutionPlan for HaloDropExec { + fn name(&self) -> &'static str { + "HaloDropExec" + } + + fn properties(&self) -> &Arc { + &self.cache + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + mut children: Vec>, + ) -> Result> { + let input = children.swap_remove(0); + let cache = Arc::clone(input.properties()); + Ok(Arc::new(Self { + input, + sort_col: self.sort_col, + cache, + })) + } + + fn maintains_input_order(&self) -> Vec { + vec![true] + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + let input = self.input.execute(partition, context)?; + let schema = self.schema(); + let extremes_provider = Arc::clone(&self.input); + let sort_col = self.sort_col; + let stream = filter_stream(input, extremes_provider, partition, sort_col); + Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) + } +} + +/// Drains `input`, lazy-initializing the `[min, max]` filter range on +/// the first batch by calling +/// `extremes_provider.runtime_sort_extremes(partition)`. Subsequent +/// batches reuse the cached range. +fn filter_stream( + input: SendableRecordBatchStream, + extremes_provider: Arc, + partition: usize, + sort_col: usize, +) -> impl futures::Stream> + Send { + struct St { + input: SendableRecordBatchStream, + range: Option<(i64, i64)>, + provider: Arc, + partition: usize, + sort_col: usize, + } + let st = St { + input, + range: None, + provider: extremes_provider, + partition, + sort_col, + }; + futures::stream::try_unfold(st, |mut st| async move { + use futures::StreamExt; + loop { + let batch = match st.input.next().await { + Some(Ok(b)) => b, + Some(Err(e)) => return Err(e), + None => return Ok(None), + }; + if st.range.is_none() { + let extremes = st + .provider + .runtime_sort_extremes(st.partition)? + .ok_or_else(|| { + internal_datafusion_err!( + "HaloDropExec: extremes unavailable on first batch \ + — RangeRepartitionExec coordinator should have \ + populated them before routing any rows" + ) + })?; + let lo = scalar_to_i64(extremes.min.first())?; + let hi = scalar_to_i64(extremes.max.first())?; + st.range = Some((lo, hi)); + } + let (lo, hi) = st.range.unwrap(); + let filtered = filter_batch(&batch, st.sort_col, lo, hi)?; + if filtered.num_rows() == 0 { + continue; // skip empty filtered batches + } + return Ok(Some((filtered, st))); + } + }) +} + +fn scalar_to_i64(s: Option<&ScalarValue>) -> Result { + match s { + Some(ScalarValue::Int64(Some(v))) => Ok(*v), + _ => Err(internal_datafusion_err!( + "HaloDropExec: leading extreme must be non-null Int64" + )), + } +} + +fn filter_batch( + batch: &RecordBatch, + sort_col: usize, + lo: i64, + hi: i64, +) -> Result { + let col = batch + .column(sort_col) + .as_any() + .downcast_ref::() + .ok_or_else(|| { + internal_datafusion_err!( + "HaloDropExec: leading sort column must be Int64" + ) + })?; + let mask: BooleanArray = (0..col.len()) + .map(|i| { + if col.is_null(i) { + Some(false) + } else { + let v = col.value(i); + Some(v >= lo && v <= hi) + } + }) + .collect(); + Ok(filter_record_batch(batch, &mask)?) +} diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 96ac6f604d5e9..19c653d4e46a3 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -76,6 +76,7 @@ pub mod execution_plan; pub mod explain; pub mod filter; pub mod filter_pushdown; +pub mod halo_drop; pub mod joins; pub mod limit; pub mod memory; diff --git a/datafusion/physical-plan/src/range_repartition.rs b/datafusion/physical-plan/src/range_repartition.rs index c138b18280b5f..1d689459207cc 100644 --- a/datafusion/physical-plan/src/range_repartition.rs +++ b/datafusion/physical-plan/src/range_repartition.rs @@ -78,6 +78,12 @@ pub struct RangeRepartitionExec { /// Halo distance following each bucket's primary range. halo_following: i64, state: Arc>, + /// Per-output-partition primary `[lo, hi_exclusive)` ranges, filled + /// by the coordinator before any batch is routed. Surfaced through + /// `runtime_sort_extremes(partition)` so downstream operators + /// (e.g. HaloDropExec) can read each bucket's intended primary + /// range without needing the global extremes. + bucket_primary_ranges: Arc>>>, } struct State { @@ -124,6 +130,7 @@ impl RangeRepartitionExec { initialized: false, handoffs: (0..n).map(|_| None).collect(), })), + bucket_primary_ranges: Arc::new(Mutex::new(None)), } } @@ -175,6 +182,37 @@ impl ExecutionPlan for RangeRepartitionExec { vec![true] } + /// Returns each output partition's *intended primary range* as + /// inclusive `[min, max]` — not the actual range of routed data + /// (which is wider, by `halo_preceding`/`halo_following`). This is a + /// "useful lie" the downstream `HaloDropExec` consumes to filter + /// halo rows back out. + /// + /// Returns `Ok(None)` if the coordinator hasn't computed boundaries + /// yet — callers must drive the input stream to first batch before + /// reading, per the trait contract on `runtime_sort_extremes`. + fn runtime_sort_extremes( + &self, + partition: usize, + ) -> Result> { + let guard = self.bucket_primary_ranges.lock().map_err(|_| { + internal_datafusion_err!( + "RangeRepartitionExec bucket_primary_ranges mutex poisoned" + ) + })?; + let Some(ranges) = guard.as_ref() else { + return Ok(None); + }; + let &(lo, hi_excl) = &ranges[partition]; + // Convert [lo, hi_exclusive) → inclusive [min, max]. + let max = hi_excl.saturating_sub(1); + Ok(Some(SortExtremes { + min: vec![ScalarValue::Int64(Some(lo))], + max: vec![ScalarValue::Int64(Some(max))], + row_count: 0, // not tracked; consumers shouldn't rely on it + })) + } + fn execute( &self, partition: usize, @@ -196,12 +234,14 @@ impl ExecutionPlan for RangeRepartitionExec { let ctx = Arc::clone(&context); let halo_preceding = self.halo_preceding; let halo_following = self.halo_following; + let primaries = Arc::clone(&self.bucket_primary_ranges); tokio::spawn(coordinator( child, ctx, senders, halo_preceding, halo_following, + primaries, )); } let rx = state @@ -250,6 +290,7 @@ async fn coordinator( mut senders: Vec>>, halo_preceding: i64, halo_following: i64, + bucket_primary_ranges: Arc>>>, ) { let n = senders.len(); @@ -324,6 +365,13 @@ async fn coordinator( }; log_buckets(lo, hi, &boundaries, halo_preceding, halo_following); + // Stash per-bucket primary ranges where `runtime_sort_extremes` can + // see them. Done *before* any batch is routed so downstream operators + // that gate on first batch will read populated state. + if let Ok(mut guard) = bucket_primary_ranges.lock() { + *guard = Some(primary_ranges_from_boundaries(lo, hi, &boundaries)); + } + // Phase 5: figure out which column carries the leading sort key. let col_idx = match ordering .as_ref() @@ -533,9 +581,29 @@ fn int64_range(global: Option<&SortExtremes>) -> Option<(i64, i64)> { Some((*lo, *hi)) } +/// Per-bucket primary ranges as `[start, end_exclusive)` derived from +/// the global `[lo, hi]` (inclusive) and the interior cut points. Same +/// edge convention as `log_buckets`: edges = `[lo] ++ boundaries ++ [hi+1]`, +/// each bucket spans `[edges[i], edges[i+1])`. +pub fn primary_ranges_from_boundaries( + lo: i64, + hi: i64, + boundaries: &[i64], +) -> Vec<(i64, i64)> { + let mut edges: Vec = std::iter::once(lo) + .chain(boundaries.iter().copied()) + .chain(std::iter::once(hi.saturating_add(1))) + .collect(); + edges.dedup(); + edges + .windows(2) + .map(|w| (w[0], w[1])) + .collect() +} + /// Split the closed `Int64` interval `[lo, hi]` into `n` equal-width /// buckets, returning the `n - 1` interior cut points. -fn equal_width_int64_boundaries(lo: i64, hi: i64, n: usize) -> Option> { +pub fn equal_width_int64_boundaries(lo: i64, hi: i64, n: usize) -> Option> { if n <= 1 { return Some(vec![]); } diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 157f593b0965f..9a789e1ff5fd4 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -36,7 +36,8 @@ use crate::windows::{ use crate::{ ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, ExecutionPlanProperties, InputOrderMode, PlanProperties, RecordBatchStream, - SendableRecordBatchStream, Statistics, WindowExpr, check_if_same_properties, + SendableRecordBatchStream, SortExtremes, Statistics, WindowExpr, + check_if_same_properties, }; use arrow::compute::take_record_batch; @@ -358,6 +359,16 @@ impl ExecutionPlan for BoundedWindowAggExec { vec![true] } + /// Passthrough: the window operator doesn't alter the leading sort + /// key's value range, so its `partition`-th output partition's + /// extremes are exactly its input partition's extremes. + fn runtime_sort_extremes( + &self, + partition: usize, + ) -> Result> { + self.input.runtime_sort_extremes(partition) + } + fn with_new_children( self: Arc, children: Vec>, diff --git a/datafusion/sqllogictest/test_files/parallel_window.slt b/datafusion/sqllogictest/test_files/parallel_window.slt index 0c29933fb5ab9..c47050367007c 100644 --- a/datafusion/sqllogictest/test_files/parallel_window.slt +++ b/datafusion/sqllogictest/test_files/parallel_window.slt @@ -93,10 +93,11 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [seq@0 ASC NULLS LAST], fetch=5, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:)]] 02)--ProjectionExec: expr=[seq@0 as seq, sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW@2 as rolling_sum], statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:)]] -03)----BoundedWindowAggExec: wdw=[sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted], statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]] -04)------RangeRepartitionExec, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:)]] -05)--------SortExec: expr=[seq@0 ASC NULLS LAST], preserve_partitioning=[true], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] -06)----------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/3.parquet]]}, projection=[seq, amount], file_type=parquet, sort_order_for_reorder=[seq@0 ASC NULLS LAST], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] +03)----HaloDropExec, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]] +04)------BoundedWindowAggExec: wdw=[sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW: Field { "sum(events.amount) ORDER BY [events.seq ASC NULLS LAST] RANGE BETWEEN 5 PRECEDING AND CURRENT ROW": nullable Int64 }, frame: RANGE BETWEEN 5 PRECEDING AND CURRENT ROW], mode=[Sorted], statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:),(Col[2]:)]] +05)--------RangeRepartitionExec, statistics=[Rows=Absent, Bytes=Absent, [(Col[0]:),(Col[1]:)]] +06)----------SortExec: expr=[seq@0 ASC NULLS LAST], preserve_partitioning=[true], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] +07)------------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/2.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parallel_window/events/3.parquet]]}, projection=[seq, amount], file_type=parquet, sort_order_for_reorder=[seq@0 ASC NULLS LAST], statistics=[Rows=Exact(100), Bytes=Exact(1600), [(Col[0]: Min=Exact(Int64(0)) Max=Exact(Int64(99)) Null=Exact(0) ScanBytes=Exact(800)),(Col[1]: Min=Exact(Int64(0)) Max=Exact(Int64(6)) Null=Exact(0) ScanBytes=Exact(800))]] # Actually execute the query — this drives the SortExec down the (false, None) # external-sort branch in each input partition and lets the runtime min/max @@ -118,16 +119,12 @@ LIMIT 5; 3 6 4 10 -# Halo duplication is observable as an inflated row count: each row whose -# seq is within `halo_preceding` of a bucket boundary appears in both its -# primary bucket and the neighbor that needs it as halo. For this fixture -# (100 rows, halo_preceding=5, 3 interior boundaries) that's 15 extra -# rows, so the merged output has 115. The eventual `HaloDropExec` will -# drop halo rows post-window and bring this back to 100. -# -# `count(rolling_sum)` forces the window to actually execute (a plain -# count(*) gets statistics-pruned from the parquet row count, never -# instantiates the window operator, and so wouldn't notice the bug). +# With `HaloDropExec` planted above the per-partition window, halo rows +# (rows duplicated into a neighbor's bucket so its window frame sees the +# boundary context) are filtered out before the merge — so the row count +# is exactly the input cardinality, 100. `count(rolling_sum)` forces the +# window to actually execute (a plain count(*) gets statistics-pruned +# from the parquet row count, never instantiates the window operator). query I SELECT count(rolling_sum) AS n FROM ( SELECT seq, SUM(amount) OVER ( @@ -137,7 +134,7 @@ SELECT count(rolling_sum) AS n FROM ( FROM events ) t; ---- -115 +100 # Reset session settings so this file doesn't leak config into the rest of the run. statement ok From be1c5259b422cbef542a3d0c2c573cd212c539ed Mon Sep 17 00:00:00 2001 From: Brent Gardner Date: Thu, 18 Jun 2026 11:17:33 -0600 Subject: [PATCH 12/12] =?UTF-8?q?Rename=20SortExtremes=20=E2=86=92=20Parti?= =?UTF-8?q?tionExtremes;=20document=20dual=20semantics?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `SortExtremes` reads as "data extremes observed by a sort", which doesn't cover what `RangeRepartitionExec` does when it implements the trait method: it returns each output partition's *intended primary range* (narrower than the data the partition will actually carry, by the halo distance), so `HaloDropExec` upstream can read each bucket's bound without threading a side-channel. That's a different interpretation of "extremes" — *intended* rather than *observed* — and the old name reads as a lie at that call site. Rename: - struct: SortExtremes → PartitionExtremes - trait method: runtime_sort_extremes → runtime_partition_extremes - internal slot aliases in sorts/sort.rs follow Type doc at `execution_plan.rs:97-127` documents both interpretations so future consumers don't assume observed-only. --- .../physical-optimizer/src/parallel_window.rs | 2 +- .../physical-plan/src/execution_plan.rs | 34 ++++++++++---- datafusion/physical-plan/src/halo_drop.rs | 10 ++-- datafusion/physical-plan/src/lib.rs | 2 +- .../physical-plan/src/range_repartition.rs | 41 ++++++++--------- datafusion/physical-plan/src/sorts/sort.rs | 46 ++++++++++--------- .../src/windows/bounded_window_agg_exec.rs | 10 ++-- 7 files changed, 81 insertions(+), 64 deletions(-) diff --git a/datafusion/physical-optimizer/src/parallel_window.rs b/datafusion/physical-optimizer/src/parallel_window.rs index aeda46d378076..4216a63987db1 100644 --- a/datafusion/physical-optimizer/src/parallel_window.rs +++ b/datafusion/physical-optimizer/src/parallel_window.rs @@ -106,7 +106,7 @@ impl PhysicalOptimizerRule for ParallelWindow { .with_parallel_aware(true), ); // Drop halo rows above the per-partition window. HaloDropExec - // reads its primary range from `input.runtime_sort_extremes`, + // reads its primary range from `input.runtime_partition_extremes`, // which BWAG passes through and RangeRepartitionExec populates. let drop_halo: Arc = Arc::new(HaloDropExec::try_new(new_window, &lex)?); diff --git a/datafusion/physical-plan/src/execution_plan.rs b/datafusion/physical-plan/src/execution_plan.rs index 1b08035e13ebc..a64a2adb19544 100644 --- a/datafusion/physical-plan/src/execution_plan.rs +++ b/datafusion/physical-plan/src/execution_plan.rs @@ -94,17 +94,29 @@ use futures::stream::{StreamExt, TryStreamExt}; /// [`datafusion-examples`]: https://github.com/apache/datafusion/tree/main/datafusion-examples /// [`memory_pool_execution_plan.rs`]: https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/execution_monitoring/memory_pool_execution_plan.rs -/// Runtime-derived endpoints of a single partition's output, expressed in the -/// operator's declared output ordering (the `PhysicalSortExpr` list returned -/// by `equivalence_properties().output_ordering()`). +/// Endpoints of a single partition's output expressed in the operator's +/// declared output ordering (the `PhysicalSortExpr` list returned by +/// `equivalence_properties().output_ordering()`). /// /// `min` and `max` are tuples of values — one entry per sort key — taken at -/// the lex-smallest and lex-largest output rows respectively. For the leading -/// sort key these are exactly the natural min/max of that key (after honoring +/// the lex-smallest and lex-largest output rows. For the leading sort key +/// these are exactly the natural min/max of that key (after honoring /// ASC/DESC). For trailing sort keys the entries hold the value of that key /// at the lex-extreme row, not that key's own natural extreme. +/// +/// Default semantics — *observed*: these are the actual min/max of data this +/// partition has produced (or will produce, by the time the upstream is fully +/// consumed). `SortExec` is the canonical observer-style override. +/// +/// Exception — *intended*: `RangeRepartitionExec` returns each output +/// partition's *intended primary range* (the range a downstream `HaloDropExec` +/// should keep, with halo rows excluded), which is **narrower** than the data +/// the partition actually carries when halo is non-zero. The "useful lie" +/// is what lets the operator above the window strip halo without threading a +/// boundaries side-channel. Consumers that need observed extremes must not +/// read through a `RangeRepartitionExec` boundary when halo > 0. #[derive(Debug, Clone)] -pub struct SortExtremes { +pub struct PartitionExtremes { /// Sort-key values at the lex-smallest row across the partition. pub min: Vec, /// Sort-key values at the lex-largest row across the partition. @@ -544,8 +556,14 @@ pub trait ExecutionPlan: Any + Debug + DisplayAs + Send + Sync { /// enough progress (typically by polling `execute()`) before relying on /// the result; until then this returns `Ok(None)`. /// - /// See [`SortExtremes`] for the result shape. - fn runtime_sort_extremes(&self, partition: usize) -> Result> { + /// See [`PartitionExtremes`] for the result shape — and read the type doc + /// before assuming "observed" semantics: a few operators (notably + /// `RangeRepartitionExec`) intentionally return the *intended* range, not + /// the data they will actually carry. + fn runtime_partition_extremes( + &self, + partition: usize, + ) -> Result> { let _ = partition; Ok(None) } diff --git a/datafusion/physical-plan/src/halo_drop.rs b/datafusion/physical-plan/src/halo_drop.rs index fe4ea41785c2e..1ea3e6822041e 100644 --- a/datafusion/physical-plan/src/halo_drop.rs +++ b/datafusion/physical-plan/src/halo_drop.rs @@ -19,7 +19,7 @@ //! over `RangeRepartitionExec`-routed input. //! //! Each partition reads its *intended primary range* from -//! `input.runtime_sort_extremes(partition)` — which `RangeRepartitionExec` +//! `input.runtime_partition_extremes(partition)` — which `RangeRepartitionExec` //! exposes as a "useful lie" — and filters rows whose leading sort key //! falls outside that range. Halo rows (rows duplicated into this //! partition for the window's frame context at boundaries) sit *outside* @@ -137,7 +137,7 @@ impl ExecutionPlan for HaloDropExec { /// Drains `input`, lazy-initializing the `[min, max]` filter range on /// the first batch by calling -/// `extremes_provider.runtime_sort_extremes(partition)`. Subsequent +/// `extremes_provider.runtime_partition_extremes(partition)`. Subsequent /// batches reuse the cached range. fn filter_stream( input: SendableRecordBatchStream, @@ -170,7 +170,7 @@ fn filter_stream( if st.range.is_none() { let extremes = st .provider - .runtime_sort_extremes(st.partition)? + .runtime_partition_extremes(st.partition)? .ok_or_else(|| { internal_datafusion_err!( "HaloDropExec: extremes unavailable on first batch \ @@ -212,9 +212,7 @@ fn filter_batch( .as_any() .downcast_ref::() .ok_or_else(|| { - internal_datafusion_err!( - "HaloDropExec: leading sort column must be Int64" - ) + internal_datafusion_err!("HaloDropExec: leading sort column must be Int64") })?; let mask: BooleanArray = (0..col.len()) .map(|i| { diff --git a/datafusion/physical-plan/src/lib.rs b/datafusion/physical-plan/src/lib.rs index 19c653d4e46a3..6f0aeda26cda7 100644 --- a/datafusion/physical-plan/src/lib.rs +++ b/datafusion/physical-plan/src/lib.rs @@ -42,7 +42,7 @@ pub use datafusion_physical_expr::{ pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay}; pub use crate::execution_plan::{ - ExecutionPlan, ExecutionPlanProperties, PlanProperties, SortExtremes, collect, + ExecutionPlan, ExecutionPlanProperties, PartitionExtremes, PlanProperties, collect, collect_partitioned, displayable, execute_input_stream, execute_stream, execute_stream_partitioned, get_plan_string, with_new_children_if_necessary, }; diff --git a/datafusion/physical-plan/src/range_repartition.rs b/datafusion/physical-plan/src/range_repartition.rs index 1d689459207cc..dec1763d54a9f 100644 --- a/datafusion/physical-plan/src/range_repartition.rs +++ b/datafusion/physical-plan/src/range_repartition.rs @@ -22,9 +22,9 @@ //! `execute()`'s first call spawns a coordinator that: //! 1. opens `child.execute(k)` for every input partition `k`, //! 2. drives each stream to its first batch (which makes the pipeline- -//! breaking sort child populate its `SortExtremes` slot), -//! 3. reads `child.runtime_sort_extremes(k)` per input, -//! 4. lex-reduces those into a single global [`SortExtremes`], derives +//! breaking sort child populate its `PartitionExtremes` slot), +//! 3. reads `child.runtime_partition_extremes(k)` per input, +//! 4. lex-reduces those into a single global [`PartitionExtremes`], derives //! `N` equal-width Int64 bucket boundaries from `[global.min, //! global.max]`, and computes per-bucket expanded ranges by //! extending each primary [b_i, b_{i+1}) outward by @@ -59,8 +59,8 @@ use datafusion_common::ScalarValue; use crate::sorts::sort::lex_compare; use crate::stream::RecordBatchStreamAdapter; use crate::{ - DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, - SendableRecordBatchStream, SortExtremes, + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, + PartitionExtremes, PlanProperties, SendableRecordBatchStream, }; #[derive(Debug)] @@ -80,7 +80,7 @@ pub struct RangeRepartitionExec { state: Arc>, /// Per-output-partition primary `[lo, hi_exclusive)` ranges, filled /// by the coordinator before any batch is routed. Surfaced through - /// `runtime_sort_extremes(partition)` so downstream operators + /// `runtime_partition_extremes(partition)` so downstream operators /// (e.g. HaloDropExec) can read each bucket's intended primary /// range without needing the global extremes. bucket_primary_ranges: Arc>>>, @@ -190,11 +190,11 @@ impl ExecutionPlan for RangeRepartitionExec { /// /// Returns `Ok(None)` if the coordinator hasn't computed boundaries /// yet — callers must drive the input stream to first batch before - /// reading, per the trait contract on `runtime_sort_extremes`. - fn runtime_sort_extremes( + /// reading, per the trait contract on `runtime_partition_extremes`. + fn runtime_partition_extremes( &self, partition: usize, - ) -> Result> { + ) -> Result> { let guard = self.bucket_primary_ranges.lock().map_err(|_| { internal_datafusion_err!( "RangeRepartitionExec bucket_primary_ranges mutex poisoned" @@ -206,7 +206,7 @@ impl ExecutionPlan for RangeRepartitionExec { let &(lo, hi_excl) = &ranges[partition]; // Convert [lo, hi_exclusive) → inclusive [min, max]. let max = hi_excl.saturating_sub(1); - Ok(Some(SortExtremes { + Ok(Some(PartitionExtremes { min: vec![ScalarValue::Int64(Some(lo))], max: vec![ScalarValue::Int64(Some(max))], row_count: 0, // not tracked; consumers shouldn't rely on it @@ -323,8 +323,8 @@ async fn coordinator( } // Phase 2: collect per-input runtime extremes. - let per_input: Vec> = (0..n) - .map(|k| child.runtime_sort_extremes(k).ok().flatten()) + let per_input: Vec> = (0..n) + .map(|k| child.runtime_partition_extremes(k).ok().flatten()) .collect(); // Phase 3: lex-reduce per-input → global, using the input's declared @@ -365,7 +365,7 @@ async fn coordinator( }; log_buckets(lo, hi, &boundaries, halo_preceding, halo_following); - // Stash per-bucket primary ranges where `runtime_sort_extremes` can + // Stash per-bucket primary ranges where `runtime_partition_extremes` can // see them. Done *before* any batch is routed so downstream operators // that gate on first batch will read populated state. if let Ok(mut guard) = bucket_primary_ranges.lock() { @@ -568,10 +568,10 @@ fn log_buckets( } } -/// Extract `(lo, hi)` from the leading slot of a global [`SortExtremes`]. +/// Extract `(lo, hi)` from the leading slot of a global [`PartitionExtremes`]. /// Returns `None` if the extremes are missing, the leading key isn't /// Int64, or either endpoint is null. -fn int64_range(global: Option<&SortExtremes>) -> Option<(i64, i64)> { +fn int64_range(global: Option<&PartitionExtremes>) -> Option<(i64, i64)> { let global = global?; let (ScalarValue::Int64(Some(lo)), ScalarValue::Int64(Some(hi))) = (global.min.first()?, global.max.first()?) @@ -595,10 +595,7 @@ pub fn primary_ranges_from_boundaries( .chain(std::iter::once(hi.saturating_add(1))) .collect(); edges.dedup(); - edges - .windows(2) - .map(|w| (w[0], w[1])) - .collect() + edges.windows(2).map(|w| (w[0], w[1])).collect() } /// Split the closed `Int64` interval `[lo, hi]` into `n` equal-width @@ -616,14 +613,14 @@ pub fn equal_width_int64_boundaries(lo: i64, hi: i64, n: usize) -> Option], + per_input: &[Option], ordering: &LexOrdering, -) -> Option { +) -> Option { let mut iter = per_input.iter().filter_map(Option::clone); let mut global = iter.next()?; for next in iter { diff --git a/datafusion/physical-plan/src/sorts/sort.rs b/datafusion/physical-plan/src/sorts/sort.rs index 2d5a52235ea23..8ed5896d531c7 100644 --- a/datafusion/physical-plan/src/sorts/sort.rs +++ b/datafusion/physical-plan/src/sorts/sort.rs @@ -70,7 +70,7 @@ use datafusion_physical_expr::LexOrdering; use datafusion_physical_expr::PhysicalExpr; use datafusion_physical_expr::expressions::{DynamicFilterPhysicalExpr, lit}; -use crate::SortExtremes; +use crate::PartitionExtremes; use arrow::array::ArrayRef; use datafusion_common::ScalarValue; use futures::{StreamExt, TryStreamExt}; @@ -79,13 +79,13 @@ use std::sync::Mutex; /// One slot per SortExec output partition. Populated inside the sort code /// path each time `sort_batch_chunked` produces sorted batches, and -/// surfaced via `ExecutionPlan::runtime_sort_extremes`. By the time +/// surfaced via `ExecutionPlan::runtime_partition_extremes`. By the time /// downstream sees the first output batch the slot for that partition is /// already populated. -type SortExtremesSlot = Arc>>; -type SortExtremesSlots = Arc>; +type PartitionExtremesSlot = Arc>>; +type PartitionExtremesSlots = Arc>; -fn make_sort_extremes_slots(n_partitions: usize) -> SortExtremesSlots { +fn make_partition_extremes_slots(n_partitions: usize) -> PartitionExtremesSlots { Arc::new( (0..n_partitions) .map(|_| Arc::new(Mutex::new(None))) @@ -158,9 +158,9 @@ pub fn lex_compare( } /// Fold the endpoints of one already-sorted chunk produced by -/// `sort_batch_chunked` into the running [`SortExtremes`] for one partition. +/// `sort_batch_chunked` into the running [`PartitionExtremes`] for one partition. fn merge_chunk_into_slot( - slot: &Mutex>, + slot: &Mutex>, expressions: &LexOrdering, sorted_chunks: &[RecordBatch], ) -> Result<()> { @@ -179,7 +179,7 @@ fn merge_chunk_into_slot( let mut guard = slot.lock().unwrap(); *guard = Some(match guard.take() { - None => SortExtremes { + None => PartitionExtremes { min: chunk_min, max: chunk_max, row_count: total_rows, @@ -195,7 +195,7 @@ fn merge_chunk_into_slot( } else { prev.max }; - SortExtremes { + PartitionExtremes { min, max, row_count: prev.row_count + total_rows, @@ -397,8 +397,8 @@ struct ExternalSorter { /// Optional slot that `sort_batch_stream` updates after each /// `sort_batch_chunked` call with the leading-key endpoints of the /// sorted output. `SortExec` provides this so consumers can fetch the - /// runtime sort extremes via `ExecutionPlan::runtime_sort_extremes`. - extremes_slot: Option>>>, + /// observed extremes via `ExecutionPlan::runtime_partition_extremes`. + extremes_slot: Option>>>, } impl ExternalSorter { @@ -452,9 +452,9 @@ impl ExternalSorter { } /// Provide a slot for `sort_batch_stream` to publish runtime endpoints - /// into. Used by `SortExec` so its `runtime_sort_extremes` override has + /// into. Used by `SortExec` so its `runtime_partition_extremes` override has /// a value to return. - fn with_extremes_slot(mut self, slot: Arc>>) -> Self { + fn with_extremes_slot(mut self, slot: Arc>>) -> Self { self.extremes_slot = Some(slot); self } @@ -1022,9 +1022,9 @@ pub struct SortExec { /// If `fetch` is `None`, this will be `None`. filter: Option>>, /// Per-output-partition slot populated by the sort code path. Surfaced - /// via `ExecutionPlan::runtime_sort_extremes` so range-partitioning + /// via `ExecutionPlan::runtime_partition_extremes` so range-partitioning /// callers can derive global boundaries from runtime data. - runtime_extremes: SortExtremesSlots, + runtime_extremes: PartitionExtremesSlots, } impl SortExec { @@ -1036,7 +1036,7 @@ impl SortExec { Self::compute_properties(&input, expr.clone(), preserve_partitioning) .unwrap(); let runtime_extremes = - make_sort_extremes_slots(cache.partitioning.partition_count()); + make_partition_extremes_slots(cache.partitioning.partition_count()); Self { expr, input, @@ -1067,7 +1067,7 @@ impl SortExec { Arc::make_mut(&mut self.cache).partitioning = Self::output_partitioning_helper(&self.input, self.preserve_partitioning); self.runtime_extremes = - make_sort_extremes_slots(self.cache.partitioning.partition_count()); + make_partition_extremes_slots(self.cache.partitioning.partition_count()); self } @@ -1337,8 +1337,9 @@ impl ExecutionPlan for SortExec { )?; new_sort.cache = Arc::new(cache); new_sort.common_sort_prefix = sort_prefix; - new_sort.runtime_extremes = - make_sort_extremes_slots(new_sort.cache.partitioning.partition_count()); + new_sort.runtime_extremes = make_partition_extremes_slots( + new_sort.cache.partitioning.partition_count(), + ); } Ok(Arc::new(new_sort)) @@ -1463,11 +1464,14 @@ impl ExecutionPlan for SortExec { Ok(Arc::new(stats.with_fetch(self.fetch, 0, 1)?)) } - /// Returns the runtime sort extremes for the requested output partition. + /// Returns observed lex-min/lex-max for the requested output partition. /// `None` while that partition's sort path has not yet folded any /// chunk into its slot — the caller is expected to have driven enough of /// `execute(partition)` to reach the first `sort_batch_chunked` call. - fn runtime_sort_extremes(&self, partition: usize) -> Result> { + fn runtime_partition_extremes( + &self, + partition: usize, + ) -> Result> { Ok(self .runtime_extremes .get(partition) diff --git a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs index 9a789e1ff5fd4..d45edd3d91552 100644 --- a/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs +++ b/datafusion/physical-plan/src/windows/bounded_window_agg_exec.rs @@ -35,8 +35,8 @@ use crate::windows::{ }; use crate::{ ColumnStatistics, DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, - ExecutionPlanProperties, InputOrderMode, PlanProperties, RecordBatchStream, - SendableRecordBatchStream, SortExtremes, Statistics, WindowExpr, + ExecutionPlanProperties, InputOrderMode, PartitionExtremes, PlanProperties, + RecordBatchStream, SendableRecordBatchStream, Statistics, WindowExpr, check_if_same_properties, }; @@ -362,11 +362,11 @@ impl ExecutionPlan for BoundedWindowAggExec { /// Passthrough: the window operator doesn't alter the leading sort /// key's value range, so its `partition`-th output partition's /// extremes are exactly its input partition's extremes. - fn runtime_sort_extremes( + fn runtime_partition_extremes( &self, partition: usize, - ) -> Result> { - self.input.runtime_sort_extremes(partition) + ) -> Result> { + self.input.runtime_partition_extremes(partition) } fn with_new_children(