apache · cetra3 · Mar 5, 2026 · martin-g · Mar 6, 2026 · cetra3
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml
@@ -166,6 +166,7 @@ async-trait = { workspace = true }
 criterion = { workspace = true, features = ["async_tokio", "async_futures"] }
 ctor = { workspace = true }
 dashmap = "6.1.0"
+dhat = "0.3.3"
 datafusion-doc = { workspace = true }
 datafusion-functions-window-common = { workspace = true }
 datafusion-macros = { workspace = true }

diff --git a/datafusion/core/tests/heap_profile_hash_aggregate.rs b/datafusion/core/tests/heap_profile_hash_aggregate.rs
@@ -0,0 +1,72 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Heap profiling test for grouped hash aggregation with spilling.
+//! Data has many distinct groups to force hash table growth beyond
+//! the memory pool, triggering spilling.
+
+#[global_allocator]
+static ALLOC: dhat::Alloc = dhat::Alloc;
+
+use std::sync::Arc;
+
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_execution::memory_pool::FairSpillPool;
+use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+const MEMORY_LIMIT: usize = 10 * 1024 * 1024; // 10MB
+
+#[tokio::test]
+async fn heap_profile_hash_aggregate() {
+    let _profiler = dhat::Profiler::builder().testing().build();
+
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_pool(Arc::new(FairSpillPool::new(MEMORY_LIMIT)))
+        .build_arc()
+        .unwrap();
+    let config = SessionConfig::new().with_target_partitions(1);
+    let ctx = SessionContext::new_with_config_rt(config, runtime);
+
+    // 5M distinct groups forces hash table growth and spilling
+    let df = ctx
+        .sql(
+            "SELECT v, COUNT(*) \
+             FROM generate_series(1, 5000000) AS t(v) \
+             GROUP BY v",
+        )
+        .await
+        .unwrap();
+    let batches = df.collect().await.unwrap();
+    let row_count: usize = batches.iter().map(|b| b.num_rows()).sum();
+    assert_eq!(row_count, 5_000_000);
+
+    let stats = dhat::HeapStats::get();
+    let limit = (MEMORY_LIMIT as f64 * 1.1) as usize;
+    println!(
+        "hash_aggregate: max_bytes={}, memory_limit={}, ratio={:.2}x",
+        stats.max_bytes,
+        MEMORY_LIMIT,
+        stats.max_bytes as f64 / MEMORY_LIMIT as f64
+    );
+    // TODO: peak is ~122MB (12.2x pool) because:
+    // 1. HashTable size() underreports (uses capacity * sizeof instead of allocation_size())
+    // 2. Hash table doubles capacity atomically inside intern(), before the pool check
+    // 3. generate_series input data is not tracked by the MemoryPool
+    // dhat::assert!(stats.max_bytes < limit,
+    //     "Peak heap {} exceeded {}", stats.max_bytes, limit);
+    let _ = limit;
+}
diff --git a/datafusion/core/tests/heap_profile_hash_join.rs b/datafusion/core/tests/heap_profile_hash_join.rs
@@ -0,0 +1,96 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Heap profiling test for HashJoinExec.
+
+#[global_allocator]
+static ALLOC: dhat::Alloc = dhat::Alloc;
+
+use std::sync::Arc;
+
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_execution::memory_pool::FairSpillPool;
+use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+const MEMORY_LIMIT: usize = 40 * 1024 * 1024; // 40MB
+
+#[tokio::test]
+async fn heap_profile_hash_join() {
+    // HashJoin does not spill, so the pool must fit the build side
+    // hash table. 1M rows of (i64, i64) ~16MB plus hash table overhead.
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_pool(Arc::new(FairSpillPool::new(MEMORY_LIMIT)))
+        .build_arc()
+        .unwrap();
+    let config = SessionConfig::new().with_target_partitions(1);
+    let ctx = SessionContext::new_with_config_rt(config, runtime);
+
+    // Create tables before starting the profiler
+    ctx.sql(
+        "CREATE TABLE t1 AS \
+         SELECT v AS id, v * 2 AS val \
+         FROM generate_series(1, 1000000) AS t(v)",
+    )
+    .await
+    .unwrap();
+
+    ctx.sql(
+        "CREATE TABLE t2 AS \
+         SELECT v AS id, v * 3 AS val \
+         FROM generate_series(1, 1000000) AS t(v)",
+    )
+    .await
+    .unwrap();
+
+    // Verify HashJoin is used
+    let explain = ctx
+        .sql("EXPLAIN SELECT t1.id, t1.val, t2.val FROM t1 JOIN t2 ON t1.id = t2.id")
+        .await
+        .unwrap()
+        .collect()
+        .await
+        .unwrap();
+    let plan_str = format!("{explain:?}");
+    assert!(
+        plan_str.contains("HashJoinExec"),
+        "Expected HashJoinExec in plan but got: {plan_str}"
+    );
+
+    // Start profiling after table creation
+    let _profiler = dhat::Profiler::builder().testing().build();
+
+    let df = ctx
+        .sql("SELECT t1.id, t1.val, t2.val FROM t1 JOIN t2 ON t1.id = t2.id")
+        .await
+        .unwrap();
+    let _batches = df.collect().await.unwrap();
+
+    let stats = dhat::HeapStats::get();
+    let limit = (MEMORY_LIMIT as f64 * 1.1) as usize;
+    println!(
+        "hash_join: max_bytes={}, memory_limit={}, ratio={:.2}x",
+        stats.max_bytes,
+        MEMORY_LIMIT,
+        stats.max_bytes as f64 / MEMORY_LIMIT as f64
+    );
+    dhat::assert!(
+        stats.max_bytes < limit,
+        "Peak heap {} exceeded {}",
+        stats.max_bytes,
+        limit
+    );
+}
diff --git a/datafusion/core/tests/heap_profile_parquet_sort.rs b/datafusion/core/tests/heap_profile_parquet_sort.rs
@@ -0,0 +1,91 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Heap profiling test for reading parquet files and sorting.
+//! This exercises the parquet reader's allocation path alongside
+//! the sort operator. Data exceeds memory pool to force spilling.
+
+#[global_allocator]
+static ALLOC: dhat::Alloc = dhat::Alloc;
+
+use std::sync::Arc;
+
+use datafusion::dataframe::DataFrameWriteOptions;
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_execution::memory_pool::FairSpillPool;
+use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+const MEMORY_LIMIT: usize = 20 * 1024 * 1024; // 20MB
+
+#[tokio::test]
+async fn heap_profile_parquet_sort() {
+    // Write test data to a parquet file using a separate context
+    let tmpdir = tempfile::tempdir().unwrap();
+    let parquet_path = tmpdir.path().join("test_data.parquet");
+    {
+        let write_ctx = SessionContext::new();
+        let df = write_ctx
+            .sql(
+                "SELECT v AS id, v * 2 AS val, \
+                 CASE WHEN v % 3 = 0 THEN 'aaa' WHEN v % 3 = 1 THEN 'bbb' ELSE 'ccc' END AS category \
+                 FROM generate_series(1, 2000000) AS t(v)",
+            )
+            .await
+            .unwrap();
+        df.write_parquet(
+            parquet_path.to_str().unwrap(),
+            DataFrameWriteOptions::new().with_single_file_output(true),
+            None,
+        )
+        .await
+        .unwrap();
+    }
+
+    // Set up the memory-limited context for reading
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_pool(Arc::new(FairSpillPool::new(MEMORY_LIMIT)))
+        .build_arc()
+        .unwrap();
+    let config = SessionConfig::new()
+        .with_target_partitions(1)
+        .with_sort_spill_reservation_bytes(5 * 1024 * 1024);
+    let ctx = SessionContext::new_with_config_rt(config, runtime);
+
+    ctx.register_parquet("t", parquet_path.to_str().unwrap(), Default::default())
+        .await
+        .unwrap();
+
+    // Start profiling before planning
+    let _profiler = dhat::Profiler::builder().testing().build();
+
+    let df = ctx.sql("SELECT * FROM t ORDER BY id DESC").await.unwrap();
+    let _batches = df.collect().await.unwrap();
+
+    let stats = dhat::HeapStats::get();
+    let limit = (MEMORY_LIMIT as f64 * 1.1) as usize;
+    println!(
+        "parquet_sort: max_bytes={}, memory_limit={}, ratio={:.2}x",
+        stats.max_bytes,
+        MEMORY_LIMIT,
+        stats.max_bytes as f64 / MEMORY_LIMIT as f64
+    );
+    // TODO: peak is ~67MB (3.3x pool) because parquet decoded
+    // batches and sort output arrays are not tracked by the MemoryPool.
+    // dhat::assert!(stats.max_bytes < limit,
+    //     "Peak heap {} exceeded {}", stats.max_bytes, limit);
+    let _ = limit;
+}
diff --git a/datafusion/core/tests/heap_profile_repartition.rs b/datafusion/core/tests/heap_profile_repartition.rs
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Heap profiling test for RepartitionExec with multiple partitions.
+//! Uses enough data with a GROUP BY to force repartition buffering
+//! under memory pressure.
+
+#[global_allocator]
+static ALLOC: dhat::Alloc = dhat::Alloc;
+
+use std::sync::Arc;
+
+use datafusion::prelude::{SessionConfig, SessionContext};
+use datafusion_execution::memory_pool::FairSpillPool;
+use datafusion_execution::runtime_env::RuntimeEnvBuilder;
+
+const MEMORY_LIMIT: usize = 10 * 1024 * 1024; // 10MB
+
+#[tokio::test]
+async fn heap_profile_repartition() {
+    let _profiler = dhat::Profiler::builder().testing().build();
+
+    let runtime = RuntimeEnvBuilder::new()
+        .with_memory_pool(Arc::new(FairSpillPool::new(MEMORY_LIMIT)))
+        .build_arc()
+        .unwrap();
+    // Use multiple partitions to exercise RepartitionExec
+    let config = SessionConfig::new().with_target_partitions(4);
+    let ctx = SessionContext::new_with_config_rt(config, runtime);
+
+    // GROUP BY forces repartition by hash + aggregate spilling
+    let df = ctx
+        .sql(
+            "SELECT v % 100000, COUNT(*) \
+             FROM generate_series(1, 5000000) AS t(v) \
+             GROUP BY v % 100000",
+        )
+        .await
+        .unwrap();
+    let _batches = df.collect().await.unwrap();
+
+    let stats = dhat::HeapStats::get();
+    let limit = (MEMORY_LIMIT as f64 * 1.1) as usize;
+    println!(
+        "repartition: max_bytes={}, memory_limit={}, ratio={:.2}x",
+        stats.max_bytes,
+        MEMORY_LIMIT,
+        stats.max_bytes as f64 / MEMORY_LIMIT as f64
+    );
+    // TODO: peak is ~20MB (1.97x pool)
+    // dhat::assert!(stats.max_bytes < limit,
+    //     "Peak heap {} exceeded {}", stats.max_bytes, limit);
+    let _ = limit;
+}