From ccb6d7eb777646de4914e64c02377dfc4141211b Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 4 Mar 2026 20:12:56 -0500 Subject: [PATCH 1/4] Add benchmark for comparing nested values --- Cargo.lock | 2 + datafusion/physical-expr-common/Cargo.toml | 8 ++ .../benches/compare_nested.rs | 106 ++++++++++++++++++ 3 files changed, 116 insertions(+) create mode 100644 datafusion/physical-expr-common/benches/compare_nested.rs diff --git a/Cargo.lock b/Cargo.lock index 38fa83dd12119..78150f712d25a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2431,12 +2431,14 @@ dependencies = [ "ahash", "arrow", "chrono", + "criterion", "datafusion-common", "datafusion-expr-common", "hashbrown 0.16.1", "indexmap 2.13.0", "itertools 0.14.0", "parking_lot", + "rand 0.9.2", ] [[package]] diff --git a/datafusion/physical-expr-common/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml index a81eafe196959..453c8a0cb4889 100644 --- a/datafusion/physical-expr-common/Cargo.toml +++ b/datafusion/physical-expr-common/Cargo.toml @@ -50,3 +50,11 @@ hashbrown = { workspace = true } indexmap = { workspace = true } itertools = { workspace = true } parking_lot = { workspace = true } + +[dev-dependencies] +criterion = { workspace = true } +rand = { workspace = true } + +[[bench]] +harness = false +name = "compare_nested" diff --git a/datafusion/physical-expr-common/benches/compare_nested.rs b/datafusion/physical-expr-common/benches/compare_nested.rs new file mode 100644 index 0000000000000..5814103eb333e --- /dev/null +++ b/datafusion/physical-expr-common/benches/compare_nested.rs @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, Int32Array, Scalar, StringArray, StructArray}; +use arrow::datatypes::{DataType, Field, Fields}; +use criterion::{criterion_group, criterion_main, Criterion}; +use datafusion_expr_common::operator::Operator; +use datafusion_physical_expr_common::datum::compare_op_for_nested; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use std::hint::black_box; +use std::sync::Arc; + +/// Build a StructArray with fields {x: Int32, y: Utf8}. +fn make_struct_array(num_rows: usize, null_fraction: f64, rng: &mut StdRng) -> ArrayRef { + let ints: Int32Array = (0..num_rows) + .map(|_| { + if null_fraction > 0.0 && rng.random::() < null_fraction { + None + } else { + Some(rng.random::()) + } + }) + .collect(); + + let strings: StringArray = (0..num_rows) + .map(|_| { + if null_fraction > 0.0 && rng.random::() < null_fraction { + None + } else { + let s: String = (0..12).map(|_| rng.random_range(b'a'..=b'z') as char).collect(); + Some(s) + } + }) + .collect(); + + let fields = Fields::from(vec![ + Field::new("x", DataType::Int32, true), + Field::new("y", DataType::Utf8, true), + ]); + + Arc::new(StructArray::try_new(fields, vec![Arc::new(ints), Arc::new(strings)], None).unwrap()) +} + +fn criterion_benchmark(c: &mut Criterion) { + for num_rows in [4096, 8192] { + for null_pct in [0, 10] { + let null_fraction = null_pct as f64 / 100.0; + let mut rng = StdRng::seed_from_u64(42); + + let lhs = make_struct_array(num_rows, null_fraction, &mut rng); + let rhs_array = make_struct_array(num_rows, null_fraction, &mut rng); + let rhs_scalar_array = make_struct_array(1, 0.0, &mut rng); + let rhs_scalar = Scalar::new(rhs_scalar_array); + + c.bench_function( + &format!("compare_nested array_array rows={num_rows} nulls={null_pct}%"), + |b| { + b.iter(|| { + black_box( + compare_op_for_nested( + Operator::Eq, + &lhs, + &rhs_array, + ) + .unwrap(), + ) + }) + }, + ); + + c.bench_function( + &format!("compare_nested array_scalar rows={num_rows} nulls={null_pct}%"), + |b| { + b.iter(|| { + black_box( + compare_op_for_nested( + Operator::Eq, + &lhs, + &rhs_scalar, + ) + .unwrap(), + ) + }) + }, + ); + } + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); From 543d44c8d9242ebaf7bbfeb83af3afa7e5d95edc Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 4 Mar 2026 20:29:55 -0500 Subject: [PATCH 2/4] Simplify benchmark --- .../benches/compare_nested.rs | 89 +++++++------------ 1 file changed, 30 insertions(+), 59 deletions(-) diff --git a/datafusion/physical-expr-common/benches/compare_nested.rs b/datafusion/physical-expr-common/benches/compare_nested.rs index 5814103eb333e..d8e62c10cfc8e 100644 --- a/datafusion/physical-expr-common/benches/compare_nested.rs +++ b/datafusion/physical-expr-common/benches/compare_nested.rs @@ -26,80 +26,51 @@ use std::hint::black_box; use std::sync::Arc; /// Build a StructArray with fields {x: Int32, y: Utf8}. -fn make_struct_array(num_rows: usize, null_fraction: f64, rng: &mut StdRng) -> ArrayRef { - let ints: Int32Array = (0..num_rows) - .map(|_| { - if null_fraction > 0.0 && rng.random::() < null_fraction { - None - } else { - Some(rng.random::()) - } - }) - .collect(); +fn make_struct_array(num_rows: usize, rng: &mut StdRng) -> ArrayRef { + let ints: Int32Array = (0..num_rows).map(|_| Some(rng.random::())).collect(); let strings: StringArray = (0..num_rows) .map(|_| { - if null_fraction > 0.0 && rng.random::() < null_fraction { - None - } else { - let s: String = (0..12).map(|_| rng.random_range(b'a'..=b'z') as char).collect(); - Some(s) - } + let s: String = + (0..12).map(|_| rng.random_range(b'a'..=b'z') as char).collect(); + Some(s) }) .collect(); let fields = Fields::from(vec![ - Field::new("x", DataType::Int32, true), - Field::new("y", DataType::Utf8, true), + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Utf8, false), ]); - Arc::new(StructArray::try_new(fields, vec![Arc::new(ints), Arc::new(strings)], None).unwrap()) + Arc::new( + StructArray::try_new(fields, vec![Arc::new(ints), Arc::new(strings)], None) + .unwrap(), + ) } fn criterion_benchmark(c: &mut Criterion) { - for num_rows in [4096, 8192] { - for null_pct in [0, 10] { - let null_fraction = null_pct as f64 / 100.0; - let mut rng = StdRng::seed_from_u64(42); + let num_rows = 8192; + let mut rng = StdRng::seed_from_u64(42); - let lhs = make_struct_array(num_rows, null_fraction, &mut rng); - let rhs_array = make_struct_array(num_rows, null_fraction, &mut rng); - let rhs_scalar_array = make_struct_array(1, 0.0, &mut rng); - let rhs_scalar = Scalar::new(rhs_scalar_array); + let lhs = make_struct_array(num_rows, &mut rng); + let rhs_array = make_struct_array(num_rows, &mut rng); + let rhs_scalar = Scalar::new(make_struct_array(1, &mut rng)); - c.bench_function( - &format!("compare_nested array_array rows={num_rows} nulls={null_pct}%"), - |b| { - b.iter(|| { - black_box( - compare_op_for_nested( - Operator::Eq, - &lhs, - &rhs_array, - ) - .unwrap(), - ) - }) - }, - ); + c.bench_function("compare_nested array_array", |b| { + b.iter(|| { + black_box( + compare_op_for_nested(Operator::Eq, &lhs, &rhs_array).unwrap(), + ) + }) + }); - c.bench_function( - &format!("compare_nested array_scalar rows={num_rows} nulls={null_pct}%"), - |b| { - b.iter(|| { - black_box( - compare_op_for_nested( - Operator::Eq, - &lhs, - &rhs_scalar, - ) - .unwrap(), - ) - }) - }, - ); - } - } + c.bench_function("compare_nested array_scalar", |b| { + b.iter(|| { + black_box( + compare_op_for_nested(Operator::Eq, &lhs, &rhs_scalar).unwrap(), + ) + }) + }); } criterion_group!(benches, criterion_benchmark); From 358ec91b097c8042a7daab23711e012c7451999f Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 4 Mar 2026 20:13:09 -0500 Subject: [PATCH 3/4] perf: Use BooleanBuffer::collect() in nested struct comparison --- datafusion/physical-expr-common/src/datum.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/physical-expr-common/src/datum.rs b/datafusion/physical-expr-common/src/datum.rs index 9efaca0f6b6a2..bd5790507f662 100644 --- a/datafusion/physical-expr-common/src/datum.rs +++ b/datafusion/physical-expr-common/src/datum.rs @@ -17,7 +17,7 @@ use arrow::array::BooleanArray; use arrow::array::{ArrayRef, Datum, make_comparator}; -use arrow::buffer::NullBuffer; +use arrow::buffer::{BooleanBuffer, NullBuffer}; use arrow::compute::kernels::cmp::{ distinct, eq, gt, gt_eq, lt, lt_eq, neq, not_distinct, }; @@ -171,9 +171,9 @@ pub fn compare_op_for_nested( }; let values = match (is_l_scalar, is_r_scalar) { - (false, false) => (0..len).map(|i| cmp_with_op(i, i)).collect(), - (true, false) => (0..len).map(|i| cmp_with_op(0, i)).collect(), - (false, true) => (0..len).map(|i| cmp_with_op(i, 0)).collect(), + (false, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i, i)), + (true, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(0, i)), + (false, true) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i, 0)), (true, true) => std::iter::once(cmp_with_op(0, 0)).collect(), }; From 57f47b969aba0a0125c6bf05d733a7c2cbef9155 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Wed, 4 Mar 2026 21:54:39 -0500 Subject: [PATCH 4/4] cargo fmt --- .../benches/compare_nested.rs | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/datafusion/physical-expr-common/benches/compare_nested.rs b/datafusion/physical-expr-common/benches/compare_nested.rs index d8e62c10cfc8e..56c122fef9420 100644 --- a/datafusion/physical-expr-common/benches/compare_nested.rs +++ b/datafusion/physical-expr-common/benches/compare_nested.rs @@ -17,7 +17,7 @@ use arrow::array::{ArrayRef, Int32Array, Scalar, StringArray, StructArray}; use arrow::datatypes::{DataType, Field, Fields}; -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::{Criterion, criterion_group, criterion_main}; use datafusion_expr_common::operator::Operator; use datafusion_physical_expr_common::datum::compare_op_for_nested; use rand::rngs::StdRng; @@ -31,8 +31,9 @@ fn make_struct_array(num_rows: usize, rng: &mut StdRng) -> ArrayRef { let strings: StringArray = (0..num_rows) .map(|_| { - let s: String = - (0..12).map(|_| rng.random_range(b'a'..=b'z') as char).collect(); + let s: String = (0..12) + .map(|_| rng.random_range(b'a'..=b'z') as char) + .collect(); Some(s) }) .collect(); @@ -58,17 +59,13 @@ fn criterion_benchmark(c: &mut Criterion) { c.bench_function("compare_nested array_array", |b| { b.iter(|| { - black_box( - compare_op_for_nested(Operator::Eq, &lhs, &rhs_array).unwrap(), - ) + black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_array).unwrap()) }) }); c.bench_function("compare_nested array_scalar", |b| { b.iter(|| { - black_box( - compare_op_for_nested(Operator::Eq, &lhs, &rhs_scalar).unwrap(), - ) + black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_scalar).unwrap()) }) }); }