diff --git a/Cargo.lock b/Cargo.lock index 38fa83dd12119..78150f712d25a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2431,12 +2431,14 @@ dependencies = [ "ahash", "arrow", "chrono", + "criterion", "datafusion-common", "datafusion-expr-common", "hashbrown 0.16.1", "indexmap 2.13.0", "itertools 0.14.0", "parking_lot", + "rand 0.9.2", ] [[package]] diff --git a/datafusion/physical-expr-common/Cargo.toml b/datafusion/physical-expr-common/Cargo.toml index a81eafe196959..453c8a0cb4889 100644 --- a/datafusion/physical-expr-common/Cargo.toml +++ b/datafusion/physical-expr-common/Cargo.toml @@ -50,3 +50,11 @@ hashbrown = { workspace = true } indexmap = { workspace = true } itertools = { workspace = true } parking_lot = { workspace = true } + +[dev-dependencies] +criterion = { workspace = true } +rand = { workspace = true } + +[[bench]] +harness = false +name = "compare_nested" diff --git a/datafusion/physical-expr-common/benches/compare_nested.rs b/datafusion/physical-expr-common/benches/compare_nested.rs new file mode 100644 index 0000000000000..56c122fef9420 --- /dev/null +++ b/datafusion/physical-expr-common/benches/compare_nested.rs @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, Int32Array, Scalar, StringArray, StructArray}; +use arrow::datatypes::{DataType, Field, Fields}; +use criterion::{Criterion, criterion_group, criterion_main}; +use datafusion_expr_common::operator::Operator; +use datafusion_physical_expr_common::datum::compare_op_for_nested; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use std::hint::black_box; +use std::sync::Arc; + +/// Build a StructArray with fields {x: Int32, y: Utf8}. +fn make_struct_array(num_rows: usize, rng: &mut StdRng) -> ArrayRef { + let ints: Int32Array = (0..num_rows).map(|_| Some(rng.random::())).collect(); + + let strings: StringArray = (0..num_rows) + .map(|_| { + let s: String = (0..12) + .map(|_| rng.random_range(b'a'..=b'z') as char) + .collect(); + Some(s) + }) + .collect(); + + let fields = Fields::from(vec![ + Field::new("x", DataType::Int32, false), + Field::new("y", DataType::Utf8, false), + ]); + + Arc::new( + StructArray::try_new(fields, vec![Arc::new(ints), Arc::new(strings)], None) + .unwrap(), + ) +} + +fn criterion_benchmark(c: &mut Criterion) { + let num_rows = 8192; + let mut rng = StdRng::seed_from_u64(42); + + let lhs = make_struct_array(num_rows, &mut rng); + let rhs_array = make_struct_array(num_rows, &mut rng); + let rhs_scalar = Scalar::new(make_struct_array(1, &mut rng)); + + c.bench_function("compare_nested array_array", |b| { + b.iter(|| { + black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_array).unwrap()) + }) + }); + + c.bench_function("compare_nested array_scalar", |b| { + b.iter(|| { + black_box(compare_op_for_nested(Operator::Eq, &lhs, &rhs_scalar).unwrap()) + }) + }); +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/physical-expr-common/src/datum.rs b/datafusion/physical-expr-common/src/datum.rs index 9efaca0f6b6a2..bd5790507f662 100644 --- a/datafusion/physical-expr-common/src/datum.rs +++ b/datafusion/physical-expr-common/src/datum.rs @@ -17,7 +17,7 @@ use arrow::array::BooleanArray; use arrow::array::{ArrayRef, Datum, make_comparator}; -use arrow::buffer::NullBuffer; +use arrow::buffer::{BooleanBuffer, NullBuffer}; use arrow::compute::kernels::cmp::{ distinct, eq, gt, gt_eq, lt, lt_eq, neq, not_distinct, }; @@ -171,9 +171,9 @@ pub fn compare_op_for_nested( }; let values = match (is_l_scalar, is_r_scalar) { - (false, false) => (0..len).map(|i| cmp_with_op(i, i)).collect(), - (true, false) => (0..len).map(|i| cmp_with_op(0, i)).collect(), - (false, true) => (0..len).map(|i| cmp_with_op(i, 0)).collect(), + (false, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i, i)), + (true, false) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(0, i)), + (false, true) => BooleanBuffer::collect_bool(len, |i| cmp_with_op(i, 0)), (true, true) => std::iter::once(cmp_with_op(0, 0)).collect(), };