diff --git a/datafusion/functions-nested/benches/array_position.rs b/datafusion/functions-nested/benches/array_position.rs index 08367648449d2..c718b2b725640 100644 --- a/datafusion/functions-nested/benches/array_position.rs +++ b/datafusion/functions-nested/benches/array_position.rs @@ -24,7 +24,7 @@ use criterion::{ use datafusion_common::ScalarValue; use datafusion_common::config::ConfigOptions; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl}; -use datafusion_functions_nested::position::ArrayPosition; +use datafusion_functions_nested::position::{ArrayPosition, ArrayPositions}; use rand::Rng; use rand::SeedableRng; use rand::rngs::StdRng; @@ -39,6 +39,7 @@ const SENTINEL_NEEDLE: i64 = -1; fn criterion_benchmark(c: &mut Criterion) { for size in [10, 100, 500] { bench_array_position(c, size); + bench_array_positions(c, size); } } @@ -146,6 +147,112 @@ fn bench_array_position(c: &mut Criterion, array_size: usize) { group.finish(); } +fn bench_array_positions(c: &mut Criterion, array_size: usize) { + let mut group = c.benchmark_group("array_positions_i64"); + let haystack_found_once = create_haystack_with_sentinel( + NUM_ROWS, + array_size, + NULL_DENSITY, + SENTINEL_NEEDLE, + 0, + ); + let haystack_found_many = create_haystack_with_sentinels( + NUM_ROWS, + array_size, + NULL_DENSITY, + SENTINEL_NEEDLE, + ); + let haystack_not_found = + create_haystack_without_sentinel(NUM_ROWS, array_size, NULL_DENSITY); + let num_rows = haystack_not_found.len(); + let arg_fields: Vec> = vec![ + Field::new("haystack", haystack_not_found.data_type().clone(), false).into(), + Field::new("needle", DataType::Int64, false).into(), + ]; + let return_field: Arc = Field::new( + "result", + DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, true))), + true, + ) + .into(); + let config_options = Arc::new(ConfigOptions::default()); + let needle = ScalarValue::Int64(Some(SENTINEL_NEEDLE)); + + let args_found_once = vec![ + ColumnarValue::Array(haystack_found_once.clone()), + ColumnarValue::Scalar(needle.clone()), + ]; + group.bench_with_input( + BenchmarkId::new("found_once", array_size), + &array_size, + |b, _| { + let udf = ArrayPositions::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found_once.clone(), + arg_fields: arg_fields.clone(), + number_rows: num_rows, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); + + let args_found_many = vec![ + ColumnarValue::Array(haystack_found_many.clone()), + ColumnarValue::Scalar(needle.clone()), + ]; + group.bench_with_input( + BenchmarkId::new("found_many", array_size), + &array_size, + |b, _| { + let udf = ArrayPositions::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_found_many.clone(), + arg_fields: arg_fields.clone(), + number_rows: num_rows, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); + + let args_not_found = vec![ + ColumnarValue::Array(haystack_not_found.clone()), + ColumnarValue::Scalar(needle.clone()), + ]; + group.bench_with_input( + BenchmarkId::new("not_found", array_size), + &array_size, + |b, _| { + let udf = ArrayPositions::new(); + b.iter(|| { + black_box( + udf.invoke_with_args(ScalarFunctionArgs { + args: args_not_found.clone(), + arg_fields: arg_fields.clone(), + number_rows: num_rows, + return_field: return_field.clone(), + config_options: config_options.clone(), + }) + .unwrap(), + ) + }) + }, + ); + + group.finish(); +} + fn create_haystack_without_sentinel( num_rows: usize, array_size: usize, diff --git a/datafusion/functions-nested/src/position.rs b/datafusion/functions-nested/src/position.rs index 0214b1552bc9c..59d5cdfe7fa34 100644 --- a/datafusion/functions-nested/src/position.rs +++ b/datafusion/functions-nested/src/position.rs @@ -18,6 +18,7 @@ //! [`ScalarUDFImpl`] definitions for array_position and array_positions functions. use arrow::array::Scalar; +use arrow::buffer::OffsetBuffer; use arrow::datatypes::DataType; use arrow::datatypes::{ DataType::{LargeList, List, UInt64}, @@ -25,7 +26,8 @@ use arrow::datatypes::{ }; use datafusion_common::ScalarValue; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, + ColumnarValue, Documentation, ScalarFunctionArgs, ScalarUDFImpl, Signature, + Volatility, }; use datafusion_macros::user_doc; @@ -122,57 +124,10 @@ impl ScalarUDFImpl for ArrayPosition { Ok(UInt64) } - fn invoke_with_args( - &self, - args: datafusion_expr::ScalarFunctionArgs, - ) -> Result { - let [first_arg, second_arg, third_arg @ ..] = args.args.as_slice() else { - return exec_err!("array_position expects two or three arguments"); - }; - - match second_arg { - ColumnarValue::Scalar(scalar_element) => { - // Nested element types (List, Struct) can't use the fast path - // (because Arrow's `non_distinct` does not support them). - if scalar_element.data_type().is_nested() { - return make_scalar_function(array_position_inner)(&args.args); - } - - // Determine batch length from whichever argument is columnar; - // if all inputs are scalar, batch length is 1. - let (num_rows, all_inputs_scalar) = match (first_arg, third_arg.first()) { - (ColumnarValue::Array(a), _) => (a.len(), false), - (_, Some(ColumnarValue::Array(a))) => (a.len(), false), - _ => (1, true), - }; - - let element_arr = scalar_element.to_array_of_size(1)?; - let haystack = first_arg.to_array(num_rows)?; - let arr_from = resolve_start_from(third_arg.first(), num_rows)?; - - let result = match haystack.data_type() { - List(_) => { - let list = as_generic_list_array::(&haystack)?; - array_position_scalar::(list, &element_arr, &arr_from) - } - LargeList(_) => { - let list = as_generic_list_array::(&haystack)?; - array_position_scalar::(list, &element_arr, &arr_from) - } - t => exec_err!("array_position does not support type '{t}'."), - }?; - - if all_inputs_scalar { - Ok(ColumnarValue::Scalar(ScalarValue::try_from_array( - &result, 0, - )?)) - } else { - Ok(ColumnarValue::Array(result)) - } - } - ColumnarValue::Array(_) => { - make_scalar_function(array_position_inner)(&args.args) - } + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + match try_array_position_scalar(&args.args)? { + Some(result) => Ok(result), + None => make_scalar_function(array_position_inner)(&args.args), } } @@ -185,6 +140,56 @@ impl ScalarUDFImpl for ArrayPosition { } } +/// Attempts the scalar-needle fast path for `array_position`. +fn try_array_position_scalar(args: &[ColumnarValue]) -> Result> { + if args.len() < 2 || args.len() > 3 { + return exec_err!("array_position expects two or three arguments"); + } + + let scalar_needle = match &args[1] { + ColumnarValue::Scalar(s) => s, + ColumnarValue::Array(_) => return Ok(None), + }; + + // `not_distinct` doesn't support nested types (List, Struct, etc.), + // so fall back to the per-row path for those. + if scalar_needle.data_type().is_nested() { + return Ok(None); + } + + // Determine batch length from whichever argument is columnar; + // if all inputs are scalar, batch length is 1. + let (num_rows, all_inputs_scalar) = match (&args[0], args.get(2)) { + (ColumnarValue::Array(a), _) => (a.len(), false), + (_, Some(ColumnarValue::Array(a))) => (a.len(), false), + _ => (1, true), + }; + + let needle = scalar_needle.to_array_of_size(1)?; + let haystack = args[0].to_array(num_rows)?; + let arr_from = resolve_start_from(args.get(2), num_rows)?; + + let result = match haystack.data_type() { + List(_) => { + let list = as_list_array(&haystack)?; + array_position_scalar::(list, &needle, &arr_from) + } + LargeList(_) => { + let list = as_large_list_array(&haystack)?; + array_position_scalar::(list, &needle, &arr_from) + } + t => exec_err!("array_position does not support type '{t}'"), + }?; + + if all_inputs_scalar { + Ok(Some(ColumnarValue::Scalar(ScalarValue::try_from_array( + &result, 0, + )?))) + } else { + Ok(Some(ColumnarValue::Array(result))) + } +} + fn array_position_inner(args: &[ArrayRef]) -> Result { if args.len() < 2 || args.len() > 3 { return exec_err!("array_position expects two or three arguments"); @@ -192,7 +197,7 @@ fn array_position_inner(args: &[ArrayRef]) -> Result { match &args[0].data_type() { List(_) => general_position_dispatch::(args), LargeList(_) => general_position_dispatch::(args), - array_type => exec_err!("array_position does not support type '{array_type}'."), + t => exec_err!("array_position does not support type '{t}'"), } } @@ -216,48 +221,46 @@ fn resolve_start_from( } } -/// Fast path for `array_position` when the element is a scalar. +/// Fast path for `array_position` when the needle is scalar. /// -/// Performs a single bulk `not_distinct` comparison of the scalar element -/// against the entire flattened values buffer, then walks the result bitmap -/// using offsets to find per-row first-match positions. +/// Performs a single bulk `not_distinct` comparison of the needle against the +/// entire flat values buffer, then walks the result bitmap using offsets to +/// find per-row first-match positions. fn array_position_scalar( - list_array: &GenericListArray, - element_array: &ArrayRef, + haystack: &GenericListArray, + needle: &ArrayRef, arr_from: &[i64], // 0-indexed ) -> Result { - crate::utils::check_datatypes( - "array_position", - &[list_array.values(), element_array], - )?; + crate::utils::check_datatypes("array_position", &[haystack.values(), needle])?; - if list_array.len() == 0 { + if haystack.len() == 0 { return Ok(Arc::new(UInt64Array::new_null(0))); } - let element_datum = Scalar::new(Arc::clone(element_array)); - let validity = list_array.nulls(); + let needle_datum = Scalar::new(Arc::clone(needle)); + let validity = haystack.nulls(); - // Only compare the visible portion of the values buffer, which avoids - // wasted work for sliced ListArrays. - let offsets = list_array.offsets(); + // Only convert the visible portion of the values array. For sliced + // ListArrays, values() returns the full underlying array but only + // elements between the first and last offset are referenced. + let offsets = haystack.offsets(); let first_offset = offsets[0].as_usize(); - let last_offset = offsets[list_array.len()].as_usize(); - let visible_values = list_array + let last_offset = offsets[haystack.len()].as_usize(); + let visible_values = haystack .values() .slice(first_offset, last_offset - first_offset); // `not_distinct` treats NULL=NULL as true, matching the semantics of - // `array_position` - let eq_array = arrow_ord::cmp::not_distinct(&visible_values, &element_datum)?; + // `array_position`. + let eq_array = arrow_ord::cmp::not_distinct(&visible_values, &needle_datum)?; let eq_bits = eq_array.values(); - let mut result: Vec> = Vec::with_capacity(list_array.len()); + let mut result: Vec> = Vec::with_capacity(haystack.len()); let mut matches = eq_bits.set_indices().peekable(); // Match positions are relative to visible_values (0-based), so // subtract first_offset from each offset when comparing. - for i in 0..list_array.len() { + for i in 0..haystack.len() { let start = offsets[i].as_usize() - first_offset; let end = offsets[i + 1].as_usize() - first_offset; @@ -295,18 +298,15 @@ fn array_position_scalar( } } - debug_assert_eq!(result.len(), list_array.len()); + debug_assert_eq!(result.len(), haystack.len()); Ok(Arc::new(UInt64Array::from(result))) } fn general_position_dispatch(args: &[ArrayRef]) -> Result { - let list_array = as_generic_list_array::(&args[0])?; - let element_array = &args[1]; + let haystack = as_generic_list_array::(&args[0])?; + let needle = &args[1]; - crate::utils::check_datatypes( - "array_position", - &[list_array.values(), element_array], - )?; + crate::utils::check_datatypes("array_position", &[haystack.values(), needle])?; let arr_from = if args.len() == 3 { as_int64_array(&args[2])? @@ -315,34 +315,30 @@ fn general_position_dispatch(args: &[ArrayRef]) -> Result>() } else { - vec![0; list_array.len()] + vec![0; haystack.len()] }; - for (arr, &from) in list_array.iter().zip(arr_from.iter()) { - // If `arr` is `None`: we will get null if we got null in the array, so we don't need to check - if !arr.is_none_or(|arr| from >= 0 && (from as usize) <= arr.len()) { + for (row, &from) in haystack.iter().zip(arr_from.iter()) { + if !row.is_none_or(|row| from >= 0 && (from as usize) <= row.len()) { return exec_err!("start_from out of bounds: {}", from + 1); } } - generic_position::(list_array, element_array, &arr_from) + generic_position::(haystack, needle, &arr_from) } -fn generic_position( - list_array: &GenericListArray, - element_array: &ArrayRef, +fn generic_position( + haystack: &GenericListArray, + needle: &ArrayRef, arr_from: &[i64], // 0-indexed ) -> Result { - let mut data = Vec::with_capacity(list_array.len()); + let mut data = Vec::with_capacity(haystack.len()); - for (row_index, (list_array_row, &from)) in - list_array.iter().zip(arr_from.iter()).enumerate() - { + for (row_index, (row, &from)) in haystack.iter().zip(arr_from.iter()).enumerate() { let from = from as usize; - if let Some(list_array_row) = list_array_row { - let eq_array = - compare_element_to_list(&list_array_row, element_array, row_index, true)?; + if let Some(row) = row { + let eq_array = compare_element_to_list(&row, needle, row_index, true)?; // Collect `true`s in 1-indexed positions let index = eq_array @@ -384,17 +380,20 @@ make_udf_expr_and_func!( name = "array", description = "Array expression. Can be a constant, column, or function, and any combination of array operators." ), - argument( - name = "element", - description = "Element to search for position in the array." - ) + argument(name = "element", description = "Element to search for in the array.") )] #[derive(Debug, PartialEq, Eq, Hash)] -pub(super) struct ArrayPositions { +pub struct ArrayPositions { signature: Signature, aliases: Vec, } +impl Default for ArrayPositions { + fn default() -> Self { + Self::new() + } +} + impl ArrayPositions { pub fn new() -> Self { Self { @@ -420,11 +419,11 @@ impl ScalarUDFImpl for ArrayPositions { Ok(List(Arc::new(Field::new_list_field(UInt64, true)))) } - fn invoke_with_args( - &self, - args: datafusion_expr::ScalarFunctionArgs, - ) -> Result { - make_scalar_function(array_positions_inner)(&args.args) + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + match try_array_positions_scalar(&args.args)? { + Some(result) => Ok(result), + None => make_scalar_function(array_positions_inner)(&args.args), + } } fn aliases(&self) -> &[String] { @@ -436,36 +435,70 @@ impl ScalarUDFImpl for ArrayPositions { } } -fn array_positions_inner(args: &[ArrayRef]) -> Result { - let [array, element] = take_function_args("array_positions", args)?; +/// Attempts the scalar-needle fast path for `array_positions`. +fn try_array_positions_scalar(args: &[ColumnarValue]) -> Result> { + let [haystack_arg, needle_arg] = take_function_args("array_positions", args)?; + + let scalar_needle = match needle_arg { + ColumnarValue::Scalar(s) => s, + ColumnarValue::Array(_) => return Ok(None), + }; + + // `not_distinct` doesn't support nested types (List, Struct, etc.), + // so fall back to the per-row path for those. + if scalar_needle.data_type().is_nested() { + return Ok(None); + } + + let (num_rows, all_inputs_scalar) = match haystack_arg { + ColumnarValue::Array(a) => (a.len(), false), + ColumnarValue::Scalar(_) => (1, true), + }; - match &array.data_type() { + let needle = scalar_needle.to_array_of_size(1)?; + let haystack = haystack_arg.to_array(num_rows)?; + + let result = match haystack.data_type() { List(_) => { - let arr = as_list_array(&array)?; - crate::utils::check_datatypes("array_positions", &[arr.values(), element])?; - general_positions::(arr, element) + let list = as_list_array(&haystack)?; + array_positions_scalar::(list, &needle) } LargeList(_) => { - let arr = as_large_list_array(&array)?; - crate::utils::check_datatypes("array_positions", &[arr.values(), element])?; - general_positions::(arr, element) - } - array_type => { - exec_err!("array_positions does not support type '{array_type}'.") + let list = as_large_list_array(&haystack)?; + array_positions_scalar::(list, &needle) } + t => exec_err!("array_positions does not support type '{t}'"), + }?; + + if all_inputs_scalar { + Ok(Some(ColumnarValue::Scalar(ScalarValue::try_from_array( + &result, 0, + )?))) + } else { + Ok(Some(ColumnarValue::Array(result))) } } -fn general_positions( - list_array: &GenericListArray, - element_array: &ArrayRef, +fn array_positions_inner(args: &[ArrayRef]) -> Result { + let [haystack, needle] = take_function_args("array_positions", args)?; + + match &haystack.data_type() { + List(_) => general_positions::(as_list_array(&haystack)?, needle), + LargeList(_) => general_positions::(as_large_list_array(&haystack)?, needle), + t => exec_err!("array_positions does not support type '{t}'"), + } +} + +fn general_positions( + haystack: &GenericListArray, + needle: &ArrayRef, ) -> Result { - let mut data = Vec::with_capacity(list_array.len()); + crate::utils::check_datatypes("array_positions", &[haystack.values(), needle])?; + let mut data = Vec::with_capacity(haystack.len()); - for (row_index, list_array_row) in list_array.iter().enumerate() { - if let Some(list_array_row) = list_array_row { - let eq_array = - compare_element_to_list(&list_array_row, element_array, row_index, true)?; + for (row_index, row) in haystack.iter().enumerate() { + if let Some(row) = row { + let eq_array = compare_element_to_list(&row, needle, row_index, true)?; // Collect `true`s in 1-indexed positions let indexes = eq_array @@ -485,13 +518,88 @@ fn general_positions( )) } +/// Fast path for `array_positions` when the needle is scalar. +/// +/// Performs a single bulk `not_distinct` comparison of the needle against the +/// entire flat values buffer, then walks the result bitmap using offsets to +/// collect all per-row match positions. +fn array_positions_scalar( + haystack: &GenericListArray, + needle: &ArrayRef, +) -> Result { + crate::utils::check_datatypes("array_positions", &[haystack.values(), needle])?; + + let num_rows = haystack.len(); + if num_rows == 0 { + return Ok(Arc::new(ListArray::try_new( + Arc::new(Field::new_list_field(UInt64, true)), + OffsetBuffer::new_zeroed(1), + Arc::new(UInt64Array::from(Vec::::new())), + None, + )?)); + } + + let needle_datum = Scalar::new(Arc::clone(needle)); + let validity = haystack.nulls(); + + // Only convert the visible portion of the values array. For sliced + // ListArrays, values() returns the full underlying array but only + // elements between the first and last offset are referenced. + let offsets = haystack.offsets(); + let first_offset = offsets[0].as_usize(); + let last_offset = offsets[num_rows].as_usize(); + let visible_values = haystack + .values() + .slice(first_offset, last_offset - first_offset); + + // `not_distinct` treats NULL=NULL as true, matching the semantics of + // `array_positions`. + let eq_array = arrow_ord::cmp::not_distinct(&visible_values, &needle_datum)?; + let eq_bits = eq_array.values(); + + let num_matches = eq_bits.count_set_bits(); + let mut positions: Vec = Vec::with_capacity(num_matches); + let mut result_offsets: Vec = Vec::with_capacity(num_rows + 1); + result_offsets.push(0); + let mut matches = eq_bits.set_indices().peekable(); + + // Match positions are relative to visible_values (0-based), so + // subtract first_offset from each offset when comparing. + for i in 0..num_rows { + let start = offsets[i].as_usize() - first_offset; + let end = offsets[i + 1].as_usize() - first_offset; + + if validity.is_some_and(|v| v.is_null(i)) { + // Null row -> null output; advance past matches in range. + while matches.peek().is_some_and(|&p| p < end) { + matches.next(); + } + result_offsets.push(positions.len() as i32); + continue; + } + + // Collect all matches in [start, end). + while let Some(pos) = matches.next_if(|&p| p < end) { + positions.push((pos - start + 1) as u64); + } + result_offsets.push(positions.len() as i32); + } + + debug_assert_eq!(result_offsets.len(), num_rows + 1); + Ok(Arc::new(ListArray::try_new( + Arc::new(Field::new_list_field(UInt64, true)), + OffsetBuffer::new(result_offsets.into()), + Arc::new(UInt64Array::from(positions)), + validity.cloned(), + )?)) +} + #[cfg(test)] mod tests { use super::*; use arrow::array::AsArray; use arrow::datatypes::Int32Type; use datafusion_common::config::ConfigOptions; - use datafusion_expr::ScalarFunctionArgs; #[test] fn test_array_position_sliced_list() -> Result<()> { @@ -540,4 +648,112 @@ mod tests { Ok(()) } + + #[test] + fn test_array_positions_sliced_list() -> Result<()> { + // [[10, 20, 30], [30, 40, 30], [50, 60, 30], [70, 80, 30]] + // → slice(1,2) → [[30, 40, 30], [50, 60, 30]] + let list = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(10), Some(20), Some(30)]), + Some(vec![Some(30), Some(40), Some(30)]), + Some(vec![Some(50), Some(60), Some(30)]), + Some(vec![Some(70), Some(80), Some(30)]), + ]); + let sliced = list.slice(1, 2); + let haystack_field = + Arc::new(Field::new("haystack", sliced.data_type().clone(), true)); + let needle_field = Arc::new(Field::new("needle", DataType::Int32, true)); + let return_field = Arc::new(Field::new( + "return", + List(Arc::new(Field::new_list_field(UInt64, true))), + true, + )); + + let invoke = |needle: i32| -> Result { + ArrayPositions::new() + .invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(Arc::new(sliced.clone())), + ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))), + ], + arg_fields: vec![ + Arc::clone(&haystack_field), + Arc::clone(&needle_field), + ], + number_rows: 2, + return_field: Arc::clone(&return_field), + config_options: Arc::new(ConfigOptions::default()), + })? + .into_array(2) + }; + + // Needle 30: appears at positions 1,3 in row 0 ([30,40,30]) + // and position 3 in row 1 ([50,60,30]). + let output = invoke(30)?; + let output = output.as_list::(); + let row0 = output.value(0); + let row0 = row0.as_primitive::(); + assert_eq!(row0.values().as_ref(), &[1, 3]); + let row1 = output.value(1); + let row1 = row1.as_primitive::(); + assert_eq!(row1.values().as_ref(), &[3]); + + // Needle 10: only in the sliced-away prefix row → empty lists. + let output = invoke(10)?; + let output = output.as_list::(); + assert!(output.value(0).is_empty()); + assert!(output.value(1).is_empty()); + + // Needle 70: only in the sliced-away suffix row → empty lists. + let output = invoke(70)?; + let output = output.as_list::(); + assert!(output.value(0).is_empty()); + assert!(output.value(1).is_empty()); + + Ok(()) + } + + #[test] + fn test_array_positions_sliced_list_with_nulls() -> Result<()> { + // [[1, 2], null, [3, 1], [4, 5]] → slice(1,2) → [null, [3, 1]] + let list = ListArray::from_iter_primitive::(vec![ + Some(vec![Some(1), Some(2)]), + None, + Some(vec![Some(3), Some(1)]), + Some(vec![Some(4), Some(5)]), + ]); + let sliced = list.slice(1, 2); + let haystack_field = + Arc::new(Field::new("haystack", sliced.data_type().clone(), true)); + let needle_field = Arc::new(Field::new("needle", DataType::Int32, true)); + let return_field = Arc::new(Field::new( + "return", + List(Arc::new(Field::new_list_field(UInt64, true))), + true, + )); + + let output = ArrayPositions::new() + .invoke_with_args(ScalarFunctionArgs { + args: vec![ + ColumnarValue::Array(Arc::new(sliced)), + ColumnarValue::Scalar(ScalarValue::Int32(Some(1))), + ], + arg_fields: vec![Arc::clone(&haystack_field), Arc::clone(&needle_field)], + number_rows: 2, + return_field: Arc::clone(&return_field), + config_options: Arc::new(ConfigOptions::default()), + })? + .into_array(2)?; + + let output = output.as_list::(); + // Row 0 is null (from the sliced null row). + assert!(output.is_null(0)); + // Row 1 is [3, 1] → needle 1 found at position 2. + assert!(!output.is_null(1)); + let row1 = output.value(1); + let row1 = row1.as_primitive::(); + assert_eq!(row1.values().as_ref(), &[2]); + + Ok(()) + } } diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 112351c5efa73..26cb9ffd0396e 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -4003,6 +4003,12 @@ select array_position(arrow_cast(['a', 'b', 'c', 'b'], 'FixedSizeList(4, Utf8)') ## array_positions (aliases: `list_positions`) +# array_positions with empty array +query ? +select array_positions(arrow_cast(make_array(), 'List(Int64)'), 1); +---- +[] + query ? select array_positions([1, 2, 3, 4, 5], null); ---- diff --git a/docs/source/user-guide/sql/scalar_functions.md b/docs/source/user-guide/sql/scalar_functions.md index 254151c2c20eb..15ce5878808ea 100644 --- a/docs/source/user-guide/sql/scalar_functions.md +++ b/docs/source/user-guide/sql/scalar_functions.md @@ -3823,7 +3823,7 @@ array_positions(array, element) #### Arguments - **array**: Array expression. Can be a constant, column, or function, and any combination of array operators. -- **element**: Element to search for position in the array. +- **element**: Element to search for in the array. #### Example