From 0a8ceedc21ce0c852db0525183f72557787cae45 Mon Sep 17 00:00:00 2001 From: Kumar Ujjawal Date: Sat, 28 Feb 2026 14:14:54 +0530 Subject: [PATCH] Make upper emit Utf8View for Utf8View input --- datafusion/functions/src/string/common.rs | 74 +++++++++++++++---- datafusion/functions/src/string/upper.rs | 28 ++++++- .../sqllogictest/test_files/functions.slt | 15 ++++ 3 files changed, 97 insertions(+), 20 deletions(-) diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 77af82e25c483..fd996c37f7d88 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use crate::strings::make_and_append_view; use arrow::array::{ Array, ArrayRef, GenericStringArray, GenericStringBuilder, NullBufferBuilder, - OffsetSizeTrait, StringBuilder, StringViewArray, new_null_array, + OffsetSizeTrait, StringBuilder, StringViewArray, StringViewBuilder, new_null_array, }; use arrow::buffer::{Buffer, ScalarBuffer}; use arrow::datatypes::DataType; @@ -332,17 +332,34 @@ fn string_trim(args: &[ArrayRef]) -> Result Result { - case_conversion(args, |string| string.to_lowercase(), name) + case_conversion( + args, + |string| string.to_lowercase(), + name, + Utf8ViewOutput::Utf8, + ) } pub(crate) fn to_upper(args: &[ColumnarValue], name: &str) -> Result { - case_conversion(args, |string| string.to_uppercase(), name) + case_conversion( + args, + |string| string.to_uppercase(), + name, + Utf8ViewOutput::Utf8View, + ) +} + +#[derive(Debug, Clone, Copy)] +enum Utf8ViewOutput { + Utf8, + Utf8View, } fn case_conversion<'a, F>( args: &'a [ColumnarValue], op: F, name: &str, + utf8view_output: Utf8ViewOutput, ) -> Result where F: Fn(&'a str) -> String, @@ -358,20 +375,38 @@ where >(array, op)?)), DataType::Utf8View => { let string_array = as_string_view_array(array)?; - let mut string_builder = StringBuilder::with_capacity( - string_array.len(), - string_array.get_array_memory_size(), - ); - - for str in string_array.iter() { - if let Some(str) = str { - string_builder.append_value(op(str)); - } else { - string_builder.append_null(); + match utf8view_output { + Utf8ViewOutput::Utf8 => { + let mut string_builder = StringBuilder::with_capacity( + string_array.len(), + string_array.get_array_memory_size(), + ); + + for str in string_array.iter() { + if let Some(str) = str { + string_builder.append_value(op(str)); + } else { + string_builder.append_null(); + } + } + + Ok(ColumnarValue::Array(Arc::new(string_builder.finish()))) + } + Utf8ViewOutput::Utf8View => { + let mut string_builder = + StringViewBuilder::with_capacity(string_array.len()); + + for str in string_array.iter() { + if let Some(str) = str { + string_builder.append_value(op(str)); + } else { + string_builder.append_null(); + } + } + + Ok(ColumnarValue::Array(Arc::new(string_builder.finish()))) } } - - Ok(ColumnarValue::Array(Arc::new(string_builder.finish()))) } other => exec_err!("Unsupported data type {other:?} for function {name}"), }, @@ -386,7 +421,14 @@ where } ScalarValue::Utf8View(a) => { let result = a.as_ref().map(|x| op(x)); - Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result))) + match utf8view_output { + Utf8ViewOutput::Utf8 => { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result))) + } + Utf8ViewOutput::Utf8View => { + Ok(ColumnarValue::Scalar(ScalarValue::Utf8View(result))) + } + } } other => exec_err!("Unsupported data type {other:?} for function {name}"), }, diff --git a/datafusion/functions/src/string/upper.rs b/datafusion/functions/src/string/upper.rs index a2a7db1848f59..c14b40ebdd6ae 100644 --- a/datafusion/functions/src/string/upper.rs +++ b/datafusion/functions/src/string/upper.rs @@ -81,7 +81,11 @@ impl ScalarUDFImpl for UpperFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - utf8_to_str_type(&arg_types[0], "upper") + if arg_types[0] == DataType::Utf8View { + Ok(DataType::Utf8View) + } else { + utf8_to_str_type(&arg_types[0], "upper") + } } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { @@ -96,8 +100,7 @@ impl ScalarUDFImpl for UpperFunc { #[cfg(test)] mod tests { use super::*; - use arrow::array::{Array, ArrayRef, StringArray}; - use arrow::datatypes::DataType::Utf8; + use arrow::array::{Array, ArrayRef, StringArray, StringViewArray}; use arrow::datatypes::Field; use datafusion_common::config::ConfigOptions; use std::sync::Arc; @@ -110,7 +113,7 @@ mod tests { number_rows: input.len(), args: vec![ColumnarValue::Array(input)], arg_fields: vec![arg_field], - return_field: Field::new("f", Utf8, true).into(), + return_field: Field::new("f", expected.data_type().clone(), true).into(), config_options: Arc::new(ConfigOptions::default()), }; @@ -196,4 +199,21 @@ mod tests { to_upper(input, expected) } + + #[test] + fn upper_utf8view() -> Result<()> { + let input = Arc::new(StringViewArray::from(vec![ + Some("arrow"), + None, + Some("tschüß"), + ])) as ArrayRef; + + let expected = Arc::new(StringViewArray::from(vec![ + Some("ARROW"), + None, + Some("TSCHÜSS"), + ])) as ArrayRef; + + to_upper(input, expected) + } } diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 5a43d18e23879..6d2ea4cf06fa2 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -445,6 +445,21 @@ SELECT upper(arrow_cast('árvore ação αβγ', 'Dictionary(Int32, Utf8)')) ---- ÁRVORE AÇÃO ΑΒΓ +query T +SELECT arrow_typeof(upper('foo')) +---- +Utf8 + +query T +SELECT arrow_typeof(upper(arrow_cast('foo', 'LargeUtf8'))) +---- +LargeUtf8 + +query T +SELECT arrow_typeof(upper(arrow_cast('foo', 'Utf8View'))) +---- +Utf8View + query T SELECT btrim(' foo ') ----