diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs index 8859beca77996..7bcdac5d85474 100644 --- a/datafusion/spark/src/function/string/mod.rs +++ b/datafusion/spark/src/function/string/mod.rs @@ -25,6 +25,7 @@ pub mod ilike; pub mod length; pub mod like; pub mod luhn_check; +pub mod soundex; pub mod space; pub mod substring; @@ -45,6 +46,7 @@ make_udf_function!(format_string::FormatStringFunc, format_string); make_udf_function!(space::SparkSpace, space); make_udf_function!(substring::SparkSubstring, substring); make_udf_function!(base64::SparkUnBase64, unbase64); +make_udf_function!(soundex::SparkSoundex, soundex); pub mod expr_fn { use datafusion_functions::export_functions; @@ -110,6 +112,7 @@ pub mod expr_fn { "Decodes the input string `str` from a base64 string into binary data.", str )); + export_functions!((soundex, "Returns Soundex code of the string.", str)); } pub fn functions() -> Vec> { @@ -127,5 +130,6 @@ pub fn functions() -> Vec> { space(), substring(), unbase64(), + soundex(), ] } diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs new file mode 100644 index 0000000000000..332d7bb51df7d --- /dev/null +++ b/datafusion/spark/src/function/string/soundex.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray}; +use arrow::datatypes::DataType; +use datafusion::logical_expr::{ColumnarValue, Signature, Volatility}; +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions::utils::make_scalar_function; +use std::any::Any; +use std::sync::Arc; + +/// Spark-compatible `soundex` expression +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkSoundex { + signature: Signature, +} + +impl Default for SparkSoundex { + fn default() -> Self { + Self::new() + } +} + +impl SparkSoundex { + pub fn new() -> Self { + Self { + signature: Signature::string(1, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkSoundex { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "soundex" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Utf8) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + make_scalar_function(spark_soundex_inner, vec![])(&args.args) + } +} + +fn spark_soundex_inner(arg: &[ArrayRef]) -> Result { + let [array] = take_function_args("soundex", arg)?; + match &array.data_type() { + DataType::Utf8 => soundex::(array), + DataType::LargeUtf8 => soundex::(array), + other => { + exec_err!("unsupported data type {other:?} for function `soundex`") + } + } +} + +fn soundex(array: &ArrayRef) -> Result { + let str_array = as_generic_string_array::(array)?; + + let result = str_array + .iter() + .map(|s| s.map(compute_soundex)) + .collect::(); + + Ok(Arc::new(result)) +} + +fn compute_soundex(s: &str) -> String { + if s.chars().next().is_some_and(|c| c.is_ascii_digit()) { + return s.to_string(); + } + + let mut chars = s.chars().filter(|c| c.is_ascii_alphabetic()); + + let first_ch = match chars.next() { + Some(c) => c.to_ascii_uppercase(), + None => return "".to_string(), + }; + + let mut result = String::with_capacity(4); + result.push(first_ch); + let mut last_code = classify_char(first_ch); + + for c in chars { + if result.len() >= 4 { + break; + } + let current = classify_char(c); + if let Some(digit) = current + && current != last_code + { + result.push(digit); + } + last_code = current; + } + while result.len() < 4 { + result.push('0'); + } + result +} + +fn classify_char(c: char) -> Option { + match c.to_ascii_uppercase() { + 'B' | 'F' | 'P' | 'V' => Some('1'), + 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'), + 'D' | 'T' => Some('3'), + 'L' => Some('4'), + 'M' | 'N' => Some('5'), + 'R' => Some('6'), + _ => None, // A, E, I, O, U, H, W, Y + } +} diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index f0c46e10fd1de..b1f98fe167d1e 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -15,13 +15,37 @@ # specific language governing permissions and limitations # under the License. -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT soundex('Miller'); -## PySpark 3.5.5 Result: {'soundex(Miller)': 'M460', 'typeof(soundex(Miller))': 'string', 'typeof(Miller)': 'string'} -#query -#SELECT soundex('Miller'::string); +query T +SELECT soundex('Miller'); +---- +M460 + +query T +SELECT soundex(NULL); +---- +NULL + +query T +SELECT soundex(''); +---- +(empty) + +query T +SELECT soundex('Apache Spark'); +---- +A122 + +query T +SELECT soundex('123'); +---- +123 + +query T +SELECT soundex('a123'); +---- +A000 + +query T +SELECT soundex('Datafusion'); +---- +D312