From f2328982d4baa4520091dafc7aa0576cfe24bd12 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Thu, 5 Mar 2026 20:29:55 +0400 Subject: [PATCH 1/8] Spark soundex function implementation --- datafusion/spark/src/function/string/mod.rs | 4 + .../spark/src/function/string/soundex.rs | 135 ++++++++++++++++++ .../test_files/spark/string/soundex.slt | 17 ++- 3 files changed, 147 insertions(+), 9 deletions(-) create mode 100644 datafusion/spark/src/function/string/soundex.rs diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs index 8859beca7799..7bcdac5d8547 100644 --- a/datafusion/spark/src/function/string/mod.rs +++ b/datafusion/spark/src/function/string/mod.rs @@ -25,6 +25,7 @@ pub mod ilike; pub mod length; pub mod like; pub mod luhn_check; +pub mod soundex; pub mod space; pub mod substring; @@ -45,6 +46,7 @@ make_udf_function!(format_string::FormatStringFunc, format_string); make_udf_function!(space::SparkSpace, space); make_udf_function!(substring::SparkSubstring, substring); make_udf_function!(base64::SparkUnBase64, unbase64); +make_udf_function!(soundex::SparkSoundex, soundex); pub mod expr_fn { use datafusion_functions::export_functions; @@ -110,6 +112,7 @@ pub mod expr_fn { "Decodes the input string `str` from a base64 string into binary data.", str )); + export_functions!((soundex, "Returns Soundex code of the string.", str)); } pub fn functions() -> Vec> { @@ -127,5 +130,6 @@ pub fn functions() -> Vec> { space(), substring(), unbase64(), + soundex(), ] } diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs new file mode 100644 index 000000000000..3b47de6ada9c --- /dev/null +++ b/datafusion/spark/src/function/string/soundex.rs @@ -0,0 +1,135 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray}; +use arrow::datatypes::DataType; +use datafusion::logical_expr::{ColumnarValue, Signature, Volatility}; +use datafusion_common::cast::as_generic_string_array; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions::utils::make_scalar_function; +use std::any::Any; +use std::sync::Arc; + +/// Spark-compatible `soundex` expression +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkSoundex { + signature: Signature, +} + +impl Default for SparkSoundex { + fn default() -> Self { + Self::new() + } +} + +impl SparkSoundex { + pub fn new() -> Self { + Self { + signature: Signature::string(1, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkSoundex { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "soundex" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Utf8) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + make_scalar_function(spark_soundex_inner, vec![])(&args.args) + } +} + +fn spark_soundex_inner(arg: &[ArrayRef]) -> Result { + let [array] = take_function_args("soundex", arg)?; + match &array.data_type() { + DataType::Utf8 | DataType::Utf8View => soundex::(array), + DataType::LargeUtf8 => soundex::(array), + other => { + exec_err!("unsupported data type {other:?} for function `soundex`") + } + } +} + +fn soundex(array: &ArrayRef) -> Result { + let str_array = as_generic_string_array::(array)?; + + let result = str_array + .iter() + .map(|s| s.map(compute_soundex)) + .collect::(); + + Ok(Arc::new(result)) +} + +fn compute_soundex(s: &str) -> String { + let mut chars = s.chars().filter(|c| c.is_ascii_alphabetic()); + + let first_ch = match chars.next() { + Some(c) => c.to_ascii_uppercase(), + None => return "".to_string(), + }; + + let mut result = String::with_capacity(4); + result.push(first_ch); + let mut last_code = classify_char(first_ch); + + for c in chars { + if result.len() >= 4 { + break; + } + let current = classify_char(c); + if let Some(digit) = current { + if current != last_code { + result.push(digit); + } + } + last_code = current; + } + + while result.len() < 4 { + result.push('0'); + } + result +} + +fn classify_char(c: char) -> Option { + match c.to_ascii_uppercase() { + 'B' | 'F' | 'P' | 'V' => Some('1'), + 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'), + 'D' | 'T' => Some('3'), + 'L' => Some('4'), + 'M' | 'N' => Some('5'), + 'R' => Some('6'), + _ => None, // A, E, I, O, U, H, W, Y + } +} diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index f0c46e10fd1d..5a109c294c71 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -15,13 +15,12 @@ # specific language governing permissions and limitations # under the License. -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 +query T +SELECT soundex('Miller'::string); +---- +M460 -## Original Query: SELECT soundex('Miller'); -## PySpark 3.5.5 Result: {'soundex(Miller)': 'M460', 'typeof(soundex(Miller))': 'string', 'typeof(Miller)': 'string'} -#query -#SELECT soundex('Miller'::string); +query T +SELECT soundex(NULL); +---- +NULL From e2aadb341c2aa528580b631a555ede37eefc262e Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Thu, 5 Mar 2026 20:41:39 +0400 Subject: [PATCH 2/8] Add more tests --- .../test_files/spark/string/soundex.slt | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index 5a109c294c71..c6d905bcfb26 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -24,3 +24,23 @@ query T SELECT soundex(NULL); ---- NULL + +query T +SELECT soundex(''::string); +---- +(empty) + +query T +SELECT soundex('Apache Spark'::string); +---- +A122 + +query T +SELECT soundex('123'::string); +---- +123 + +query T +SELECT soundex('Datafusion'::string); +---- +D312 From 37c339065ce0cd5c633ac4bae344dc6080ce731b Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Thu, 5 Mar 2026 23:01:46 +0400 Subject: [PATCH 3/8] Clippy fixing --- datafusion/spark/src/function/string/soundex.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 3b47de6ada9c..f8a97c3d4d5d 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -108,14 +108,11 @@ fn compute_soundex(s: &str) -> String { break; } let current = classify_char(c); - if let Some(digit) = current { - if current != last_code { - result.push(digit); - } + if let Some(digit) = current && current != last_code { + result.push(digit); } last_code = current; } - while result.len() < 4 { result.push('0'); } From 5058986e644c066d81fc788f3ecf330f61942cea Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Thu, 5 Mar 2026 23:07:30 +0400 Subject: [PATCH 4/8] Clippy fixing --- datafusion/spark/src/function/string/soundex.rs | 6 ++++-- .../sqllogictest/test_files/spark/string/soundex.slt | 10 +++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index f8a97c3d4d5d..804436389368 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -72,7 +72,7 @@ impl ScalarUDFImpl for SparkSoundex { fn spark_soundex_inner(arg: &[ArrayRef]) -> Result { let [array] = take_function_args("soundex", arg)?; match &array.data_type() { - DataType::Utf8 | DataType::Utf8View => soundex::(array), + DataType::Utf8 => soundex::(array), DataType::LargeUtf8 => soundex::(array), other => { exec_err!("unsupported data type {other:?} for function `soundex`") @@ -108,7 +108,9 @@ fn compute_soundex(s: &str) -> String { break; } let current = classify_char(c); - if let Some(digit) = current && current != last_code { + if let Some(digit) = current + && current != last_code + { result.push(digit); } last_code = current; diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index c6d905bcfb26..1f202f9c8b68 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -16,7 +16,7 @@ # under the License. query T -SELECT soundex('Miller'::string); +SELECT soundex('Miller'); ---- M460 @@ -26,21 +26,21 @@ SELECT soundex(NULL); NULL query T -SELECT soundex(''::string); +SELECT soundex(''); ---- (empty) query T -SELECT soundex('Apache Spark'::string); +SELECT soundex('Apache Spark'); ---- A122 query T -SELECT soundex('123'::string); +SELECT soundex('123'); ---- 123 query T -SELECT soundex('Datafusion'::string); +SELECT soundex('Datafusion'); ---- D312 From 5682c4f945dffb566d2a9c01799874c7f28ad77a Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Fri, 6 Mar 2026 20:43:53 +0400 Subject: [PATCH 5/8] Fix compute_soundex --- datafusion/spark/src/function/string/soundex.rs | 4 ++++ datafusion/sqllogictest/test_files/spark/string/soundex.slt | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 804436389368..045a42511136 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -99,6 +99,10 @@ fn compute_soundex(s: &str) -> String { None => return "".to_string(), }; + if first_ch.is_ascii_digit() { + return s.to_string() + } + let mut result = String::with_capacity(4); result.push(first_ch); let mut last_code = classify_char(first_ch); diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index 1f202f9c8b68..b1f98fe167d1 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -40,6 +40,11 @@ SELECT soundex('123'); ---- 123 +query T +SELECT soundex('a123'); +---- +A000 + query T SELECT soundex('Datafusion'); ---- From 9b014ec3d05ef77aa86859e90d6aa73ff7630c9d Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Fri, 6 Mar 2026 20:58:15 +0400 Subject: [PATCH 6/8] Fix compute_soundex --- datafusion/spark/src/function/string/soundex.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 045a42511136..c7046baa73c7 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -92,6 +92,10 @@ fn soundex(array: &ArrayRef) -> Result { } fn compute_soundex(s: &str) -> String { + if s.chars().next().map_or(false, |c| c.is_ascii_digit()) { + return s.to_string() + } + let mut chars = s.chars().filter(|c| c.is_ascii_alphabetic()); let first_ch = match chars.next() { @@ -99,10 +103,6 @@ fn compute_soundex(s: &str) -> String { None => return "".to_string(), }; - if first_ch.is_ascii_digit() { - return s.to_string() - } - let mut result = String::with_capacity(4); result.push(first_ch); let mut last_code = classify_char(first_ch); From 74569b44c2b7ecd6d92feaf272fac2ba08dca089 Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Fri, 6 Mar 2026 21:00:38 +0400 Subject: [PATCH 7/8] Fix compute_soundex --- datafusion/spark/src/function/string/soundex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index c7046baa73c7..2ff9428d414c 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -93,7 +93,7 @@ fn soundex(array: &ArrayRef) -> Result { fn compute_soundex(s: &str) -> String { if s.chars().next().map_or(false, |c| c.is_ascii_digit()) { - return s.to_string() + return s.to_string(); } let mut chars = s.chars().filter(|c| c.is_ascii_alphabetic()); From b89175e16a47e6e5a26e16cfa165b72ea8502dff Mon Sep 17 00:00:00 2001 From: Kazantsev Maksim Date: Fri, 6 Mar 2026 21:07:43 +0400 Subject: [PATCH 8/8] Clippy fixing --- datafusion/spark/src/function/string/soundex.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs index 2ff9428d414c..332d7bb51df7 100644 --- a/datafusion/spark/src/function/string/soundex.rs +++ b/datafusion/spark/src/function/string/soundex.rs @@ -92,7 +92,7 @@ fn soundex(array: &ArrayRef) -> Result { } fn compute_soundex(s: &str) -> String { - if s.chars().next().map_or(false, |c| c.is_ascii_digit()) { + if s.chars().next().is_some_and(|c| c.is_ascii_digit()) { return s.to_string(); }