This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 8d47fc6034 Spark soundex function implementation (#20725)
8d47fc6034 is described below
commit 8d47fc6034ff8a2da72ecb3e9602a4357100cd54
Author: Kazantsev Maksim <[email protected]>
AuthorDate: Wed Mar 25 00:47:00 2026 +0400
Spark soundex function implementation (#20725)
## Which issue does this PR close?
N/A
## Rationale for this change
Add new spark function:
https://spark.apache.org/docs/latest/api/sql/index.html#soundex
## What changes are included in this PR?
- Implementation
- SLT tests
## Are these changes tested?
Yes, tests added as part of this PR.
## Are there any user-facing changes?
No, these are new function.
---------
Co-authored-by: Kazantsev Maksim <[email protected]>
---
datafusion/spark/src/function/string/mod.rs | 4 +
datafusion/spark/src/function/string/soundex.rs | 155 ++++++++++++++++
.../test_files/spark/string/soundex.slt | 194 +++++++++++++++++++--
3 files changed, 343 insertions(+), 10 deletions(-)
diff --git a/datafusion/spark/src/function/string/mod.rs
b/datafusion/spark/src/function/string/mod.rs
index 8859beca77..7bcdac5d85 100644
--- a/datafusion/spark/src/function/string/mod.rs
+++ b/datafusion/spark/src/function/string/mod.rs
@@ -25,6 +25,7 @@ pub mod ilike;
pub mod length;
pub mod like;
pub mod luhn_check;
+pub mod soundex;
pub mod space;
pub mod substring;
@@ -45,6 +46,7 @@ make_udf_function!(format_string::FormatStringFunc,
format_string);
make_udf_function!(space::SparkSpace, space);
make_udf_function!(substring::SparkSubstring, substring);
make_udf_function!(base64::SparkUnBase64, unbase64);
+make_udf_function!(soundex::SparkSoundex, soundex);
pub mod expr_fn {
use datafusion_functions::export_functions;
@@ -110,6 +112,7 @@ pub mod expr_fn {
"Decodes the input string `str` from a base64 string into binary
data.",
str
));
+ export_functions!((soundex, "Returns Soundex code of the string.", str));
}
pub fn functions() -> Vec<Arc<ScalarUDF>> {
@@ -127,5 +130,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
space(),
substring(),
unbase64(),
+ soundex(),
]
}
diff --git a/datafusion/spark/src/function/string/soundex.rs
b/datafusion/spark/src/function/string/soundex.rs
new file mode 100644
index 0000000000..1d23ca7545
--- /dev/null
+++ b/datafusion/spark/src/function/string/soundex.rs
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, exec_err};
+use datafusion_expr::{ColumnarValue, Signature, Volatility};
+use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `soundex` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#soundex>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSoundex {
+ signature: Signature,
+}
+
+impl Default for SparkSoundex {
+ fn default() -> Self {
+ Self::new()
+ }
+}
+
+impl SparkSoundex {
+ pub fn new() -> Self {
+ Self {
+ signature: Signature::string(1, Volatility::Immutable),
+ }
+ }
+}
+
+impl ScalarUDFImpl for SparkSoundex {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ "soundex"
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+ match &arg_types[0] {
+ DataType::LargeUtf8 => Ok(DataType::LargeUtf8),
+ _ => Ok(DataType::Utf8),
+ }
+ }
+
+ fn invoke_with_args(&self, args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
+ make_scalar_function(spark_soundex_inner, vec![])(&args.args)
+ }
+}
+
+fn spark_soundex_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+ let [array] = take_function_args("soundex", arg)?;
+ match &array.data_type() {
+ DataType::Utf8 => soundex_array::<i32>(array),
+ DataType::LargeUtf8 => soundex_array::<i64>(array),
+ DataType::Utf8View => soundex_view(array),
+ other => {
+ exec_err!("unsupported data type {other:?} for function `soundex`")
+ }
+ }
+}
+
+fn soundex_array<T: OffsetSizeTrait>(array: &ArrayRef) -> Result<ArrayRef> {
+ let str_array = as_generic_string_array::<T>(array)?;
+ let result = str_array
+ .iter()
+ .map(|s| s.map(compute_soundex))
+ .collect::<StringArray>();
+ Ok(Arc::new(result))
+}
+
+fn soundex_view(str_view: &ArrayRef) -> Result<ArrayRef> {
+ let str_array = as_string_view_array(str_view)?;
+ let result = str_array
+ .iter()
+ .map(|opt_str| opt_str.map(compute_soundex))
+ .collect::<StringArray>();
+ Ok(Arc::new(result) as ArrayRef)
+}
+
+fn classify_char(c: char) -> Option<char> {
+ match c.to_ascii_uppercase() {
+ 'B' | 'F' | 'P' | 'V' => Some('1'),
+ 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'),
+ 'D' | 'T' => Some('3'),
+ 'L' => Some('4'),
+ 'M' | 'N' => Some('5'),
+ 'R' => Some('6'),
+ _ => None,
+ }
+}
+
+fn is_ignored(c: char) -> bool {
+ matches!(c.to_ascii_uppercase(), 'H' | 'W')
+}
+
+fn compute_soundex(s: &str) -> String {
+ let mut chars = s.chars();
+
+ let first_char = match chars.next() {
+ Some(c) if c.is_ascii_alphabetic() => c.to_ascii_uppercase(),
+ _ => return s.to_string(),
+ };
+
+ let mut soundex_code = String::with_capacity(4);
+ soundex_code.push(first_char);
+ let mut last_code = classify_char(first_char);
+
+ for c in chars {
+ if soundex_code.len() >= 4 {
+ break;
+ }
+
+ if is_ignored(c) {
+ continue;
+ }
+
+ match classify_char(c) {
+ Some(code) => {
+ if last_code != Some(code) {
+ soundex_code.push(code);
+ }
+ last_code = Some(code);
+ }
+ None => {
+ last_code = None;
+ }
+ }
+ }
+ format!("{soundex_code:0<4}")
+}
diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt
b/datafusion/sqllogictest/test_files/spark/string/soundex.slt
index f0c46e10fd..ec85c4bd40 100644
--- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt
+++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt
@@ -15,13 +15,187 @@
# specific language governing permissions and limitations
# under the License.
-# This file was originally created by a porting script from:
-#
https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function
library.
-# For more information, please see:
-# https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT soundex('Miller');
-## PySpark 3.5.5 Result: {'soundex(Miller)': 'M460',
'typeof(soundex(Miller))': 'string', 'typeof(Miller)': 'string'}
-#query
-#SELECT soundex('Miller'::string);
+query T
+SELECT soundex('Miller');
+----
+M460
+
+query T
+SELECT soundex(NULL);
+----
+NULL
+
+query T
+SELECT soundex('');
+----
+(empty)
+
+query T
+SELECT soundex('Apache Spark');
+----
+A122
+
+query T
+SELECT soundex('123');
+----
+123
+
+query T
+SELECT soundex('a123');
+----
+A000
+
+query T
+SELECT soundex('Datafusion');
+----
+D312
+
+query T
+SELECT soundex('Ashcroft');
+----
+A261
+
+query T
+SELECT soundex('B1B');
+----
+B100
+
+query T
+SELECT soundex('B B');
+----
+B100
+
+query T
+SELECT soundex('BAB');
+----
+B100
+
+query T
+SELECT soundex('#hello');
+----
+#hello
+
+query T
+SELECT soundex(' hello');
+----
+ hello
+
+query T
+SELECT soundex('\thello');
+----
+\thello
+
+query T
+SELECT soundex('😀hello');
+----
+😀hello
+
+query T
+SELECT soundex('123');
+----
+123
+
+query T
+SELECT soundex('1abc');
+----
+1abc
+
+query T
+SELECT soundex('A');
+----
+A000
+
+query T
+SELECT soundex('BFPV');
+----
+B000
+
+query T
+SELECT soundex('Robert');
+----
+R163
+
+query T
+SELECT soundex('Rupert');
+----
+R163
+
+query T
+SELECT soundex(NULL);
+----
+NULL
+
+query T
+SELECT soundex('');
+----
+(empty)
+
+query T
+SELECT soundex('robert');
+----
+R163
+
+query T
+SELECT soundex('rObErT');
+----
+R163
+
+query T
+SELECT soundex('Müller');
+----
+M460
+
+query T
+SELECT soundex('Abcdefghijklmnop');
+----
+A123
+
+query T
+SELECT soundex('Lloyd');
+----
+L300
+
+query T
+SELECT soundex('BWB');
+----
+B000
+
+query T
+SELECT soundex('BHB');
+----
+B000
+
+query T
+SELECT soundex('Tymczak');
+----
+T522
+
+query T
+SELECT soundex('Aeiou');
+----
+A000
+
+query T
+SELECT soundex('1Robert');
+----
+1Robert
+
+query T
+SELECT soundex('Smith-Jones');
+----
+S532
+
+query T
+SELECT soundex('#');
+----
+#
+
+query T
+SELECT soundex('\nhello');
+----
+\nhello
+
+query T
+SELECT concat(soundex(' '), 'Spark')
+----
+ Spark
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]