This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 8d47fc6034 Spark soundex function implementation (#20725)
8d47fc6034 is described below

commit 8d47fc6034ff8a2da72ecb3e9602a4357100cd54
Author: Kazantsev Maksim <[email protected]>
AuthorDate: Wed Mar 25 00:47:00 2026 +0400

    Spark soundex function implementation (#20725)
    
    ## Which issue does this PR close?
    
    N/A
    
    ## Rationale for this change
    
    Add new spark function:
    https://spark.apache.org/docs/latest/api/sql/index.html#soundex
    
    ## What changes are included in this PR?
    
    - Implementation
    - SLT tests
    
    ## Are these changes tested?
    
    Yes, tests added as part of this PR.
    
    ## Are there any user-facing changes?
    
    No, these are new function.
    
    ---------
    
    Co-authored-by: Kazantsev Maksim <[email protected]>
---
 datafusion/spark/src/function/string/mod.rs        |   4 +
 datafusion/spark/src/function/string/soundex.rs    | 155 ++++++++++++++++
 .../test_files/spark/string/soundex.slt            | 194 +++++++++++++++++++--
 3 files changed, 343 insertions(+), 10 deletions(-)

diff --git a/datafusion/spark/src/function/string/mod.rs 
b/datafusion/spark/src/function/string/mod.rs
index 8859beca77..7bcdac5d85 100644
--- a/datafusion/spark/src/function/string/mod.rs
+++ b/datafusion/spark/src/function/string/mod.rs
@@ -25,6 +25,7 @@ pub mod ilike;
 pub mod length;
 pub mod like;
 pub mod luhn_check;
+pub mod soundex;
 pub mod space;
 pub mod substring;
 
@@ -45,6 +46,7 @@ make_udf_function!(format_string::FormatStringFunc, 
format_string);
 make_udf_function!(space::SparkSpace, space);
 make_udf_function!(substring::SparkSubstring, substring);
 make_udf_function!(base64::SparkUnBase64, unbase64);
+make_udf_function!(soundex::SparkSoundex, soundex);
 
 pub mod expr_fn {
     use datafusion_functions::export_functions;
@@ -110,6 +112,7 @@ pub mod expr_fn {
         "Decodes the input string `str` from a base64 string into binary 
data.",
         str
     ));
+    export_functions!((soundex, "Returns Soundex code of the string.", str));
 }
 
 pub fn functions() -> Vec<Arc<ScalarUDF>> {
@@ -127,5 +130,6 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
         space(),
         substring(),
         unbase64(),
+        soundex(),
     ]
 }
diff --git a/datafusion/spark/src/function/string/soundex.rs 
b/datafusion/spark/src/function/string/soundex.rs
new file mode 100644
index 0000000000..1d23ca7545
--- /dev/null
+++ b/datafusion/spark/src/function/string/soundex.rs
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray};
+use arrow::datatypes::DataType;
+use datafusion_common::cast::{as_generic_string_array, as_string_view_array};
+use datafusion_common::utils::take_function_args;
+use datafusion_common::{Result, exec_err};
+use datafusion_expr::{ColumnarValue, Signature, Volatility};
+use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl};
+use datafusion_functions::utils::make_scalar_function;
+use std::any::Any;
+use std::sync::Arc;
+
+/// Spark-compatible `soundex` expression
+/// <https://spark.apache.org/docs/latest/api/sql/index.html#soundex>
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub struct SparkSoundex {
+    signature: Signature,
+}
+
+impl Default for SparkSoundex {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkSoundex {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::string(1, Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkSoundex {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "soundex"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
+        match &arg_types[0] {
+            DataType::LargeUtf8 => Ok(DataType::LargeUtf8),
+            _ => Ok(DataType::Utf8),
+        }
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        make_scalar_function(spark_soundex_inner, vec![])(&args.args)
+    }
+}
+
+fn spark_soundex_inner(arg: &[ArrayRef]) -> Result<ArrayRef> {
+    let [array] = take_function_args("soundex", arg)?;
+    match &array.data_type() {
+        DataType::Utf8 => soundex_array::<i32>(array),
+        DataType::LargeUtf8 => soundex_array::<i64>(array),
+        DataType::Utf8View => soundex_view(array),
+        other => {
+            exec_err!("unsupported data type {other:?} for function `soundex`")
+        }
+    }
+}
+
+fn soundex_array<T: OffsetSizeTrait>(array: &ArrayRef) -> Result<ArrayRef> {
+    let str_array = as_generic_string_array::<T>(array)?;
+    let result = str_array
+        .iter()
+        .map(|s| s.map(compute_soundex))
+        .collect::<StringArray>();
+    Ok(Arc::new(result))
+}
+
+fn soundex_view(str_view: &ArrayRef) -> Result<ArrayRef> {
+    let str_array = as_string_view_array(str_view)?;
+    let result = str_array
+        .iter()
+        .map(|opt_str| opt_str.map(compute_soundex))
+        .collect::<StringArray>();
+    Ok(Arc::new(result) as ArrayRef)
+}
+
+fn classify_char(c: char) -> Option<char> {
+    match c.to_ascii_uppercase() {
+        'B' | 'F' | 'P' | 'V' => Some('1'),
+        'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'),
+        'D' | 'T' => Some('3'),
+        'L' => Some('4'),
+        'M' | 'N' => Some('5'),
+        'R' => Some('6'),
+        _ => None,
+    }
+}
+
+fn is_ignored(c: char) -> bool {
+    matches!(c.to_ascii_uppercase(), 'H' | 'W')
+}
+
+fn compute_soundex(s: &str) -> String {
+    let mut chars = s.chars();
+
+    let first_char = match chars.next() {
+        Some(c) if c.is_ascii_alphabetic() => c.to_ascii_uppercase(),
+        _ => return s.to_string(),
+    };
+
+    let mut soundex_code = String::with_capacity(4);
+    soundex_code.push(first_char);
+    let mut last_code = classify_char(first_char);
+
+    for c in chars {
+        if soundex_code.len() >= 4 {
+            break;
+        }
+
+        if is_ignored(c) {
+            continue;
+        }
+
+        match classify_char(c) {
+            Some(code) => {
+                if last_code != Some(code) {
+                    soundex_code.push(code);
+                }
+                last_code = Some(code);
+            }
+            None => {
+                last_code = None;
+            }
+        }
+    }
+    format!("{soundex_code:0<4}")
+}
diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt 
b/datafusion/sqllogictest/test_files/spark/string/soundex.slt
index f0c46e10fd..ec85c4bd40 100644
--- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt
+++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt
@@ -15,13 +15,187 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# This file was originally created by a porting script from:
-#   
https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function
-# This file is part of the implementation of the datafusion-spark function 
library.
-# For more information, please see:
-#   https://github.com/apache/datafusion/issues/15914
-
-## Original Query: SELECT soundex('Miller');
-## PySpark 3.5.5 Result: {'soundex(Miller)': 'M460', 
'typeof(soundex(Miller))': 'string', 'typeof(Miller)': 'string'}
-#query
-#SELECT soundex('Miller'::string);
+query T
+SELECT soundex('Miller');
+----
+M460
+
+query T
+SELECT soundex(NULL);
+----
+NULL
+
+query T
+SELECT soundex('');
+----
+(empty)
+
+query T
+SELECT soundex('Apache Spark');
+----
+A122
+
+query T
+SELECT soundex('123');
+----
+123
+
+query T
+SELECT soundex('a123');
+----
+A000
+
+query T
+SELECT soundex('Datafusion');
+----
+D312
+
+query T
+SELECT soundex('Ashcroft');
+----
+A261
+
+query T
+SELECT soundex('B1B');
+----
+B100
+
+query T
+SELECT soundex('B B');
+----
+B100
+
+query T
+SELECT soundex('BAB');
+----
+B100
+
+query T
+SELECT soundex('#hello');
+----
+#hello
+
+query T
+SELECT soundex(' hello');
+----
+ hello
+
+query T
+SELECT soundex('\thello');
+----
+\thello
+
+query T
+SELECT soundex('😀hello');
+----
+😀hello
+
+query T
+SELECT soundex('123');
+----
+123
+
+query T
+SELECT soundex('1abc');
+----
+1abc
+
+query T
+SELECT soundex('A');
+----
+A000
+
+query T
+SELECT soundex('BFPV');
+----
+B000
+
+query T
+SELECT soundex('Robert');
+----
+R163
+
+query T
+SELECT soundex('Rupert');
+----
+R163
+
+query T
+SELECT soundex(NULL);
+----
+NULL
+
+query T
+SELECT soundex('');
+----
+(empty)
+
+query T
+SELECT soundex('robert');
+----
+R163
+
+query T
+SELECT soundex('rObErT');
+----
+R163
+
+query T
+SELECT soundex('Müller');
+----
+M460
+
+query T
+SELECT soundex('Abcdefghijklmnop');
+----
+A123
+
+query T
+SELECT soundex('Lloyd');
+----
+L300
+
+query T
+SELECT soundex('BWB');
+----
+B000
+
+query T
+SELECT soundex('BHB');
+----
+B000
+
+query T
+SELECT soundex('Tymczak');
+----
+T522
+
+query T
+SELECT soundex('Aeiou');
+----
+A000
+
+query T
+SELECT soundex('1Robert');
+----
+1Robert
+
+query T
+SELECT soundex('Smith-Jones');
+----
+S532
+
+query T
+SELECT soundex('#');
+----
+#
+
+query T
+SELECT soundex('\nhello');
+----
+\nhello
+
+query T
+SELECT concat(soundex('   '), 'Spark')
+----
+   Spark


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to