Re: [PR] Update SPLIT_PART scalar function to support Utf8View [datafusion]

via GitHub Thu, 15 Aug 2024 06:52:59 -0700


alamb commented on code in PR #11975:
URL: https://github.com/apache/datafusion/pull/11975#discussion_r1718433217



##########
datafusion/functions/src/string/split_part.rs:
##########
@@ -75,50 +82,101 @@ impl ScalarUDFImpl for SplitPartFunc {
     }
 
     fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        match args[0].data_type() {
-            DataType::Utf8 => make_scalar_function(split_part::<i32>, 
vec![])(args),
-            DataType::LargeUtf8 => make_scalar_function(split_part::<i64>, 
vec![])(args),
-            other => {
-                exec_err!("Unsupported data type {other:?} for function 
split_part")
+        match (args[0].data_type(), args[1].data_type()) {
+            (
+                DataType::Utf8 | DataType::Utf8View,
+                DataType::Utf8 | DataType::Utf8View,
+            ) => make_scalar_function(split_part::<i32, i32>, vec![])(args),
+            (DataType::LargeUtf8, DataType::LargeUtf8) => {
+                make_scalar_function(split_part::<i64, i64>, vec![])(args)
             }
+            (_, DataType::LargeUtf8) => {
+                make_scalar_function(split_part::<i32, i64>, vec![])(args)
+            }
+            (DataType::LargeUtf8, _) => {
+                make_scalar_function(split_part::<i64, i32>, vec![])(args)
+            }
+            (first_type, second_type) => exec_err!(
+                "unsupported first type {} and second type {} for split_part 
function",
+                first_type,
+                second_type
+            ),
         }
     }
 }
 
+macro_rules! process_split_part {
+    ($string_array: expr, $delimiter_array: expr, $n_array: expr) => {{
+        let result = $string_array
+            .iter()
+            .zip($delimiter_array.iter())
+            .zip($n_array.iter())
+            .map(|((string, delimiter), n)| match (string, delimiter, n) {
+                (Some(string), Some(delimiter), Some(n)) => {
+                    let split_string: Vec<&str> = 
string.split(delimiter).collect();
+                    let len = split_string.len();
+
+                    let index = match n.cmp(&0) {
+                        std::cmp::Ordering::Less => len as i64 + n,
+                        std::cmp::Ordering::Equal => {
+                            return exec_err!("field position must not be 
zero");
+                        }
+                        std::cmp::Ordering::Greater => n - 1,
+                    } as usize;
+
+                    if index < len {
+                        Ok(Some(split_string[index]))
+                    } else {
+                        Ok(Some(""))
+                    }
+                }
+                _ => Ok(None),
+            })
+            .collect::<Result<GenericStringArray<StringLen>>>()?;
+        Ok(Arc::new(result) as ArrayRef)
+    }};
+}
+
 /// Splits string at occurrences of delimiter and returns the n'th field 
(counting from one).
 /// split_part('abc~@~def~@~ghi', '~@~', 2) = 'def'
-fn split_part<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
-    let string_array = as_generic_string_array::<T>(&args[0])?;
-    let delimiter_array = as_generic_string_array::<T>(&args[1])?;
+fn split_part<StringLen: OffsetSizeTrait, DelimiterLen: OffsetSizeTrait>(
+    args: &[ArrayRef],
+) -> Result<ArrayRef> {
     let n_array = as_int64_array(&args[2])?;
-    let result = string_array
-        .iter()
-        .zip(delimiter_array.iter())
-        .zip(n_array.iter())
-        .map(|((string, delimiter), n)| match (string, delimiter, n) {
-            (Some(string), Some(delimiter), Some(n)) => {
-                let split_string: Vec<&str> = 
string.split(delimiter).collect();
-                let len = split_string.len();
-
-                let index = match n.cmp(&0) {
-                    std::cmp::Ordering::Less => len as i64 + n,
-                    std::cmp::Ordering::Equal => {
-                        return exec_err!("field position must not be zero");
-                    }
-                    std::cmp::Ordering::Greater => n - 1,
-                } as usize;
-
-                if index < len {
-                    Ok(Some(split_string[index]))
-                } else {
-                    Ok(Some(""))
+    match (args[0].data_type(), args[1].data_type()) {

Review Comment:
   split_part is one of the functions where we could actually use 
StringViewArray to speed up its implementation (by avoiding a string copy). 
This function as written will copy the parts along
   
   It might be a fun optimization project to try as a follow on



##########
datafusion/functions/src/string/split_part.rs:
##########
@@ -75,50 +82,101 @@ impl ScalarUDFImpl for SplitPartFunc {
     }
 
     fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
-        match args[0].data_type() {
-            DataType::Utf8 => make_scalar_function(split_part::<i32>, 
vec![])(args),
-            DataType::LargeUtf8 => make_scalar_function(split_part::<i64>, 
vec![])(args),
-            other => {
-                exec_err!("Unsupported data type {other:?} for function 
split_part")
+        match (args[0].data_type(), args[1].data_type()) {
+            (
+                DataType::Utf8 | DataType::Utf8View,
+                DataType::Utf8 | DataType::Utf8View,
+            ) => make_scalar_function(split_part::<i32, i32>, vec![])(args),
+            (DataType::LargeUtf8, DataType::LargeUtf8) => {
+                make_scalar_function(split_part::<i64, i64>, vec![])(args)
             }
+            (_, DataType::LargeUtf8) => {
+                make_scalar_function(split_part::<i32, i64>, vec![])(args)
+            }
+            (DataType::LargeUtf8, _) => {
+                make_scalar_function(split_part::<i64, i32>, vec![])(args)
+            }
+            (first_type, second_type) => exec_err!(
+                "unsupported first type {} and second type {} for split_part 
function",
+                first_type,
+                second_type
+            ),
         }
     }
 }
 
+macro_rules! process_split_part {

Review Comment:
   FWIW I think we could make this macro a generic function with 
`ArrayAccessor` like in this pR from @PsiACE  
https://github.com/apache/datafusion/pull/11974



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Update SPLIT_PART scalar function to support Utf8View [datafusion]

Reply via email to