alamb commented on code in PR #11975:
URL: https://github.com/apache/datafusion/pull/11975#discussion_r1718433217
##########
datafusion/functions/src/string/split_part.rs:
##########
@@ -75,50 +82,101 @@ impl ScalarUDFImpl for SplitPartFunc {
}
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
- match args[0].data_type() {
- DataType::Utf8 => make_scalar_function(split_part::<i32>,
vec![])(args),
- DataType::LargeUtf8 => make_scalar_function(split_part::<i64>,
vec![])(args),
- other => {
- exec_err!("Unsupported data type {other:?} for function
split_part")
+ match (args[0].data_type(), args[1].data_type()) {
+ (
+ DataType::Utf8 | DataType::Utf8View,
+ DataType::Utf8 | DataType::Utf8View,
+ ) => make_scalar_function(split_part::<i32, i32>, vec![])(args),
+ (DataType::LargeUtf8, DataType::LargeUtf8) => {
+ make_scalar_function(split_part::<i64, i64>, vec![])(args)
}
+ (_, DataType::LargeUtf8) => {
+ make_scalar_function(split_part::<i32, i64>, vec![])(args)
+ }
+ (DataType::LargeUtf8, _) => {
+ make_scalar_function(split_part::<i64, i32>, vec![])(args)
+ }
+ (first_type, second_type) => exec_err!(
+ "unsupported first type {} and second type {} for split_part
function",
+ first_type,
+ second_type
+ ),
}
}
}
+macro_rules! process_split_part {
+ ($string_array: expr, $delimiter_array: expr, $n_array: expr) => {{
+ let result = $string_array
+ .iter()
+ .zip($delimiter_array.iter())
+ .zip($n_array.iter())
+ .map(|((string, delimiter), n)| match (string, delimiter, n) {
+ (Some(string), Some(delimiter), Some(n)) => {
+ let split_string: Vec<&str> =
string.split(delimiter).collect();
+ let len = split_string.len();
+
+ let index = match n.cmp(&0) {
+ std::cmp::Ordering::Less => len as i64 + n,
+ std::cmp::Ordering::Equal => {
+ return exec_err!("field position must not be
zero");
+ }
+ std::cmp::Ordering::Greater => n - 1,
+ } as usize;
+
+ if index < len {
+ Ok(Some(split_string[index]))
+ } else {
+ Ok(Some(""))
+ }
+ }
+ _ => Ok(None),
+ })
+ .collect::<Result<GenericStringArray<StringLen>>>()?;
+ Ok(Arc::new(result) as ArrayRef)
+ }};
+}
+
/// Splits string at occurrences of delimiter and returns the n'th field
(counting from one).
/// split_part('abc~@~def~@~ghi', '~@~', 2) = 'def'
-fn split_part<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let delimiter_array = as_generic_string_array::<T>(&args[1])?;
+fn split_part<StringLen: OffsetSizeTrait, DelimiterLen: OffsetSizeTrait>(
+ args: &[ArrayRef],
+) -> Result<ArrayRef> {
let n_array = as_int64_array(&args[2])?;
- let result = string_array
- .iter()
- .zip(delimiter_array.iter())
- .zip(n_array.iter())
- .map(|((string, delimiter), n)| match (string, delimiter, n) {
- (Some(string), Some(delimiter), Some(n)) => {
- let split_string: Vec<&str> =
string.split(delimiter).collect();
- let len = split_string.len();
-
- let index = match n.cmp(&0) {
- std::cmp::Ordering::Less => len as i64 + n,
- std::cmp::Ordering::Equal => {
- return exec_err!("field position must not be zero");
- }
- std::cmp::Ordering::Greater => n - 1,
- } as usize;
-
- if index < len {
- Ok(Some(split_string[index]))
- } else {
- Ok(Some(""))
+ match (args[0].data_type(), args[1].data_type()) {
Review Comment:
split_part is one of the functions where we could actually use
StringViewArray to speed up its implementation (by avoiding a string copy).
This function as written will copy the parts along
It might be a fun optimization project to try as a follow on
##########
datafusion/functions/src/string/split_part.rs:
##########
@@ -75,50 +82,101 @@ impl ScalarUDFImpl for SplitPartFunc {
}
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
- match args[0].data_type() {
- DataType::Utf8 => make_scalar_function(split_part::<i32>,
vec![])(args),
- DataType::LargeUtf8 => make_scalar_function(split_part::<i64>,
vec![])(args),
- other => {
- exec_err!("Unsupported data type {other:?} for function
split_part")
+ match (args[0].data_type(), args[1].data_type()) {
+ (
+ DataType::Utf8 | DataType::Utf8View,
+ DataType::Utf8 | DataType::Utf8View,
+ ) => make_scalar_function(split_part::<i32, i32>, vec![])(args),
+ (DataType::LargeUtf8, DataType::LargeUtf8) => {
+ make_scalar_function(split_part::<i64, i64>, vec![])(args)
}
+ (_, DataType::LargeUtf8) => {
+ make_scalar_function(split_part::<i32, i64>, vec![])(args)
+ }
+ (DataType::LargeUtf8, _) => {
+ make_scalar_function(split_part::<i64, i32>, vec![])(args)
+ }
+ (first_type, second_type) => exec_err!(
+ "unsupported first type {} and second type {} for split_part
function",
+ first_type,
+ second_type
+ ),
}
}
}
+macro_rules! process_split_part {
Review Comment:
FWIW I think we could make this macro a generic function with
`ArrayAccessor` like in this pR from @PsiACE
https://github.com/apache/datafusion/pull/11974
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]