alamb commented on code in PR #13403: URL: https://github.com/apache/datafusion/pull/13403#discussion_r1851052692
########## datafusion/functions-nested/src/string.rs: ########## @@ -524,63 +687,90 @@ pub fn string_to_array_inner<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<Ar } (Some(string), None) => { string.chars().map(|c| c.to_string()).for_each(|c| { - list_builder.values().append_value(c); + list_builder.values().append_value(c.as_str()); }); list_builder.append(true); } _ => list_builder.append(false), // null value } }, - ); + ) } - - 3 => { - let null_value_array = as_generic_string_array::<T>(&args[2])?; - string_array - .iter() - .zip(delimiter_array.iter()) - .zip(null_value_array.iter()) - .for_each(|((string, delimiter), null_value)| { - match (string, delimiter) { - (Some(string), Some("")) => { - if Some(string) == null_value { + Some(null_value_array) => string_array + .iter() + .zip(delimiter_array.iter()) + .zip(null_value_array.iter()) + .for_each(|((string, delimiter), null_value)| { + match (string, delimiter) { + (Some(string), Some("")) => { + if Some(string) == null_value { + list_builder.values().append_null(); + } else { + list_builder.values().append_value(string); + } + list_builder.append(true); + } + (Some(string), Some(delimiter)) => { + string.split(delimiter).for_each(|s| { + if Some(s) == null_value { list_builder.values().append_null(); } else { - list_builder.values().append_value(string); + list_builder.values().append_value(s); } - list_builder.append(true); - } - (Some(string), Some(delimiter)) => { - string.split(delimiter).for_each(|s| { - if Some(s) == null_value { - list_builder.values().append_null(); - } else { - list_builder.values().append_value(s); - } - }); - list_builder.append(true); - } - (Some(string), None) => { - string.chars().map(|c| c.to_string()).for_each(|c| { - if Some(c.as_str()) == null_value { - list_builder.values().append_null(); - } else { - list_builder.values().append_value(c); - } - }); - list_builder.append(true); - } - _ => list_builder.append(false), // null value + }); + list_builder.append(true); } - }); - } - _ => { - return exec_err!( - "Expect string_to_array function to take two or three parameters" - ) - } - } + (Some(string), None) => { + string.chars().map(|c| c.to_string()).for_each(|c| { + if Some(c.as_str()) == null_value { + list_builder.values().append_null(); + } else { + list_builder.values().append_value(c.as_str()); + } + }); + list_builder.append(true); + } + _ => list_builder.append(false), // null value + } + }), + }; let list_array = list_builder.finish(); Ok(Arc::new(list_array) as ArrayRef) } + +trait StringArrayBuilderType: ArrayBuilder { Review Comment: this is an interesting idea -- maybe something worth looking into making more visible somehow / reusing to reduce duplication ########## datafusion/functions-nested/src/string.rs: ########## @@ -495,20 +505,173 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> { /// String_to_array SQL function /// Splits string at occurrences of delimiter and returns an array of parts /// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]' -pub fn string_to_array_inner<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { +fn string_to_array_inner<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { if args.len() < 2 || args.len() > 3 { return exec_err!("string_to_array expects two or three arguments"); } - let string_array = as_generic_string_array::<T>(&args[0])?; - let delimiter_array = as_generic_string_array::<T>(&args[1])?; - let mut list_builder = ListBuilder::new(StringBuilder::with_capacity( - string_array.len(), - string_array.get_buffer_memory_size(), - )); + match args[0].data_type() { + Utf8 => { + let string_array = args[0].as_string::<T>(); + let builder = StringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size()); + string_to_array_inner_2::<&GenericStringArray<T>, StringBuilder>(args, string_array, builder) + } + Utf8View => { + let string_array = args[0].as_string_view(); + let builder = StringViewBuilder::with_capacity(string_array.len()); + string_to_array_inner_2::<&StringViewArray, StringViewBuilder>(args, string_array, builder) + } + LargeUtf8 => { + let string_array = args[0].as_string::<T>(); + let builder = LargeStringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size()); + string_to_array_inner_2::<&GenericStringArray<T>, LargeStringBuilder>(args, string_array, builder) + } + other => exec_err!("unsupported type for first argument to string_to_array function as {other:?}") + } +} + +fn string_to_array_inner_2<'a, StringArrType, StringBuilderType>( + args: &'a [ArrayRef], + string_array: StringArrType, + string_builder: StringBuilderType, +) -> Result<ArrayRef> +where + StringArrType: StringArrayType<'a>, + StringBuilderType: StringArrayBuilderType, +{ + match args[1].data_type() { + Utf8 => { + let delimiter_array = args[1].as_string::<i32>(); + if args.len() == 2 { + string_to_array_impl::< + StringArrType, + &GenericStringArray<i32>, + &StringViewArray, + StringBuilderType, + >(string_array, delimiter_array, None, string_builder) + } else { + string_to_array_inner_3::<StringArrType, + &GenericStringArray<i32>, + StringBuilderType>(args, string_array, delimiter_array, string_builder) + } + } + Utf8View => { + let delimiter_array = args[1].as_string_view(); + + if args.len() == 2 { + string_to_array_impl::< + StringArrType, + &StringViewArray, + &StringViewArray, + StringBuilderType, + >(string_array, delimiter_array, None, string_builder) + } else { + string_to_array_inner_3::<StringArrType, + &StringViewArray, + StringBuilderType>(args, string_array, delimiter_array, string_builder) + } + } + LargeUtf8 => { + let delimiter_array = args[1].as_string::<i64>(); + if args.len() == 2 { + string_to_array_impl::< + StringArrType, + &GenericStringArray<i64>, + &StringViewArray, + StringBuilderType, + >(string_array, delimiter_array, None, string_builder) + } else { + string_to_array_inner_3::<StringArrType, + &GenericStringArray<i64>, + StringBuilderType>(args, string_array, delimiter_array, string_builder) + } + } + other => exec_err!("unsupported type for second argument to string_to_array function as {other:?}") + } +} - match args.len() { - 2 => { +fn string_to_array_inner_3<'a, StringArrType, DelimiterArrType, StringBuilderType>( Review Comment: If I am reading this correctly, this function effectively handles all possible combinations of `Utf8`, `Utf8View` and `LargeUtf8` by generating specialized implementations for each combination -- so I think that means 3 * 3 = 9 copies of the function I wonder if it is common to mix all the types 🤔 ########## datafusion/sqllogictest/test_files/array.slt: ########## @@ -6916,6 +6940,79 @@ select string_to_array(e, ',') from values; [adipiscing] NULL +# karge string tests for string_to_array Review Comment: ```suggestion # large string tests for string_to_array ``` ########## datafusion/functions-nested/src/string.rs: ########## @@ -495,20 +505,173 @@ pub(super) fn array_to_string_inner(args: &[ArrayRef]) -> Result<ArrayRef> { /// String_to_array SQL function /// Splits string at occurrences of delimiter and returns an array of parts /// string_to_array('abc~@~def~@~ghi', '~@~') = '["abc", "def", "ghi"]' -pub fn string_to_array_inner<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { +fn string_to_array_inner<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { if args.len() < 2 || args.len() > 3 { return exec_err!("string_to_array expects two or three arguments"); } - let string_array = as_generic_string_array::<T>(&args[0])?; - let delimiter_array = as_generic_string_array::<T>(&args[1])?; - let mut list_builder = ListBuilder::new(StringBuilder::with_capacity( - string_array.len(), - string_array.get_buffer_memory_size(), - )); + match args[0].data_type() { + Utf8 => { + let string_array = args[0].as_string::<T>(); + let builder = StringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size()); + string_to_array_inner_2::<&GenericStringArray<T>, StringBuilder>(args, string_array, builder) + } + Utf8View => { + let string_array = args[0].as_string_view(); + let builder = StringViewBuilder::with_capacity(string_array.len()); + string_to_array_inner_2::<&StringViewArray, StringViewBuilder>(args, string_array, builder) + } + LargeUtf8 => { + let string_array = args[0].as_string::<T>(); + let builder = LargeStringBuilder::with_capacity(string_array.len(), string_array.get_buffer_memory_size()); + string_to_array_inner_2::<&GenericStringArray<T>, LargeStringBuilder>(args, string_array, builder) + } + other => exec_err!("unsupported type for first argument to string_to_array function as {other:?}") + } +} + +fn string_to_array_inner_2<'a, StringArrType, StringBuilderType>( Review Comment: inner_2 😆 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org