alamb commented on code in PR #11556:
URL: https://github.com/apache/datafusion/pull/11556#discussion_r1685391814
##########
datafusion/functions/src/regex/regexpreplace.rs:
##########
@@ -358,32 +366,61 @@ fn _regexp_replace_static_pattern_replace<T:
OffsetSizeTrait>(
// with rust ones.
let replacement = regex_replace_posix_groups(replacement);
- // We are going to create the underlying string buffer from its parts
- // to be able to re-use the existing null buffer for sparse arrays.
- let mut vals = BufferBuilder::<u8>::new({
- let offsets = string_array.value_offsets();
- (offsets[string_array.len()] - offsets[0])
- .to_usize()
- .expect("Failed to convert usize")
- });
- let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
- new_offsets.append(T::zero());
-
- string_array.iter().for_each(|val| {
- if let Some(val) = val {
- let result = re.replacen(val, limit, replacement.as_str());
- vals.append_slice(result.as_bytes());
+ let string_array_type = args[0].data_type();
+ match string_array_type {
+ DataType::Utf8 | DataType::LargeUtf8 => {
+ let string_array = as_generic_string_array::<T>(&args[0])?;
+
+ // We are going to create the underlying string buffer from its
parts
+ // to be able to re-use the existing null buffer for sparse arrays.
+ let mut vals = BufferBuilder::<u8>::new({
+ let offsets = string_array.value_offsets();
+ (offsets[string_array.len()] - offsets[0])
+ .to_usize()
+ .unwrap()
+ });
+ let mut new_offsets = BufferBuilder::<T>::new(string_array.len() +
1);
+ new_offsets.append(T::zero());
+
+ string_array.iter().for_each(|val| {
+ if let Some(val) = val {
+ let result = re.replacen(val, limit, replacement.as_str());
+ vals.append_slice(result.as_bytes());
+ }
+ new_offsets.append(T::from_usize(vals.len()).unwrap());
+ });
+
+ let data =
ArrayDataBuilder::new(GenericStringArray::<T>::DATA_TYPE)
+ .len(string_array.len())
+ .nulls(string_array.nulls().cloned())
+ .buffers(vec![new_offsets.finish(), vals.finish()])
+ .build()?;
+ let result_array = GenericStringArray::<T>::from(data);
+ Ok(Arc::new(result_array) as ArrayRef)
}
- new_offsets.append(T::from_usize(vals.len()).unwrap());
- });
-
- let data = ArrayDataBuilder::new(GenericStringArray::<T>::DATA_TYPE)
- .len(string_array.len())
- .nulls(string_array.nulls().cloned())
- .buffers(vec![new_offsets.finish(), vals.finish()])
- .build()?;
- let result_array = GenericStringArray::<T>::from(data);
- Ok(Arc::new(result_array) as ArrayRef)
+ DataType::Utf8View => {
+ let string_view_array = as_string_view_array(&args[0])?;
Review Comment:
is the idea that we could specialize this implementation to potentially do
inplace updates or something if the string replacements allowed (e.g. replacing
larger strings with smaller)?
##########
datafusion/functions/src/regex/regexpreplace.rs:
##########
@@ -313,25 +322,24 @@ macro_rules! fetch_string_arg {
fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
args: &[ArrayRef],
) -> Result<ArrayRef> {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let array_size = string_array.len();
+ let array_size = args[0].len();
let pattern = fetch_string_arg!(
&args[1],
"pattern",
- T,
+ i32,
Review Comment:
I don't understand why thiw was changed from `T` to `i32` 🤔
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]