Jefffrey commented on code in PR #19976:
URL: https://github.com/apache/datafusion/pull/19976#discussion_r2724751090


##########
datafusion/functions/src/string/repeat.rs:
##########
@@ -181,37 +237,52 @@ where
     // Reusable buffer to avoid allocations in string.repeat()
     let mut buffer = Vec::<u8>::with_capacity(max_item_capacity);
 
-    string_array
-        .iter()
-        .zip(number_array.iter())
-        .for_each(|(string, number)| {
+    // Helper function to repeat a string into a buffer using doubling strategy
+    #[inline]
+    fn repeat_to_buffer(buffer: &mut Vec<u8>, string: &str, count: i64) {
+        buffer.clear();
+        if count > 0 && !string.is_empty() {
+            let count = count as usize;
+            let src = string.as_bytes();
+            buffer.extend_from_slice(src);
+            while buffer.len() < src.len() * count {
+                let copy_len = buffer.len().min(src.len() * count - 
buffer.len());
+                buffer.extend_from_within(..copy_len);
+            }
+        }
+    }
+
+    // no nulls in either array
+    if string_array.null_count() == 0 && number_array.null_count() == 0 {
+        for i in 0..string_array.len() {
+            // SAFETY: null_count() == 0 guarantees no nulls
+            let string = unsafe { string_array.value_unchecked(i) };
+            let count = number_array.value(i);
+            if count >= 0 {
+                repeat_to_buffer(&mut buffer, string, count);

Review Comment:
   We check count twice here; once for `count >= 0` and then again inside 
`repeat_to_buffer` for `count > 0`
   
   Might want to consider folding these checks if we're looking at optimizing



##########
datafusion/functions/src/string/repeat.rs:
##########
@@ -99,7 +100,61 @@ impl ScalarUDFImpl for RepeatFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
-        make_scalar_function(repeat, vec![])(&args.args)
+        let [string_arg, count_arg] = take_function_args(self.name(), 
args.args)?;
+
+        match (&string_arg, &count_arg) {
+            (
+                ColumnarValue::Scalar(string_scalar),
+                ColumnarValue::Scalar(count_scalar),
+            ) => {
+                if string_scalar.is_null() || count_scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)));

Review Comment:
   Return type depends on input string type, it isn't blanket `Utf8`



##########
datafusion/functions/src/string/repeat.rs:
##########
@@ -99,7 +100,61 @@ impl ScalarUDFImpl for RepeatFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
-        make_scalar_function(repeat, vec![])(&args.args)
+        let [string_arg, count_arg] = take_function_args(self.name(), 
args.args)?;
+
+        match (&string_arg, &count_arg) {
+            (
+                ColumnarValue::Scalar(string_scalar),
+                ColumnarValue::Scalar(count_scalar),
+            ) => {
+                if string_scalar.is_null() || count_scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)));

Review Comment:
   We can also return null early if either argument is a scalar null, e.g. if 
we have array + scalar null input (currently this code only does early return 
check if both are scalar and if either is null)



##########
datafusion/functions/src/string/repeat.rs:
##########
@@ -181,37 +237,52 @@ where
     // Reusable buffer to avoid allocations in string.repeat()
     let mut buffer = Vec::<u8>::with_capacity(max_item_capacity);
 
-    string_array
-        .iter()
-        .zip(number_array.iter())
-        .for_each(|(string, number)| {
+    // Helper function to repeat a string into a buffer using doubling strategy
+    #[inline]
+    fn repeat_to_buffer(buffer: &mut Vec<u8>, string: &str, count: i64) {
+        buffer.clear();
+        if count > 0 && !string.is_empty() {
+            let count = count as usize;
+            let src = string.as_bytes();
+            buffer.extend_from_slice(src);
+            while buffer.len() < src.len() * count {
+                let copy_len = buffer.len().min(src.len() * count - 
buffer.len());
+                buffer.extend_from_within(..copy_len);
+            }
+        }
+    }
+
+    // no nulls in either array
+    if string_array.null_count() == 0 && number_array.null_count() == 0 {
+        for i in 0..string_array.len() {
+            // SAFETY: null_count() == 0 guarantees no nulls

Review Comment:
   Safety is actually related to bounds checking here, not about nulls



##########
datafusion/functions/src/string/repeat.rs:
##########
@@ -99,7 +100,61 @@ impl ScalarUDFImpl for RepeatFunc {
     }
 
     fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
-        make_scalar_function(repeat, vec![])(&args.args)
+        let [string_arg, count_arg] = take_function_args(self.name(), 
args.args)?;
+
+        match (&string_arg, &count_arg) {
+            (
+                ColumnarValue::Scalar(string_scalar),
+                ColumnarValue::Scalar(count_scalar),
+            ) => {
+                if string_scalar.is_null() || count_scalar.is_null() {
+                    return Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)));
+                }
+
+                let count = match count_scalar {
+                    ScalarValue::Int64(Some(n)) => *n,
+                    _ => {
+                        return internal_err!(
+                            "Unexpected data type {:?} for repeat count",
+                            count_scalar.data_type()
+                        );
+                    }
+                };
+
+                let repeated = match string_scalar {
+                    ScalarValue::Utf8(Some(s))
+                    | ScalarValue::LargeUtf8(Some(s))
+                    | ScalarValue::Utf8View(Some(s)) => {
+                        if count <= 0 {
+                            String::new()
+                        } else {
+                            let result_len = s.len().saturating_mul(count as 
usize);
+                            if result_len > i32::MAX as usize {
+                                return exec_err!(
+                                    "string size overflow on repeat, max size 
is {}, but got {}",
+                                    i32::MAX,
+                                    result_len
+                                );
+                            }
+                            s.repeat(count as usize)
+                        }
+                    }
+                    _ => {
+                        return internal_err!(
+                            "Unexpected data type {:?} for function repeat",
+                            string_scalar.data_type()
+                        );
+                    }
+                };
+
+                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(repeated))))
+            }
+            _ => {
+                let string_array = string_arg.to_array(args.number_rows)?;
+                let count_array = count_arg.to_array(args.number_rows)?;
+                Ok(ColumnarValue::Array(repeat(&[string_array, count_array])?))

Review Comment:
   We probably should change `repeat` to accept two distinct arguments instead 
of passing as a slice



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to