Jefffrey commented on code in PR #19980:
URL: https://github.com/apache/datafusion/pull/19980#discussion_r2726289808


##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +121,174 @@ impl ScalarUDFImpl for LeftFunc {
 
 /// Returns first n characters in the string, or when n is negative, returns 
all but last |n| characters.
 /// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
 /// The implementation uses UTF-8 code points as characters
-fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
+fn left(args: &[ArrayRef]) -> Result<ArrayRef> {
     let n_array = as_int64_array(&args[1])?;
 
-    if args[0].data_type() == &DataType::Utf8View {
-        let string_array = as_string_view_array(&args[0])?;
-        left_impl::<T, _>(string_array, n_array)
-    } else {
-        let string_array = as_generic_string_array::<T>(&args[0])?;
-        left_impl::<T, _>(string_array, n_array)
+    match args[0].data_type() {
+        DataType::Utf8 => {
+            let string_array = as_generic_string_array::<i32>(&args[0])?;
+            left_impl::<i32, _>(string_array, n_array)
+        }
+        DataType::LargeUtf8 => {
+            let string_array = as_generic_string_array::<i64>(&args[0])?;
+            left_impl::<i64, _>(string_array, n_array)
+        }
+        DataType::Utf8View => {
+            let string_view_array = as_string_view_array(&args[0])?;
+            left_impl_view(string_view_array, n_array)
+        }
+        _ => exec_err!("Not supported"),
     }
 }
 
+/// `left` implementation for strings
 fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
     string_array: V,
     n_array: &Int64Array,
 ) -> Result<ArrayRef> {
     let iter = ArrayIter::new(string_array);
-    let mut chars_buf = Vec::new();
     let result = iter
         .zip(n_array.iter())
         .map(|(string, n)| match (string, n) {
-            (Some(string), Some(n)) => match n.cmp(&0) {
-                Ordering::Less => {
-                    // Collect chars once and reuse for both count and take
-                    chars_buf.clear();
-                    chars_buf.extend(string.chars());
-                    let len = chars_buf.len() as i64;
-
-                    // For negative n, take (len + n) chars if n > -len 
(avoiding abs() which panics on i64::MIN)
-                    Some(if n > -len {
-                        chars_buf
-                            .iter()
-                            .take((len + n) as usize)
-                            .collect::<String>()
-                    } else {
-                        "".to_string()
-                    })
-                }
-                Ordering::Equal => Some("".to_string()),
-                Ordering::Greater => {
-                    Some(string.chars().take(n as usize).collect::<String>())
-                }
-            },
+            (Some(string), Some(n)) => {
+                let byte_length = left_byte_length(string, n);
+                // Extract first `byte_length` bytes from a byte-indexed slice
+                Some(&string[0..byte_length])
+            }
             _ => None,
         })
         .collect::<GenericStringArray<T>>();
 
     Ok(Arc::new(result) as ArrayRef)
 }
 
+/// `left` implementation for StringViewArray
+fn left_impl_view(
+    string_view_array: &StringViewArray,
+    n_array: &Int64Array,
+) -> Result<ArrayRef> {
+    let len = n_array.len();
+
+    let views = string_view_array.views();
+    // Every string in StringViewArray has one corresponding view in `views`
+    debug_assert!(views.len() == string_view_array.len());
+
+    // Compose null buffer at once
+    let string_nulls = string_view_array.nulls();
+    let n_nulls = n_array.nulls();
+    let new_nulls = NullBuffer::union(string_nulls, n_nulls);
+
+    let new_views = (0..len)
+        .map(|idx| {
+            let view = views[idx];
+
+            let is_valid = match &new_nulls {
+                Some(nulls_buf) => nulls_buf.is_valid(idx),
+                None => true,
+            };
+
+            if is_valid {
+                let string: &str = string_view_array.value(idx);
+                let n = n_array.value(idx);
+
+                // Input string comes from StringViewArray, so it should fit 
in 32-bit length
+                let new_length: u32 = left_byte_length(string, n) as u32;
+                let byte_view = ByteView::from(view);
+                // Construct a new view
+                let new_view =
+                    shrink_string_view_array_view(string, new_length, 
byte_view)?;
+                Ok(new_view)
+            } else {
+                // For nulls, keep the original view
+                Ok(view)
+            }
+        })
+        .collect::<Result<Vec<u128>>>()?;
+
+    // Buffers are unchanged
+    let result = StringViewArray::try_new(
+        ScalarBuffer::from(new_views),
+        Vec::from(string_view_array.data_buffers()),
+        new_nulls,
+    )?;
+    Ok(Arc::new(result) as ArrayRef)
+}
+
+/// Calculate the byte length of the substring of `n` chars from string 
`string`
+fn left_byte_length(string: &str, n: i64) -> usize {
+    match n.cmp(&0) {
+        Ordering::Less => {
+            let mut indices = string.char_indices();
+            // Find the first byte of a character past last `-n` characters
+            let mut end_idx: usize = usize::MAX;
+            for _ in n..0 {
+                if let Some((i, _)) = indices.next_back() {
+                    end_idx = i;
+                } else {
+                    end_idx = 0;
+                    break;
+                }
+            }
+            end_idx
+        }
+        Ordering::Equal => 0,
+        Ordering::Greater => {
+            if let Some((end_idx, end_char)) =
+                string.char_indices().take(n as usize).last()
+            {
+                // Include length of the last character
+                end_idx + end_char.len_utf8()
+            } else {
+                // String is empty
+                0
+            }
+        }
+    }
+}
+
+/// Construct a new StringViewArray view from existing view `byte_view` and 
new length `len`.
+/// Prefix is taken from the original string `string`.
+/// Handles both inline and non-inline views, referencing the same buffers.
+fn shrink_string_view_array_view(
+    string: &str,
+    len: u32,
+    byte_view: ByteView,
+) -> Result<u128> {
+    if len > byte_view.length {

Review Comment:
   We could change this to a debug_assert as well since len comes from 
`left_byte_length` which should guard this for us? Might help in simplifying 
the return signature and potentially improve things 🤔 



##########
datafusion/functions/src/unicode/left.rs:
##########
@@ -121,61 +125,175 @@ impl ScalarUDFImpl for LeftFunc {
 
 /// Returns first n characters in the string, or when n is negative, returns 
all but last |n| characters.
 /// left('abcde', 2) = 'ab'
+/// left('abcde', -2) = 'ab'
 /// The implementation uses UTF-8 code points as characters
 fn left<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
     let n_array = as_int64_array(&args[1])?;
 
     if args[0].data_type() == &DataType::Utf8View {
-        let string_array = as_string_view_array(&args[0])?;
-        left_impl::<T, _>(string_array, n_array)
+        let string_view_array = as_string_view_array(&args[0])?;
+        left_impl_view(string_view_array, n_array)
     } else {
         let string_array = as_generic_string_array::<T>(&args[0])?;
         left_impl::<T, _>(string_array, n_array)
     }
 }
 
+/// `left` implementation for strings
 fn left_impl<'a, T: OffsetSizeTrait, V: ArrayAccessor<Item = &'a str>>(
     string_array: V,
     n_array: &Int64Array,
 ) -> Result<ArrayRef> {
     let iter = ArrayIter::new(string_array);
-    let mut chars_buf = Vec::new();
     let result = iter
         .zip(n_array.iter())
         .map(|(string, n)| match (string, n) {
-            (Some(string), Some(n)) => match n.cmp(&0) {
-                Ordering::Less => {
-                    // Collect chars once and reuse for both count and take
-                    chars_buf.clear();
-                    chars_buf.extend(string.chars());
-                    let len = chars_buf.len() as i64;
-
-                    // For negative n, take (len + n) chars if n > -len 
(avoiding abs() which panics on i64::MIN)
-                    Some(if n > -len {
-                        chars_buf
-                            .iter()
-                            .take((len + n) as usize)
-                            .collect::<String>()
-                    } else {
-                        "".to_string()
-                    })
-                }
-                Ordering::Equal => Some("".to_string()),
-                Ordering::Greater => {
-                    Some(string.chars().take(n as usize).collect::<String>())
-                }
-            },
+            (Some(string), Some(n)) => {
+                let byte_length = left_byte_length(string, n);
+                // Extract first `byte_length` bytes from a byte-indexed slice
+                Some(&string[0..byte_length])
+            }
             _ => None,
         })
         .collect::<GenericStringArray<T>>();
 
     Ok(Arc::new(result) as ArrayRef)
 }
 
+/// `left` implementation for StringViewArray
+fn left_impl_view(
+    string_view_array: &StringViewArray,
+    n_array: &Int64Array,
+) -> Result<ArrayRef> {
+    let len = n_array.len();
+
+    let (views, buffers, _nulls) = string_view_array.clone().into_parts();
+
+    if string_view_array.len() != n_array.len() {
+        return exec_err!(
+            "Expected same shape of arrays, given {} and {}",
+            string_view_array.len(),
+            n_array.len()
+        );
+    }
+
+    // Every string in StringViewArray has one corresponding view in `views`
+    if views.len() != string_view_array.len() {
+        return exec_err!(
+            "StringViewArray views length {} does not match array length {}",
+            views.len(),
+            string_view_array.len()
+        );
+    }
+    let mut null_buffer_builder = NullBufferBuilder::new(len);
+    let mut new_views = Vec::with_capacity(len);
+
+    for idx in 0..len {
+        let view = views[idx];
+
+        if string_view_array.is_valid(idx) && n_array.is_valid(idx) {
+            let string: &str = string_view_array.value(idx);
+            let n = n_array.value(idx);
+
+            let byte_length = left_byte_length(string, n);
+            let new_length: u32 = byte_length.try_into().map_err(|_| {
+                exec_datafusion_err!("String is larger than 32-bit limit at 
index {idx}")
+            })?;
+            let byte_view = ByteView::from(view);
+            // Construct a new view
+            let new_view = shrink_string_view_array_view(string, new_length, 
byte_view)?;
+            new_views.push(new_view);
+            null_buffer_builder.append_non_null();
+        } else {
+            new_views.push(view);
+            // Emit null
+            null_buffer_builder.append_null();
+        }
+    }
+    // Buffers are unchanged
+    // Nulls are rebuilt from scratch
+    let nulls = null_buffer_builder.finish();
+    let result = StringViewArray::try_new(ScalarBuffer::from(new_views), 
buffers, nulls)?;
+    Ok(Arc::new(result) as ArrayRef)
+}
+
+/// Calculate the byte length of the substring of `n` chars from string 
`string`
+fn left_byte_length(string: &str, n: i64) -> usize {
+    match n.cmp(&0) {
+        Ordering::Less => {
+            let mut indices = string.char_indices();
+            // Find the first byte of a character past last `-n` characters
+            let mut end_idx: usize = usize::MAX;
+            for _ in n..0 {
+                if let Some((i, _)) = indices.next_back() {
+                    end_idx = i;
+                } else {
+                    end_idx = 0;
+                    break;
+                }
+            }
+            end_idx
+        }
+        Ordering::Equal => 0,
+        Ordering::Greater => {
+            if let Some((end_idx, end_char)) =
+                string.char_indices().take(n as usize).last()

Review Comment:
   What about something like this?
   
   ```rust
       match n.cmp(&0) {
           Ordering::Less => string
               .char_indices()
               .nth_back(n.unsigned_abs() as usize - 1)
               .map(|(index, _)| index)
               .unwrap_or(0),
           Ordering::Equal => 0,
           Ordering::Greater => string
               .char_indices()
               .nth(n as usize)
               .map(|(index, _)| index)
               .unwrap_or(string.len()),
       }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to