Kev1n8 commented on code in PR #12044:
URL: https://github.com/apache/datafusion/pull/12044#discussion_r1727098779
##########
datafusion/functions/src/unicode/substr.rs:
##########
@@ -89,29 +94,193 @@ impl ScalarUDFImpl for SubstrFunc {
}
}
+/// Extracts the substring of string starting at the start'th character, and
extending for count characters if that is specified. (Same as substring(string
from start for count).)
+/// substr('alphabet', 3) = 'phabet'
+/// substr('alphabet', 3, 2) = 'ph'
+/// The implementation uses UTF-8 code points as characters
pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
match args[0].data_type() {
DataType::Utf8 => {
let string_array = args[0].as_string::<i32>();
- calculate_substr::<_, i32>(string_array, &args[1..])
+ string_substr::<_, i32>(string_array, &args[1..])
}
DataType::LargeUtf8 => {
let string_array = args[0].as_string::<i64>();
- calculate_substr::<_, i64>(string_array, &args[1..])
+ string_substr::<_, i64>(string_array, &args[1..])
}
DataType::Utf8View => {
let string_array = args[0].as_string_view();
- calculate_substr::<_, i32>(string_array, &args[1..])
+ string_view_substr(string_array, &args[1..])
}
- other => exec_err!("Unsupported data type {other:?} for function
substr"),
+ other => exec_err!(
+ "Unsupported data type {other:?} for function substr,\
+ expected Utf8View, Utf8 or LargeUtf8."
+ ),
}
}
-/// Extracts the substring of string starting at the start'th character, and
extending for count characters if that is specified. (Same as substring(string
from start for count).)
-/// substr('alphabet', 3) = 'phabet'
-/// substr('alphabet', 3, 2) = 'ph'
-/// The implementation uses UTF-8 code points as characters
-fn calculate_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) ->
Result<ArrayRef>
+// Return the exact byte index for [start, end), set count to -1 to ignore
count
+fn get_true_start_count(input: &str, start: usize, count: i64) -> (usize,
usize) {
+ let (mut st, mut ed) = (input.len(), input.len());
+ let mut start_counting = false;
+ let mut cnt = 0;
+ for (char_cnt, (byte_cnt, _)) in input.char_indices().enumerate() {
+ if char_cnt == start {
+ st = byte_cnt;
+ if count != -1 {
+ start_counting = true;
+ } else {
+ break;
+ }
+ }
+ if start_counting {
+ if cnt == count {
+ ed = byte_cnt;
+ break;
+ }
+ cnt += 1;
+ }
+ }
+ (st, ed)
+}
+
+// The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44
+// From<u128> for ByteView
+fn string_view_substr(
+ string_view_array: &StringViewArray,
+ args: &[ArrayRef],
+) -> Result<ArrayRef> {
+ let mut builder = StringViewBuilder::new();
+ // Copy all blocks from input
+ for block in string_view_array.data_buffers() {
+ builder.append_block(block.clone());
+ }
+
+ let start_array = as_int64_array(&args[0])?;
+
+ match args.len() {
+ 1 => {
+ for (idx, (raw, start)) in string_view_array
+ .views()
+ .iter()
+ .zip(start_array.iter())
+ .enumerate()
+ {
+ if let Some(start) = start {
+ let length = *raw as u32;
+ let start = (start - 1).max(0);
+
+ // Operate according to the length of bytes
+ if length == 0 {
+ builder.append_null();
+ } else if length > 12 {
+ let view = ByteView::from(*raw);
+
+ // Safety:
+ // 1. idx < string_array.views.size()
+ // 2. builder is guaranteed to have corresponding
blocks
+ unsafe {
+ let str = string_view_array.value_unchecked(idx);
+ let (start, end) =
+ get_true_start_count(str, start as usize, -1);
+ builder.append_view_unchecked(
+ view.buffer_index,
+ view.offset + start as u32,
+ // guarantee that end-offset >= 0 for end <=
str.len()
+ (end - start) as u32,
+ );
+ }
+ } else {
+ // Safety:
+ // (1) original bytes are valid utf-8,
+ // (2) we do not slice on utf-8 codepoint
+ unsafe {
+ let bytes =
+ StringViewArray::inline_value(raw, length as
usize);
Review Comment:
Maybe we could add a function like `append_view_u128_unchecked(view: u128)`
in arrow/src/builder/generic_bytes_view_builder.rs to simply add a view with a
given `u128`. Then the whole process would be:
1. Get the str of the view by `value_unchecked`, then get the [start, end)
2. sub_view = ~~if end-start>12 substr_large_view() else
substr_small_view()~~ make_view()
3. call `appned_view_u128_unchecked(sub_view)` on the builder
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]