XiangpengHao commented on code in PR #12044:
URL: https://github.com/apache/datafusion/pull/12044#discussion_r1720861699
##########
datafusion/functions/src/unicode/substr.rs:
##########
@@ -107,11 +112,170 @@ pub fn substr(args: &[ArrayRef]) -> Result<ArrayRef> {
}
}
-/// Extracts the substring of string starting at the start'th character, and
extending for count characters if that is specified. (Same as substring(string
from start for count).)
-/// substr('alphabet', 3) = 'phabet'
-/// substr('alphabet', 3, 2) = 'ph'
-/// The implementation uses UTF-8 code points as characters
-fn calculate_substr<'a, V, T>(string_array: V, args: &[ArrayRef]) ->
Result<ArrayRef>
+// Return the exact byte index for [start, end), set count to -1 to ignore
count
+fn get_true_start_count(input: &str, start: usize, count: i64) -> (usize,
usize) {
+ let (mut st, mut ed) = (input.len(), input.len());
+ let mut start_counting = false;
+ let mut cnt = 0;
+ for (char_cnt, (byte_cnt, _)) in input.char_indices().enumerate() {
+ if char_cnt == start {
+ st = byte_cnt;
+ if count != -1 {
+ start_counting = true;
+ } else {
+ break;
+ }
+ }
+ if start_counting {
+ if cnt == count {
+ ed = byte_cnt;
+ break;
+ }
+ cnt += 1;
+ }
+ }
+ (st, ed)
+}
+
+// The decoding process refs the trait at: arrow/arrow-data/src/byte_view.rs:44
+// From<u128> for ByteView
+fn calculate_string_view(
+ string_array: &StringViewArray,
+ args: &[ArrayRef],
+) -> Result<ArrayRef> {
+ let mut builder = StringViewBuilder::new();
+ // Copy all blocks from input
+ for block in string_array.data_buffers() {
+ builder.append_block(block.clone());
+ }
+
+ let start_array = as_int64_array(&args[0])?;
+
+ match args.len() {
+ 1 => {
+ for (idx, (raw, start)) in string_array
+ .views()
+ .iter()
+ .zip(start_array.iter())
+ .enumerate()
+ {
+ if let Some(start) = start {
+ let length = *raw as u32;
+ let start = (start - 1).max(0);
+
+ // Operate according to the length of bytes
+ if length == 0 {
+ builder.append_null();
+ } else if length > 12 {
+ let buffer_index = (*raw >> 64) as u32;
Review Comment:
We should use `ByteView` from arrow-rs:
https://github.com/apache/arrow-rs/blob/27789d7c9abb50796a4042e7e193703efe3c95b3/arrow-data/src/byte_view.rs#L44-L54
But ByteView is behind `arrow-data`, which is not explicitly depended by
DataFusion, what's your opinion? @alamb
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]