uzqw commented on code in PR #19590:
URL: https://github.com/apache/datafusion/pull/19590#discussion_r2656328467


##########
datafusion/functions/benches/substr_index.rs:
##########
@@ -83,38 +106,66 @@ fn data() -> (StringArray, StringArray, Int64Array) {
     )
 }
 
-fn criterion_benchmark(c: &mut Criterion) {
-    c.bench_function("substr_index_array_array_1000", |b| {
-        let (strings, delimiters, counts) = data();
-        let batch_len = counts.len();
-        let strings = ColumnarValue::Array(Arc::new(strings) as ArrayRef);
-        let delimiters = ColumnarValue::Array(Arc::new(delimiters) as 
ArrayRef);
-        let counts = ColumnarValue::Array(Arc::new(counts) as ArrayRef);
-
-        let args = vec![strings, delimiters, counts];
-        let arg_fields = args
-            .iter()
-            .enumerate()
-            .map(|(idx, arg)| {
-                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
-            })
-            .collect::<Vec<_>>();
-        let config_options = Arc::new(ConfigOptions::default());
-
-        b.iter(|| {
-            black_box(
-                substr_index()
-                    .invoke_with_args(ScalarFunctionArgs {
-                        args: args.clone(),
-                        arg_fields: arg_fields.clone(),
-                        number_rows: batch_len,
-                        return_field: Field::new("f", DataType::Utf8, 
true).into(),
-                        config_options: Arc::clone(&config_options),
-                    })
-                    .expect("substr_index should work on valid values"),
-            )
+fn run_benchmark(
+    b: &mut criterion::Bencher,
+    strings: StringArray,
+    delimiters: StringArray,
+    counts: Int64Array,
+    batch_size: usize,
+) {
+    let strings = ColumnarValue::Array(Arc::new(strings) as ArrayRef);
+    let delimiters = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef);
+    let counts = ColumnarValue::Array(Arc::new(counts) as ArrayRef);
+
+    let args = vec![strings, delimiters, counts];
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| {
+            Field::new(format!("arg_{idx}"), arg.data_type().clone(), 
true).into()
         })
-    });
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    b.iter(|| {
+        black_box(
+            substr_index()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: args.clone(),
+                    arg_fields: arg_fields.clone(),
+                    number_rows: batch_size,
+                    return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
+                })
+                .expect("substr_index should work on valid values"),
+        )
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("substr_index");
+
+    let batch_sizes = [100, 1000, 10_000];
+
+    for batch_size in batch_sizes {
+        group.bench_function(
+            &format!("substr_index_{}_single_delimiter", batch_size),
+            |b| {
+                let (strings, delimiters, counts) = data(batch_size, true);
+                run_benchmark(b, strings, delimiters, counts, batch_size);
+            },
+        );
+
+        group.bench_function(
+            &format!("substr_index_{}_long_delimiter", batch_size),

Review Comment:
   variables can be used directly in the `format!` string



##########
datafusion/functions/src/unicode/substrindex.rs:
##########
@@ -198,31 +198,44 @@ where
                 }
 
                 let occurrences = 
usize::try_from(n.unsigned_abs()).unwrap_or(usize::MAX);
-                let length = if n > 0 {
-                    let split = string.split(delimiter);
-                    split
-                        .take(occurrences)
-                        .map(|s| s.len() + delimiter.len())
-                        .sum::<usize>()
-                        - delimiter.len()
-                } else {
-                    let split = string.rsplit(delimiter);
-                    split
-                        .take(occurrences)
-                        .map(|s| s.len() + delimiter.len())
-                        .sum::<usize>()
-                        - delimiter.len()
-                };
-                if n > 0 {
-                    match string.get(..length) {
-                        Some(substring) => builder.append_value(substring),
-                        None => builder.append_null(),
+                let result_idx = if delimiter.len() == 1 {
+                    let d_byte = delimiter.as_bytes()[0];
+                    let bytes = string.as_bytes();
+
+                    if n > 0 {
+                        bytes.iter()
+                            .enumerate()
+                            .filter(|&(_, &b)| b == d_byte)
+                            .nth(occurrences - 1)
+                            .map(|(idx, _)| idx)
+                    } else {
+                        bytes.iter()
+                            .enumerate()
+                            .rev()
+                            .filter(|&(_, &b)| b == d_byte)
+                            .nth(occurrences - 1)
+                            .map(|(idx, _)| idx + 1)
                     }
                 } else {
-                    match string.get(string.len().saturating_sub(length)..) {
-                        Some(substring) => builder.append_value(substring),
-                        None => builder.append_null(),
+                    if n > 0 {

Review Comment:
   this `else { if .. }` block can be collapsed



##########
datafusion/functions/benches/substr_index.rs:
##########
@@ -83,38 +106,66 @@ fn data() -> (StringArray, StringArray, Int64Array) {
     )
 }
 
-fn criterion_benchmark(c: &mut Criterion) {
-    c.bench_function("substr_index_array_array_1000", |b| {
-        let (strings, delimiters, counts) = data();
-        let batch_len = counts.len();
-        let strings = ColumnarValue::Array(Arc::new(strings) as ArrayRef);
-        let delimiters = ColumnarValue::Array(Arc::new(delimiters) as 
ArrayRef);
-        let counts = ColumnarValue::Array(Arc::new(counts) as ArrayRef);
-
-        let args = vec![strings, delimiters, counts];
-        let arg_fields = args
-            .iter()
-            .enumerate()
-            .map(|(idx, arg)| {
-                Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
-            })
-            .collect::<Vec<_>>();
-        let config_options = Arc::new(ConfigOptions::default());
-
-        b.iter(|| {
-            black_box(
-                substr_index()
-                    .invoke_with_args(ScalarFunctionArgs {
-                        args: args.clone(),
-                        arg_fields: arg_fields.clone(),
-                        number_rows: batch_len,
-                        return_field: Field::new("f", DataType::Utf8, 
true).into(),
-                        config_options: Arc::clone(&config_options),
-                    })
-                    .expect("substr_index should work on valid values"),
-            )
+fn run_benchmark(
+    b: &mut criterion::Bencher,
+    strings: StringArray,
+    delimiters: StringArray,
+    counts: Int64Array,
+    batch_size: usize,
+) {
+    let strings = ColumnarValue::Array(Arc::new(strings) as ArrayRef);
+    let delimiters = ColumnarValue::Array(Arc::new(delimiters) as ArrayRef);
+    let counts = ColumnarValue::Array(Arc::new(counts) as ArrayRef);
+
+    let args = vec![strings, delimiters, counts];
+    let arg_fields = args
+        .iter()
+        .enumerate()
+        .map(|(idx, arg)| {
+            Field::new(format!("arg_{idx}"), arg.data_type().clone(), 
true).into()
         })
-    });
+        .collect::<Vec<_>>();
+    let config_options = Arc::new(ConfigOptions::default());
+
+    b.iter(|| {
+        black_box(
+            substr_index()
+                .invoke_with_args(ScalarFunctionArgs {
+                    args: args.clone(),
+                    arg_fields: arg_fields.clone(),
+                    number_rows: batch_size,
+                    return_field: Field::new("f", DataType::Utf8, true).into(),
+                    config_options: Arc::clone(&config_options),
+                })
+                .expect("substr_index should work on valid values"),
+        )
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    let mut group = c.benchmark_group("substr_index");
+
+    let batch_sizes = [100, 1000, 10_000];
+
+    for batch_size in batch_sizes {
+        group.bench_function(
+            &format!("substr_index_{}_single_delimiter", batch_size),

Review Comment:
   variables can be used directly in the `format!` string



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to