alamb commented on code in PR #12513:
URL: https://github.com/apache/datafusion/pull/12513#discussion_r1764871405
##########
datafusion/functions/benches/ltrim.rs:
##########
@@ -17,32 +17,216 @@
extern crate criterion;
-use arrow::array::{ArrayRef, StringArray};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use criterion::{
+ black_box, criterion_group, criterion_main, measurement::Measurement,
BenchmarkGroup,
+ Criterion, SamplingMode,
+};
use datafusion_common::ScalarValue;
-use datafusion_expr::ColumnarValue;
+use datafusion_expr::{ColumnarValue, ScalarUDF};
use datafusion_functions::string;
-use std::sync::Arc;
+use rand::{distributions::Alphanumeric, rngs::StdRng, Rng, SeedableRng};
+use std::{fmt, sync::Arc};
-fn create_args(size: usize, characters: &str) -> Vec<ColumnarValue> {
- let iter =
- std::iter::repeat(format!("{}datafusion{}", characters,
characters)).take(size);
- let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef;
+pub fn seedable_rng() -> StdRng {
+ StdRng::seed_from_u64(42)
+}
+
+#[derive(Clone, Copy)]
+pub enum StringArrayType {
+ Utf8View,
+ Utf8,
+ LargeUtf8,
+}
+
+impl fmt::Display for StringArrayType {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match self {
+ StringArrayType::Utf8View => f.write_str("string_view"),
+ StringArrayType::Utf8 => f.write_str("string"),
+ StringArrayType::LargeUtf8 => f.write_str("large_string"),
+ }
+ }
+}
+
+pub fn create_string_array_and_characters(
Review Comment:
```suggestion
/// returns an array of strings, and `characters` as a ScalarValue
pub fn create_string_array_and_characters(
```
##########
datafusion/functions/benches/ltrim.rs:
##########
@@ -17,32 +17,216 @@
extern crate criterion;
-use arrow::array::{ArrayRef, StringArray};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use criterion::{
+ black_box, criterion_group, criterion_main, measurement::Measurement,
BenchmarkGroup,
+ Criterion, SamplingMode,
+};
use datafusion_common::ScalarValue;
-use datafusion_expr::ColumnarValue;
+use datafusion_expr::{ColumnarValue, ScalarUDF};
use datafusion_functions::string;
-use std::sync::Arc;
+use rand::{distributions::Alphanumeric, rngs::StdRng, Rng, SeedableRng};
+use std::{fmt, sync::Arc};
-fn create_args(size: usize, characters: &str) -> Vec<ColumnarValue> {
- let iter =
- std::iter::repeat(format!("{}datafusion{}", characters,
characters)).take(size);
- let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef;
+pub fn seedable_rng() -> StdRng {
+ StdRng::seed_from_u64(42)
+}
+
+#[derive(Clone, Copy)]
+pub enum StringArrayType {
+ Utf8View,
+ Utf8,
+ LargeUtf8,
+}
+
+impl fmt::Display for StringArrayType {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match self {
+ StringArrayType::Utf8View => f.write_str("string_view"),
+ StringArrayType::Utf8 => f.write_str("string"),
+ StringArrayType::LargeUtf8 => f.write_str("large_string"),
+ }
+ }
+}
+
+pub fn create_string_array_and_characters(
+ size: usize,
+ characters: &str,
+ trimmed: &str,
+ remaining_len: usize,
+ string_array_type: StringArrayType,
+) -> (ArrayRef, ScalarValue) {
+ let rng = &mut seedable_rng();
+
+ let lens = vec![remaining_len; size];
+ let string_iter = lens.into_iter().map(|len| {
+ if rng.gen::<f32>() < 0.1 {
+ None
+ } else {
+ let mut value = trimmed.as_bytes().to_vec();
+ let generated = rng.sample_iter(&Alphanumeric).take(len);
+ value.extend(generated);
+ Some(String::from_utf8(value).unwrap())
+ }
+ });
+
+ match string_array_type {
+ StringArrayType::Utf8View => (
+ Arc::new(string_iter.collect::<StringViewArray>()),
+ ScalarValue::Utf8View(Some(characters.to_string())),
+ ),
+ StringArrayType::Utf8 => (
+ Arc::new(string_iter.collect::<StringArray>()),
+ ScalarValue::Utf8(Some(characters.to_string())),
+ ),
+ StringArrayType::LargeUtf8 => (
+ Arc::new(string_iter.collect::<LargeStringArray>()),
+ ScalarValue::LargeUtf8(Some(characters.to_string())),
+ ),
+ }
+}
+
+/// Create args for the ltrim benchmark
Review Comment:
❤️
##########
datafusion/functions/benches/ltrim.rs:
##########
@@ -17,32 +17,216 @@
extern crate criterion;
-use arrow::array::{ArrayRef, StringArray};
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use arrow::array::{ArrayRef, LargeStringArray, StringArray, StringViewArray};
+use criterion::{
+ black_box, criterion_group, criterion_main, measurement::Measurement,
BenchmarkGroup,
+ Criterion, SamplingMode,
+};
use datafusion_common::ScalarValue;
-use datafusion_expr::ColumnarValue;
+use datafusion_expr::{ColumnarValue, ScalarUDF};
use datafusion_functions::string;
-use std::sync::Arc;
+use rand::{distributions::Alphanumeric, rngs::StdRng, Rng, SeedableRng};
+use std::{fmt, sync::Arc};
-fn create_args(size: usize, characters: &str) -> Vec<ColumnarValue> {
- let iter =
- std::iter::repeat(format!("{}datafusion{}", characters,
characters)).take(size);
- let array = Arc::new(StringArray::from_iter_values(iter)) as ArrayRef;
+pub fn seedable_rng() -> StdRng {
+ StdRng::seed_from_u64(42)
+}
+
+#[derive(Clone, Copy)]
+pub enum StringArrayType {
+ Utf8View,
+ Utf8,
+ LargeUtf8,
+}
+
+impl fmt::Display for StringArrayType {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match self {
+ StringArrayType::Utf8View => f.write_str("string_view"),
+ StringArrayType::Utf8 => f.write_str("string"),
+ StringArrayType::LargeUtf8 => f.write_str("large_string"),
+ }
+ }
+}
+
+pub fn create_string_array_and_characters(
+ size: usize,
+ characters: &str,
+ trimmed: &str,
+ remaining_len: usize,
+ string_array_type: StringArrayType,
+) -> (ArrayRef, ScalarValue) {
+ let rng = &mut seedable_rng();
+
+ let lens = vec![remaining_len; size];
+ let string_iter = lens.into_iter().map(|len| {
Review Comment:
I don't understand the use of `vec!` here - isn't this the same as creating
`size` strings of lenght up to `remaining_len`?
Maybe this could be somehthing like
```rust
let string_iter = (0..size).map(|len|{
```
?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]