Omega359 commented on code in PR #11941:
URL: https://github.com/apache/datafusion/pull/11941#discussion_r1714624067
##########
datafusion/functions/src/unicode/lpad.rs:
##########
@@ -76,300 +87,450 @@ impl ScalarUDFImpl for LPadFunc {
}
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
- match args[0].data_type() {
- DataType::Utf8 => make_scalar_function(lpad::<i32>, vec![])(args),
- DataType::LargeUtf8 => make_scalar_function(lpad::<i64>,
vec![])(args),
- other => exec_err!("Unsupported data type {other:?} for function
lpad"),
- }
+ make_scalar_function(lpad, vec![])(args)
}
}
-/// Extends the string to length 'length' by prepending the characters fill (a
space by default). If the string is already longer than length then it is
truncated (on the right).
+/// Extends the string to length 'length' by prepending the characters fill (a
space by default).
+/// If the string is already longer than length then it is truncated (on the
right).
/// lpad('hi', 5, 'xy') = 'xyxhi'
-pub fn lpad<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
- match args.len() {
- 2 => {
- let string_array = as_generic_string_array::<T>(&args[0])?;
- let length_array = as_int64_array(&args[1])?;
-
- let result = string_array
- .iter()
- .zip(length_array.iter())
- .map(|(string, length)| match (string, length) {
- (Some(string), Some(length)) => {
- if length > i32::MAX as i64 {
- return exec_err!(
- "lpad requested length {length} too large"
- );
- }
+pub fn lpad(args: &[ArrayRef]) -> Result<ArrayRef> {
+ if args.len() <= 1 || args.len() > 3 {
+ return exec_err!(
+ "lpad was called with {} arguments. It requires at least 2 and at
most 3.",
+ args.len()
+ );
+ }
+
+ let length_array = as_int64_array(&args[1])?;
+
+ match args[0].data_type() {
+ Utf8 => match args.len() {
+ 2 => lpad_impl::<&GenericStringArray<i32>,
&GenericStringArray<i32>, i32>(
+ args[0].as_string::<i32>(),
+ length_array,
+ None,
+ ),
+ 3 => lpad_with_replace::<&GenericStringArray<i32>, i32>(
+ args[0].as_string::<i32>(),
+ length_array,
+ &args[2],
+ ),
+ _ => unreachable!(),
+ },
+ LargeUtf8 => match args.len() {
+ 2 => lpad_impl::<&GenericStringArray<i64>,
&GenericStringArray<i64>, i64>(
+ args[0].as_string::<i64>(),
+ length_array,
+ None,
+ ),
+ 3 => lpad_with_replace::<&GenericStringArray<i64>, i64>(
+ args[0].as_string::<i64>(),
+ length_array,
+ &args[2],
+ ),
+ _ => unreachable!(),
+ },
+ Utf8View => match args.len() {
+ 2 => lpad_impl::<&StringViewArray, &GenericStringArray<i32>, i32>(
+ args[0].as_string_view(),
+ length_array,
+ None,
+ ),
+ 3 => lpad_with_replace::<&StringViewArray, i32>(
+ args[0].as_string_view(),
+ length_array,
+ &args[2],
+ ),
+ _ => unreachable!(),
+ },
+ other => {
+ exec_err!("Unsupported data type {other:?} for function lpad")
+ }
+ }
+}
- let length = if length < 0 { 0 } else { length as
usize };
- if length == 0 {
- Ok(Some("".to_string()))
+fn lpad_with_replace<'a, V, T: OffsetSizeTrait>(
+ string_array: V,
+ length_array: &Int64Array,
+ fill_array: &'a ArrayRef,
+) -> Result<ArrayRef>
+where
+ V: StringArrayType<'a>,
+{
+ match fill_array.data_type() {
+ Utf8 => lpad_impl::<V, &GenericStringArray<i32>, T>(
+ string_array,
+ length_array,
+ Some(fill_array.as_string::<i32>()),
+ ),
+ LargeUtf8 => lpad_impl::<V, &GenericStringArray<i64>, T>(
+ string_array,
+ length_array,
+ Some(fill_array.as_string::<i64>()),
+ ),
+ Utf8View => lpad_impl::<V, &StringViewArray, T>(
+ string_array,
+ length_array,
+ Some(fill_array.as_string_view()),
+ ),
+ other => {
+ exec_err!("Unsupported data type {other:?} for function lpad")
+ }
+ }
+}
+
+fn lpad_impl<'a, V, V2, T>(
+ string_array: V,
+ length_array: &Int64Array,
+ fill_array: Option<V2>,
+) -> Result<ArrayRef>
+where
+ V: StringArrayType<'a>,
+ V2: StringArrayType<'a>,
+ T: OffsetSizeTrait,
+{
+ if fill_array.is_none() {
+ let result = string_array
+ .iter()
+ .zip(length_array.iter())
+ .map(|(string, length)| match (string, length) {
+ (Some(string), Some(length)) => {
+ if length > i32::MAX as i64 {
+ return exec_err!("lpad requested length {length} too
large");
+ }
+
+ let length = if length < 0 { 0 } else { length as usize };
+ if length == 0 {
+ Ok(Some("".to_string()))
+ } else {
+ let graphemes =
string.graphemes(true).collect::<Vec<&str>>();
+ if length < graphemes.len() {
+ Ok(Some(graphemes[..length].concat()))
} else {
- let graphemes =
string.graphemes(true).collect::<Vec<&str>>();
- if length < graphemes.len() {
- Ok(Some(graphemes[..length].concat()))
- } else {
- let mut s: String = " ".repeat(length -
graphemes.len());
- s.push_str(string);
- Ok(Some(s))
- }
+ let mut s: String = " ".repeat(length -
graphemes.len());
Review Comment:
I did a test locally with using `GenericStringBuilder` vs what is in this PR
and the differences are within +/- 10% for 1024 and 2048 element arrays for
each of the 3 string types. Worth noting though is that timing between runs on
my laptop is not really stable at all (+/- 10% or so between runs of the exact
same code).
Interestingly the utf8view seems to be consistently slightly slower than
just plain utf8 at least in this function.
```
lpad function/utf8 type/2048
time: [715.78 µs 718.43 µs 721.89 µs]
lpad function/largeutf8 type/2048
time: [745.44 µs 759.62 µs 779.84 µs]
lpad function/stringview type/2048
time: [734.25 µs 751.13 µs 773.67 µs]
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]