This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 340ef606cf perf: Optimize `to_char` to allocate less, fix NULL
handling (#20635)
340ef606cf is described below
commit 340ef606cf19f63a6b8e01160a43a394bbfd46f4
Author: Neil Conway <[email protected]>
AuthorDate: Wed Mar 4 17:59:27 2026 -0500
perf: Optimize `to_char` to allocate less, fix NULL handling (#20635)
## Which issue does this PR close?
- Closes #20634.
## Rationale for this change
The current `to_char` implementation (both scalar and array paths)
allocates a new string for every input row to hold the result of the
`ArrayFormatter`. To produce the results, it uses `StringArray::from`,
which copies.
We can do better by using a reusable buffer to store the result of
`ArrayFormatter`, and then append to `StringBuilder`. We can then
construct the final result values from the `StringBuilder` very
efficiently. This yields a 10-22% improvement on the `to_char`
microbenchmarks.
In the scalar path, we can do a bit better by having the
`ArrayFormatter` write directly into the `StringBuilder`'s buffer, which
saves a copy.
In the array path, we can't easily do this, because `to_char` has some
(dubious) logic to retry errors on `Date32` inputs by casting to
`Date64` and retrying the ArrayFormatter. Since the `StringBuilder` is
shared between all rows and there's no easy way to remove the partial
write that might happen in the case of an error, we use the intermediate
buffer here instead.
This PR also cleans up various code in `to_char`, and fixes a bug in
`NULL` handling: in the array case, if the current data value is NULL
but the format string is non-NULL, we incorrectly returned an empty
string instead of NULL.
## What changes are included in this PR?
* Optimize `to_char` (scalar and array paths) as described above
* Fix bug in NULL handling
* Add SLT test case for NULL handling bug
* Simplify and refactor various parts of `to_char`, particularly around
error handling
* Revise `to_char` benchmarks: the previous version tested `to_char` for
`Date32` inputs where the format string specifies a time value, which is
an odd corner case. Also get rid of benchmarking the scalar-scalar case;
this is very fast and occurs rarely in practice (will usually be
constant-folded before query execution).
## Are these changes tested?
Yes. Benchmarked and added new test.
## Are there any user-facing changes?
No.
---
datafusion/functions/benches/to_char.rs | 61 +++---
datafusion/functions/src/datetime/to_char.rs | 214 +++++++++------------
.../test_files/datetime/timestamps.slt | 19 +-
3 files changed, 130 insertions(+), 164 deletions(-)
diff --git a/datafusion/functions/benches/to_char.rs
b/datafusion/functions/benches/to_char.rs
index 65f4999d23..4d866570b7 100644
--- a/datafusion/functions/benches/to_char.rs
+++ b/datafusion/functions/benches/to_char.rs
@@ -18,13 +18,12 @@
use std::hint::black_box;
use std::sync::Arc;
-use arrow::array::{ArrayRef, Date32Array, StringArray};
+use arrow::array::{ArrayRef, Date32Array, Date64Array, StringArray};
use arrow::datatypes::{DataType, Field};
use chrono::TimeDelta;
use chrono::prelude::*;
use criterion::{Criterion, criterion_group, criterion_main};
use datafusion_common::ScalarValue;
-use datafusion_common::ScalarValue::TimestampNanosecond;
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::datetime::to_char;
@@ -63,6 +62,26 @@ fn generate_date32_array(rng: &mut ThreadRng) -> Date32Array
{
Date32Array::from(data)
}
+fn generate_date64_array(rng: &mut ThreadRng) -> Date64Array {
+ let start_date = "1970-01-01"
+ .parse::<NaiveDate>()
+ .expect("Date should parse");
+ let end_date = "2050-12-31"
+ .parse::<NaiveDate>()
+ .expect("Date should parse");
+ let mut data: Vec<i64> = Vec::with_capacity(1000);
+ for _ in 0..1000 {
+ let date = pick_date_in_range(rng, start_date, end_date);
+ let millis = date
+ .and_hms_opt(0, 0, 0)
+ .unwrap()
+ .and_utc()
+ .timestamp_millis();
+ data.push(millis);
+ }
+ Date64Array::from(data)
+}
+
const DATE_PATTERNS: [&str; 5] =
["%Y:%m:%d", "%d-%m-%Y", "%d%m%Y", "%Y%m%d", "%Y...%m...%d"];
@@ -155,7 +174,7 @@ fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("to_char_array_datetime_patterns_1000", |b| {
let mut rng = rand::rng();
- let data_arr = generate_date32_array(&mut rng);
+ let data_arr = generate_date64_array(&mut rng);
let batch_len = data_arr.len();
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
let patterns =
ColumnarValue::Array(Arc::new(generate_datetime_pattern_array(
@@ -182,7 +201,7 @@ fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("to_char_array_mixed_patterns_1000", |b| {
let mut rng = rand::rng();
- let data_arr = generate_date32_array(&mut rng);
+ let data_arr = generate_date64_array(&mut rng);
let batch_len = data_arr.len();
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
let patterns =
ColumnarValue::Array(Arc::new(generate_mixed_pattern_array(
@@ -235,7 +254,7 @@ fn criterion_benchmark(c: &mut Criterion) {
c.bench_function("to_char_scalar_datetime_pattern_1000", |b| {
let mut rng = rand::rng();
- let data_arr = generate_date32_array(&mut rng);
+ let data_arr = generate_date64_array(&mut rng);
let batch_len = data_arr.len();
let data = ColumnarValue::Array(Arc::new(data_arr) as ArrayRef);
let patterns = ColumnarValue::Scalar(ScalarValue::Utf8(Some(
@@ -259,38 +278,6 @@ fn criterion_benchmark(c: &mut Criterion) {
)
})
});
-
- c.bench_function("to_char_scalar_1000", |b| {
- let mut rng = rand::rng();
- let timestamp = "2026-07-08T09:10:11"
- .parse::<NaiveDateTime>()
- .unwrap()
- .with_nanosecond(56789)
- .unwrap()
- .and_utc()
- .timestamp_nanos_opt()
- .unwrap();
- let data = ColumnarValue::Scalar(TimestampNanosecond(Some(timestamp),
None));
- let pattern =
-
ColumnarValue::Scalar(ScalarValue::Utf8(Some(pick_date_pattern(&mut rng))));
-
- b.iter(|| {
- black_box(
- to_char()
- .invoke_with_args(ScalarFunctionArgs {
- args: vec![data.clone(), pattern.clone()],
- arg_fields: vec![
- Field::new("a", data.data_type(), true).into(),
- Field::new("b", pattern.data_type(), true).into(),
- ],
- number_rows: 1,
- return_field: Field::new("f", DataType::Utf8,
true).into(),
- config_options: Arc::clone(&config_options),
- })
- .expect("to_char should work on valid values"),
- )
- })
- });
}
criterion_group!(benches, criterion_benchmark);
diff --git a/datafusion/functions/src/datetime/to_char.rs
b/datafusion/functions/src/datetime/to_char.rs
index 2c6f823545..4ceaac1cc8 100644
--- a/datafusion/functions/src/datetime/to_char.rs
+++ b/datafusion/functions/src/datetime/to_char.rs
@@ -18,15 +18,15 @@
use std::any::Any;
use std::sync::Arc;
+use arrow::array::builder::StringBuilder;
use arrow::array::cast::AsArray;
-use arrow::array::{Array, ArrayRef, StringArray, new_null_array};
+use arrow::array::{Array, ArrayRef};
use arrow::compute::cast;
use arrow::datatypes::DataType;
use arrow::datatypes::DataType::{
Date32, Date64, Duration, Time32, Time64, Timestamp, Utf8,
};
use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second};
-use arrow::error::ArrowError;
use arrow::util::display::{ArrayFormatter, DurationFormat, FormatOptions};
use datafusion_common::{Result, ScalarValue, exec_err,
utils::take_function_args};
use datafusion_expr::TypeSignature::Exact;
@@ -143,20 +143,17 @@ impl ScalarUDFImpl for ToCharFunc {
let [date_time, format] = take_function_args(self.name(), &args)?;
match format {
- ColumnarValue::Scalar(ScalarValue::Utf8(None))
- | ColumnarValue::Scalar(ScalarValue::Null) =>
to_char_scalar(date_time, None),
- // constant format
- ColumnarValue::Scalar(ScalarValue::Utf8(Some(format))) => {
- // invoke to_char_scalar with the known string, without
converting to array
- to_char_scalar(date_time, Some(format))
+ ColumnarValue::Scalar(ScalarValue::Null | ScalarValue::Utf8(None))
=> {
+ Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
}
- ColumnarValue::Array(_) => to_char_array(&args),
- _ => {
- exec_err!(
- "Format for `to_char` must be non-null Utf8, received {}",
- format.data_type()
- )
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(fmt))) => {
+ to_char_scalar(date_time, fmt)
}
+ ColumnarValue::Array(_) => to_char_array(&args),
+ _ => exec_err!(
+ "Format for `to_char` must be non-null Utf8, received {}",
+ format.data_type()
+ ),
}
}
@@ -171,11 +168,8 @@ impl ScalarUDFImpl for ToCharFunc {
fn build_format_options<'a>(
data_type: &DataType,
- format: Option<&'a str>,
-) -> Result<FormatOptions<'a>, Result<ColumnarValue>> {
- let Some(format) = format else {
- return Ok(FormatOptions::new());
- };
+ format: &'a str,
+) -> Result<FormatOptions<'a>> {
let format_options = match data_type {
Date32 => FormatOptions::new()
.with_date_format(Some(format))
@@ -194,144 +188,114 @@ fn build_format_options<'a>(
},
),
other => {
- return Err(exec_err!(
+ return exec_err!(
"to_char only supports date, time, timestamp and duration data
types, received {other:?}"
- ));
+ );
}
};
Ok(format_options)
}
-/// Special version when arg\[1] is a scalar
-fn to_char_scalar(
- expression: &ColumnarValue,
- format: Option<&str>,
-) -> Result<ColumnarValue> {
- // it's possible that the expression is a scalar however because
- // of the implementation in arrow-rs we need to convert it to an array
+/// Formats `expression` using a constant `format` string.
+fn to_char_scalar(expression: &ColumnarValue, format: &str) ->
Result<ColumnarValue> {
+ // ArrayFormatter requires an array, so scalar expressions must be
+ // converted to a 1-element array first.
let data_type = &expression.data_type();
let is_scalar_expression = matches!(&expression, ColumnarValue::Scalar(_));
- let array = expression.clone().into_array(1)?;
+ let array = expression.to_array(1)?;
- if format.is_none() {
- return if is_scalar_expression {
- Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None)))
- } else {
- Ok(ColumnarValue::Array(new_null_array(&Utf8, array.len())))
- };
- }
+ let format_options = build_format_options(data_type, format)?;
+ let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
- let format_options = match build_format_options(data_type, format) {
- Ok(value) => value,
- Err(value) => return value,
- };
+ // Pad the preallocated capacity a bit because format specifiers often
+ // expand the string (e.g., %Y -> "2026")
+ let fmt_len = format.len() + 10;
+ let mut builder = StringBuilder::with_capacity(array.len(), array.len() *
fmt_len);
- let formatter = ArrayFormatter::try_new(array.as_ref(), &format_options)?;
- let formatted: Result<Vec<Option<String>>, ArrowError> = (0..array.len())
- .map(|i| {
- if array.is_null(i) {
- Ok(None)
- } else {
- formatter.value(i).try_to_string().map(Some)
- }
- })
- .collect();
-
- if let Ok(formatted) = formatted {
- if is_scalar_expression {
- Ok(ColumnarValue::Scalar(ScalarValue::Utf8(
- formatted.first().unwrap().clone(),
- )))
+ for i in 0..array.len() {
+ if array.is_null(i) {
+ builder.append_null();
} else {
- Ok(ColumnarValue::Array(
- Arc::new(StringArray::from(formatted)) as ArrayRef
- ))
- }
- } else {
- // if the data type was a Date32, formatting could have failed because
the format string
- // contained datetime specifiers, so we'll retry by casting the date
array as a timestamp array
- if data_type == &Date32 {
- return to_char_scalar(&expression.cast_to(&Date64, None)?, format);
+ // Write directly into the builder's internal buffer, then
+ // commit the value with append_value("").
+ match formatter.value(i).write(&mut builder) {
+ Ok(()) => builder.append_value(""),
+ // Arrow's Date32 formatter only handles date specifiers
+ // (%Y, %m, %d, ...). Format strings with time specifiers
+ // (%H, %M, %S, ...) cause it to fail. When this happens,
+ // we retry by casting to Date64, whose datetime formatter
+ // handles both date and time specifiers (with zero for
+ // the time components).
+ Err(_) if data_type == &Date32 => {
+ return to_char_scalar(&expression.cast_to(&Date64, None)?,
format);
+ }
+ Err(e) => return Err(e.into()),
+ }
}
+ }
- exec_err!("{}", formatted.unwrap_err())
+ let result = builder.finish();
+ if is_scalar_expression {
+ let val = result.is_valid(0).then(|| result.value(0).to_string());
+ Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
+ } else {
+ Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef))
}
}
fn to_char_array(args: &[ColumnarValue]) -> Result<ColumnarValue> {
let arrays = ColumnarValue::values_to_arrays(args)?;
- let mut results: Vec<Option<String>> = vec![];
+ let data_array = &arrays[0];
let format_array = arrays[1].as_string::<i32>();
- let data_type = arrays[0].data_type();
+ let data_type = data_array.data_type();
- for idx in 0..arrays[0].len() {
- let format = if format_array.is_null(idx) {
- None
- } else {
- Some(format_array.value(idx))
- };
- if format.is_none() {
- results.push(None);
+ // Arbitrary guess for the length of a typical formatted datetime string
+ let fmt_len = 30;
+ let mut builder =
+ StringBuilder::with_capacity(data_array.len(), data_array.len() *
fmt_len);
+ let mut buffer = String::with_capacity(fmt_len);
+
+ for idx in 0..data_array.len() {
+ if format_array.is_null(idx) || data_array.is_null(idx) {
+ builder.append_null();
continue;
}
- let format_options = match build_format_options(data_type, format) {
- Ok(value) => value,
- Err(value) => return value,
- };
- // this isn't ideal but this can't use ValueFormatter as it isn't
independent
- // from ArrayFormatter
- let formatter = ArrayFormatter::try_new(arrays[0].as_ref(),
&format_options)?;
- let result = formatter.value(idx).try_to_string();
- match result {
- Ok(value) => results.push(Some(value)),
- Err(e) => {
- // if the data type was a Date32, formatting could have failed
because the format string
- // contained datetime specifiers, so we'll treat this specific
date element as a timestamp
- if data_type == &Date32 {
- let failed_date_value = arrays[0].slice(idx, 1);
-
- match retry_date_as_timestamp(&failed_date_value,
&format_options) {
- Ok(value) => {
- results.push(Some(value));
- continue;
- }
- Err(e) => {
- return exec_err!("{}", e);
- }
- }
- }
- return exec_err!("{}", e);
+ let format = format_array.value(idx);
+ let format_options = build_format_options(data_type, format)?;
+ let formatter = ArrayFormatter::try_new(data_array.as_ref(),
&format_options)?;
+
+ buffer.clear();
+
+ // We'd prefer to write directly to the StringBuilder's internal
buffer,
+ // but the write might fail, and there's no easy way to ensure a
partial
+ // write is removed from the buffer. So instead we write to a temporary
+ // buffer and `append_value` on success.
+ match formatter.value(idx).write(&mut buffer) {
+ Ok(()) => builder.append_value(&buffer),
+ // Retry with Date64 (see comment in to_char_scalar).
+ Err(_) if data_type == &Date32 => {
+ buffer.clear();
+ let date64_value = cast(&data_array.slice(idx, 1), &Date64)?;
+ let retry_fmt =
+ ArrayFormatter::try_new(date64_value.as_ref(),
&format_options)?;
+ retry_fmt.value(0).write(&mut buffer)?;
+ builder.append_value(&buffer);
}
+ Err(e) => return Err(e.into()),
}
}
+ let result = builder.finish();
match args[0] {
- ColumnarValue::Array(_) =>
Ok(ColumnarValue::Array(Arc::new(StringArray::from(
- results,
- )) as ArrayRef)),
- ColumnarValue::Scalar(_) => match results.first().unwrap() {
- Some(value) => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(Some(
- value.to_string(),
- )))),
- None => Ok(ColumnarValue::Scalar(ScalarValue::Utf8(None))),
- },
+ ColumnarValue::Scalar(_) => {
+ let val = result.is_valid(0).then(|| result.value(0).to_string());
+ Ok(ColumnarValue::Scalar(ScalarValue::Utf8(val)))
+ }
+ ColumnarValue::Array(_) => Ok(ColumnarValue::Array(Arc::new(result) as
ArrayRef)),
}
}
-fn retry_date_as_timestamp(
- array_ref: &ArrayRef,
- format_options: &FormatOptions,
-) -> Result<String> {
- let target_data_type = Date64;
-
- let date_value = cast(&array_ref, &target_data_type)?;
- let formatter = ArrayFormatter::try_new(date_value.as_ref(),
format_options)?;
- let result = formatter.value(0).try_to_string()?;
-
- Ok(result)
-}
-
#[cfg(test)]
mod tests {
use crate::datetime::to_char::ToCharFunc;
diff --git a/datafusion/sqllogictest/test_files/datetime/timestamps.slt
b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
index 78889230fd..40d72c01ce 100644
--- a/datafusion/sqllogictest/test_files/datetime/timestamps.slt
+++ b/datafusion/sqllogictest/test_files/datetime/timestamps.slt
@@ -3676,10 +3676,10 @@ select to_char(arrow_cast(123456, 'Duration(Second)'),
null);
----
NULL
-query error DataFusion error: Execution error: Cast error: Format error
+query error DataFusion error: Arrow error: Cast error: Format error
SELECT to_char(timestamps, '%X%K') from formats;
-query error DataFusion error: Execution error: Cast error: Format error
+query error DataFusion error: Arrow error: Cast error: Format error
SELECT to_char('2000-02-03'::date, '%X%K');
query T
@@ -3726,6 +3726,21 @@ select to_char('2020-01-01 00:10:20.123'::timestamp at
time zone 'America/New_Yo
----
2020-01-01 00:10:20.123
+# Null values with array format
+query T
+SELECT to_char(column1, column2)
+FROM (VALUES
+ (DATE '2020-09-01', '%Y-%m-%d'),
+ (NULL, '%Y-%m-%d'),
+ (DATE '2020-09-02', NULL),
+ (NULL, NULL)
+);
+----
+2020-09-01
+NULL
+NULL
+NULL
+
statement ok
drop table formats;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]