martin-g commented on code in PR #19755:
URL: https://github.com/apache/datafusion/pull/19755#discussion_r2852897599
##########
datafusion/functions/src/datetime/mod.rs:
##########
@@ -267,7 +269,8 @@ pub mod expr_fn {
/// # }
/// ```
pub fn to_date(args: Vec<Expr>) -> Expr {
- super::to_date().call(args)
+ let config = ConfigOptions::default();
Review Comment:
This will always use the "chrono" parser, even if the user application
prefers the jiff one.
Is this intentional ?
##########
datafusion/functions/src/datetime/to_timestamp.rs:
##########
@@ -803,10 +827,12 @@ impl ScalarUDFImpl for ToTimestampNanosFunc {
}
}
+#[expect(clippy::borrowed_box)]
Review Comment:
```suggestion
```
##########
datafusion/functions/src/datetime/to_timestamp.rs:
##########
@@ -803,10 +827,12 @@ impl ScalarUDFImpl for ToTimestampNanosFunc {
}
}
+#[expect(clippy::borrowed_box)]
fn to_timestamp_impl<T: ArrowTimestampType + ScalarType<i64>>(
args: &[ColumnarValue],
name: &str,
timezone: &Option<Arc<str>>,
+ parser: &Box<dyn DateTimeParser>,
Review Comment:
```suggestion
parser: &dyn DateTimeParser,
```
and update the callers to `parser.as_ref()` instead of `&parser`
##########
datafusion/sqllogictest/test_files/datetime/timestamps_jiff.slt:
##########
@@ -0,0 +1,290 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+SET datafusion.execution.date_time_parser = 'jiff';
+
+statement ok
+create table ts_data(ts bigint, value int) as values
+ (1599572549190855123, 1),
+ (1599568949190855123, 2),
+ (1599565349190855123, 3);
+
+statement ok
+create table ts_data_nanos as select arrow_cast(ts, 'Timestamp(Nanosecond,
None)') as ts, value from ts_data;
+
+statement ok
+create table ts_data_micros as select arrow_cast(ts / 1000,
'Timestamp(Microsecond, None)') as ts, value from ts_data;
+
+statement ok
+create table ts_data_millis as select arrow_cast(ts / 1000000,
'Timestamp(Millisecond, None)') as ts, value from ts_data;
+
+statement ok
+create table ts_data_secs as select arrow_cast(ts / 1000000000,
'Timestamp(Second, None)') as ts, value from ts_data;
+
+# verify chrono formats don't work
+
+query error strptime parsing failed
+SELECT COUNT(*) FROM ts_data_nanos where ts >
to_timestamp('2020-09-08T12:00:00+00:00', '%Y-%m-%dT%H:%M:%S%Z')
+
+# to_timestamp with formatting
+query I
+SELECT COUNT(*) FROM ts_data_nanos where ts >
to_timestamp('2020-09-08T12:00:00+00:00', '2020-09-08 12/00/00+00:00', '%+',
'%Y-%m-%d %H/%M/%s%#z')
+----
+2
+
+# to_timestamp_nanos with formatting
+query I
+SELECT COUNT(*) FROM ts_data_nanos where ts > to_timestamp_nanos('2020-09-08
12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z')
+----
+2
+
+# to_timestamp_millis with formatting
+query I
+SELECT COUNT(*) FROM ts_data_millis where ts > to_timestamp_millis('2020-09-08
12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z')
+----
+2
+
+# to_timestamp_micros with formatting
+query I
+SELECT COUNT(*) FROM ts_data_micros where ts > to_timestamp_micros('2020-09-08
12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z')
+----
+2
+
+# to_timestamp_seconds with formatting
+query I
+SELECT COUNT(*) FROM ts_data_secs where ts > to_timestamp_seconds('2020-09-08
12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z')
+----
+2
+
+# verify timestamp data with formatting options
+query PPPPPP
+SELECT to_timestamp(null, '%+'), to_timestamp(0, '%s'),
to_timestamp(1926632005, '%s'), to_timestamp(1, '%+', '%s'), to_timestamp(-1,
'%c', '%+', '%s'), to_timestamp(0-1, '%c', '%+', '%s')
+----
+NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01
1969-12-31T23:59:59 1969-12-31T23:59:59
+
+# verify timestamp data with formatting options
+query PPPPPP
+SELECT to_timestamp(null, '%+'), to_timestamp(0, '%s'),
to_timestamp(1926632005, '%s'), to_timestamp(1, '%+', '%s'), to_timestamp(-1,
'%c', '%+', '%s'), to_timestamp(0-1, '%c', '%+', '%s')
+----
+NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01
1969-12-31T23:59:59 1969-12-31T23:59:59
+
Review Comment:
```suggestion
```
The test duplicates the same at line 76
##########
datafusion/common/src/config.rs:
##########
@@ -505,6 +505,11 @@ config_namespace! {
/// Defaults to the number of CPU cores on the system
pub target_partitions: usize, transform =
ExecutionOptions::normalized_parallelism, default = get_available_parallelism()
+ /// The date time parser to use when parsing date time values.
+ ///
+ /// Defaults to 'chrono'. 'jiff' is supported when the 'jiff' feature
is enabled.
+ pub date_time_parser: Option<String>, transform = str::to_lowercase,
default = Some("chrono".to_string())
Review Comment:
Consider using an enum + FromStr impl instead. Like `Dialect` above.
This way invalid values will fail at config parsing time instead of at
runtime when `get_date_time_parser()` is called.
E.g.:
```rust
#[derive(Debug, Default)]
pub enum DateTimeParserType {
#[default]
Chrono,
#[cfg(feature="jiff")]
Jiff
}
```
##########
datafusion/functions/benches/to_timestamp.rs:
##########
@@ -44,261 +48,566 @@ fn data() -> StringArray {
StringArray::from(data)
}
-fn data_with_formats() -> (StringArray, StringArray, StringArray, StringArray)
{
+fn data_with_formats() -> (
+ StringArray,
+ StringArray,
+ StringArray,
+ StringArray,
+ StringArray,
+ StringArray,
+ StringArray,
+) {
let mut inputs = StringBuilder::new();
let mut format1_builder = StringBuilder::with_capacity(2, 10);
let mut format2_builder = StringBuilder::with_capacity(2, 10);
let mut format3_builder = StringBuilder::with_capacity(2, 10);
+ let mut jiff_format1_builder = StringBuilder::with_capacity(2, 10);
+ let mut jiff_format2_builder = StringBuilder::with_capacity(2, 10);
+ let mut jiff_format3_builder = StringBuilder::with_capacity(2, 10);
- inputs.append_value("1997-01-31T09:26:56.123Z");
- format1_builder.append_value("%+");
- format2_builder.append_value("%c");
- format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%Z");
+ inputs.append_value("1997-01-31T09:26:56UTC");
+
+ format1_builder.append_value("%m");
+ format2_builder.append_value("%H");
+ format3_builder.append_value("%+");
+
+ jiff_format1_builder.append_value("%m");
+ jiff_format2_builder.append_value("%H");
+ jiff_format3_builder.append_value("%Y-%m-%dT%H:%M:%S%Q");
inputs.append_value("1997-01-31T09:26:56.123-05:00");
- format1_builder.append_value("%+");
- format2_builder.append_value("%c");
- format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%z");
+ format1_builder.append_value("%m");
+ format2_builder.append_value("%H");
+ format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%:z");
+
+ jiff_format1_builder.append_value("%m");
+ jiff_format2_builder.append_value("%H");
+ jiff_format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f%:z");
inputs.append_value("1997-01-31 09:26:56.123-05:00");
- format1_builder.append_value("%+");
- format2_builder.append_value("%c");
- format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%Z");
+ format1_builder.append_value("%m");
+ format2_builder.append_value("%H");
+ format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%:z");
+
+ jiff_format1_builder.append_value("%m");
+ jiff_format2_builder.append_value("%H");
+ jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f%:z");
- inputs.append_value("2023-01-01 04:05:06.789 -08");
- format1_builder.append_value("%+");
- format2_builder.append_value("%c");
+ inputs.append_value("2023-01-01 04:05:06.789 -0800");
+ format1_builder.append_value("%m");
+ format2_builder.append_value("%H");
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %#z");
+ jiff_format1_builder.append_value("%m");
+ jiff_format2_builder.append_value("%H");
+ jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f %z");
+
inputs.append_value("1997-01-31T09:26:56.123");
- format1_builder.append_value("%+");
- format2_builder.append_value("%c");
+ format1_builder.append_value("%m");
+ format2_builder.append_value("%H");
format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f");
+ jiff_format1_builder.append_value("%m");
+ jiff_format2_builder.append_value("%H");
+ jiff_format3_builder.append_value("%Y-%m-%dT%H:%M:%S%.f");
+
inputs.append_value("1997-01-31 09:26:56.123");
- format1_builder.append_value("%+");
- format2_builder.append_value("%c");
+ format1_builder.append_value("%m");
+ format2_builder.append_value("%H");
format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f");
+ jiff_format1_builder.append_value("%m");
+ jiff_format2_builder.append_value("%H");
+ jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S%.f");
+
inputs.append_value("1997-01-31 09:26:56");
- format1_builder.append_value("%+");
- format2_builder.append_value("%c");
+ format1_builder.append_value("%m");
+ format2_builder.append_value("%H");
format3_builder.append_value("%Y-%m-%d %H:%M:%S");
+ jiff_format1_builder.append_value("%m");
+ jiff_format2_builder.append_value("%H");
+ jiff_format3_builder.append_value("%Y-%m-%d %H:%M:%S");
+
inputs.append_value("1997-01-31 092656");
- format1_builder.append_value("%+");
- format2_builder.append_value("%c");
+ format1_builder.append_value("%m");
+ format2_builder.append_value("%H");
format3_builder.append_value("%Y-%m-%d %H%M%S");
+ jiff_format1_builder.append_value("%m");
+ jiff_format2_builder.append_value("%H");
+ jiff_format3_builder.append_value("%Y-%m-%d %H%M%S");
+
inputs.append_value("1997-01-31 092656+04:00");
- format1_builder.append_value("%+");
- format2_builder.append_value("%c");
+ format1_builder.append_value("%m");
+ format2_builder.append_value("%H");
format3_builder.append_value("%Y-%m-%d %H%M%S%:z");
+ jiff_format1_builder.append_value("%m");
+ jiff_format2_builder.append_value("%H");
+ jiff_format3_builder.append_value("%Y-%m-%d %H%M%S%:z");
+
inputs.append_value("Sun Jul 8 00:34:60 2001");
- format1_builder.append_value("%+");
- format2_builder.append_value("%c");
- format3_builder.append_value("%Y-%m-%d 00:00:00");
+ format1_builder.append_value("%m");
+ format2_builder.append_value("%H");
+ format3_builder.append_value("%c");
+
+ jiff_format1_builder.append_value("%m");
+ jiff_format2_builder.append_value("%H");
+ jiff_format3_builder.append_value("%a %b %e %H:%M:%S %Y");
(
inputs.finish(),
format1_builder.finish(),
format2_builder.finish(),
format3_builder.finish(),
+ jiff_format1_builder.finish(),
+ jiff_format2_builder.finish(),
+ jiff_format3_builder.finish(),
)
}
+
fn criterion_benchmark(c: &mut Criterion) {
- let return_field =
- Field::new("f", DataType::Timestamp(TimeUnit::Nanosecond, None),
true).into();
- let arg_field = Field::new("a", DataType::Utf8, false).into();
- let arg_fields = vec![arg_field];
- let mut options = ConfigOptions::default();
- options.execution.time_zone = Some("UTC".into());
- let config_options = Arc::new(options);
-
- let to_timestamp_udf = to_timestamp(config_options.as_ref());
-
- c.bench_function("to_timestamp_no_formats_utf8", |b| {
- let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
- let arr_data = data();
- let batch_len = arr_data.len();
- let string_array = ColumnarValue::Array(Arc::new(arr_data) as
ArrayRef);
-
- b.iter(|| {
- black_box(
- to_timestamp_udf
- .invoke_with_args(ScalarFunctionArgs {
- args: vec![string_array.clone()],
- arg_fields: arg_fields.clone(),
- number_rows: batch_len,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("to_timestamp should work on valid values"),
- )
- })
- });
-
- c.bench_function("to_timestamp_no_formats_largeutf8", |b| {
- let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
- let data = cast(&data(), &DataType::LargeUtf8).unwrap();
- let batch_len = data.len();
- let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef);
-
- b.iter(|| {
- black_box(
- to_timestamp_udf
- .invoke_with_args(ScalarFunctionArgs {
- args: vec![string_array.clone()],
- arg_fields: arg_fields.clone(),
- number_rows: batch_len,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("to_timestamp should work on valid values"),
- )
- })
- });
-
- c.bench_function("to_timestamp_no_formats_utf8view", |b| {
- let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
- let data = cast(&data(), &DataType::Utf8View).unwrap();
- let batch_len = data.len();
- let string_array = ColumnarValue::Array(Arc::new(data) as ArrayRef);
-
- b.iter(|| {
- black_box(
- to_timestamp_udf
- .invoke_with_args(ScalarFunctionArgs {
- args: vec![string_array.clone()],
- arg_fields: arg_fields.clone(),
- number_rows: batch_len,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("to_timestamp should work on valid values"),
- )
- })
- });
-
- c.bench_function("to_timestamp_with_formats_utf8", |b| {
- let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
- let (inputs, format1, format2, format3) = data_with_formats();
- let batch_len = inputs.len();
-
- let args = vec![
- ColumnarValue::Array(Arc::new(inputs) as ArrayRef),
- ColumnarValue::Array(Arc::new(format1) as ArrayRef),
- ColumnarValue::Array(Arc::new(format2) as ArrayRef),
- ColumnarValue::Array(Arc::new(format3) as ArrayRef),
- ];
- let arg_fields = args
- .iter()
- .enumerate()
- .map(|(idx, arg)| {
- Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
- })
- .collect::<Vec<_>>();
-
- b.iter(|| {
- black_box(
- to_timestamp_udf
- .invoke_with_args(ScalarFunctionArgs {
- args: args.clone(),
- arg_fields: arg_fields.clone(),
- number_rows: batch_len,
- return_field: Arc::clone(&return_field),
- config_options: Arc::clone(&config_options),
- })
- .expect("to_timestamp should work on valid values"),
- )
- })
- });
-
- c.bench_function("to_timestamp_with_formats_largeutf8", |b| {
- let to_timestamp_udf = Arc::clone(&to_timestamp_udf);
- let (inputs, format1, format2, format3) = data_with_formats();
- let batch_len = inputs.len();
-
- let args = vec![
- ColumnarValue::Array(
- Arc::new(cast(&inputs, &DataType::LargeUtf8).unwrap()) as
ArrayRef
- ),
- ColumnarValue::Array(
- Arc::new(cast(&format1, &DataType::LargeUtf8).unwrap()) as
ArrayRef
- ),
- ColumnarValue::Array(
- Arc::new(cast(&format2, &DataType::LargeUtf8).unwrap()) as
ArrayRef
+ for &parser in ["jiff", "chrono"].iter() {
+ c.bench_function(
+
&format!("string_to_timestamp_nanos_formatted_single_format_utf8_{parser}"),
+ |b| {
+ let datetime_parser = match parser {
+ "chrono" => {
+ Box::new(ChronoDateTimeParser::new()) as Box<dyn
DateTimeParser>
+ }
+ "jiff" => {
+ Box::new(JiffDateTimeParser::new()) as Box<dyn
DateTimeParser>
+ }
+ _ => unreachable!(),
+ };
+
+ let (inputs, _, _, format3, _, _, jiffformat3) =
data_with_formats();
+
+ b.iter(|| {
+ for (input, format3, jiff_format3) in
+ izip!(inputs.iter(), format3.iter(),
jiffformat3.iter())
+ {
+ let _ = black_box(match parser {
+ "chrono" => {
+ let t = datetime_parser
+ .string_to_timestamp_nanos_formatted(
+ "UTC",
+ input.unwrap(),
+ &[format3.unwrap()],
+ );
+ if t.is_err() {
+ println!("Error: {t:?}");
Review Comment:
Is this a debug leftover ?
I'd not expect printing to the stdout/err in a benchmark.
Maybe change it to `panic!()` if you want to make sure that all inputs are
valid.
##########
datafusion/functions/src/datetime/to_date.rs:
##########
@@ -67,21 +69,39 @@ Additional examples can be found
[here](https://github.com/apache/datafusion/blo
#[derive(Debug, PartialEq, Eq, Hash)]
pub struct ToDateFunc {
signature: Signature,
+ parser: Box<dyn DateTimeParser>,
}
impl Default for ToDateFunc {
fn default() -> Self {
- Self::new()
+ Self::new_with_config(&ConfigOptions::default())
}
}
impl ToDateFunc {
+ #[deprecated(since = "52.0.0", note = "use `new_with_config` instead")]
+ /// Deprecated constructor retained for backwards compatibility.
+ ///
+ /// Prefer [`ToDateFunc::new_with_config`] which allows specifying the
+ /// date time parser via [`ConfigOptions`].
pub fn new() -> Self {
+ Self::new_with_config(&ConfigOptions::default())
+ }
+
+ pub fn new_with_config(config: &ConfigOptions) -> Self {
Review Comment:
Consider also implementing `with_updated_config()` to be able to update an
existing UDF instance.
See
https://github.com/Omega359/arrow-datafusion/blob/9bb7705fcdeeaf2f713a3397fd354a46ddcde60d/datafusion/functions/src/datetime/now.rs#L98
##########
datafusion/functions/src/datetime/parser.rs:
##########
@@ -0,0 +1,724 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::config::ConfigOptions;
+use dyn_eq::DynEq;
+use dyn_hash::DynHash;
+use std::fmt::Debug;
+
+/// A trait for parsing timestamps from strings. Two implementations are
provided:
+/// `ChronoDateTimeParser` (the default) which uses [Chrono] to parse
timestamps, and
+/// `JiffDateTimeParser` (via the `jiff` feature flag) which uses [Jiff] to
parse
+/// timestamps.
+///
+/// While both implementations are capable of parsing timestamps, the
`ChronoDateTimeParser`
+/// is a bit more lenient wrt formats at the cost of being slightly slower
than Jiff.
+/// Jiff, on the other hand, has slightly less support for some format options
than Chrono
+/// but is measurably faster than Chrono in some situations.
+///
+/// [Chrono]: https://docs.rs/chrono/latest/chrono/
+/// [Jiff]: https://docs.rs/jiff/latest/jiff/
+pub trait DateTimeParser: Debug + DynEq + DynHash + Send + Sync {
+ fn string_to_timestamp_nanos(
+ &self,
+ tz: &str,
+ s: &str,
+ ) -> datafusion_common::Result<i64>;
+
+ fn string_to_timestamp_nanos_formatted(
+ &self,
+ tz: &str,
+ s: &str,
+ formats: &[&str],
+ ) -> datafusion_common::Result<i64>;
+
+ fn string_to_timestamp_millis_formatted(
+ &self,
+ tz: &str,
+ s: &str,
+ formats: &[&str],
+ ) -> datafusion_common::Result<i64>;
+}
+
+// impl Eq and PartialEq for dyn DateTimeParser
+dyn_eq::eq_trait_object!(DateTimeParser);
+
+// Implement std::hash::Hash for dyn DateTimeParser
+dyn_hash::hash_trait_object!(DateTimeParser);
+
+pub fn get_date_time_parser(config_options: &ConfigOptions) -> Box<dyn
DateTimeParser> {
+ let parser_cfg = config_options.execution.date_time_parser.as_ref();
+
+ match parser_cfg {
+ Some(p) => match p.as_str() {
+ "chrono" => {
+ Box::new(chrono::ChronoDateTimeParser::new()) as Box<dyn
DateTimeParser>
+ }
+ #[cfg(feature = "jiff")]
+ "jiff" => {
+ Box::new(jiff::JiffDateTimeParser::new()) as Box<dyn
DateTimeParser>
+ }
+ #[cfg(not(feature = "jiff"))]
+ "jiff" => {
+ panic!(
+ "jiff parser requested but 'jiff' feature is not enabled.
Enable with --features jiff"
Review Comment:
I wonder whether these panics should be just log::warn!() + fallback to
chrono.
Or maybe return DataFusionError, so it is treated as a session/query error.
Currently if a user issues `SET datafusion.execution.date_time_parser=boom`
will crash the application
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -372,18 +175,23 @@ where
};
if let Some(s) = x {
- match op(a, s.as_str()) {
- Ok(r) => {
- let result = op2(r).to_i64();
- let s = scalar_value(dt, result)?;
- ret = Some(Ok(ColumnarValue::Scalar(s)));
- break;
- }
- Err(e) => ret = Some(Err(e)),
- }
+ vals.push(s.as_str());
}
}
+ let a = a.as_ref();
+ // ASK: Why do we trust `a` to be non-null at this point?
Review Comment:
I think it is valid `a` to be `None` here, i.e. SQL NULL.
IMO you should return `Some(Ok(ColumnarValue::Scalar(ScalarValue::Null)))`
instead of an error
##########
datafusion/functions/src/datetime/parser.rs:
##########
@@ -0,0 +1,724 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use datafusion_common::config::ConfigOptions;
+use dyn_eq::DynEq;
+use dyn_hash::DynHash;
+use std::fmt::Debug;
+
+/// A trait for parsing timestamps from strings. Two implementations are
provided:
+/// `ChronoDateTimeParser` (the default) which uses [Chrono] to parse
timestamps, and
+/// `JiffDateTimeParser` (via the `jiff` feature flag) which uses [Jiff] to
parse
+/// timestamps.
+///
+/// While both implementations are capable of parsing timestamps, the
`ChronoDateTimeParser`
+/// is a bit more lenient wrt formats at the cost of being slightly slower
than Jiff.
+/// Jiff, on the other hand, has slightly less support for some format options
than Chrono
+/// but is measurably faster than Chrono in some situations.
+///
+/// [Chrono]: https://docs.rs/chrono/latest/chrono/
+/// [Jiff]: https://docs.rs/jiff/latest/jiff/
+pub trait DateTimeParser: Debug + DynEq + DynHash + Send + Sync {
+ fn string_to_timestamp_nanos(
+ &self,
+ tz: &str,
+ s: &str,
+ ) -> datafusion_common::Result<i64>;
+
+ fn string_to_timestamp_nanos_formatted(
+ &self,
+ tz: &str,
+ s: &str,
+ formats: &[&str],
+ ) -> datafusion_common::Result<i64>;
+
+ fn string_to_timestamp_millis_formatted(
+ &self,
+ tz: &str,
+ s: &str,
+ formats: &[&str],
+ ) -> datafusion_common::Result<i64>;
+}
+
+// impl Eq and PartialEq for dyn DateTimeParser
+dyn_eq::eq_trait_object!(DateTimeParser);
+
+// Implement std::hash::Hash for dyn DateTimeParser
+dyn_hash::hash_trait_object!(DateTimeParser);
+
+pub fn get_date_time_parser(config_options: &ConfigOptions) -> Box<dyn
DateTimeParser> {
+ let parser_cfg = config_options.execution.date_time_parser.as_ref();
+
+ match parser_cfg {
+ Some(p) => match p.as_str() {
+ "chrono" => {
+ Box::new(chrono::ChronoDateTimeParser::new()) as Box<dyn
DateTimeParser>
+ }
+ #[cfg(feature = "jiff")]
+ "jiff" => {
+ Box::new(jiff::JiffDateTimeParser::new()) as Box<dyn
DateTimeParser>
+ }
+ #[cfg(not(feature = "jiff"))]
+ "jiff" => {
+ panic!(
+ "jiff parser requested but 'jiff' feature is not enabled.
Enable with --features jiff"
+ );
+ }
+ _ => panic!("Unknown/unsupported date time parser: {p}"),
+ },
+ None => Box::new(chrono::ChronoDateTimeParser::new()) as Box<dyn
DateTimeParser>,
+ }
+}
+
+pub mod chrono {
+ use crate::datetime::parser::DateTimeParser;
+ use arrow::array::timezone::Tz;
+ use arrow::compute::kernels::cast_utils::string_to_datetime;
+ use chrono::LocalResult;
+ use chrono::format::{ParseErrorKind, Parsed, StrftimeItems, parse};
+ use chrono_tz::ParseError;
+ use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err};
+
+ const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be
represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and
2262-04-11T23:47:16.854775804";
+
+ #[derive(Debug, Default, PartialEq, Eq, Hash)]
+ pub struct ChronoDateTimeParser {}
+
+ impl ChronoDateTimeParser {
+ pub fn new() -> Self {
+ Self {}
+ }
+
+ /// Accepts a string and parses it using the
[`chrono::format::strftime`] specifiers
+ /// relative to the provided `timezone`.
+ ///
+ /// If a timestamp is ambiguous, for example, as a result of
daylight-savings time, an error
+ /// will be returned.
+ ///
+ /// [`chrono::format::strftime`]:
https://docs.rs/chrono/latest/chrono/format/strftime/index.html
+ fn string_to_datetime_formatted<T: chrono::TimeZone>(
+ &self,
+ timezone: &T,
+ s: &str,
+ formats: &[&str],
+ ) -> Result<chrono::DateTime<T>, DataFusionError> {
+ let mut err: Option<String> = None;
+
+ for &format in formats.iter() {
+ let mut format = format;
+ let mut datetime_str = s;
+
+ // we manually handle the most common case of a named timezone
at the end of the timestamp
+ // since chrono doesn't support that directly. Note, however,
that %+ does handle 'Z' at the
+ // end of the string.
+ //
+ // This code doesn't handle named timezones with no preceding
space since that would require
+ // writing a custom parser.
+ let tz: Option<chrono_tz::Tz> = if format.ends_with(" %Z")
+ && datetime_str.contains(' ')
+ {
+ // grab the string after the last space as the named
timezone
+ let parts: Vec<&str> = datetime_str.rsplitn(2, '
').collect();
+ let timezone_name = parts[0];
+ datetime_str = parts[1];
+
+ // attempt to parse the timezone name
+ let result: Result<chrono_tz::Tz, ParseError> =
timezone_name.parse();
+ let Ok(tz) = result else {
+ err = Some(result.unwrap_err().to_string());
+ continue;
+ };
+
+ // successfully parsed the timezone name, remove the ' %Z'
from the format
+ format = &format[..format.len() - 3];
+
+ Some(tz)
+ } else if format.contains("%Z") {
+ err = Some("'%Z' is only supported at the end of the
format string preceded by a space".into());
+ continue;
+ } else {
+ None
+ };
+
+ let mut parsed = Parsed::new();
+ let items = StrftimeItems::new(format);
+ let result = parse(&mut parsed, datetime_str, items);
+
+ if let Err(e) = &result {
+ err = Some(e.to_string());
+ continue;
+ }
+
+ let dt = match tz {
+ Some(tz) => {
+ // A timezone was manually parsed out, convert it to a
fixed offset
+ match parsed.to_datetime_with_timezone(&tz) {
+ Ok(dt) => Ok(dt.fixed_offset()),
+ Err(e) => Err(e),
+ }
+ }
+ // default to parse the string assuming it has a timezone
+ None => parsed.to_datetime(),
+ };
+
+ match dt {
+ Ok(dt) => return Ok(dt.with_timezone(timezone)),
+ Err(e) if e.kind() == ParseErrorKind::Impossible => {
+ err = Some(format!(
+ "Unable to parse timestamp {datetime_str} in
timezone {tz:?}: datetime was impossible"
+ ));
+ continue;
+ }
+ Err(e) if e.kind() == ParseErrorKind::OutOfRange => {
+ err = Some(format!(
+ "Unable to parse timestamp {datetime_str} in
timezone {tz:?}: datetime was out of range"
+ ));
+ continue;
+ }
+ _ => {
+ // no timezone or other failure, try without a timezone
+ let ndt = parsed
+ .to_naive_datetime_with_offset(0)
+ .or_else(|_| parsed.to_naive_date().map(|nd|
nd.into()));
+ if let Err(e) = &ndt {
+ err = Some(e.to_string());
+ continue;
+ }
+
+ let result =
&timezone.from_local_datetime(&ndt.unwrap());
+
+ match result {
+ LocalResult::Single(e) => return Ok(e.to_owned()),
+ LocalResult::None => {
+ err = Some(format!(
+ "Unable to parse timestamp {datetime_str}
in timezone {tz:?}: no valid local time found"
+ ));
+ continue;
+ }
+ LocalResult::Ambiguous(earliest, latest) => {
+ err = Some(format!(
+ "Unable to parse timestamp {datetime_str}
in timezone {tz:?}: ambiguous timestamps found {earliest:?} and {latest:?}"
+ ));
+ continue;
+ }
+ }
+ }
+ }
+ }
+
+ match err {
+ Some(e) => exec_err!(
+ "Error parsing timestamp from '{s}' using formats:
{formats:?}: {e}"
+ ),
+ None => exec_err!(
+ "Error parsing timestamp from '{s}' using formats:
{formats:?}"
+ ),
+ }
+ }
+ }
+
+ impl DateTimeParser for ChronoDateTimeParser {
+ /// Accepts a string and parses it relative to the provided `timezone`
+ ///
+ /// In addition to RFC3339 / ISO8601 standard timestamps, it also
+ /// accepts strings that use a space ` ` to separate the date and time
+ /// as well as strings that have no explicit timezone offset.
+ ///
+ /// Examples of accepted inputs:
+ /// * `1997-01-31T09:26:56.123Z` # RCF3339
+ /// * `1997-01-31T09:26:56.123-05:00` # RCF3339
+ /// * `1997-01-31 09:26:56.123-05:00` # close to RCF3339 but with a
space rather than T
+ /// * `2023-01-01 04:05:06.789 -08` # close to RCF3339, no
fractional seconds or time separator
+ /// * `1997-01-31T09:26:56.123` # close to RCF3339 but no
timezone offset specified
+ /// * `1997-01-31 09:26:56.123` # close to RCF3339 but uses a
space and no timezone offset
+ /// * `1997-01-31 09:26:56` # close to RCF3339, no
fractional seconds
+ /// * `1997-01-31 092656` # close to RCF3339, no
fractional seconds
+ /// * `1997-01-31 092656+04:00` # close to RCF3339, no
fractional seconds or time separator
+ /// * `1997-01-31` # close to RCF3339, only date
no time
+ ///
+ /// [IANA timezones] are only supported if the `arrow-array/chrono-tz`
feature is enabled
+ ///
+ /// * `2023-01-01 040506 America/Los_Angeles`
+ ///
+ /// If a timestamp is ambiguous, for example as a result of
daylight-savings time, an error
+ /// will be returned
+ ///
+ /// Some formats supported by PostgresSql
<https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-DATETIME-TIME-TABLE>
+ /// are not supported, like
+ ///
+ /// * "2023-01-01 04:05:06.789 +07:30:00",
+ /// * "2023-01-01 040506 +07:30:00",
+ /// * "2023-01-01 04:05:06.789 PST",
+ #[inline]
+ fn string_to_timestamp_nanos(
+ &self,
+ tz: &str,
+ s: &str,
+ ) -> Result<i64, DataFusionError> {
+ let tz: Tz = match tz.parse() {
+ Ok(tz) => tz,
+ Err(e) => return exec_err!("Invalid timezone '{tz}': {e}"),
+ };
+
+ let dt = string_to_datetime(&tz, s)?;
+ let parsed = dt
+ .timestamp_nanos_opt()
+ .ok_or_else(||
exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))?;
+
+ Ok(parsed)
+ }
+
+ /// Accepts a string with an array of `chrono` formats and converts it
to a
+ /// nanosecond precision timestamp according to the rules
+ /// defined by `chrono`.
+ ///
+ /// See [`chrono::format::strftime`] for the full set of supported
formats.
+ ///
+ /// ## Timestamp Precision
+ ///
+ /// This function uses the maximum precision timestamps supported by
+ /// Arrow (nanoseconds stored as a 64-bit integer) timestamps. This
+ /// means the range of dates that timestamps can represent is ~1677 AD
+ /// to 2262 AM.
+ ///
+ /// ## Timezone / Offset Handling
+ ///
+ /// Numerical values of timestamps are stored compared to offset UTC.
+ ///
+ /// Note that parsing named [IANA timezones] is not supported yet in
chrono
+ /// <https://github.com/chronotope/chrono/issues/38> and this
implementation
+ /// only supports named timezones at the end of the string preceded by
a space.
+ ///
+ /// [`chrono::format::strftime`]:
https://docs.rs/chrono/latest/chrono/format/strftime/index.html
+ /// [IANA timezones]: https://www.iana.org/time-zones
+ #[inline]
+ fn string_to_timestamp_nanos_formatted(
+ &self,
+ tz: &str,
+ s: &str,
+ formats: &[&str],
+ ) -> Result<i64, DataFusionError> {
+ let tz: Tz = match tz.parse() {
+ Ok(tz) => tz,
+ Err(e) => return exec_err!("Invalid timezone '{tz}': {e}"),
+ };
+
+ let dt = self.string_to_datetime_formatted(&tz, s, formats)?;
+ dt.naive_utc()
+ .and_utc()
+ .timestamp_nanos_opt()
+ .ok_or_else(||
exec_datafusion_err!("{ERR_NANOSECONDS_NOT_SUPPORTED}"))
+ }
+
+ /// Accepts a string with a `chrono` format and converts it to a
+ /// millisecond precision timestamp.
+ ///
+ /// See [`chrono::format::strftime`] for the full set of supported
formats.
+ ///
+ /// Internally, this function uses the `chrono` library for the
+ /// datetime parsing
+ ///
+ /// ## Timezone / Offset Handling
+ ///
+ /// Numerical values of timestamps are stored compared to offset UTC.
+ ///
+ /// Any timestamp in the formatting string is handled according to the
rules
+ /// defined by `chrono`.
+ ///
+ /// [`chrono::format::strftime`]:
https://docs.rs/chrono/latest/chrono/format/strftime/index.html
+ #[inline]
+ fn string_to_timestamp_millis_formatted(
+ &self,
+ tz: &str,
+ s: &str,
+ formats: &[&str],
+ ) -> Result<i64, DataFusionError> {
+ let tz: Tz = match tz.parse() {
+ Ok(tz) => tz,
+ Err(e) => return exec_err!("Invalid timezone '{tz}': {e}"),
+ };
+
+ Ok(self
+ .string_to_datetime_formatted(&tz, s, formats)?
+ .naive_utc()
+ .and_utc()
+ .timestamp_millis())
+ }
+ }
+}
+
+#[cfg(feature = "jiff")]
+pub mod jiff {
+ use crate::datetime::parser::DateTimeParser;
+ use arrow::array::timezone::Tz;
+ use arrow::error::ArrowError;
+ use chrono::format::{Parsed, StrftimeItems, parse};
+ use datafusion_common::{DataFusionError, exec_datafusion_err, exec_err};
+ use jiff::civil::Time;
+ use jiff::fmt::temporal::Pieces;
+ use jiff::tz::{Disambiguation, TimeZone};
+ use jiff::{Timestamp, Zoned};
+ use num_traits::ToPrimitive;
+
+ const ERR_NANOSECONDS_NOT_SUPPORTED: &str = "The dates that can be
represented as nanoseconds have to be between 1677-09-21T00:12:44.0 and
2262-04-11T23:47:16.854775804";
+
+ #[derive(Debug, Default, PartialEq, Eq, Hash)]
+ pub struct JiffDateTimeParser {}
+
+ impl JiffDateTimeParser {
+ pub fn new() -> Self {
+ Self {}
+ }
+
+ /// Attempts to parse a given string representation of a timestamp
into a `Timestamp` object.
+ /// The function also adjusts the datetime for the specified timezone.
+ ///
+ /// # Parameters
+ /// - `timezone`: A reference to the `TimeZone` object used to adjust
the parsed datetime to the desired timezone.
+ /// - `s`: A string slice holding the timestamp to be parsed.
+ ///
+ /// # Returns
+ /// - `Ok(Timestamp)`: Contains the parsed timestamp in seconds since
the Unix epoch.
+ /// - `Err(ArrowError)`: Returned in the event of errors in parsing
+ /// the timestamp string or computing the timezone offset.
+ ///
+ /// # Errors
+ /// This function will return an `ArrowError` if:
+ ///
+ /// - The string `s` is shorter than 10 characters.
+ /// - The format of the string does not match expected timestamp
patterns.
+ /// - An invalid/unknown time zone or offset is provided during
parsing.
+ /// - Errors occur while converting the datetime to a specific time
zone.
+ fn string_to_datetime(
+ &self,
+ timezone: &TimeZone,
+ s: &str,
+ ) -> datafusion_common::Result<Timestamp, ArrowError> {
+ let err = |ctx: &str| {
+ ArrowError::ParseError(format!(
+ "Error parsing timestamp from '{s}': {ctx}"
+ ))
+ };
+
+ let bytes = s.as_bytes();
+ if bytes.len() < 10 {
+ return Err(err("timestamp must contain at least 10
characters"));
+ }
+
+ let pieces = Pieces::parse(bytes).map_err(|e|
err(&format!("{e:?}")))?;
+ let time = pieces.time().unwrap_or_else(Time::midnight);
+ let dt = pieces.date().to_datetime(time);
+ let tz = match pieces
+ .to_time_zone()
+ .map_err(|e| err(&format!("unknown time zone: {e:?}")))?
+ {
+ Some(tz) => tz,
+ None => match pieces.to_numeric_offset() {
+ Some(offset) => TimeZone::fixed(offset),
+ None => timezone.to_owned(),
+ },
+ };
+ let zdt = tz
+ .to_zoned(dt)
+ .map_err(|e| err(&format!("error computing timezone offset:
{e:?}")))?;
+
+ Ok(zdt.timestamp())
+ }
+
+ /// Attempts to parse a given string representation of a timestamp
into a `Zoned` datetime object
+ /// using a list of provided formats. The function also adjusts the
datetime for the specified timezone.
+ ///
+ /// # Parameters
+ /// - `timezone`: A reference to the `TimeZone` object used to adjust
the parsed datetime to the desired timezone.
+ /// - `s`: A string slice holding the timestamp to be parsed.
+ /// - `formats`: A slice of string slices representing the accepted
datetime formats to use when parsing.
+ ///
+ /// # Returns
+ /// - `Ok(Zoned)` if the string is successfully parsed into a `Zoned`
datetime object using one of the formats provided.
+ /// - `Err(DataFusionError)` if the string cannot be parsed or if
there are issues related to the timezone or format handling.
+ ///
+ /// # Behavior
+ /// 1. Iterates through the list of provided formats.
+ /// 2. Handles special cases such as `%Z` (timezone) or `%c`
(locale-aware datetime representation)
+ /// by either modifying the format string or switching to different
parsing logic.
+ /// 3. Attempts to resolve timezone-related issues:
+ /// - Calculates a fixed offset for the given `timezone`.
+ /// - Handles attempts to parse timezones using IANA names or
offsets.
+ /// 4. If parsing fails for the current format, the function moves to
the next format and aggregates any parsing errors.
+ /// 5. If no formats succeed, an error is returned summarizing the
failure.
+ ///
+ /// # Errors
+ /// - Returns a descriptive error wrapped in `DataFusionError` if:
+ /// - The input string is not compatible with any of the provided
formats.
+ /// - There are issues parsing the timezone from the input string or
adjusting to the provided `TimeZone`.
+ /// - There are issues resolving ambiguous datetime representations.
+ fn string_to_datetime_formatted(
+ &self,
+ timezone: &TimeZone,
+ s: &str,
+ formats: &[&str],
+ ) -> datafusion_common::Result<Zoned, DataFusionError> {
+ let mut err: Option<String> = None;
+
+ for &format in formats.iter() {
+ let mut format = if format.contains("%Z") {
+ &format.replace("%Z", "%Q")
+ } else {
+ format
+ };
+
+ if format == "%c" {
+ // switch to chrono, jiff doesn't support parsing with %c
+ let mut parsed = Parsed::new();
+ let result = parse(&mut parsed, s,
StrftimeItems::new(format));
+
+ if result.is_err() {
+ err = Some(format!("error parsing timestamp:
{result:?}"));
+ continue;
+ }
+
+ let offset = timezone.to_fixed_offset();
+ let tz: &str = if let Ok(offset) = offset {
+ &offset.to_string()
+ } else {
+ let result = timezone.iana_name();
+ match result {
+ Some(tz) => tz,
+ None => {
+ err =
+ Some(format!("error parsing timezone:
{timezone:?}"));
+ continue;
+ }
+ }
+ };
+ let tz: Tz =
tz.parse::<Tz>().ok().unwrap_or("UTC".parse::<Tz>()?);
+
+ match parsed.to_datetime_with_timezone(&tz) {
+ Ok(dt) => {
+ let dt = dt.fixed_offset();
+ let nanos_opt = dt.timestamp_nanos_opt();
+
+ let Some(nanos) = nanos_opt else {
+ err =
Some(ERR_NANOSECONDS_NOT_SUPPORTED.to_owned());
+ continue;
+ };
+
+ let result = Timestamp::from_nanosecond(nanos as
i128);
+ match result {
+ Ok(result) => {
+ return
Ok(result.to_zoned(timezone.to_owned()));
+ }
+ Err(e) => {
+ err = Some(e.to_string());
+ continue;
+ }
+ }
+ }
+ Err(e) => {
+ err = Some(e.to_string());
+ continue;
+ }
+ };
+ }
+
+ let result = if format == "%+" {
+ // jiff doesn't support parsing with %+, but the
equivalent of %+ is
+ // somewhat rfc3389 which is what is used by the parse
method
+ let result: Result<Zoned, _> = s.parse();
+
+ if let Ok(r) = result {
+ return Ok(r);
+ } else {
+ // however, the above only works with timezone names,
not offsets
+ let s = if s.ends_with("Z") {
+ &(s.trim_end_matches("Z").to_owned() + "+00:00")
+ } else {
+ s
+ };
+
+ // try again with fractional seconds
+ format = "%Y-%m-%dT%H:%M:%S%.f%:z";
+
+ jiff::fmt::strtime::parse(format, s)
+ }
+ } else {
+ jiff::fmt::strtime::parse(format, s)
+ };
+
+ match result {
+ Ok(bdt) => {
+ if (bdt.iana_time_zone().is_some() ||
bdt.offset().is_some())
+ && let Ok(zoned) = bdt.to_zoned()
+ {
+ return
Ok(zoned.with_time_zone(timezone.to_owned()));
+ }
+
+ // %s just sets the timestamp and the .to_datetime()
call will error
+ // since the other fields like year are not set.
Handle this case
+ // directly.
+ let datetime = if let Some(ts) = bdt.timestamp() {
+ timezone.to_datetime(ts)
+ } else {
+ let result = bdt.to_datetime();
+ match result {
+ Ok(datetime) => datetime,
+ Err(e) => {
+ err = Some(e.to_string());
+ continue;
+ }
+ }
+ };
+
+ let zoned = timezone
+ .to_owned()
+ .into_ambiguous_zoned(datetime)
+ .disambiguate(Disambiguation::Compatible);
Review Comment:
https://github.com/apache/datafusion/pull/19755/changes#diff-8ee000ae56959a59980f69bab8e85f406ba4674df3b87428d8d1fa090f23a326R468
says that an error is returned when there is `ambiguous datetime
representation`.
I believe Chrono also returns an error.
Should it reject instead ?
```suggestion
.disambiguate(Disambiguation::Reject);
```
##########
datafusion/functions/src/datetime/to_date.rs:
##########
@@ -394,10 +414,13 @@ mod tests {
tc.name, tc.formatted_date, tc.format_str
);
}
- _ => panic!(
- "Could not convert '{}' with format string '{}'to Date",
- tc.date_str, tc.format_str
- ),
+ e => {
+ println!("e was {e:?}");
Review Comment:
```suggestion
```
##########
datafusion/functions/src/datetime/common.rs:
##########
@@ -481,25 +289,38 @@ where
.map(|(pos, x)| {
let mut val = None;
if let Some(x) = x {
+ let mut v = vec![];
+
for arg in args {
- let v = match arg {
+ match arg {
ColumnarValue::Array(a) => match a.data_type() {
- DataType::Utf8View =>
Ok(a.as_string_view().value(pos)),
- DataType::LargeUtf8 =>
Ok(a.as_string::<i64>().value(pos)),
- DataType::Utf8 =>
Ok(a.as_string::<i32>().value(pos)),
- other => exec_err!("Unexpected type encountered
'{other}'"),
+ DataType::Utf8View =>
v.push(a.as_string_view().value(pos)),
Review Comment:
For Scalar "null string" is omitted (line 310) but here there is no check
whether `value(pos)` is null or not.
Is this intentional ?
##########
datafusion/sqllogictest/test_files/datetime/timestamps_jiff.slt:
##########
@@ -0,0 +1,290 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+
+# http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+statement ok
+SET datafusion.execution.date_time_parser = 'jiff';
+
+statement ok
+create table ts_data(ts bigint, value int) as values
+ (1599572549190855123, 1),
+ (1599568949190855123, 2),
+ (1599565349190855123, 3);
+
+statement ok
+create table ts_data_nanos as select arrow_cast(ts, 'Timestamp(Nanosecond,
None)') as ts, value from ts_data;
+
+statement ok
+create table ts_data_micros as select arrow_cast(ts / 1000,
'Timestamp(Microsecond, None)') as ts, value from ts_data;
+
+statement ok
+create table ts_data_millis as select arrow_cast(ts / 1000000,
'Timestamp(Millisecond, None)') as ts, value from ts_data;
+
+statement ok
+create table ts_data_secs as select arrow_cast(ts / 1000000000,
'Timestamp(Second, None)') as ts, value from ts_data;
+
+# verify chrono formats don't work
+
+query error strptime parsing failed
+SELECT COUNT(*) FROM ts_data_nanos where ts >
to_timestamp('2020-09-08T12:00:00+00:00', '%Y-%m-%dT%H:%M:%S%Z')
+
+# to_timestamp with formatting
+query I
+SELECT COUNT(*) FROM ts_data_nanos where ts >
to_timestamp('2020-09-08T12:00:00+00:00', '2020-09-08 12/00/00+00:00', '%+',
'%Y-%m-%d %H/%M/%s%#z')
+----
+2
+
+# to_timestamp_nanos with formatting
+query I
+SELECT COUNT(*) FROM ts_data_nanos where ts > to_timestamp_nanos('2020-09-08
12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z')
+----
+2
+
+# to_timestamp_millis with formatting
+query I
+SELECT COUNT(*) FROM ts_data_millis where ts > to_timestamp_millis('2020-09-08
12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z')
+----
+2
+
+# to_timestamp_micros with formatting
+query I
+SELECT COUNT(*) FROM ts_data_micros where ts > to_timestamp_micros('2020-09-08
12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z')
+----
+2
+
+# to_timestamp_seconds with formatting
+query I
+SELECT COUNT(*) FROM ts_data_secs where ts > to_timestamp_seconds('2020-09-08
12/00/00+00:00', '%c', '%+', '%Y-%m-%d %H/%M/%S%#:z')
+----
+2
+
+# verify timestamp data with formatting options
+query PPPPPP
+SELECT to_timestamp(null, '%+'), to_timestamp(0, '%s'),
to_timestamp(1926632005, '%s'), to_timestamp(1, '%+', '%s'), to_timestamp(-1,
'%c', '%+', '%s'), to_timestamp(0-1, '%c', '%+', '%s')
+----
+NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01
1969-12-31T23:59:59 1969-12-31T23:59:59
+
+# verify timestamp data with formatting options
+query PPPPPP
+SELECT to_timestamp(null, '%+'), to_timestamp(0, '%s'),
to_timestamp(1926632005, '%s'), to_timestamp(1, '%+', '%s'), to_timestamp(-1,
'%c', '%+', '%s'), to_timestamp(0-1, '%c', '%+', '%s')
+----
+NULL 1970-01-01T00:00:00 2031-01-19T23:33:25 1970-01-01T00:00:01
1969-12-31T23:59:59 1969-12-31T23:59:59
+
+# verify timestamp output types with formatting options
+query TTT
+SELECT arrow_typeof(to_timestamp(1, '%c', '%s')),
arrow_typeof(to_timestamp(null, '%+', '%s')),
arrow_typeof(to_timestamp('2023-01-10 12:34:56.000', '%Y-%m-%d %H:%M:%S%.f'))
+----
+Timestamp(ns) Timestamp(ns) Timestamp(ns)
+
+# to_timestamp with invalid formatting
+query error strptime parsing failed: expected to match literal byte `T` from
format string, but found byte ` ` in input
+SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%c', '%+')
+
+# to_timestamp_nanos with invalid formatting
+query error strptime parsing failed: expected to match literal byte `T` from
format string, but found byte ` ` in input
+SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%c', '%+')
+
+# to_timestamp_millis with invalid formatting
+query error strptime parsing failed: expected to match literal byte `T` from
format string, but found byte ` ` in input
+SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%c', '%+')
+
+# to_timestamp_micros with invalid formatting
+query error strptime parsing failed: expected to match literal byte `T` from
format string, but found byte ` ` in input
+SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%c', '%+')
+
+# to_timestamp_seconds with invalid formatting
+query error strptime parsing failed: expected to match literal byte `T` from
format string, but found byte ` ` in input
+SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%c', '%+')
+
+# to_timestamp with broken formatting
+query error DataFusion error: Execution error: Error parsing timestamp from
'2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]:
+SELECT to_timestamp('2020-09-08 12/00/00+00:00', '%q')
+
+# to_timestamp_nanos with broken formatting
+query error DataFusion error: Execution error: Error parsing timestamp from
'2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]:
+SELECT to_timestamp_nanos('2020-09-08 12/00/00+00:00', '%q')
+
+# to_timestamp_millis with broken formatting
+query error DataFusion error: Execution error: Error parsing timestamp from
'2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]:
+SELECT to_timestamp_millis('2020-09-08 12/00/00+00:00', '%q')
+
+# to_timestamp_micros with broken formatting
+query error DataFusion error: Execution error: Error parsing timestamp from
'2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]:
+SELECT to_timestamp_micros('2020-09-08 12/00/00+00:00', '%q')
+
+# to_timestamp_seconds with broken formatting
+query error DataFusion error: Execution error: Error parsing timestamp from
'2020\-09\-08 12/00/00\+00:00' using formats: \["%q"\]:
+SELECT to_timestamp_seconds('2020-09-08 12/00/00+00:00', '%q')
+
+# Create string timestamp table with different formats
+# including a few very non-standard formats
+
+statement ok
+create table ts_utf8_data(ts varchar(100), format varchar(100)) as values
+ ('2020-09-08 12/00/00+00:00', '%Y-%m-%d %H/%M/%S%#:z'),
+ ('2031-01-19T23:33:25+05:00', '%+'),
+ ('08-09-2020 12:00:00+00:00', '%d-%m-%Y %H:%M:%S%#:z'),
+ ('1926632005', '%s'),
+ ('2000-01-01T01:01:01+07:00', '%+');
+
+statement ok
+create table ts_largeutf8_data as
+select arrow_cast(ts, 'LargeUtf8') as ts, arrow_cast(format, 'LargeUtf8') as
format from ts_utf8_data;
+
+statement ok
+create table ts_utf8view_data as
+select arrow_cast(ts, 'Utf8View') as ts, arrow_cast(format, 'Utf8View') as
format from ts_utf8_data;
+
+# verify timestamp data using tables with formatting options
+query P
+SELECT to_timestamp(t.ts, t.format) from ts_utf8_data as t
+----
+2020-09-08T12:00:00
+2031-01-19T18:33:25
+2020-09-08T12:00:00
+2031-01-19T23:33:25
+1999-12-31T18:01:01
+
+query PPPPP
+SELECT to_timestamp(t.ts, t.format),
+ to_timestamp_seconds(t.ts, t.format),
+ to_timestamp_millis(t.ts, t.format),
+ to_timestamp_micros(t.ts, t.format),
+ to_timestamp_nanos(t.ts, t.format)
+ from ts_largeutf8_data as t
+----
+2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00
2020-09-08T12:00:00 2020-09-08T12:00:00
+2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25
2031-01-19T18:33:25 2031-01-19T18:33:25
+2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00
2020-09-08T12:00:00 2020-09-08T12:00:00
+2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25
2031-01-19T23:33:25 2031-01-19T23:33:25
+1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01
1999-12-31T18:01:01 1999-12-31T18:01:01
+
+query PPPPP
+SELECT to_timestamp(t.ts, t.format),
+ to_timestamp_seconds(t.ts, t.format),
+ to_timestamp_millis(t.ts, t.format),
+ to_timestamp_micros(t.ts, t.format),
+ to_timestamp_nanos(t.ts, t.format)
+ from ts_utf8view_data as t
+----
+2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00
2020-09-08T12:00:00 2020-09-08T12:00:00
+2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25
2031-01-19T18:33:25 2031-01-19T18:33:25
+2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00
2020-09-08T12:00:00 2020-09-08T12:00:00
+2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25
2031-01-19T23:33:25 2031-01-19T23:33:25
+1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01
1999-12-31T18:01:01 1999-12-31T18:01:01
+
+# verify timestamp data using tables with formatting options
+query PPPPP
+SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y
%H:%M:%S%#:z'),
+ to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s',
'%d-%m-%Y %H:%M:%S%#:z'),
+ to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s',
'%d-%m-%Y %H:%M:%S%#:z'),
+ to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s',
'%d-%m-%Y %H:%M:%S%#:z'),
+ to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y
%H:%M:%S%#:z')
+ from ts_utf8_data as t
+----
+2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00
2020-09-08T12:00:00 2020-09-08T12:00:00
+2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25
2031-01-19T18:33:25 2031-01-19T18:33:25
+2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00
2020-09-08T12:00:00 2020-09-08T12:00:00
+2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25
2031-01-19T23:33:25 2031-01-19T23:33:25
+1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01
1999-12-31T18:01:01 1999-12-31T18:01:01
+
+query PPPPP
+SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y
%H:%M:%S%#:z'),
+ to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s',
'%d-%m-%Y %H:%M:%S%#:z'),
+ to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s',
'%d-%m-%Y %H:%M:%S%#:z'),
+ to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s',
'%d-%m-%Y %H:%M:%S%#:z'),
+ to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y
%H:%M:%S%#:z')
+ from ts_largeutf8_data as t
+----
+2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00
2020-09-08T12:00:00 2020-09-08T12:00:00
+2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25
2031-01-19T18:33:25 2031-01-19T18:33:25
+2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00
2020-09-08T12:00:00 2020-09-08T12:00:00
+2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25
2031-01-19T23:33:25 2031-01-19T23:33:25
+1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01
1999-12-31T18:01:01 1999-12-31T18:01:01
+
+query PPPPP
+SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y
%H:%M:%S%#:z'),
+ to_timestamp_seconds(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s',
'%d-%m-%Y %H:%M:%S%#:z'),
+ to_timestamp_millis(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s',
'%d-%m-%Y %H:%M:%S%#:z'),
+ to_timestamp_micros(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s',
'%d-%m-%Y %H:%M:%S%#:z'),
+ to_timestamp_nanos(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%+', '%s', '%d-%m-%Y
%H:%M:%S%#:z')
+ from ts_utf8view_data as t
+----
+2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00
2020-09-08T12:00:00 2020-09-08T12:00:00
+2031-01-19T18:33:25 2031-01-19T18:33:25 2031-01-19T18:33:25
2031-01-19T18:33:25 2031-01-19T18:33:25
+2020-09-08T12:00:00 2020-09-08T12:00:00 2020-09-08T12:00:00
2020-09-08T12:00:00 2020-09-08T12:00:00
+2031-01-19T23:33:25 2031-01-19T23:33:25 2031-01-19T23:33:25
2031-01-19T23:33:25 2031-01-19T23:33:25
+1999-12-31T18:01:01 1999-12-31T18:01:01 1999-12-31T18:01:01
1999-12-31T18:01:01 1999-12-31T18:01:01
+
+# verify timestamp data using tables with formatting options where at least
one column cannot be parsed
+query error
+SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#z', '%+', '%d-%m-%Y
%H:%M:%S%#:z') from ts_utf8_data as t
+----
+DataFusion error: Execution error: Error parsing timestamp from '2020-09-08
12/00/00+00:00' using formats: ["%Y-%m-%d %H/%M/%S%#z", "%+", "%d-%m-%Y
%H:%M:%S%#:z"]: strptime parsing failed: expected to match literal byte `-`
from format string, but found byte `2` in input
+
+
+# verify timestamp data using tables with formatting options where one of the
formats is invalid
+query P
+SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%s', '%q', '%d-%m-%Y
%H:%M:%S%#:z', '%+') from ts_utf8_data as t
+----
+2020-09-08T12:00:00
+2031-01-19T18:33:25
+2020-09-08T12:00:00
+2031-01-19T23:33:25
+1999-12-31T18:01:01
+
+query P
+SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%s', '%q', '%d-%m-%Y
%H:%M:%S%#:z', '%+') from ts_largeutf8_data as t
+----
+2020-09-08T12:00:00
+2031-01-19T18:33:25
+2020-09-08T12:00:00
+2031-01-19T23:33:25
+1999-12-31T18:01:01
+
+query P
+SELECT to_timestamp(t.ts, '%Y-%m-%d %H/%M/%S%#:z', '%s', '%q', '%d-%m-%Y
%H:%M:%S%#:z', '%+') from ts_utf8view_data as t
+----
+2020-09-08T12:00:00
+2031-01-19T18:33:25
+2020-09-08T12:00:00
+2031-01-19T23:33:25
+1999-12-31T18:01:01
+
+# timestamp data using tables with formatting options in an array is not
supported at this time
+query error function unsupported data type at index 1:
+SELECT to_timestamp(t.ts, make_array('%Y-%m-%d %H/%M/%S%#:z', '%s', '%q',
'%d-%m-%Y %H:%M:%S%#:z', '%+')) from ts_utf8_data as t
+
+statement ok
+drop table ts_utf8_data
+
+statement ok
+drop table ts_data
+
Review Comment:
```suggestion
statement ok
drop table ts_largeutf8_data
statement ok
drop table ts_utf8view_data
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]