martin-g commented on code in PR #19829: URL: https://github.com/apache/datafusion/pull/19829#discussion_r2694342755
########## datafusion/spark/src/function/datetime/trunc.rs: ########## @@ -0,0 +1,145 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit}; +use datafusion_common::types::{NativeType, logical_date, logical_string}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, ScalarValue, internal_err}; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext}; +use datafusion_expr::{ + Coercion, ColumnarValue, Expr, ExprSchemable, ReturnFieldArgs, ScalarFunctionArgs, + ScalarUDFImpl, Signature, TypeSignatureClass, Volatility, +}; + +/// Spark trunc supports date inputs only and extra format aliases. +/// Also spark trunc's argument order is (date, format). +/// <https://spark.apache.org/docs/latest/api/sql/index.html#trunc> +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkTrunc { + signature: Signature, +} + +impl Default for SparkTrunc { + fn default() -> Self { + Self::new() + } +} + +impl SparkTrunc { + pub fn new() -> Self { + Self { + signature: Signature::coercible( + vec![ + Coercion::new_implicit( + TypeSignatureClass::Native(logical_date()), + vec![TypeSignatureClass::Native(logical_string())], + NativeType::Date, + ), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SparkTrunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "trunc" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + internal_err!("return_field_from_args should be used instead") + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> { + let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); + + Ok(Arc::new(Field::new( + self.name(), + args.arg_fields[0].data_type().clone(), + nullable, + ))) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> { + internal_err!( + "spark date_trunc should have been simplified to standard date_trunc" + ) + } + + fn simplify( + &self, + args: Vec<Expr>, + info: &SimplifyContext, + ) -> Result<ExprSimplifyResult> { + let [dt_expr, fmt_expr] = take_function_args(self.name(), args)?; + + let fmt = match fmt_expr.as_literal() { + Some(ScalarValue::Utf8(Some(v))) + | Some(ScalarValue::Utf8View(Some(v))) + | Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(), + _ => { + return internal_err!( + "First argument of `TRUNC` must be non-null scalar Utf8" Review Comment: ```suggestion "Second argument of `TRUNC` must be non-null scalar Utf8" ``` `fmt` is the second ########## datafusion/sqllogictest/test_files/spark/datetime/trunc.slt: ########## @@ -15,28 +15,79 @@ # specific language governing permissions and limitations # under the License. -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT trunc('2009-02-12', 'MM'); -## PySpark 3.5.5 Result: {'trunc(2009-02-12, MM)': datetime.date(2009, 2, 1), 'typeof(trunc(2009-02-12, MM))': 'date', 'typeof(2009-02-12)': 'string', 'typeof(MM)': 'string'} -#query -#SELECT trunc('2009-02-12'::string, 'MM'::string); - -## Original Query: SELECT trunc('2015-10-27', 'YEAR'); -## PySpark 3.5.5 Result: {'trunc(2015-10-27, YEAR)': datetime.date(2015, 1, 1), 'typeof(trunc(2015-10-27, YEAR))': 'date', 'typeof(2015-10-27)': 'string', 'typeof(YEAR)': 'string'} -#query -#SELECT trunc('2015-10-27'::string, 'YEAR'::string); - -## Original Query: SELECT trunc('2019-08-04', 'quarter'); -## PySpark 3.5.5 Result: {'trunc(2019-08-04, quarter)': datetime.date(2019, 7, 1), 'typeof(trunc(2019-08-04, quarter))': 'date', 'typeof(2019-08-04)': 'string', 'typeof(quarter)': 'string'} -#query -#SELECT trunc('2019-08-04'::string, 'quarter'::string); - -## Original Query: SELECT trunc('2019-08-04', 'week'); -## PySpark 3.5.5 Result: {'trunc(2019-08-04, week)': datetime.date(2019, 7, 29), 'typeof(trunc(2019-08-04, week))': 'date', 'typeof(2019-08-04)': 'string', 'typeof(week)': 'string'} -#query -#SELECT trunc('2019-08-04'::string, 'week'::string); +# YEAR - truncate to first date of year +query D +SELECT trunc('2009-02-12'::date, 'YEAR'::string); +---- +2009-01-01 + +query D +SELECT trunc('2009-02-12'::date, 'YYYY'::string); +---- +2009-01-01 + +query D +SELECT trunc('2009-02-12'::date, 'YY'::string); +---- +2009-01-01 + +# QUARTER - truncate to first date of quarter +query D +SELECT trunc('2009-02-12'::date, 'QUARTER'::string); +---- +2009-01-01 + +# MONTH - truncate to first date of month +query D +SELECT trunc('2009-02-12'::date, 'MONTH'::string); +---- +2009-02-01 + +query D +SELECT trunc('2009-02-12'::date, 'MM'::string); +---- +2009-02-01 + +query D +SELECT trunc('2009-02-12'::date, 'MON'::string); +---- +2009-02-01 + +# WEEK - truncate to Monday of the week +query D +SELECT trunc('2009-02-12'::date, 'WEEK'::string); +---- +2009-02-09 + +# string input +query D +SELECT trunc('2009-02-12'::string, 'YEAR'::string); +---- +2009-01-01 + +query D +SELECT trunc(column1, 'YEAR'::string) +FROM VALUES +('2009-02-12'::date), +('2000-02-12'::date), +('2042-02-12'::date), +(NULL::date); +---- +2009-01-01 +2000-01-01 +2042-01-01 +NULL + +# Null handling +query D +SELECT trunc(NULL::date, 'YEAR'::string); +---- +NULL + +query error Internal error: First argument of `TRUNC` must be non-null scalar Utf8. Review Comment: ```suggestion query error Internal error: Second argument of `TRUNC` must be non-null scalar Utf8. ``` ########## datafusion/spark/src/function/datetime/time_trunc.rs: ########## @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::{DataType, Field, FieldRef}; +use datafusion_common::types::logical_string; +use datafusion_common::{Result, ScalarValue, internal_err}; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext}; +use datafusion_expr::{ + Coercion, ColumnarValue, Expr, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, + Signature, TypeSignatureClass, Volatility, +}; + +/// Spark time_trunc function only handles time inputs. +/// <https://spark.apache.org/docs/latest/api/sql/index.html#time_trunc> +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkTimeTrunc { + signature: Signature, +} + +impl Default for SparkTimeTrunc { + fn default() -> Self { + Self::new() + } +} + +impl SparkTimeTrunc { + pub fn new() -> Self { + Self { + signature: Signature::coercible( + vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Time), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SparkTimeTrunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "time_trunc" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + internal_err!("return_field_from_args should be used instead") + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> { + let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); + + Ok(Arc::new(Field::new( + self.name(), + args.arg_fields[1].data_type().clone(), + nullable, + ))) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> { + internal_err!( + "spark time_trunc should have been simplified to standard date_trunc" + ) + } + + fn simplify( + &self, + args: Vec<Expr>, + _info: &SimplifyContext, + ) -> Result<ExprSimplifyResult> { + let fmt_expr = &args[0]; + + let fmt = match fmt_expr.as_literal() { + Some(ScalarValue::Utf8(Some(v))) + | Some(ScalarValue::Utf8View(Some(v))) + | Some(ScalarValue::LargeUtf8(Some(v))) => v.to_lowercase(), + _ => { + return internal_err!( + "First argument of `TIME_TRUNC` must be non-null scalar Utf8" + ); + } + }; + + if !matches!( + fmt.as_str(), + "hour" | "minute" | "second" | "millisecond" | "microsecond" + ) { + return internal_err!( + "The format argument of `TIME_TRUNC` must be one of: hour, minute, second, millisecond, microsecond" + ); + } + + Ok(ExprSimplifyResult::Simplified(Expr::ScalarFunction( + ScalarFunction::new_udf(datafusion_functions::datetime::date_trunc(), args), Review Comment: `fmt` is normalized (lowercased) above and validated but here you pass the original `args` (non-normalized). Maybe it will be better to pass the `fmt`: ```rust let fmt_expr = Expr::Literal(ScalarValue::new_utf8(fmt.as_str()), None); ... ScalarFunction::new_udf( datafusion_functions::datetime::date_trunc(), vec![fmt_expr, time_expr], ), ``` ########## datafusion/sqllogictest/test_files/spark/datetime/time_trunc.slt: ########## @@ -0,0 +1,74 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# HOUR - zero out minute and second with fraction +query D +SELECT time_trunc('HOUR', '09:32:05.123456'::time); +---- +09:00:00 + +# MINUTE - zero out second with fraction +query D +SELECT time_trunc('MINUTE', '09:32:05.123456'::time); +---- +09:32:00 + +# SECOND - zero out fraction +query D +SELECT time_trunc('SECOND', '09:32:05.123456'::time); +---- +09:32:05 + +# MILLISECOND - zero out microseconds +query D +SELECT time_trunc('MILLISECOND', '09:32:05.123456'::time); +---- +09:32:05.123 + +# MICROSECOND - everything remains +query D +SELECT time_trunc('MICROSECOND', '09:32:05.123456'::time); +---- +09:32:05.123456 + +query D +SELECT time_trunc('HOUR', column1) +FROM VALUES +('09:32:05.123456'::time), +('22:45:30.654321'::time), +('14:20:10.000001'::time), +(NULL::time); +---- +09:00:00 +22:00:00 +14:00:00 +NULL + + +# Null handling +query error Internal error: First argument of `TIME_TRUNC` must be non-null scalar Utf8. +SELECT time_trunc(NULL, '09:32:05.123456'::time); + +query D +SELECT time_trunc('HOUR', NULL::time); +---- +NULL + +# incorrect format +query error nternal error: The format argument of `TIME_TRUNC` must be one of: hour, minute, second, millisecond, microsecond. Review Comment: ```suggestion query error Internal error: The format argument of `TIME_TRUNC` must be one of: hour, minute, second, millisecond, microsecond. ``` ########## datafusion/spark/src/function/datetime/trunc.rs: ########## @@ -0,0 +1,145 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::{DataType, Field, FieldRef, TimeUnit}; +use datafusion_common::types::{NativeType, logical_date, logical_string}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, ScalarValue, internal_err}; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext}; +use datafusion_expr::{ + Coercion, ColumnarValue, Expr, ExprSchemable, ReturnFieldArgs, ScalarFunctionArgs, + ScalarUDFImpl, Signature, TypeSignatureClass, Volatility, +}; + +/// Spark trunc supports date inputs only and extra format aliases. +/// Also spark trunc's argument order is (date, format). +/// <https://spark.apache.org/docs/latest/api/sql/index.html#trunc> +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkTrunc { + signature: Signature, +} + +impl Default for SparkTrunc { + fn default() -> Self { + Self::new() + } +} + +impl SparkTrunc { + pub fn new() -> Self { + Self { + signature: Signature::coercible( + vec![ + Coercion::new_implicit( + TypeSignatureClass::Native(logical_date()), + vec![TypeSignatureClass::Native(logical_string())], + NativeType::Date, + ), + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SparkTrunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "trunc" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + internal_err!("return_field_from_args should be used instead") + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> { + let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); + + Ok(Arc::new(Field::new( + self.name(), + args.arg_fields[0].data_type().clone(), + nullable, + ))) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> { + internal_err!( + "spark date_trunc should have been simplified to standard date_trunc" Review Comment: ```suggestion "spark trunc should have been simplified to standard date_trunc" ``` ########## datafusion/spark/src/function/datetime/time_trunc.rs: ########## @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::any::Any; +use std::sync::Arc; + +use arrow::datatypes::{DataType, Field, FieldRef}; +use datafusion_common::types::logical_string; +use datafusion_common::{Result, ScalarValue, internal_err}; +use datafusion_expr::expr::ScalarFunction; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext}; +use datafusion_expr::{ + Coercion, ColumnarValue, Expr, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, + Signature, TypeSignatureClass, Volatility, +}; + +/// Spark time_trunc function only handles time inputs. +/// <https://spark.apache.org/docs/latest/api/sql/index.html#time_trunc> +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkTimeTrunc { + signature: Signature, +} + +impl Default for SparkTimeTrunc { + fn default() -> Self { + Self::new() + } +} + +impl SparkTimeTrunc { + pub fn new() -> Self { + Self { + signature: Signature::coercible( + vec![ + Coercion::new_exact(TypeSignatureClass::Native(logical_string())), + Coercion::new_exact(TypeSignatureClass::Time), + ], + Volatility::Immutable, + ), + } + } +} + +impl ScalarUDFImpl for SparkTimeTrunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "time_trunc" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + internal_err!("return_field_from_args should be used instead") + } + + fn return_field_from_args(&self, args: ReturnFieldArgs) -> Result<FieldRef> { + let nullable = args.arg_fields.iter().any(|f| f.is_nullable()); + + Ok(Arc::new(Field::new( + self.name(), + args.arg_fields[1].data_type().clone(), + nullable, + ))) + } + + fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> Result<ColumnarValue> { + internal_err!( + "spark time_trunc should have been simplified to standard date_trunc" + ) + } + + fn simplify( + &self, + args: Vec<Expr>, + _info: &SimplifyContext, + ) -> Result<ExprSimplifyResult> { + let fmt_expr = &args[0]; Review Comment: ```suggestion let [fmt_expr, time_expr] = take_function_args(self.name(), args)?; ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
