alamb commented on code in PR #15168: URL: https://github.com/apache/datafusion/pull/15168#discussion_r1997557995
########## datafusion/sqllogictest/test_files/spark/string/ascii.slt: ########## @@ -0,0 +1,43 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +query I +SELECT spark_ascii('234'); +---- +50 + +query I +SELECT spark_ascii(''); +---- +0 + +query I +SELECT spark_ascii('222'); +---- +50 + +query I +SELECT spark_ascii(2::INT); +---- +50 + +query I +SELECT spark_ascii(a) FROM (VALUES ('Spark'), ('PySpark'), ('Pandas API')) AS t(a); +---- +83 +80 +80 Review Comment: There appears to be a (likely unintended) change to the parquet-testing submodule in this PR too:  ########## datafusion/spark/src/function/string/ascii.rs: ########## @@ -0,0 +1,208 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, Int32Array}; +use arrow::datatypes::DataType; +use arrow::error::ArrowError; +use datafusion_common::{internal_err, plan_err, Result}; +use datafusion_expr::{ColumnarValue, Documentation}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; +use datafusion_functions::utils::make_scalar_function; +use datafusion_macros::user_doc; +use std::any::Any; +use std::sync::Arc; + +#[user_doc( + doc_section(label = "Spark String Functions"), + description = "Returns the ASCII code point of the first character of string.", + syntax_example = "ascii(str)", + sql_example = r#"```sql +> select ascii('abc'); ++--------------------+ +| ascii(abc) | ++--------------------+ +| 97 | ++--------------------+ +> select ascii('🚀'); ++-------------------+ +| ascii(🚀) | ++-------------------+ +| 128640 | ++-------------------+ +> select ascii(2); ++----------------+ +| ascii(2) | ++----------------+ +| 50 | ++----------------+ +> select ascii(X'44617461467573696F6E'); ++--------------------------------------+ +| ascii(X'44617461467573696F6E') | ++--------------------------------------+ +| 68 | ++--------------------------------------+ +```"#, + standard_argument(name = "str", prefix = "String") +)] +#[derive(Debug)] +pub struct SparkAscii { + signature: Signature, + aliases: Vec<String>, +} + +impl Default for SparkAscii { + fn default() -> Self { + Self::new() + } +} + +impl SparkAscii { + pub fn new() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + aliases: vec![], + } + } +} + +impl ScalarUDFImpl for SparkAscii { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "spark_ascii" Review Comment: see above ########## datafusion/spark/src/function/math/expm1.rs: ########## @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::function::error_utils::{ + invalid_arg_count_exec_err, unsupported_data_type_exec_err, +}; +use arrow::array::{ArrayRef, AsArray}; +use arrow::datatypes::{DataType, Float64Type}; +use datafusion_common::{Result, ScalarValue}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; +use datafusion_macros::user_doc; +use std::any::Any; +use std::sync::Arc; + +#[user_doc( + doc_section(label = "Spark Math Functions"), + description = "Returns exp(expr) - 1 as a Float64.", + syntax_example = "expm1(expr)", + sql_example = r#"```sql +> select expm1(0); ++--------------------+ +| expm1(0) | ++--------------------+ +| 0.0 | ++--------------------+ +> select expm1(1); ++----------------+ +| expm1(1) | ++----------------+ +| 50 | ++----------------+ +```"#, + argument( + name = "expr", + description = "An expression that evaluates to a numeric." + ) +)] +#[derive(Debug)] +pub struct SparkExpm1 { + signature: Signature, + aliases: Vec<String>, +} + +impl Default for SparkExpm1 { + fn default() -> Self { + Self::new() + } +} + +impl SparkExpm1 { + pub fn new() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + aliases: vec!["spark_expm1".to_string()], + } + } +} + +impl ScalarUDFImpl for SparkExpm1 { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "expm1" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> { + Ok(DataType::Float64) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result<ColumnarValue> { + if args.args.len() != 1 { + return Err(invalid_arg_count_exec_err("expm1", (1, 1), args.args.len())); + } + match &args.args[0] { + ColumnarValue::Scalar(ScalarValue::Float64(value)) => Ok( + ColumnarValue::Scalar(ScalarValue::Float64(value.map(|x| x.exp_m1()))), + ), + ColumnarValue::Array(array) => match array.data_type() { + DataType::Float64 => Ok(ColumnarValue::Array(Arc::new( + array + .as_primitive::<Float64Type>() + .unary::<_, Float64Type>(|x| x.exp_m1()), + ) + as ArrayRef)), + other => Err(unsupported_data_type_exec_err( + "expm1", + format!("{}", DataType::Float64).as_str(), + other, + )), + }, + other => Err(unsupported_data_type_exec_err( + "expm1", + format!("{}", DataType::Float64).as_str(), + &other.data_type(), + )), + } + } + + fn aliases(&self) -> &[String] { + &self.aliases + } + + fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> { + if arg_types.len() != 1 { + return Err(invalid_arg_count_exec_err("expm1", (1, 1), arg_types.len())); + } + if arg_types[0].is_numeric() { + Ok(vec![DataType::Float64]) + } else { + Err(unsupported_data_type_exec_err( + "expm1", + "Numeric Type", + &arg_types[0], + )) + } + } +} + +#[cfg(test)] +mod tests { + use crate::function::math::expm1::SparkExpm1; + use crate::function::utils::test::test_scalar_function; + use arrow::array::{Array, Float64Array}; + use arrow::datatypes::DataType::Float64; + use datafusion_common::{Result, ScalarValue}; + use datafusion_expr::{ColumnarValue, ScalarUDFImpl}; + + macro_rules! test_expm1_float64_invoke { + ($INPUT:expr, $EXPECTED:expr) => { + test_scalar_function!( + SparkExpm1::new(), + vec![ColumnarValue::Scalar(ScalarValue::Float64($INPUT))], + $EXPECTED, + f64, + Float64, + Float64Array + ); + }; + } + + #[test] Review Comment: Because sqllogictests are so much faster to write and update, I suggest we point people towards using sqllogictests to test the functions unless there is somehting that can not be tested using `.slt` files ########## datafusion/spark/src/function/math/expm1.rs: ########## @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::function::error_utils::{ + invalid_arg_count_exec_err, unsupported_data_type_exec_err, +}; +use arrow::array::{ArrayRef, AsArray}; +use arrow::datatypes::{DataType, Float64Type}; +use datafusion_common::{Result, ScalarValue}; +use datafusion_expr::{ + ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; +use datafusion_macros::user_doc; +use std::any::Any; +use std::sync::Arc; + +#[user_doc( + doc_section(label = "Spark Math Functions"), + description = "Returns exp(expr) - 1 as a Float64.", + syntax_example = "expm1(expr)", + sql_example = r#"```sql +> select expm1(0); ++--------------------+ +| expm1(0) | ++--------------------+ +| 0.0 | ++--------------------+ +> select expm1(1); ++----------------+ +| expm1(1) | ++----------------+ +| 50 | ++----------------+ +```"#, + argument( + name = "expr", + description = "An expression that evaluates to a numeric." + ) +)] +#[derive(Debug)] +pub struct SparkExpm1 { + signature: Signature, + aliases: Vec<String>, +} + +impl Default for SparkExpm1 { + fn default() -> Self { + Self::new() + } +} + +impl SparkExpm1 { + pub fn new() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + aliases: vec![], + } + } +} + +impl ScalarUDFImpl for SparkExpm1 { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "spark_expm1" Review Comment: What I would recommend is 1. keep the original `expm1` name (that seems the most useful to people who are trying to get spark compatible behavior) 2. Change to use a function to register spark compatible functions (see above) 3. Change our sqlloigictest driver so it registers spark functions for any test that starts with `spark_*.slt` (similiar to `pg_...`) That way most sqllogictest stuff stays the same, and we can write spark tests in `spark/spark_math.slt`, `spark/spark_string.slt` etc type tests Here is the code that customizes the context for the individual test files https://github.com/apache/datafusion/blob/e4bf9512616c8fbfdf785e5dcfe509f3bd1524a4/datafusion/sqllogictest/src/test_context.rs#L77-L81 ########## datafusion/sqllogictest/test_files/spark/math/expm1.slt: ########## @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + Review Comment: > The expm1 probably works identically across all Spark versions and is not affected by different configuration settings, but many expressions are affected by settings such as ANSI mode and different date/time formats and timezones. I believe @Omega359 has a proposal here of how to thread the config options into the arguments of the functions: - https://github.com/apache/datafusion/pull/13527 ########## datafusion/core/src/execution/session_state_defaults.rs: ########## @@ -104,28 +104,55 @@ impl SessionStateDefaults { /// returns the list of default [`ScalarUDF']'s pub fn default_scalar_functions() -> Vec<Arc<ScalarUDF>> { - #[cfg_attr(not(feature = "nested_expressions"), allow(unused_mut))] + #[cfg_attr( + not(any(feature = "nested_expressions", feature = "spark")), + allow(unused_mut) + )] let mut functions: Vec<Arc<ScalarUDF>> = functions::all_default_functions(); #[cfg(feature = "nested_expressions")] functions.append(&mut functions_nested::all_default_nested_functions()); + #[cfg(feature = "spark")] + functions.append(&mut datafusion_spark::all_default_scalar_functions()); + Review Comment: I think this approach is non ideal because: 1. It increases the dependency load (and number of config flags) 2. It makes it hard to dynamically change between spark/non spark mode in sqllogictests (see below) I would recommend making no change to the `datafusion` core crate and adding no new dependencies there Instead, I suggest adding a function to `datafusion-functions-spark` that can register all the required functions with a [FunctionRegistry](https://docs.rs/datafusion/latest/datafusion/execution/trait.FunctionRegistry.html) (`SessionContext` [implements](https://docs.rs/datafusion/latest/datafusion/execution/trait.FunctionRegistry.html#implementors) this trait) For example, ```rust /// Registers spark compatible functions with the `registry` fn register_spark_functions(registry: &dyn mut FunctionRegistry) -> Result<()> { ... } ``` And then to use these functions someone would do ```rust // Create default session context let mut ctx = SessionContext::new(); // setup for spark execution register_spark_functions(&mut ctx)?; // run queries, profit! ... ``` ########## datafusion/spark/README.md: ########## @@ -0,0 +1,38 @@ +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +# datafusion-spark: Spark-compatible Expressions + +This crate provides Apache Spark-compatible expressions for use with DataFusion. + +## Testing Guide + +When testing functions by directly invoking them (e.g., `test_scalar_function!()`), input coercion (from the `signature` +or `coerce_types`) is not applied. + +Therefore, direct invocation tests should only be used to verify that the function is correctly implemented. + +Please be sure to add additional tests beyond direct invocation. +For more detailed testing guidelines, refer to +the [Spark SQLLogicTest README](../sqllogictest/test_files/spark/README.md). + +## Implementation References + +When implementing Spark-compatible functions, you can check if there are existing implementations in +the [Sail](https://github.com/lakehq/sail) or [Comet](https://github.com/apache/datafusion-comet) projects first. Review Comment: 100% Maybe we can also point out that tests should be ported over too -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org