alamb commented on code in PR #15168:
URL: https://github.com/apache/datafusion/pull/15168#discussion_r1997557995


##########
datafusion/sqllogictest/test_files/spark/string/ascii.slt:
##########
@@ -0,0 +1,43 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+query I
+SELECT spark_ascii('234');
+----
+50
+
+query I
+SELECT spark_ascii('');
+----
+0
+
+query I
+SELECT spark_ascii('222');
+----
+50
+
+query I
+SELECT spark_ascii(2::INT);
+----
+50
+
+query I
+SELECT spark_ascii(a) FROM (VALUES ('Spark'), ('PySpark'), ('Pandas API')) AS 
t(a);
+----
+83
+80
+80

Review Comment:
   
   There appears to be a (likely unintended) change to the parquet-testing 
submodule in this PR too:
   
   ![Screenshot 2025-03-16 at 6 52 10 
AM](https://github.com/user-attachments/assets/ba6cda29-fc58-482e-8819-4cad0f7ae967)
   



##########
datafusion/spark/src/function/string/ascii.rs:
##########
@@ -0,0 +1,208 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::array::{ArrayAccessor, ArrayIter, ArrayRef, AsArray, Int32Array};
+use arrow::datatypes::DataType;
+use arrow::error::ArrowError;
+use datafusion_common::{internal_err, plan_err, Result};
+use datafusion_expr::{ColumnarValue, Documentation};
+use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, 
Volatility};
+use datafusion_functions::utils::make_scalar_function;
+use datafusion_macros::user_doc;
+use std::any::Any;
+use std::sync::Arc;
+
+#[user_doc(
+    doc_section(label = "Spark String Functions"),
+    description = "Returns the ASCII code point of the first character of 
string.",
+    syntax_example = "ascii(str)",
+    sql_example = r#"```sql
+> select ascii('abc');
++--------------------+
+| ascii(abc)         |
++--------------------+
+| 97                 |
++--------------------+
+> select ascii('🚀');
++-------------------+
+| ascii(🚀)         |
++-------------------+
+| 128640            |
++-------------------+
+> select ascii(2);
++----------------+
+| ascii(2)       |
++----------------+
+| 50             |
++----------------+
+> select ascii(X'44617461467573696F6E');
++--------------------------------------+
+| ascii(X'44617461467573696F6E')       |
++--------------------------------------+
+| 68                                   |
++--------------------------------------+
+```"#,
+    standard_argument(name = "str", prefix = "String")
+)]
+#[derive(Debug)]
+pub struct SparkAscii {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkAscii {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkAscii {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec![],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkAscii {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "spark_ascii"

Review Comment:
   see above



##########
datafusion/spark/src/function/math/expm1.rs:
##########
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::function::error_utils::{
+    invalid_arg_count_exec_err, unsupported_data_type_exec_err,
+};
+use arrow::array::{ArrayRef, AsArray};
+use arrow::datatypes::{DataType, Float64Type};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_macros::user_doc;
+use std::any::Any;
+use std::sync::Arc;
+
+#[user_doc(
+    doc_section(label = "Spark Math Functions"),
+    description = "Returns exp(expr) - 1 as a Float64.",
+    syntax_example = "expm1(expr)",
+    sql_example = r#"```sql
+> select expm1(0);
++--------------------+
+| expm1(0)           |
++--------------------+
+| 0.0                |
++--------------------+
+> select expm1(1);
++----------------+
+| expm1(1)       |
++----------------+
+| 50             |
++----------------+
+```"#,
+    argument(
+        name = "expr",
+        description = "An expression that evaluates to a numeric."
+    )
+)]
+#[derive(Debug)]
+pub struct SparkExpm1 {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkExpm1 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkExpm1 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec!["spark_expm1".to_string()],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkExpm1 {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "expm1"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn invoke_with_args(&self, args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        if args.args.len() != 1 {
+            return Err(invalid_arg_count_exec_err("expm1", (1, 1), 
args.args.len()));
+        }
+        match &args.args[0] {
+            ColumnarValue::Scalar(ScalarValue::Float64(value)) => Ok(
+                ColumnarValue::Scalar(ScalarValue::Float64(value.map(|x| 
x.exp_m1()))),
+            ),
+            ColumnarValue::Array(array) => match array.data_type() {
+                DataType::Float64 => Ok(ColumnarValue::Array(Arc::new(
+                    array
+                        .as_primitive::<Float64Type>()
+                        .unary::<_, Float64Type>(|x| x.exp_m1()),
+                )
+                    as ArrayRef)),
+                other => Err(unsupported_data_type_exec_err(
+                    "expm1",
+                    format!("{}", DataType::Float64).as_str(),
+                    other,
+                )),
+            },
+            other => Err(unsupported_data_type_exec_err(
+                "expm1",
+                format!("{}", DataType::Float64).as_str(),
+                &other.data_type(),
+            )),
+        }
+    }
+
+    fn aliases(&self) -> &[String] {
+        &self.aliases
+    }
+
+    fn coerce_types(&self, arg_types: &[DataType]) -> Result<Vec<DataType>> {
+        if arg_types.len() != 1 {
+            return Err(invalid_arg_count_exec_err("expm1", (1, 1), 
arg_types.len()));
+        }
+        if arg_types[0].is_numeric() {
+            Ok(vec![DataType::Float64])
+        } else {
+            Err(unsupported_data_type_exec_err(
+                "expm1",
+                "Numeric Type",
+                &arg_types[0],
+            ))
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::function::math::expm1::SparkExpm1;
+    use crate::function::utils::test::test_scalar_function;
+    use arrow::array::{Array, Float64Array};
+    use arrow::datatypes::DataType::Float64;
+    use datafusion_common::{Result, ScalarValue};
+    use datafusion_expr::{ColumnarValue, ScalarUDFImpl};
+
+    macro_rules! test_expm1_float64_invoke {
+        ($INPUT:expr, $EXPECTED:expr) => {
+            test_scalar_function!(
+                SparkExpm1::new(),
+                vec![ColumnarValue::Scalar(ScalarValue::Float64($INPUT))],
+                $EXPECTED,
+                f64,
+                Float64,
+                Float64Array
+            );
+        };
+    }
+
+    #[test]

Review Comment:
   Because sqllogictests are so much faster to write and update, I suggest we 
point people towards using sqllogictests to test the functions unless there is 
somehting that can not be tested using `.slt` files
   
   



##########
datafusion/spark/src/function/math/expm1.rs:
##########
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::function::error_utils::{
+    invalid_arg_count_exec_err, unsupported_data_type_exec_err,
+};
+use arrow::array::{ArrayRef, AsArray};
+use arrow::datatypes::{DataType, Float64Type};
+use datafusion_common::{Result, ScalarValue};
+use datafusion_expr::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility,
+};
+use datafusion_macros::user_doc;
+use std::any::Any;
+use std::sync::Arc;
+
+#[user_doc(
+    doc_section(label = "Spark Math Functions"),
+    description = "Returns exp(expr) - 1 as a Float64.",
+    syntax_example = "expm1(expr)",
+    sql_example = r#"```sql
+> select expm1(0);
++--------------------+
+| expm1(0)           |
++--------------------+
+| 0.0                |
++--------------------+
+> select expm1(1);
++----------------+
+| expm1(1)       |
++----------------+
+| 50             |
++----------------+
+```"#,
+    argument(
+        name = "expr",
+        description = "An expression that evaluates to a numeric."
+    )
+)]
+#[derive(Debug)]
+pub struct SparkExpm1 {
+    signature: Signature,
+    aliases: Vec<String>,
+}
+
+impl Default for SparkExpm1 {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl SparkExpm1 {
+    pub fn new() -> Self {
+        Self {
+            signature: Signature::user_defined(Volatility::Immutable),
+            aliases: vec![],
+        }
+    }
+}
+
+impl ScalarUDFImpl for SparkExpm1 {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "spark_expm1"

Review Comment:
   What I would recommend is 
   1. keep the original `expm1`  name (that seems the most useful to people who 
are trying to get spark compatible behavior)
   2. Change to use a function to register spark compatible functions (see 
above)
   3. Change our sqlloigictest driver so it registers spark functions for any 
test that starts with `spark_*.slt` (similiar to `pg_...`)
   
   That way most sqllogictest stuff stays the same, and we can write spark 
tests in `spark/spark_math.slt`, `spark/spark_string.slt` etc type tests
   
   Here is the code that customizes the context for the individual test files
   
https://github.com/apache/datafusion/blob/e4bf9512616c8fbfdf785e5dcfe509f3bd1524a4/datafusion/sqllogictest/src/test_context.rs#L77-L81
   



##########
datafusion/sqllogictest/test_files/spark/math/expm1.slt:
##########
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+
+#   http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+

Review Comment:
   > The expm1 probably works identically across all Spark versions and is not 
affected by different configuration settings, but many expressions are affected 
by settings such as ANSI mode and different date/time formats and timezones.
   
   I believe @Omega359  has a proposal here of how to thread the config options 
into the arguments of the functions:
   - https://github.com/apache/datafusion/pull/13527



##########
datafusion/core/src/execution/session_state_defaults.rs:
##########
@@ -104,28 +104,55 @@ impl SessionStateDefaults {
 
     /// returns the list of default [`ScalarUDF']'s
     pub fn default_scalar_functions() -> Vec<Arc<ScalarUDF>> {
-        #[cfg_attr(not(feature = "nested_expressions"), allow(unused_mut))]
+        #[cfg_attr(
+            not(any(feature = "nested_expressions", feature = "spark")),
+            allow(unused_mut)
+        )]
         let mut functions: Vec<Arc<ScalarUDF>> = 
functions::all_default_functions();
 
         #[cfg(feature = "nested_expressions")]
         functions.append(&mut 
functions_nested::all_default_nested_functions());
 
+        #[cfg(feature = "spark")]
+        functions.append(&mut 
datafusion_spark::all_default_scalar_functions());
+

Review Comment:
   I think this approach is non ideal because:
   1. It increases the dependency load (and number of config flags)
   2. It makes it hard to dynamically change between spark/non spark mode in 
sqllogictests (see below)
   
   I would recommend making no change to the `datafusion` core crate and adding 
no new dependencies there 
   
   Instead, I suggest adding a function to  `datafusion-functions-spark` that 
can register all the required functions with a  
[FunctionRegistry](https://docs.rs/datafusion/latest/datafusion/execution/trait.FunctionRegistry.html)
 (`SessionContext` 
[implements](https://docs.rs/datafusion/latest/datafusion/execution/trait.FunctionRegistry.html#implementors)
 this trait)
   
   For example,
   
   
   ```rust
   /// Registers spark compatible functions with the `registry`
   fn register_spark_functions(registry: &dyn mut FunctionRegistry) -> 
Result<()> {
   ...
   }
   ```
   
   And then to use these functions someone would do
   ```rust
   // Create default session context
   let mut ctx = SessionContext::new();
   // setup for spark execution
   register_spark_functions(&mut ctx)?;
   // run queries, profit!
   ...
   ```
   



##########
datafusion/spark/README.md:
##########
@@ -0,0 +1,38 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+# datafusion-spark: Spark-compatible Expressions
+
+This crate provides Apache Spark-compatible expressions for use with 
DataFusion.
+
+## Testing Guide
+
+When testing functions by directly invoking them (e.g., 
`test_scalar_function!()`), input coercion (from the `signature`
+or `coerce_types`) is not applied.
+
+Therefore, direct invocation tests should only be used to verify that the 
function is correctly implemented.
+
+Please be sure to add additional tests beyond direct invocation.
+For more detailed testing guidelines, refer to
+the [Spark SQLLogicTest README](../sqllogictest/test_files/spark/README.md).
+
+## Implementation References
+
+When implementing Spark-compatible functions, you can check if there are 
existing implementations in
+the [Sail](https://github.com/lakehq/sail) or 
[Comet](https://github.com/apache/datafusion-comet) projects first.

Review Comment:
   100%
   
   Maybe we can also point out that tests should be ported over too



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to