findepi commented on code in PR #16681:
URL: https://github.com/apache/datafusion/pull/16681#discussion_r2185166539


##########
datafusion/expr/src/test/udf_equals.rs:
##########
@@ -0,0 +1,187 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, 
Volatility,
+};
+use arrow::datatypes::DataType;
+use datafusion_common::{not_impl_err, Result};
+use std::{
+    any::Any,
+    hash::{Hash, Hasher},
+};
+#[derive(Debug)]
+#[allow(dead_code)]
+struct ParamUdf {
+    param: i32,
+    signature: Signature,
+}
+
+impl ParamUdf {
+    fn new(param: i32) -> Self {
+        Self {
+            param,
+            signature: Signature::exact(vec![DataType::Int32], 
Volatility::Immutable),
+        }
+    }
+}
+
+impl ScalarUDFImpl for ParamUdf {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+    fn name(&self) -> &str {
+        "param_udf"
+    }
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Int32)
+    }
+    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+        not_impl_err!("not used")
+    }
+    fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
+        if let Some(other) = other.as_any().downcast_ref::<ParamUdf>() {
+            self.param == other.param && self.type_id() == other.type_id()

Review Comment:
   Is it intentional not to compare signature? if ues, let's add a code comment
   if no, the ParamUdf could use derive(PartialEq) and then `self == other` here



##########
docs/source/library-user-guide/upgrading.md:
##########
@@ -62,6 +62,36 @@ DataFusionError::SchemaError(
 
 [#16652]: https://github.com/apache/datafusion/issues/16652
 
+#### `ScalarUDFImpl::equals` Default Implementation
+
+The default implementation of the `equals` method in the `ScalarUDFImpl` trait 
has been updated. Previously, it compared only the type IDs, names, and 
signatures of UDFs. Now, it assumes UDFs are not equal unless their pointers 
are the same.

Review Comment:
   This is safe default. Executing this upgrade suggestion is necessary for 
common expression elimination to work.
   Is it worth considering doing the change more as a breaking change and 
require explicit upgrade step?
   
   It's more work for sure, but also -- the result is much more predictible.
   
   For example, we could require ScalarUDFImpl to implement PartialEq and 
delegate to it.



##########
datafusion/expr/src/udf.rs:
##########
@@ -696,16 +696,81 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
 
     /// Return true if this scalar UDF is equal to the other.
     ///
-    /// Allows customizing the equality of scalar UDFs.
-    /// Must be consistent with [`Self::hash_value`] and follow the same rules 
as [`Eq`]:
+    /// This method allows customizing the equality of scalar UDFs. It must 
adhere to the rules of equivalence:
     ///
-    /// - reflexive: `a.equals(a)`;
-    /// - symmetric: `a.equals(b)` implies `b.equals(a)`;
-    /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`.
+    /// - Reflexive: `a.equals(a)` must return true.
+    /// - Symmetric: `a.equals(b)` implies `b.equals(a)`.
+    /// - Transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`.
     ///
-    /// By default, compares [`Self::name`] and [`Self::signature`].
+    /// # Default Behavior
+    /// By default, this method compares the type IDs, names, and signatures 
of the two UDFs. If these match,
+    /// the method assumes the UDFs are not equal unless their pointers are 
the same. This conservative approach
+    /// ensures that different instances of the same function type are not 
mistakenly considered equal.
+    ///
+    /// # Custom Implementation
+    /// If a UDF has internal state or additional properties that should be 
considered for equality, this method
+    /// should be overridden. For example, a UDF with parameters might compare 
those parameters in addition to
+    /// the default checks.
+    ///
+    /// # Example
+    /// ```rust
+    /// use std::any::Any;
+    /// use std::hash::{DefaultHasher, Hash, Hasher};
+    /// use arrow::datatypes::DataType;
+    /// use datafusion_common::{not_impl_err, Result};
+    /// use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, 
ScalarUDFImpl, Signature};
+    /// #[derive(Debug)]
+    /// struct MyUdf {
+    ///  param: i32,
+    ///  signature: Signature,
+    /// }
+    ///
+    /// impl ScalarUDFImpl for MyUdf {
+    ///    fn as_any(&self) -> &dyn Any {
+    ///        self
+    ///    }
+    ///    fn name(&self) -> &str {
+    ///        "my_udf"
+    ///    }
+    ///    fn signature(&self) -> &Signature {
+    ///        &self.signature
+    ///    }
+    ///    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+    ///        Ok(DataType::Int32)
+    ///    }
+    ///    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+    ///        not_impl_err!("not used")
+    ///    }
+    ///     fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
+    ///         if let Some(other) = other.as_any().downcast_ref::<Self>() {
+    ///             self.param == other.param && self.name() == other.name()
+    ///         } else {
+    ///             false
+    ///         }
+    ///     }
+    ///     fn hash_value(&self) -> u64 {
+    ///         let mut hasher = DefaultHasher::new();
+    ///         self.param.hash(&mut hasher);
+    ///         self.name().hash(&mut hasher);
+    ///         hasher.finish()
+    ///     }
+    /// }
+    /// ```
+    ///
+    /// # Notes
+    /// - This method must be consistent with [`Self::hash_value`]. If 
`equals` returns true for two UDFs,
+    ///   their hash values must also be the same.
+    /// - Ensure that the implementation does not panic or cause undefined 
behavior for any input.
     fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
-        self.name() == other.name() && self.signature() == other.signature()
+        // if the pointers are the same, the UDFs are the same
+        if std::ptr::eq(self.as_any(), other.as_any()) {
+            return true;
+        }
+
+        // if the types, names and signatures are the same, we can't know if 
they are the same so we
+        // assume they are not.
+        // If a UDF has internal state that should be compared, it should 
implement this method
+        false

Review Comment:
   Could we do bit-by-bit comparison of the self & other, if they are the same 
type?



##########
datafusion/expr/src/test/udf_equals.rs:
##########
@@ -0,0 +1,187 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, 
Volatility,
+};
+use arrow::datatypes::DataType;
+use datafusion_common::{not_impl_err, Result};
+use std::{
+    any::Any,
+    hash::{Hash, Hasher},
+};
+#[derive(Debug)]
+#[allow(dead_code)]

Review Comment:
   can this be removed?



##########
datafusion/expr/src/test/udf_equals.rs:
##########
@@ -0,0 +1,187 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::{
+    ColumnarValue, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, 
Volatility,
+};
+use arrow::datatypes::DataType;
+use datafusion_common::{not_impl_err, Result};
+use std::{
+    any::Any,
+    hash::{Hash, Hasher},
+};

Review Comment:
   ```suggestion
   };
   
   ```



##########
datafusion/expr/src/udf.rs:
##########
@@ -696,16 +696,81 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
 
     /// Return true if this scalar UDF is equal to the other.
     ///
-    /// Allows customizing the equality of scalar UDFs.
-    /// Must be consistent with [`Self::hash_value`] and follow the same rules 
as [`Eq`]:
+    /// This method allows customizing the equality of scalar UDFs. It must 
adhere to the rules of equivalence:
     ///
-    /// - reflexive: `a.equals(a)`;
-    /// - symmetric: `a.equals(b)` implies `b.equals(a)`;
-    /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`.
+    /// - Reflexive: `a.equals(a)` must return true.
+    /// - Symmetric: `a.equals(b)` implies `b.equals(a)`.
+    /// - Transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`.
     ///
-    /// By default, compares [`Self::name`] and [`Self::signature`].
+    /// # Default Behavior
+    /// By default, this method compares the type IDs, names, and signatures 
of the two UDFs. If these match,
+    /// the method assumes the UDFs are not equal unless their pointers are 
the same. This conservative approach
+    /// ensures that different instances of the same function type are not 
mistakenly considered equal.
+    ///
+    /// # Custom Implementation
+    /// If a UDF has internal state or additional properties that should be 
considered for equality, this method
+    /// should be overridden. For example, a UDF with parameters might compare 
those parameters in addition to
+    /// the default checks.
+    ///
+    /// # Example
+    /// ```rust
+    /// use std::any::Any;
+    /// use std::hash::{DefaultHasher, Hash, Hasher};
+    /// use arrow::datatypes::DataType;
+    /// use datafusion_common::{not_impl_err, Result};
+    /// use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, 
ScalarUDFImpl, Signature};

Review Comment:
   ```suggestion
       /// use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, 
ScalarUDFImpl, Signature};
       ///
   ```



##########
datafusion/expr/src/udf.rs:
##########
@@ -696,16 +696,81 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
 
     /// Return true if this scalar UDF is equal to the other.
     ///
-    /// Allows customizing the equality of scalar UDFs.
-    /// Must be consistent with [`Self::hash_value`] and follow the same rules 
as [`Eq`]:
+    /// This method allows customizing the equality of scalar UDFs. It must 
adhere to the rules of equivalence:
     ///
-    /// - reflexive: `a.equals(a)`;
-    /// - symmetric: `a.equals(b)` implies `b.equals(a)`;
-    /// - transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`.
+    /// - Reflexive: `a.equals(a)` must return true.
+    /// - Symmetric: `a.equals(b)` implies `b.equals(a)`.
+    /// - Transitive: `a.equals(b)` and `b.equals(c)` implies `a.equals(c)`.
     ///
-    /// By default, compares [`Self::name`] and [`Self::signature`].
+    /// # Default Behavior
+    /// By default, this method compares the type IDs, names, and signatures 
of the two UDFs. If these match,
+    /// the method assumes the UDFs are not equal unless their pointers are 
the same. This conservative approach
+    /// ensures that different instances of the same function type are not 
mistakenly considered equal.
+    ///
+    /// # Custom Implementation
+    /// If a UDF has internal state or additional properties that should be 
considered for equality, this method
+    /// should be overridden. For example, a UDF with parameters might compare 
those parameters in addition to
+    /// the default checks.
+    ///
+    /// # Example
+    /// ```rust
+    /// use std::any::Any;
+    /// use std::hash::{DefaultHasher, Hash, Hasher};
+    /// use arrow::datatypes::DataType;
+    /// use datafusion_common::{not_impl_err, Result};
+    /// use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, 
ScalarUDFImpl, Signature};
+    /// #[derive(Debug)]
+    /// struct MyUdf {
+    ///  param: i32,
+    ///  signature: Signature,
+    /// }
+    ///
+    /// impl ScalarUDFImpl for MyUdf {
+    ///    fn as_any(&self) -> &dyn Any {
+    ///        self
+    ///    }
+    ///    fn name(&self) -> &str {
+    ///        "my_udf"
+    ///    }
+    ///    fn signature(&self) -> &Signature {
+    ///        &self.signature
+    ///    }
+    ///    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+    ///        Ok(DataType::Int32)
+    ///    }
+    ///    fn invoke_with_args(&self, _args: ScalarFunctionArgs) -> 
Result<ColumnarValue> {
+    ///        not_impl_err!("not used")
+    ///    }
+    ///     fn equals(&self, other: &dyn ScalarUDFImpl) -> bool {
+    ///         if let Some(other) = other.as_any().downcast_ref::<Self>() {
+    ///             self.param == other.param && self.name() == other.name()

Review Comment:
   Why not include signature in the equality check?
   Worth code comment



##########
datafusion/expr/src/test/mod.rs:
##########
@@ -16,3 +16,5 @@
 // under the License.
 
 pub mod function_stub;
+#[cfg(test)]
+pub mod udf_equals;

Review Comment:
   shouldn't udf_equals.rs simply go into tests/ folder 
(`datafusion/expr/tests`)?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to