findepi commented on code in PR #13001:
URL: https://github.com/apache/datafusion/pull/13001#discussion_r1810485891
##########
datafusion/core/tests/user_defined/user_defined_scalar_functions.rs:
##########
@@ -483,6 +485,196 @@ async fn test_user_defined_functions_with_alias() ->
Result<()> {
Ok(())
}
+/// Volatile UDF that should be append a different value to each row
+struct AddIndexToStringScalarUDF {
+ name: String,
+ signature: Signature,
+ return_type: DataType,
+}
+
+impl AddIndexToStringScalarUDF {
+ fn new() -> Self {
+ Self {
+ name: "add_index_to_string".to_string(),
+ signature: Signature::exact(vec![DataType::Utf8],
Volatility::Volatile),
+ return_type: DataType::Utf8,
+ }
+ }
+}
+
+impl std::fmt::Debug for AddIndexToStringScalarUDF {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ f.debug_struct("ScalarUDF")
+ .field("name", &self.name)
+ .field("signature", &self.signature)
+ .field("fun", &"<FUNC>")
+ .finish()
+ }
+}
+
+impl ScalarUDFImpl for AddIndexToStringScalarUDF {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ &self.name
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+ Ok(self.return_type.clone())
+ }
+
+ fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ not_impl_err!("index_with_offset function does not accept arguments")
+ }
+
+ fn invoke_batch(
+ &self,
+ _args: &[ColumnarValue],
+ _number_rows: usize,
+ ) -> Result<ColumnarValue> {
+ let answer = match &_args[0] {
+ // When called with static arguments, the result is returned as an
array.
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(value))) => {
+ let mut answer = vec![];
+ for index in 1..=_number_rows {
+ // When calling a function with immutable arguments, the
result is returned with ")".
+ // Example: SELECT add_index_to_string('const_value') FROM
table;
+ answer.push(index.to_string() + ") " + value);
+ }
+ answer
+ }
+ // The result is returned as an array when called with dynamic
arguments.
+ ColumnarValue::Array(array) => {
+ let string_array = as_string_array(array);
+ let mut counter = HashMap::<&str, u64>::new();
+ string_array
+ .iter()
+ .map(|value| {
+ let value = value.expect("Unexpected null");
+ let index = counter.get(value).unwrap_or(&0) + 1;
+ counter.insert(value, index);
+
+ // When calling a function with mutable arguments, the
result is returned with ".".
+ // Example: SELECT add_index_to_string(table.value)
FROM table;
+ index.to_string() + ". " + value
+ })
+ .collect()
+ }
+ _ => unimplemented!(),
+ };
+ Ok(ColumnarValue::Array(
+ Arc::new(StringArray::from(answer)) as ArrayRef
+ ))
+ }
+}
+
+#[tokio::test]
+async fn volatile_scalar_udf_with_params() -> Result<()> {
+ {
+ let schema = Schema::new(vec![Field::new("a", DataType::Utf8, false)]);
+
+ let batch = RecordBatch::try_new(
+ Arc::new(schema.clone()),
+ vec![Arc::new(StringArray::from(vec![
+ "test_1", "test_1", "test_1", "test_2", "test_2", "test_1",
"test_2",
+ ]))],
+ )?;
+ let ctx = SessionContext::new();
+
+ ctx.register_batch("t", batch)?;
+
+ let get_new_str_udf = AddIndexToStringScalarUDF::new();
+
+ ctx.register_udf(ScalarUDF::from(get_new_str_udf));
+
+ let result =
+ plan_and_collect(&ctx, "select add_index_to_string(t.a) AS str
from t") // with dynamic function parameters
+ .await?;
+ let expected = [
+ "+-----------+",
+ "| str |",
+ "+-----------+",
+ "| 1. test_1 |",
+ "| 2. test_1 |",
+ "| 3. test_1 |",
+ "| 1. test_2 |",
+ "| 2. test_2 |",
+ "| 4. test_1 |",
+ "| 3. test_2 |",
+ "+-----------+",
+ ];
+ assert_batches_eq!(expected, &result);
+
+ let result =
+ plan_and_collect(&ctx, "select add_index_to_string('test') AS str
from t") // with fixed function parameters
+ .await?;
+ let expected = [
+ "+---------+",
+ "| str |",
+ "+---------+",
+ "| 1) test |",
+ "| 2) test |",
+ "| 3) test |",
+ "| 4) test |",
+ "| 5) test |",
+ "| 6) test |",
+ "| 7) test |",
+ "+---------+",
+ ];
+ assert_batches_eq!(expected, &result);
+
+ let result =
+ plan_and_collect(&ctx, "select add_index_to_string('test_value')
as str") // with fixed function parameters
+ .await?;
+ let expected = [
+ "+---------------+",
+ "| str |",
+ "+---------------+",
+ "| 1) test_value |",
+ "+---------------+",
+ ];
+ assert_batches_eq!(expected, &result);
+ }
+ {
Review Comment:
Nit: consider separating test cases into separate test functions, this would
given them descriptive names
##########
datafusion/expr/src/udf.rs:
##########
@@ -467,7 +478,28 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
/// to arrays, which will likely be simpler code, but be slower.
///
/// [invoke_no_args]: ScalarUDFImpl::invoke_no_args
- fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue>;
+ fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ not_impl_err!(
+ "Function {} does not implement invoke but called",
+ self.name()
+ )
+ }
+
+ /// Invoke the function with `args` and the number of rows,
+ /// returning the appropriate result.
+ ///
+ /// The function should be used for signatures with
[`datafusion_expr_common::signature::Volatility::Volatile`]
+ /// and with arguments.
+ fn invoke_batch(
+ &self,
+ _args: &[ColumnarValue],
+ _number_rows: usize,
Review Comment:
These are not unused, let's remove leading `_` from arg names
##########
datafusion/expr/src/udf.rs:
##########
@@ -467,7 +478,28 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
/// to arrays, which will likely be simpler code, but be slower.
///
/// [invoke_no_args]: ScalarUDFImpl::invoke_no_args
- fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue>;
+ fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
Review Comment:
> as a follow on PR I think we should deprecate the other two functions
(`invoke_no_args` and `invoke`) telling people to use `invoke` instead
did you mean `invoke_batch`?
yes, it would be great to have only one invoke entry-point
##########
datafusion/expr/src/udf.rs:
##########
@@ -467,7 +478,28 @@ pub trait ScalarUDFImpl: Debug + Send + Sync {
/// to arrays, which will likely be simpler code, but be slower.
///
/// [invoke_no_args]: ScalarUDFImpl::invoke_no_args
- fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue>;
+ fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ not_impl_err!(
+ "Function {} does not implement invoke but called",
+ self.name()
+ )
+ }
+
+ /// Invoke the function with `args` and the number of rows,
+ /// returning the appropriate result.
+ ///
+ /// The function should be used for signatures with
[`datafusion_expr_common::signature::Volatility::Volatile`]
+ /// and with arguments.
Review Comment:
Only for these?
if yes => the function should be named appropriate `invoke_volatile`)
if no => remove the comment
##########
datafusion/core/tests/user_defined/user_defined_scalar_functions.rs:
##########
@@ -483,6 +485,196 @@ async fn test_user_defined_functions_with_alias() ->
Result<()> {
Ok(())
}
+/// Volatile UDF that should be append a different value to each row
Review Comment:
```suggestion
/// Volatile UDF that should append a different value to each row
#[derive(Debug)]
```
##########
datafusion/core/tests/user_defined/user_defined_scalar_functions.rs:
##########
@@ -483,6 +485,196 @@ async fn test_user_defined_functions_with_alias() ->
Result<()> {
Ok(())
}
+/// Volatile UDF that should be append a different value to each row
+struct AddIndexToStringScalarUDF {
+ name: String,
+ signature: Signature,
+ return_type: DataType,
+}
+
+impl AddIndexToStringScalarUDF {
+ fn new() -> Self {
+ Self {
+ name: "add_index_to_string".to_string(),
+ signature: Signature::exact(vec![DataType::Utf8],
Volatility::Volatile),
+ return_type: DataType::Utf8,
+ }
+ }
+}
+
+impl std::fmt::Debug for AddIndexToStringScalarUDF {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ f.debug_struct("ScalarUDF")
+ .field("name", &self.name)
+ .field("signature", &self.signature)
+ .field("fun", &"<FUNC>")
+ .finish()
+ }
+}
+
+impl ScalarUDFImpl for AddIndexToStringScalarUDF {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ &self.name
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+ Ok(self.return_type.clone())
+ }
+
+ fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ not_impl_err!("index_with_offset function does not accept arguments")
+ }
+
+ fn invoke_batch(
+ &self,
+ _args: &[ColumnarValue],
+ _number_rows: usize,
+ ) -> Result<ColumnarValue> {
+ let answer = match &_args[0] {
+ // When called with static arguments, the result is returned as an
array.
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(value))) => {
+ let mut answer = vec![];
+ for index in 1..=_number_rows {
+ // When calling a function with immutable arguments, the
result is returned with ")".
+ // Example: SELECT add_index_to_string('const_value') FROM
table;
+ answer.push(index.to_string() + ") " + value);
+ }
+ answer
+ }
+ // The result is returned as an array when called with dynamic
arguments.
+ ColumnarValue::Array(array) => {
+ let string_array = as_string_array(array);
+ let mut counter = HashMap::<&str, u64>::new();
+ string_array
+ .iter()
+ .map(|value| {
+ let value = value.expect("Unexpected null");
+ let index = counter.get(value).unwrap_or(&0) + 1;
+ counter.insert(value, index);
+
+ // When calling a function with mutable arguments, the
result is returned with ".".
+ // Example: SELECT add_index_to_string(table.value)
FROM table;
+ index.to_string() + ". " + value
+ })
+ .collect()
+ }
+ _ => unimplemented!(),
+ };
+ Ok(ColumnarValue::Array(
Review Comment:
Is it OK for this function to return array also when it's invoked with
`ColumnarValue::Scalar` only?
##########
datafusion/core/tests/user_defined/user_defined_scalar_functions.rs:
##########
@@ -483,6 +485,196 @@ async fn test_user_defined_functions_with_alias() ->
Result<()> {
Ok(())
}
+/// Volatile UDF that should be append a different value to each row
+struct AddIndexToStringScalarUDF {
Review Comment:
The volatility is important, let's reflect it in the function name -- it's
this function's main purpose, not an attribute it happens to have.
`AddIndexToStringVolatileScalarUDF`
##########
datafusion/core/tests/user_defined/user_defined_scalar_functions.rs:
##########
@@ -483,6 +485,196 @@ async fn test_user_defined_functions_with_alias() ->
Result<()> {
Ok(())
}
+/// Volatile UDF that should be append a different value to each row
+struct AddIndexToStringScalarUDF {
+ name: String,
+ signature: Signature,
+ return_type: DataType,
+}
+
+impl AddIndexToStringScalarUDF {
+ fn new() -> Self {
+ Self {
+ name: "add_index_to_string".to_string(),
+ signature: Signature::exact(vec![DataType::Utf8],
Volatility::Volatile),
+ return_type: DataType::Utf8,
+ }
+ }
+}
+
+impl std::fmt::Debug for AddIndexToStringScalarUDF {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ f.debug_struct("ScalarUDF")
+ .field("name", &self.name)
+ .field("signature", &self.signature)
+ .field("fun", &"<FUNC>")
+ .finish()
+ }
+}
+
Review Comment:
Leverage `#[derive(Debug)]`, to ensure all fields are part of debug.
```suggestion
```
##########
datafusion/core/tests/user_defined/user_defined_scalar_functions.rs:
##########
@@ -483,6 +485,196 @@ async fn test_user_defined_functions_with_alias() ->
Result<()> {
Ok(())
}
+/// Volatile UDF that should be append a different value to each row
+struct AddIndexToStringScalarUDF {
+ name: String,
+ signature: Signature,
+ return_type: DataType,
+}
+
+impl AddIndexToStringScalarUDF {
+ fn new() -> Self {
+ Self {
+ name: "add_index_to_string".to_string(),
+ signature: Signature::exact(vec![DataType::Utf8],
Volatility::Volatile),
+ return_type: DataType::Utf8,
+ }
+ }
+}
+
+impl std::fmt::Debug for AddIndexToStringScalarUDF {
+ fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+ f.debug_struct("ScalarUDF")
+ .field("name", &self.name)
+ .field("signature", &self.signature)
+ .field("fun", &"<FUNC>")
+ .finish()
+ }
+}
+
+impl ScalarUDFImpl for AddIndexToStringScalarUDF {
+ fn as_any(&self) -> &dyn Any {
+ self
+ }
+
+ fn name(&self) -> &str {
+ &self.name
+ }
+
+ fn signature(&self) -> &Signature {
+ &self.signature
+ }
+
+ fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+ Ok(self.return_type.clone())
+ }
+
+ fn invoke(&self, _args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ not_impl_err!("index_with_offset function does not accept arguments")
+ }
+
+ fn invoke_batch(
+ &self,
+ _args: &[ColumnarValue],
+ _number_rows: usize,
+ ) -> Result<ColumnarValue> {
+ let answer = match &_args[0] {
+ // When called with static arguments, the result is returned as an
array.
+ ColumnarValue::Scalar(ScalarValue::Utf8(Some(value))) => {
+ let mut answer = vec![];
+ for index in 1..=_number_rows {
+ // When calling a function with immutable arguments, the
result is returned with ")".
+ // Example: SELECT add_index_to_string('const_value') FROM
table;
+ answer.push(index.to_string() + ") " + value);
+ }
+ answer
+ }
+ // The result is returned as an array when called with dynamic
arguments.
+ ColumnarValue::Array(array) => {
+ let string_array = as_string_array(array);
+ let mut counter = HashMap::<&str, u64>::new();
+ string_array
+ .iter()
+ .map(|value| {
+ let value = value.expect("Unexpected null");
+ let index = counter.get(value).unwrap_or(&0) + 1;
+ counter.insert(value, index);
+
+ // When calling a function with mutable arguments, the
result is returned with ".".
+ // Example: SELECT add_index_to_string(table.value)
FROM table;
+ index.to_string() + ". " + value
+ })
+ .collect()
+ }
+ _ => unimplemented!(),
+ };
+ Ok(ColumnarValue::Array(
+ Arc::new(StringArray::from(answer)) as ArrayRef
Review Comment:
```suggestion
Arc::new(StringArray::from(answer))
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]