This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 7f29cb050a Add `arrow_try_cast` UDF (#21130)
7f29cb050a is described below
commit 7f29cb050acd055c5cb52d620282fe9cb8ac2af0
Author: Adrian Garcia Badaracco <[email protected]>
AuthorDate: Tue Mar 24 08:38:16 2026 -0500
Add `arrow_try_cast` UDF (#21130)
## Which issue does this PR close?
N/A - new feature
## Rationale for this change
`arrow_cast(expr, 'DataType')` casts to Arrow data types specified as
strings but errors on failure. `try_cast(expr AS type)` returns NULL on
failure but only works with SQL types. There's currently no way to
attempt a cast to a specific Arrow type and get NULL on failure instead
of an error.
## What changes are included in this PR?
Adds a new `arrow_try_cast(expression, datatype)` scalar function that
combines the behavior of `arrow_cast` and `try_cast`:
- Accepts Arrow data type strings (like `arrow_cast`)
- Returns NULL on cast failure instead of erroring (like `try_cast`)
Implementation details:
- Reuses `arrow_cast`'s `data_type_from_args` helper (made `pub(crate)`)
- Simplifies to `Expr::TryCast` during optimization (vs `Expr::Cast` for
`arrow_cast`)
- Registered alongside existing core functions
## Are these changes tested?
Yes — new sqllogictest file `arrow_try_cast.slt` covering:
- Successful casts (Int64, Float64, LargeUtf8, Dictionary)
- Failed cast returning NULL
- Same-type passthrough
- NULL input
- Invalid type string errors
- Multiple casts in one query
## Are there any user-facing changes?
New `arrow_try_cast` SQL function available.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---------
Co-authored-by: Claude Opus 4.6 (1M context) <[email protected]>
---
datafusion/functions/src/core/arrow_cast.rs | 8 +-
.../src/core/{arrow_cast.rs => arrow_try_cast.rs} | 110 ++++++---------------
datafusion/functions/src/core/mod.rs | 9 +-
.../sqllogictest/test_files/arrow_try_cast.slt | 109 ++++++++++++++++++++
docs/source/user-guide/sql/scalar_functions.md | 27 +++++
5 files changed, 178 insertions(+), 85 deletions(-)
diff --git a/datafusion/functions/src/core/arrow_cast.rs
b/datafusion/functions/src/core/arrow_cast.rs
index e555081e41..3e0a23f1ad 100644
--- a/datafusion/functions/src/core/arrow_cast.rs
+++ b/datafusion/functions/src/core/arrow_cast.rs
@@ -163,7 +163,7 @@ impl ScalarUDFImpl for ArrowCastFunc {
info: &SimplifyContext,
) -> Result<ExprSimplifyResult> {
// convert this into a real cast
- let target_type = data_type_from_args(&args)?;
+ let target_type = data_type_from_args(self.name(), &args)?;
// remove second (type) argument
args.pop().unwrap();
let arg = args.pop().unwrap();
@@ -189,12 +189,12 @@ impl ScalarUDFImpl for ArrowCastFunc {
}
/// Returns the requested type from the arguments
-fn data_type_from_args(args: &[Expr]) -> Result<DataType> {
- let [_, type_arg] = take_function_args("arrow_cast", args)?;
+pub(crate) fn data_type_from_args(name: &str, args: &[Expr]) ->
Result<DataType> {
+ let [_, type_arg] = take_function_args(name, args)?;
let Expr::Literal(ScalarValue::Utf8(Some(val)), _) = type_arg else {
return exec_err!(
- "arrow_cast requires its second argument to be a constant string,
got {:?}",
+ "{name} requires its second argument to be a constant string, got
{:?}",
type_arg
);
};
diff --git a/datafusion/functions/src/core/arrow_cast.rs
b/datafusion/functions/src/core/arrow_try_cast.rs
similarity index 57%
copy from datafusion/functions/src/core/arrow_cast.rs
copy to datafusion/functions/src/core/arrow_try_cast.rs
index e555081e41..a221c81e07 100644
--- a/datafusion/functions/src/core/arrow_cast.rs
+++ b/datafusion/functions/src/core/arrow_try_cast.rs
@@ -15,14 +15,13 @@
// specific language governing permissions and limitations
// under the License.
-//! [`ArrowCastFunc`]: Implementation of the `arrow_cast`
+//! [`ArrowTryCastFunc`]: Implementation of the `arrow_try_cast`
use arrow::datatypes::{DataType, Field, FieldRef};
use arrow::error::ArrowError;
use datafusion_common::{
- Result, ScalarValue, arrow_datafusion_err, datatype::DataTypeExt,
- exec_datafusion_err, exec_err, internal_err, types::logical_string,
- utils::take_function_args,
+ Result, arrow_datafusion_err, datatype::DataTypeExt, exec_datafusion_err,
exec_err,
+ internal_err, types::logical_string, utils::take_function_args,
};
use std::any::Any;
@@ -33,52 +32,25 @@ use datafusion_expr::{
};
use datafusion_macros::user_doc;
-/// Implements casting to arbitrary arrow types (rather than SQL types)
-///
-/// Note that the `arrow_cast` function is somewhat special in that its
-/// return depends only on the *value* of its second argument (not its type)
-///
-/// It is implemented by calling the same underlying arrow `cast` kernel as
-/// normal SQL casts.
-///
-/// For example to cast to `int` using SQL (which is then mapped to the arrow
-/// type `Int32`)
-///
-/// ```sql
-/// select cast(column_x as int) ...
-/// ```
-///
-/// Use the `arrow_cast` function to cast to a specific arrow type
+use super::arrow_cast::data_type_from_args;
+
+/// Like [`arrow_cast`](super::arrow_cast::ArrowCastFunc) but returns NULL on
cast failure instead of erroring.
///
-/// For example
-/// ```sql
-/// select arrow_cast(column_x, 'Float64')
-/// ```
+/// This is implemented by simplifying `arrow_try_cast(expr, 'Type')` into
+/// `Expr::TryCast` during optimization.
#[user_doc(
doc_section(label = "Other Functions"),
- description = "Casts a value to a specific Arrow data type.",
- syntax_example = "arrow_cast(expression, datatype)",
+ description = "Casts a value to a specific Arrow data type, returning NULL
if the cast fails.",
+ syntax_example = "arrow_try_cast(expression, datatype)",
sql_example = r#"```sql
-> select
- arrow_cast(-5, 'Int8') as a,
- arrow_cast('foo', 'Dictionary(Int32, Utf8)') as b,
- arrow_cast('bar', 'LargeUtf8') as c;
-
-+----+-----+-----+
-| a | b | c |
-+----+-----+-----+
-| -5 | foo | bar |
-+----+-----+-----+
-
-> select
- arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs, "+08:00")') as d,
- arrow_cast('2023-01-02T12:53:02', 'Timestamp(µs)') as e;
-
-+---------------------------+---------------------+
-| d | e |
-+---------------------------+---------------------+
-| 2023-01-02T12:53:02+08:00 | 2023-01-02T12:53:02 |
-+---------------------------+---------------------+
+> select arrow_try_cast('123', 'Int64') as a,
+ arrow_try_cast('not_a_number', 'Int64') as b;
+
++-----+------+
+| a | b |
++-----+------+
+| 123 | NULL |
++-----+------+
```"#,
argument(
name = "expression",
@@ -90,17 +62,17 @@ use datafusion_macros::user_doc;
)
)]
#[derive(Debug, PartialEq, Eq, Hash)]
-pub struct ArrowCastFunc {
+pub struct ArrowTryCastFunc {
signature: Signature,
}
-impl Default for ArrowCastFunc {
+impl Default for ArrowTryCastFunc {
fn default() -> Self {
Self::new()
}
}
-impl ArrowCastFunc {
+impl ArrowTryCastFunc {
pub fn new() -> Self {
Self {
signature: Signature::coercible(
@@ -114,13 +86,13 @@ impl ArrowCastFunc {
}
}
-impl ScalarUDFImpl for ArrowCastFunc {
+impl ScalarUDFImpl for ArrowTryCastFunc {
fn as_any(&self) -> &dyn Any {
self
}
fn name(&self) -> &str {
- "arrow_cast"
+ "arrow_try_cast"
}
fn signature(&self) -> &Signature {
@@ -132,8 +104,7 @@ impl ScalarUDFImpl for ArrowCastFunc {
}
fn return_field_from_args(&self, args: ReturnFieldArgs) ->
Result<FieldRef> {
- let nullable = args.arg_fields.iter().any(|f| f.is_nullable());
-
+ // TryCast can always return NULL (on cast failure), so always nullable
let [_, type_arg] = take_function_args(self.name(),
args.scalar_arguments)?;
type_arg
@@ -146,7 +117,9 @@ impl ScalarUDFImpl for ArrowCastFunc {
)
},
|casted_type| match casted_type.parse::<DataType>() {
- Ok(data_type) => Ok(Field::new(self.name(), data_type,
nullable).into()),
+ Ok(data_type) => {
+ Ok(Field::new(self.name(), data_type, true).into())
+ }
Err(ArrowError::ParseError(e)) =>
Err(exec_datafusion_err!("{e}")),
Err(e) => Err(arrow_datafusion_err!(e)),
},
@@ -154,7 +127,7 @@ impl ScalarUDFImpl for ArrowCastFunc {
}
fn invoke_with_args(&self, _args: ScalarFunctionArgs) ->
Result<ColumnarValue> {
- internal_err!("arrow_cast should have been simplified to cast")
+ internal_err!("arrow_try_cast should have been simplified to try_cast")
}
fn simplify(
@@ -162,24 +135,20 @@ impl ScalarUDFImpl for ArrowCastFunc {
mut args: Vec<Expr>,
info: &SimplifyContext,
) -> Result<ExprSimplifyResult> {
- // convert this into a real cast
- let target_type = data_type_from_args(&args)?;
+ let target_type = data_type_from_args(self.name(), &args)?;
// remove second (type) argument
args.pop().unwrap();
let arg = args.pop().unwrap();
let source_type = info.get_data_type(&arg)?;
let new_expr = if source_type == target_type {
- // the argument's data type is already the correct type
arg
} else {
- // Use an actual cast to get the correct type
- Expr::Cast(datafusion_expr::Cast {
+ Expr::TryCast(datafusion_expr::TryCast {
expr: Box::new(arg),
field: target_type.into_nullable_field_ref(),
})
};
- // return the newly written argument to DataFusion
Ok(ExprSimplifyResult::Simplified(new_expr))
}
@@ -187,22 +156,3 @@ impl ScalarUDFImpl for ArrowCastFunc {
self.doc()
}
}
-
-/// Returns the requested type from the arguments
-fn data_type_from_args(args: &[Expr]) -> Result<DataType> {
- let [_, type_arg] = take_function_args("arrow_cast", args)?;
-
- let Expr::Literal(ScalarValue::Utf8(Some(val)), _) = type_arg else {
- return exec_err!(
- "arrow_cast requires its second argument to be a constant string,
got {:?}",
- type_arg
- );
- };
-
- val.parse().map_err(|e| match e {
- // If the data type cannot be parsed, return a Plan error to signal an
- // error in the input rather than a more general ArrowError
- ArrowError::ParseError(e) => exec_datafusion_err!("{e}"),
- e => arrow_datafusion_err!(e),
- })
-}
diff --git a/datafusion/functions/src/core/mod.rs
b/datafusion/functions/src/core/mod.rs
index a14d563737..e8737612a1 100644
--- a/datafusion/functions/src/core/mod.rs
+++ b/datafusion/functions/src/core/mod.rs
@@ -22,6 +22,7 @@ use std::sync::Arc;
pub mod arrow_cast;
pub mod arrow_metadata;
+pub mod arrow_try_cast;
pub mod arrowtypeof;
pub mod coalesce;
pub mod expr_ext;
@@ -42,6 +43,7 @@ pub mod version;
// create UDFs
make_udf_function!(arrow_cast::ArrowCastFunc, arrow_cast);
+make_udf_function!(arrow_try_cast::ArrowTryCastFunc, arrow_try_cast);
make_udf_function!(nullif::NullIfFunc, nullif);
make_udf_function!(nvl::NVLFunc, nvl);
make_udf_function!(nvl2::NVL2Func, nvl2);
@@ -67,7 +69,11 @@ pub mod expr_fn {
arg1 arg2
),(
arrow_cast,
- "Returns value2 if value1 is NULL; otherwise it returns value1",
+ "Casts a value to a specific Arrow data type",
+ arg1 arg2
+ ),(
+ arrow_try_cast,
+ "Casts a value to a specific Arrow data type, returning NULL if the
cast fails",
arg1 arg2
),(
nvl,
@@ -140,6 +146,7 @@ pub fn functions() -> Vec<Arc<ScalarUDF>> {
vec
- [arrow_metadata](#arrow_metadata)
+- [arrow_try_cast](#arrow_try_cast)
- [arrow_typeof](#arrow_typeof)
- [get_field](#get_field)
- [version](#version)
@@ -5257,6 +5258,32 @@ arrow_metadata(expression[, key])
+-------------------------------+
```
+### `arrow_try_cast`
+
+Casts a value to a specific Arrow data type, returning NULL if the cast fails.
+
+```sql
+arrow_try_cast(expression, datatype)
+```
+
+#### Arguments
+
+- **expression**: Expression to cast. The expression can be a constant,
column, or function, and any combination of operators.
+- **datatype**: [Arrow data
type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) name to
cast to, as a string. The format is the same as that returned by
[`arrow_typeof`]
+
+#### Example
+
+```sql
+> select arrow_try_cast('123', 'Int64') as a,
+ arrow_try_cast('not_a_number', 'Int64') as b;
+
++-----+------+
+| a | b |
++-----+------+
+| 123 | NULL |
++-----+------+
+```
+
### `arrow_typeof`
Returns the name of the underlying [Arrow data
type](https://docs.rs/arrow/latest/arrow/datatypes/enum.DataType.html) of the
expression.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]