This is an automated email from the ASF dual-hosted git repository.
dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 613188f add digest(utf8, method) function (#1090)
613188f is described below
commit 613188f5f53bbfffb83f0f7d60cac417980a20dc
Author: Jiayu Liu <[email protected]>
AuthorDate: Sun Oct 10 16:06:43 2021 +0800
add digest(utf8, method) function (#1090)
---
ballista/rust/core/proto/ballista.proto | 1 +
.../rust/core/src/serde/logical_plan/from_proto.rs | 7 +-
.../rust/core/src/serde/logical_plan/to_proto.rs | 1 +
.../core/src/serde/physical_plan/from_proto.rs | 1 +
datafusion/src/logical_plan/expr.rs | 14 +-
datafusion/src/logical_plan/mod.rs | 17 +-
datafusion/src/physical_plan/crypto_expressions.rs | 372 +++++++++++++--------
datafusion/src/physical_plan/functions.rs | 14 +-
datafusion/src/prelude.rs | 10 +-
datafusion/tests/sql.rs | 27 ++
10 files changed, 303 insertions(+), 161 deletions(-)
diff --git a/ballista/rust/core/proto/ballista.proto
b/ballista/rust/core/proto/ballista.proto
index 47cc801..8175156 100644
--- a/ballista/rust/core/proto/ballista.proto
+++ b/ballista/rust/core/proto/ballista.proto
@@ -153,6 +153,7 @@ enum ScalarFunction {
SHA512 = 33;
LN = 34;
TOTIMESTAMPMILLIS = 35;
+ DIGEST = 36;
}
message ScalarFunctionNode {
diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs
b/ballista/rust/core/src/serde/logical_plan/from_proto.rs
index c9ef97e..353be9a 100644
--- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs
+++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs
@@ -27,8 +27,8 @@ use datafusion::logical_plan::window_frames::{
WindowFrame, WindowFrameBound, WindowFrameUnits,
};
use datafusion::logical_plan::{
- abs, acos, asin, atan, ceil, cos, exp, floor, ln, log10, log2, round,
signum, sin,
- sqrt, tan, trunc, Column, DFField, DFSchema, Expr, JoinConstraint,
JoinType,
+ abs, acos, asin, atan, ceil, cos, digest, exp, floor, ln, log10, log2,
round, signum,
+ sin, sqrt, tan, trunc, Column, DFField, DFSchema, Expr, JoinConstraint,
JoinType,
LogicalPlan, LogicalPlanBuilder, Operator,
};
use datafusion::physical_plan::aggregates::AggregateFunction;
@@ -1152,6 +1152,9 @@ impl TryInto<Expr> for &protobuf::LogicalExprNode {
protobuf::ScalarFunction::Sha512 => {
Ok(sha512((&args[0]).try_into()?))
}
+ protobuf::ScalarFunction::Digest => {
+ Ok(digest((&args[0]).try_into()?,
(&args[1]).try_into()?))
+ }
_ => Err(proto_error(
"Protobuf deserialization error: Unsupported scalar
function",
)),
diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs
b/ballista/rust/core/src/serde/logical_plan/to_proto.rs
index bd7fc4d..c3ffb1a 100644
--- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs
+++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs
@@ -1485,6 +1485,7 @@ impl TryInto<protobuf::ScalarFunction> for
&BuiltinScalarFunction {
BuiltinScalarFunction::SHA256 =>
Ok(protobuf::ScalarFunction::Sha256),
BuiltinScalarFunction::SHA384 =>
Ok(protobuf::ScalarFunction::Sha384),
BuiltinScalarFunction::SHA512 =>
Ok(protobuf::ScalarFunction::Sha512),
+ BuiltinScalarFunction::Digest =>
Ok(protobuf::ScalarFunction::Digest),
BuiltinScalarFunction::ToTimestampMillis => {
Ok(protobuf::ScalarFunction::Totimestampmillis)
}
diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs
b/ballista/rust/core/src/serde/physical_plan/from_proto.rs
index 0d23372..5241e8b 100644
--- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs
+++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs
@@ -559,6 +559,7 @@ impl From<&protobuf::ScalarFunction> for
BuiltinScalarFunction {
ScalarFunction::Sha256 => BuiltinScalarFunction::SHA256,
ScalarFunction::Sha384 => BuiltinScalarFunction::SHA384,
ScalarFunction::Sha512 => BuiltinScalarFunction::SHA512,
+ ScalarFunction::Digest => BuiltinScalarFunction::Digest,
ScalarFunction::Ln => BuiltinScalarFunction::Ln,
ScalarFunction::Totimestampmillis =>
BuiltinScalarFunction::ToTimestampMillis,
}
diff --git a/datafusion/src/logical_plan/expr.rs
b/datafusion/src/logical_plan/expr.rs
index 0fc00f3..f61ed83 100644
--- a/datafusion/src/logical_plan/expr.rs
+++ b/datafusion/src/logical_plan/expr.rs
@@ -1508,7 +1508,7 @@ macro_rules! unary_scalar_expr {
};
}
-/// Create an convenience function representing a /binaryunary scalar function
+/// Create an convenience function representing a binary scalar function
macro_rules! binary_scalar_expr {
($ENUM:ident, $FUNC:ident) => {
#[doc = "this scalar function is not documented yet"]
@@ -1581,6 +1581,7 @@ unary_scalar_expr!(Upper, upper);
// date functions
binary_scalar_expr!(DatePart, date_part);
binary_scalar_expr!(DateTrunc, date_trunc);
+binary_scalar_expr!(Digest, digest);
/// returns an array of fixed size with each argument on it.
pub fn array(args: Vec<Expr>) -> Expr {
@@ -2127,6 +2128,17 @@ mod tests {
}
#[test]
+ fn digest_function_definitions() {
+ if let Expr::ScalarFunction { fun, args } = digest(col("tableA.a"),
lit("md5")) {
+ let name = functions::BuiltinScalarFunction::Digest;
+ assert_eq!(name, fun);
+ assert_eq!(2, args.len());
+ } else {
+ unreachable!();
+ }
+ }
+
+ #[test]
fn scalar_function_definitions() {
test_unary_scalar_expr!(Sqrt, sqrt);
test_unary_scalar_expr!(Sin, sin);
diff --git a/datafusion/src/logical_plan/mod.rs
b/datafusion/src/logical_plan/mod.rs
index 84ee9e5..3f0c7d2 100644
--- a/datafusion/src/logical_plan/mod.rs
+++ b/datafusion/src/logical_plan/mod.rs
@@ -38,14 +38,15 @@ pub use display::display_schema;
pub use expr::{
abs, acos, and, array, ascii, asin, atan, avg, binary_expr, bit_length,
btrim, case,
ceil, character_length, chr, col, columnize_expr, combine_filters, concat,
concat_ws,
- cos, count, count_distinct, create_udaf, create_udf, date_part,
date_trunc, exp,
- exprlist_to_fields, floor, in_list, initcap, left, length, lit,
lit_timestamp_nano,
- ln, log10, log2, lower, lpad, ltrim, max, md5, min, normalize_col,
normalize_cols,
- now, octet_length, or, random, regexp_match, regexp_replace, repeat,
replace,
- replace_col, reverse, right, round, rpad, rtrim, sha224, sha256, sha384,
sha512,
- signum, sin, split_part, sqrt, starts_with, strpos, substr, sum, tan,
to_hex,
- translate, trim, trunc, unnormalize_col, unnormalize_cols, upper, when,
Column, Expr,
- ExprRewriter, ExpressionVisitor, Literal, Recursion, RewriteRecursion,
+ cos, count, count_distinct, create_udaf, create_udf, date_part,
date_trunc, digest,
+ exp, exprlist_to_fields, floor, in_list, initcap, left, length, lit,
+ lit_timestamp_nano, ln, log10, log2, lower, lpad, ltrim, max, md5, min,
+ normalize_col, normalize_cols, now, octet_length, or, random, regexp_match,
+ regexp_replace, repeat, replace, replace_col, reverse, right, round, rpad,
rtrim,
+ sha224, sha256, sha384, sha512, signum, sin, split_part, sqrt,
starts_with, strpos,
+ substr, sum, tan, to_hex, translate, trim, trunc, unnormalize_col,
unnormalize_cols,
+ upper, when, Column, Expr, ExprRewriter, ExpressionVisitor, Literal,
Recursion,
+ RewriteRecursion,
};
pub use extension::UserDefinedLogicalNode;
pub use operators::Operator;
diff --git a/datafusion/src/physical_plan/crypto_expressions.rs
b/datafusion/src/physical_plan/crypto_expressions.rs
index 8ad876b..8fa9f44 100644
--- a/datafusion/src/physical_plan/crypto_expressions.rs
+++ b/datafusion/src/physical_plan/crypto_expressions.rs
@@ -16,183 +16,269 @@
// under the License.
//! Crypto expressions
-use std::sync::Arc;
-
-use md5::Md5;
-use sha2::{
- digest::Output as SHA2DigestOutput, Digest as SHA2Digest, Sha224, Sha256,
Sha384,
- Sha512,
-};
-
+use super::ColumnarValue;
use crate::{
error::{DataFusionError, Result},
scalar::ScalarValue,
};
use arrow::{
- array::{Array, BinaryArray, GenericStringArray, StringOffsetSizeTrait},
+ array::{
+ Array, ArrayRef, BinaryArray, GenericStringArray, StringArray,
+ StringOffsetSizeTrait,
+ },
datatypes::DataType,
};
+use md5::Md5;
+use sha2::{Digest as SHA2Digest, Sha224, Sha256, Sha384, Sha512};
+use std::any::type_name;
+use std::fmt::Write;
+use std::sync::Arc;
+use std::{fmt, str::FromStr};
-use super::{string_expressions::unary_string_function, ColumnarValue};
-
-/// Computes the md5 of a string.
-fn md5_process(input: &str) -> String {
- let mut digest = Md5::default();
- digest.update(&input);
-
- let mut result = String::new();
-
- for byte in &digest.finalize() {
- result.push_str(&format!("{:02x}", byte));
- }
-
- result
-}
-
-// It's not possible to return &[u8], because trait in trait without short
lifetime
-fn sha_process<D: SHA2Digest + Default>(input: &str) -> SHA2DigestOutput<D> {
- let mut digest = D::default();
- digest.update(&input);
-
- digest.finalize()
-}
-
-/// # Errors
-/// This function errors when:
-/// * the number of arguments is not 1
-/// * the first argument is not castable to a `GenericStringArray`
-fn unary_binary_function<T, R, F>(
- args: &[&dyn Array],
- op: F,
- name: &str,
-) -> Result<BinaryArray>
-where
- R: AsRef<[u8]>,
- T: StringOffsetSizeTrait,
- F: Fn(&str) -> R,
-{
- if args.len() != 1 {
- return Err(DataFusionError::Internal(format!(
- "{:?} args were supplied but {} takes exactly one argument",
- args.len(),
- name,
- )));
- }
-
- let array = args[0]
- .as_any()
- .downcast_ref::<GenericStringArray<T>>()
- .ok_or_else(|| {
- DataFusionError::Internal("failed to downcast to
string".to_string())
- })?;
-
- // first map is the iterator, second is for the `Option<_>`
- Ok(array.iter().map(|x| x.map(|x| op(x))).collect())
+/// Digest algorithms.
+///
+/// Note that by default all digest algorithms return BinaryArray or Binary
scalar data.
+/// However md5 when invoked by its name (rather than digest(value, 'md5'))
would return
+/// hex encoded utf8 values, due to historical reasons. You are advised to
prefer to use
+/// digest(utf8, utf8) function.
+#[derive(Debug, Copy, Clone)]
+enum DigestAlgorithm {
+ Md5,
+ Sha224,
+ Sha256,
+ Sha384,
+ Sha512,
}
-fn handle<F, R>(args: &[ColumnarValue], op: F, name: &str) ->
Result<ColumnarValue>
-where
- R: AsRef<[u8]>,
- F: Fn(&str) -> R,
-{
- match &args[0] {
+fn digest_process(
+ value: &ColumnarValue,
+ digest_algorithm: DigestAlgorithm,
+) -> Result<ColumnarValue> {
+ match value {
ColumnarValue::Array(a) => match a.data_type() {
- DataType::Utf8 => {
- Ok(ColumnarValue::Array(Arc::new(unary_binary_function::<
- i32,
- _,
- _,
- >(
- &[a.as_ref()], op, name
- )?)))
- }
- DataType::LargeUtf8 => {
- Ok(ColumnarValue::Array(Arc::new(unary_binary_function::<
- i64,
- _,
- _,
- >(
- &[a.as_ref()], op, name
- )?)))
- }
+ DataType::Utf8 => digest_algorithm.digest_array::<i32>(a.as_ref()),
+ DataType::LargeUtf8 =>
digest_algorithm.digest_array::<i64>(a.as_ref()),
other => Err(DataFusionError::Internal(format!(
"Unsupported data type {:?} for function {}",
- other, name,
+ other,
+ digest_algorithm.to_string(),
))),
},
ColumnarValue::Scalar(scalar) => match scalar {
- ScalarValue::Utf8(a) => {
- let result = a.as_ref().map(|x| (op)(x).as_ref().to_vec());
- Ok(ColumnarValue::Scalar(ScalarValue::Binary(result)))
- }
- ScalarValue::LargeUtf8(a) => {
- let result = a.as_ref().map(|x| (op)(x).as_ref().to_vec());
- Ok(ColumnarValue::Scalar(ScalarValue::Binary(result)))
+ ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => {
+ Ok(digest_algorithm.digest_scalar(a))
}
other => Err(DataFusionError::Internal(format!(
"Unsupported data type {:?} for function {}",
- other, name,
+ other,
+ digest_algorithm.to_string(),
))),
},
}
}
-fn md5_array<T: StringOffsetSizeTrait>(
- args: &[&dyn Array],
-) -> Result<GenericStringArray<i32>> {
- unary_string_function::<T, i32, _, _>(args, md5_process, "md5")
+macro_rules! digest_to_array {
+ ($METHOD:ident, $INPUT:expr) => {{
+ let binary_array: BinaryArray = $INPUT
+ .iter()
+ .map(|x| {
+ x.map(|x| {
+ let mut digest = $METHOD::default();
+ digest.update(x);
+ digest.finalize()
+ })
+ })
+ .collect();
+ Arc::new(binary_array)
+ }};
}
-/// crypto function that accepts Utf8 or LargeUtf8 and returns a
[`ColumnarValue`]
-pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
- match &args[0] {
- ColumnarValue::Array(a) => match a.data_type() {
- DataType::Utf8 =>
Ok(ColumnarValue::Array(Arc::new(md5_array::<i32>(&[
- a.as_ref()
- ])?))),
- DataType::LargeUtf8 => {
- Ok(ColumnarValue::Array(Arc::new(md5_array::<i64>(&[
- a.as_ref()
- ])?)))
- }
- other => Err(DataFusionError::Internal(format!(
- "Unsupported data type {:?} for function md5",
- other,
- ))),
- },
- ColumnarValue::Scalar(scalar) => match scalar {
- ScalarValue::Utf8(a) => {
- let result = a.as_ref().map(|x| md5_process(x));
- Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result)))
- }
- ScalarValue::LargeUtf8(a) => {
- let result = a.as_ref().map(|x| md5_process(x));
- Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(result)))
+macro_rules! digest_to_scalar {
+ ($METHOD: ident, $INPUT:expr) => {{
+ ScalarValue::Binary($INPUT.as_ref().map(|v| {
+ let mut digest = $METHOD::default();
+ digest.update(v);
+ digest.finalize().as_slice().to_vec()
+ }))
+ }};
+}
+
+impl DigestAlgorithm {
+ /// digest an optional string to its hash value, null values are returned
as is
+ fn digest_scalar(self, value: &Option<String>) -> ColumnarValue {
+ ColumnarValue::Scalar(match self {
+ Self::Md5 => digest_to_scalar!(Md5, value),
+ Self::Sha224 => digest_to_scalar!(Sha224, value),
+ Self::Sha256 => digest_to_scalar!(Sha256, value),
+ Self::Sha384 => digest_to_scalar!(Sha384, value),
+ Self::Sha512 => digest_to_scalar!(Sha512, value),
+ })
+ }
+
+ /// digest a string array to their hash values
+ fn digest_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
+ where
+ T: StringOffsetSizeTrait,
+ {
+ let input_value = value
+ .as_any()
+ .downcast_ref::<GenericStringArray<T>>()
+ .ok_or_else(|| {
+ DataFusionError::Internal(format!(
+ "could not cast value to {}",
+ type_name::<GenericStringArray<T>>()
+ ))
+ })?;
+ let array: ArrayRef = match self {
+ Self::Md5 => digest_to_array!(Md5, input_value),
+ Self::Sha224 => digest_to_array!(Sha224, input_value),
+ Self::Sha256 => digest_to_array!(Sha256, input_value),
+ Self::Sha384 => digest_to_array!(Sha384, input_value),
+ Self::Sha512 => digest_to_array!(Sha512, input_value),
+ };
+ Ok(ColumnarValue::Array(array))
+ }
+}
+
+impl fmt::Display for DigestAlgorithm {
+ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+ write!(f, "{}", format!("{:?}", self).to_lowercase())
+ }
+}
+
+impl FromStr for DigestAlgorithm {
+ type Err = DataFusionError;
+ fn from_str(name: &str) -> Result<DigestAlgorithm> {
+ Ok(match name {
+ "md5" => Self::Md5,
+ "sha224" => Self::Sha224,
+ "sha256" => Self::Sha256,
+ "sha384" => Self::Sha384,
+ "sha512" => Self::Sha512,
+ _ => {
+ return Err(DataFusionError::Plan(format!(
+ "There is no built-in digest algorithm named {}",
+ name
+ )))
}
- other => Err(DataFusionError::Internal(format!(
- "Unsupported data type {:?} for function md5",
- other,
- ))),
- },
+ })
}
}
-/// crypto function that accepts Utf8 or LargeUtf8 and returns a
[`ColumnarValue`]
-pub fn sha224(args: &[ColumnarValue]) -> Result<ColumnarValue> {
- handle(args, sha_process::<Sha224>, "ssh224")
+macro_rules! define_digest_function {
+ ($NAME: ident, $METHOD: ident, $DOC: expr) => {
+ #[doc = $DOC]
+ pub fn $NAME(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ if args.len() != 1 {
+ return Err(DataFusionError::Internal(format!(
+ "{:?} args were supplied but {} takes exactly one
argument",
+ args.len(),
+ DigestAlgorithm::$METHOD.to_string(),
+ )));
+ }
+ digest_process(&args[0], DigestAlgorithm::$METHOD)
+ }
+ };
}
-/// crypto function that accepts Utf8 or LargeUtf8 and returns a
[`ColumnarValue`]
-pub fn sha256(args: &[ColumnarValue]) -> Result<ColumnarValue> {
- handle(args, sha_process::<Sha256>, "sha256")
+/// this function exists so that we do not need to pull in the crate hex. it
is only used by md5
+/// function below
+#[inline]
+fn hex_encode<T: AsRef<[u8]>>(data: T) -> String {
+ let mut s = String::with_capacity(data.as_ref().len() * 2);
+ for b in data.as_ref() {
+ // Writing to a string never errors, so we can unwrap here.
+ write!(&mut s, "{:02x}", b).unwrap();
+ }
+ s
}
-/// crypto function that accepts Utf8 or LargeUtf8 and returns a
[`ColumnarValue`]
-pub fn sha384(args: &[ColumnarValue]) -> Result<ColumnarValue> {
- handle(args, sha_process::<Sha384>, "sha384")
+/// computes md5 hash digest of the given input
+pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ if args.len() != 1 {
+ return Err(DataFusionError::Internal(format!(
+ "{:?} args were supplied but {} takes exactly one argument",
+ args.len(),
+ DigestAlgorithm::Md5.to_string(),
+ )));
+ }
+ let value = digest_process(&args[0], DigestAlgorithm::Md5)?;
+ // md5 requires special handling because of its unique utf8 return type
+ Ok(match value {
+ ColumnarValue::Array(array) => {
+ let binary_array = array
+ .as_ref()
+ .as_any()
+ .downcast_ref::<BinaryArray>()
+ .ok_or_else(|| {
+ DataFusionError::Internal(
+ "Impossibly got non-binary array data from
digest".into(),
+ )
+ })?;
+ let string_array: StringArray = binary_array
+ .iter()
+ .map(|opt| opt.map(hex_encode::<_>))
+ .collect();
+ ColumnarValue::Array(Arc::new(string_array))
+ }
+ ColumnarValue::Scalar(ScalarValue::Binary(opt)) => {
+ ColumnarValue::Scalar(ScalarValue::Utf8(opt.map(hex_encode::<_>)))
+ }
+ _ => {
+ return Err(DataFusionError::Internal(
+ "Impossibly got invalid results from digest".into(),
+ ))
+ }
+ })
}
-/// crypto function that accepts Utf8 or LargeUtf8 and returns a
[`ColumnarValue`]
-pub fn sha512(args: &[ColumnarValue]) -> Result<ColumnarValue> {
- handle(args, sha_process::<Sha512>, "sha512")
+define_digest_function!(
+ sha224,
+ Sha224,
+ "computes sha224 hash digest of the given input"
+);
+
+define_digest_function!(
+ sha256,
+ Sha256,
+ "computes sha256 hash digest of the given input"
+);
+
+define_digest_function!(
+ sha384,
+ Sha384,
+ "computes sha384 hash digest of the given input"
+);
+
+define_digest_function!(
+ sha512,
+ Sha512,
+ "computes sha512 hash digest of the given input"
+);
+
+/// Digest computes a binary hash of the given data, accepts Utf8 or LargeUtf8
and returns a [`ColumnarValue`].
+/// Second argument is the algorithm to use.
+/// Standard algorithms are md5, sha1, sha224, sha256, sha384 and sha512.
+pub fn digest(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+ if args.len() != 2 {
+ return Err(DataFusionError::Internal(format!(
+ "{:?} args were supplied but digest takes exactly two arguments",
+ args.len(),
+ )));
+ }
+ let digest_algorithm = match &args[1] {
+ ColumnarValue::Scalar(scalar) => match scalar {
+ ScalarValue::Utf8(Some(method)) |
ScalarValue::LargeUtf8(Some(method)) => {
+ method.parse::<DigestAlgorithm>()
+ }
+ other => Err(DataFusionError::Internal(format!(
+ "Unsupported data type {:?} for function digest",
+ other,
+ ))),
+ },
+ ColumnarValue::Array(_) => Err(DataFusionError::Internal(
+ "Digest using dynamically decided method is not yet
supported".into(),
+ )),
+ }?;
+ digest_process(&args[0], digest_algorithm)
}
diff --git a/datafusion/src/physical_plan/functions.rs
b/datafusion/src/physical_plan/functions.rs
index a1d7d45..d00248e 100644
--- a/datafusion/src/physical_plan/functions.rs
+++ b/datafusion/src/physical_plan/functions.rs
@@ -185,6 +185,8 @@ pub enum BuiltinScalarFunction {
Ceil,
/// cos
Cos,
+ /// Digest
+ Digest,
/// exp
Exp,
/// floor
@@ -310,7 +312,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::Random | BuiltinScalarFunction::Now
)
}
- /// Returns the [Volatility] of the builtin function.
+ /// Returns the [Volatility] of the builtin function.
pub fn volatility(&self) -> Volatility {
match self {
//Immutable scalar builtins
@@ -350,7 +352,6 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::MD5 => Volatility::Immutable,
BuiltinScalarFunction::NullIf => Volatility::Immutable,
BuiltinScalarFunction::OctetLength => Volatility::Immutable,
-
BuiltinScalarFunction::RegexpReplace => Volatility::Immutable,
BuiltinScalarFunction::Repeat => Volatility::Immutable,
BuiltinScalarFunction::Replace => Volatility::Immutable,
@@ -362,6 +363,7 @@ impl BuiltinScalarFunction {
BuiltinScalarFunction::SHA256 => Volatility::Immutable,
BuiltinScalarFunction::SHA384 => Volatility::Immutable,
BuiltinScalarFunction::SHA512 => Volatility::Immutable,
+ BuiltinScalarFunction::Digest => Volatility::Immutable,
BuiltinScalarFunction::SplitPart => Volatility::Immutable,
BuiltinScalarFunction::StartsWith => Volatility::Immutable,
BuiltinScalarFunction::Strpos => Volatility::Immutable,
@@ -449,6 +451,7 @@ impl FromStr for BuiltinScalarFunction {
"sha256" => BuiltinScalarFunction::SHA256,
"sha384" => BuiltinScalarFunction::SHA384,
"sha512" => BuiltinScalarFunction::SHA512,
+ "digest" => BuiltinScalarFunction::Digest,
"split_part" => BuiltinScalarFunction::SplitPart,
"starts_with" => BuiltinScalarFunction::StartsWith,
"strpos" => BuiltinScalarFunction::Strpos,
@@ -554,6 +557,7 @@ pub fn return_type(
BuiltinScalarFunction::SHA256 => utf8_to_binary_type(&arg_types[0],
"sha256"),
BuiltinScalarFunction::SHA384 => utf8_to_binary_type(&arg_types[0],
"sha384"),
BuiltinScalarFunction::SHA512 => utf8_to_binary_type(&arg_types[0],
"sha512"),
+ BuiltinScalarFunction::Digest => utf8_to_binary_type(&arg_types[0],
"digest"),
BuiltinScalarFunction::SplitPart => utf8_to_str_type(&arg_types[0],
"split_part"),
BuiltinScalarFunction::StartsWith => Ok(DataType::Boolean),
BuiltinScalarFunction::Strpos => utf8_to_int_type(&arg_types[0],
"strpos"),
@@ -851,6 +855,9 @@ pub fn create_physical_fun(
BuiltinScalarFunction::MD5 => {
Arc::new(invoke_if_crypto_expressions_feature_flag!(md5, "md5"))
}
+ BuiltinScalarFunction::Digest => {
+ Arc::new(invoke_if_crypto_expressions_feature_flag!(digest,
"digest"))
+ }
BuiltinScalarFunction::NullIf => Arc::new(nullif_func),
BuiltinScalarFunction::OctetLength => Arc::new(|args| match &args[0] {
ColumnarValue::Array(v) =>
Ok(ColumnarValue::Array(length(v.as_ref())?)),
@@ -1354,6 +1361,9 @@ fn signature(fun: &BuiltinScalarFunction) -> Signature {
],
fun.volatility(),
),
+ BuiltinScalarFunction::Digest => {
+ Signature::exact(vec![DataType::Utf8, DataType::Utf8],
fun.volatility())
+ }
BuiltinScalarFunction::DateTrunc => Signature::exact(
vec![
DataType::Utf8,
diff --git a/datafusion/src/prelude.rs b/datafusion/src/prelude.rs
index 168e1d5..02b9d4f 100644
--- a/datafusion/src/prelude.rs
+++ b/datafusion/src/prelude.rs
@@ -29,10 +29,10 @@ pub use crate::dataframe::DataFrame;
pub use crate::execution::context::{ExecutionConfig, ExecutionContext};
pub use crate::logical_plan::{
array, ascii, avg, bit_length, btrim, character_length, chr, col, concat,
concat_ws,
- count, create_udf, date_part, date_trunc, in_list, initcap, left, length,
lit, lower,
- lpad, ltrim, max, md5, min, now, octet_length, random, regexp_replace,
repeat,
- replace, reverse, right, rpad, rtrim, sha224, sha256, sha384, sha512,
split_part,
- starts_with, strpos, substr, sum, to_hex, translate, trim, upper, Column,
JoinType,
- Partitioning,
+ count, create_udf, date_part, date_trunc, digest, in_list, initcap, left,
length,
+ lit, lower, lpad, ltrim, max, md5, min, now, octet_length, random,
regexp_replace,
+ repeat, replace, reverse, right, rpad, rtrim, sha224, sha256, sha384,
sha512,
+ split_part, starts_with, strpos, substr, sum, to_hex, translate, trim,
upper, Column,
+ JoinType, Partitioning,
};
pub use crate::physical_plan::csv::CsvReadOptions;
diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs
index 32da908..4ed07af 100644
--- a/datafusion/tests/sql.rs
+++ b/datafusion/tests/sql.rs
@@ -3997,32 +3997,59 @@ async fn test_boolean_expressions() -> Result<()> {
#[cfg_attr(not(feature = "crypto_expressions"), ignore)]
async fn test_crypto_expressions() -> Result<()> {
test_expression!("md5('tom')", "34b7da764b21d298ef307d04d8152dc5");
+ test_expression!("digest('tom','md5')",
"34b7da764b21d298ef307d04d8152dc5");
test_expression!("md5('')", "d41d8cd98f00b204e9800998ecf8427e");
+ test_expression!("digest('','md5')", "d41d8cd98f00b204e9800998ecf8427e");
test_expression!("md5(NULL)", "NULL");
+ test_expression!("digest(NULL,'md5')", "NULL");
test_expression!(
"sha224('tom')",
"0bf6cb62649c42a9ae3876ab6f6d92ad36cb5414e495f8873292be4d"
);
test_expression!(
+ "digest('tom','sha224')",
+ "0bf6cb62649c42a9ae3876ab6f6d92ad36cb5414e495f8873292be4d"
+ );
+ test_expression!(
"sha224('')",
"d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f"
);
+ test_expression!(
+ "digest('','sha224')",
+ "d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f"
+ );
test_expression!("sha224(NULL)", "NULL");
+ test_expression!("digest(NULL,'sha224')", "NULL");
test_expression!(
"sha256('tom')",
"e1608f75c5d7813f3d4031cb30bfb786507d98137538ff8e128a6ff74e84e643"
);
test_expression!(
+ "digest('tom','sha256')",
+ "e1608f75c5d7813f3d4031cb30bfb786507d98137538ff8e128a6ff74e84e643"
+ );
+ test_expression!(
"sha256('')",
"e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
);
+ test_expression!(
+ "digest('','sha256')",
+ "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
+ );
test_expression!("sha256(NULL)", "NULL");
+ test_expression!("digest(NULL,'sha256')", "NULL");
test_expression!("sha384('tom')",
"096f5b68aa77848e4fdf5c1c0b350de2dbfad60ffd7c25d9ea07c6c19b8a4d55a9187eb117c557883f58c16dfac3e343");
+ test_expression!("digest('tom','sha384')",
"096f5b68aa77848e4fdf5c1c0b350de2dbfad60ffd7c25d9ea07c6c19b8a4d55a9187eb117c557883f58c16dfac3e343");
test_expression!("sha384('')",
"38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b95b");
+ test_expression!("digest('','sha384')",
"38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b95b");
test_expression!("sha384(NULL)", "NULL");
+ test_expression!("digest(NULL,'sha384')", "NULL");
test_expression!("sha512('tom')",
"6e1b9b3fe840680e37051f7ad5e959d6f39ad0f8885d855166f55c659469d3c8b78118c44a2a49c72ddb481cd6d8731034e11cc030070ba843a90b3495cb8d3e");
+ test_expression!("digest('tom','sha512')",
"6e1b9b3fe840680e37051f7ad5e959d6f39ad0f8885d855166f55c659469d3c8b78118c44a2a49c72ddb481cd6d8731034e11cc030070ba843a90b3495cb8d3e");
test_expression!("sha512('')",
"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e");
+ test_expression!("digest('','sha512')",
"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e");
test_expression!("sha512(NULL)", "NULL");
+ test_expression!("digest(NULL,'sha512')", "NULL");
Ok(())
}