This is an automated email from the ASF dual-hosted git repository.

dheres pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 613188f  add digest(utf8, method) function (#1090)
613188f is described below

commit 613188f5f53bbfffb83f0f7d60cac417980a20dc
Author: Jiayu Liu <[email protected]>
AuthorDate: Sun Oct 10 16:06:43 2021 +0800

    add digest(utf8, method) function (#1090)
---
 ballista/rust/core/proto/ballista.proto            |   1 +
 .../rust/core/src/serde/logical_plan/from_proto.rs |   7 +-
 .../rust/core/src/serde/logical_plan/to_proto.rs   |   1 +
 .../core/src/serde/physical_plan/from_proto.rs     |   1 +
 datafusion/src/logical_plan/expr.rs                |  14 +-
 datafusion/src/logical_plan/mod.rs                 |  17 +-
 datafusion/src/physical_plan/crypto_expressions.rs | 372 +++++++++++++--------
 datafusion/src/physical_plan/functions.rs          |  14 +-
 datafusion/src/prelude.rs                          |  10 +-
 datafusion/tests/sql.rs                            |  27 ++
 10 files changed, 303 insertions(+), 161 deletions(-)

diff --git a/ballista/rust/core/proto/ballista.proto 
b/ballista/rust/core/proto/ballista.proto
index 47cc801..8175156 100644
--- a/ballista/rust/core/proto/ballista.proto
+++ b/ballista/rust/core/proto/ballista.proto
@@ -153,6 +153,7 @@ enum ScalarFunction {
   SHA512 = 33;
   LN = 34;
   TOTIMESTAMPMILLIS = 35;
+  DIGEST = 36;
 }
 
 message ScalarFunctionNode {
diff --git a/ballista/rust/core/src/serde/logical_plan/from_proto.rs 
b/ballista/rust/core/src/serde/logical_plan/from_proto.rs
index c9ef97e..353be9a 100644
--- a/ballista/rust/core/src/serde/logical_plan/from_proto.rs
+++ b/ballista/rust/core/src/serde/logical_plan/from_proto.rs
@@ -27,8 +27,8 @@ use datafusion::logical_plan::window_frames::{
     WindowFrame, WindowFrameBound, WindowFrameUnits,
 };
 use datafusion::logical_plan::{
-    abs, acos, asin, atan, ceil, cos, exp, floor, ln, log10, log2, round, 
signum, sin,
-    sqrt, tan, trunc, Column, DFField, DFSchema, Expr, JoinConstraint, 
JoinType,
+    abs, acos, asin, atan, ceil, cos, digest, exp, floor, ln, log10, log2, 
round, signum,
+    sin, sqrt, tan, trunc, Column, DFField, DFSchema, Expr, JoinConstraint, 
JoinType,
     LogicalPlan, LogicalPlanBuilder, Operator,
 };
 use datafusion::physical_plan::aggregates::AggregateFunction;
@@ -1152,6 +1152,9 @@ impl TryInto<Expr> for &protobuf::LogicalExprNode {
                     protobuf::ScalarFunction::Sha512 => {
                         Ok(sha512((&args[0]).try_into()?))
                     }
+                    protobuf::ScalarFunction::Digest => {
+                        Ok(digest((&args[0]).try_into()?, 
(&args[1]).try_into()?))
+                    }
                     _ => Err(proto_error(
                         "Protobuf deserialization error: Unsupported scalar 
function",
                     )),
diff --git a/ballista/rust/core/src/serde/logical_plan/to_proto.rs 
b/ballista/rust/core/src/serde/logical_plan/to_proto.rs
index bd7fc4d..c3ffb1a 100644
--- a/ballista/rust/core/src/serde/logical_plan/to_proto.rs
+++ b/ballista/rust/core/src/serde/logical_plan/to_proto.rs
@@ -1485,6 +1485,7 @@ impl TryInto<protobuf::ScalarFunction> for 
&BuiltinScalarFunction {
             BuiltinScalarFunction::SHA256 => 
Ok(protobuf::ScalarFunction::Sha256),
             BuiltinScalarFunction::SHA384 => 
Ok(protobuf::ScalarFunction::Sha384),
             BuiltinScalarFunction::SHA512 => 
Ok(protobuf::ScalarFunction::Sha512),
+            BuiltinScalarFunction::Digest => 
Ok(protobuf::ScalarFunction::Digest),
             BuiltinScalarFunction::ToTimestampMillis => {
                 Ok(protobuf::ScalarFunction::Totimestampmillis)
             }
diff --git a/ballista/rust/core/src/serde/physical_plan/from_proto.rs 
b/ballista/rust/core/src/serde/physical_plan/from_proto.rs
index 0d23372..5241e8b 100644
--- a/ballista/rust/core/src/serde/physical_plan/from_proto.rs
+++ b/ballista/rust/core/src/serde/physical_plan/from_proto.rs
@@ -559,6 +559,7 @@ impl From<&protobuf::ScalarFunction> for 
BuiltinScalarFunction {
             ScalarFunction::Sha256 => BuiltinScalarFunction::SHA256,
             ScalarFunction::Sha384 => BuiltinScalarFunction::SHA384,
             ScalarFunction::Sha512 => BuiltinScalarFunction::SHA512,
+            ScalarFunction::Digest => BuiltinScalarFunction::Digest,
             ScalarFunction::Ln => BuiltinScalarFunction::Ln,
             ScalarFunction::Totimestampmillis => 
BuiltinScalarFunction::ToTimestampMillis,
         }
diff --git a/datafusion/src/logical_plan/expr.rs 
b/datafusion/src/logical_plan/expr.rs
index 0fc00f3..f61ed83 100644
--- a/datafusion/src/logical_plan/expr.rs
+++ b/datafusion/src/logical_plan/expr.rs
@@ -1508,7 +1508,7 @@ macro_rules! unary_scalar_expr {
     };
 }
 
-/// Create an convenience function representing a /binaryunary scalar function
+/// Create an convenience function representing a binary scalar function
 macro_rules! binary_scalar_expr {
     ($ENUM:ident, $FUNC:ident) => {
         #[doc = "this scalar function is not documented yet"]
@@ -1581,6 +1581,7 @@ unary_scalar_expr!(Upper, upper);
 // date functions
 binary_scalar_expr!(DatePart, date_part);
 binary_scalar_expr!(DateTrunc, date_trunc);
+binary_scalar_expr!(Digest, digest);
 
 /// returns an array of fixed size with each argument on it.
 pub fn array(args: Vec<Expr>) -> Expr {
@@ -2127,6 +2128,17 @@ mod tests {
     }
 
     #[test]
+    fn digest_function_definitions() {
+        if let Expr::ScalarFunction { fun, args } = digest(col("tableA.a"), 
lit("md5")) {
+            let name = functions::BuiltinScalarFunction::Digest;
+            assert_eq!(name, fun);
+            assert_eq!(2, args.len());
+        } else {
+            unreachable!();
+        }
+    }
+
+    #[test]
     fn scalar_function_definitions() {
         test_unary_scalar_expr!(Sqrt, sqrt);
         test_unary_scalar_expr!(Sin, sin);
diff --git a/datafusion/src/logical_plan/mod.rs 
b/datafusion/src/logical_plan/mod.rs
index 84ee9e5..3f0c7d2 100644
--- a/datafusion/src/logical_plan/mod.rs
+++ b/datafusion/src/logical_plan/mod.rs
@@ -38,14 +38,15 @@ pub use display::display_schema;
 pub use expr::{
     abs, acos, and, array, ascii, asin, atan, avg, binary_expr, bit_length, 
btrim, case,
     ceil, character_length, chr, col, columnize_expr, combine_filters, concat, 
concat_ws,
-    cos, count, count_distinct, create_udaf, create_udf, date_part, 
date_trunc, exp,
-    exprlist_to_fields, floor, in_list, initcap, left, length, lit, 
lit_timestamp_nano,
-    ln, log10, log2, lower, lpad, ltrim, max, md5, min, normalize_col, 
normalize_cols,
-    now, octet_length, or, random, regexp_match, regexp_replace, repeat, 
replace,
-    replace_col, reverse, right, round, rpad, rtrim, sha224, sha256, sha384, 
sha512,
-    signum, sin, split_part, sqrt, starts_with, strpos, substr, sum, tan, 
to_hex,
-    translate, trim, trunc, unnormalize_col, unnormalize_cols, upper, when, 
Column, Expr,
-    ExprRewriter, ExpressionVisitor, Literal, Recursion, RewriteRecursion,
+    cos, count, count_distinct, create_udaf, create_udf, date_part, 
date_trunc, digest,
+    exp, exprlist_to_fields, floor, in_list, initcap, left, length, lit,
+    lit_timestamp_nano, ln, log10, log2, lower, lpad, ltrim, max, md5, min,
+    normalize_col, normalize_cols, now, octet_length, or, random, regexp_match,
+    regexp_replace, repeat, replace, replace_col, reverse, right, round, rpad, 
rtrim,
+    sha224, sha256, sha384, sha512, signum, sin, split_part, sqrt, 
starts_with, strpos,
+    substr, sum, tan, to_hex, translate, trim, trunc, unnormalize_col, 
unnormalize_cols,
+    upper, when, Column, Expr, ExprRewriter, ExpressionVisitor, Literal, 
Recursion,
+    RewriteRecursion,
 };
 pub use extension::UserDefinedLogicalNode;
 pub use operators::Operator;
diff --git a/datafusion/src/physical_plan/crypto_expressions.rs 
b/datafusion/src/physical_plan/crypto_expressions.rs
index 8ad876b..8fa9f44 100644
--- a/datafusion/src/physical_plan/crypto_expressions.rs
+++ b/datafusion/src/physical_plan/crypto_expressions.rs
@@ -16,183 +16,269 @@
 // under the License.
 
 //! Crypto expressions
-use std::sync::Arc;
-
-use md5::Md5;
-use sha2::{
-    digest::Output as SHA2DigestOutput, Digest as SHA2Digest, Sha224, Sha256, 
Sha384,
-    Sha512,
-};
-
+use super::ColumnarValue;
 use crate::{
     error::{DataFusionError, Result},
     scalar::ScalarValue,
 };
 use arrow::{
-    array::{Array, BinaryArray, GenericStringArray, StringOffsetSizeTrait},
+    array::{
+        Array, ArrayRef, BinaryArray, GenericStringArray, StringArray,
+        StringOffsetSizeTrait,
+    },
     datatypes::DataType,
 };
+use md5::Md5;
+use sha2::{Digest as SHA2Digest, Sha224, Sha256, Sha384, Sha512};
+use std::any::type_name;
+use std::fmt::Write;
+use std::sync::Arc;
+use std::{fmt, str::FromStr};
 
-use super::{string_expressions::unary_string_function, ColumnarValue};
-
-/// Computes the md5 of a string.
-fn md5_process(input: &str) -> String {
-    let mut digest = Md5::default();
-    digest.update(&input);
-
-    let mut result = String::new();
-
-    for byte in &digest.finalize() {
-        result.push_str(&format!("{:02x}", byte));
-    }
-
-    result
-}
-
-// It's not possible to return &[u8], because trait in trait without short 
lifetime
-fn sha_process<D: SHA2Digest + Default>(input: &str) -> SHA2DigestOutput<D> {
-    let mut digest = D::default();
-    digest.update(&input);
-
-    digest.finalize()
-}
-
-/// # Errors
-/// This function errors when:
-/// * the number of arguments is not 1
-/// * the first argument is not castable to a `GenericStringArray`
-fn unary_binary_function<T, R, F>(
-    args: &[&dyn Array],
-    op: F,
-    name: &str,
-) -> Result<BinaryArray>
-where
-    R: AsRef<[u8]>,
-    T: StringOffsetSizeTrait,
-    F: Fn(&str) -> R,
-{
-    if args.len() != 1 {
-        return Err(DataFusionError::Internal(format!(
-            "{:?} args were supplied but {} takes exactly one argument",
-            args.len(),
-            name,
-        )));
-    }
-
-    let array = args[0]
-        .as_any()
-        .downcast_ref::<GenericStringArray<T>>()
-        .ok_or_else(|| {
-            DataFusionError::Internal("failed to downcast to 
string".to_string())
-        })?;
-
-    // first map is the iterator, second is for the `Option<_>`
-    Ok(array.iter().map(|x| x.map(|x| op(x))).collect())
+/// Digest algorithms.
+///
+/// Note that by default all digest algorithms return BinaryArray or Binary 
scalar data.
+/// However md5 when invoked by its name (rather than digest(value, 'md5')) 
would return
+/// hex encoded utf8 values, due to historical reasons. You are advised to 
prefer to use
+/// digest(utf8, utf8) function.
+#[derive(Debug, Copy, Clone)]
+enum DigestAlgorithm {
+    Md5,
+    Sha224,
+    Sha256,
+    Sha384,
+    Sha512,
 }
 
-fn handle<F, R>(args: &[ColumnarValue], op: F, name: &str) -> 
Result<ColumnarValue>
-where
-    R: AsRef<[u8]>,
-    F: Fn(&str) -> R,
-{
-    match &args[0] {
+fn digest_process(
+    value: &ColumnarValue,
+    digest_algorithm: DigestAlgorithm,
+) -> Result<ColumnarValue> {
+    match value {
         ColumnarValue::Array(a) => match a.data_type() {
-            DataType::Utf8 => {
-                Ok(ColumnarValue::Array(Arc::new(unary_binary_function::<
-                    i32,
-                    _,
-                    _,
-                >(
-                    &[a.as_ref()], op, name
-                )?)))
-            }
-            DataType::LargeUtf8 => {
-                Ok(ColumnarValue::Array(Arc::new(unary_binary_function::<
-                    i64,
-                    _,
-                    _,
-                >(
-                    &[a.as_ref()], op, name
-                )?)))
-            }
+            DataType::Utf8 => digest_algorithm.digest_array::<i32>(a.as_ref()),
+            DataType::LargeUtf8 => 
digest_algorithm.digest_array::<i64>(a.as_ref()),
             other => Err(DataFusionError::Internal(format!(
                 "Unsupported data type {:?} for function {}",
-                other, name,
+                other,
+                digest_algorithm.to_string(),
             ))),
         },
         ColumnarValue::Scalar(scalar) => match scalar {
-            ScalarValue::Utf8(a) => {
-                let result = a.as_ref().map(|x| (op)(x).as_ref().to_vec());
-                Ok(ColumnarValue::Scalar(ScalarValue::Binary(result)))
-            }
-            ScalarValue::LargeUtf8(a) => {
-                let result = a.as_ref().map(|x| (op)(x).as_ref().to_vec());
-                Ok(ColumnarValue::Scalar(ScalarValue::Binary(result)))
+            ScalarValue::Utf8(a) | ScalarValue::LargeUtf8(a) => {
+                Ok(digest_algorithm.digest_scalar(a))
             }
             other => Err(DataFusionError::Internal(format!(
                 "Unsupported data type {:?} for function {}",
-                other, name,
+                other,
+                digest_algorithm.to_string(),
             ))),
         },
     }
 }
 
-fn md5_array<T: StringOffsetSizeTrait>(
-    args: &[&dyn Array],
-) -> Result<GenericStringArray<i32>> {
-    unary_string_function::<T, i32, _, _>(args, md5_process, "md5")
+macro_rules! digest_to_array {
+    ($METHOD:ident, $INPUT:expr) => {{
+        let binary_array: BinaryArray = $INPUT
+            .iter()
+            .map(|x| {
+                x.map(|x| {
+                    let mut digest = $METHOD::default();
+                    digest.update(x);
+                    digest.finalize()
+                })
+            })
+            .collect();
+        Arc::new(binary_array)
+    }};
 }
 
-/// crypto function that accepts Utf8 or LargeUtf8 and returns a 
[`ColumnarValue`]
-pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    match &args[0] {
-        ColumnarValue::Array(a) => match a.data_type() {
-            DataType::Utf8 => 
Ok(ColumnarValue::Array(Arc::new(md5_array::<i32>(&[
-                a.as_ref()
-            ])?))),
-            DataType::LargeUtf8 => {
-                Ok(ColumnarValue::Array(Arc::new(md5_array::<i64>(&[
-                    a.as_ref()
-                ])?)))
-            }
-            other => Err(DataFusionError::Internal(format!(
-                "Unsupported data type {:?} for function md5",
-                other,
-            ))),
-        },
-        ColumnarValue::Scalar(scalar) => match scalar {
-            ScalarValue::Utf8(a) => {
-                let result = a.as_ref().map(|x| md5_process(x));
-                Ok(ColumnarValue::Scalar(ScalarValue::Utf8(result)))
-            }
-            ScalarValue::LargeUtf8(a) => {
-                let result = a.as_ref().map(|x| md5_process(x));
-                Ok(ColumnarValue::Scalar(ScalarValue::LargeUtf8(result)))
+macro_rules! digest_to_scalar {
+    ($METHOD: ident, $INPUT:expr) => {{
+        ScalarValue::Binary($INPUT.as_ref().map(|v| {
+            let mut digest = $METHOD::default();
+            digest.update(v);
+            digest.finalize().as_slice().to_vec()
+        }))
+    }};
+}
+
+impl DigestAlgorithm {
+    /// digest an optional string to its hash value, null values are returned 
as is
+    fn digest_scalar(self, value: &Option<String>) -> ColumnarValue {
+        ColumnarValue::Scalar(match self {
+            Self::Md5 => digest_to_scalar!(Md5, value),
+            Self::Sha224 => digest_to_scalar!(Sha224, value),
+            Self::Sha256 => digest_to_scalar!(Sha256, value),
+            Self::Sha384 => digest_to_scalar!(Sha384, value),
+            Self::Sha512 => digest_to_scalar!(Sha512, value),
+        })
+    }
+
+    /// digest a string array to their hash values
+    fn digest_array<T>(self, value: &dyn Array) -> Result<ColumnarValue>
+    where
+        T: StringOffsetSizeTrait,
+    {
+        let input_value = value
+            .as_any()
+            .downcast_ref::<GenericStringArray<T>>()
+            .ok_or_else(|| {
+                DataFusionError::Internal(format!(
+                    "could not cast value to {}",
+                    type_name::<GenericStringArray<T>>()
+                ))
+            })?;
+        let array: ArrayRef = match self {
+            Self::Md5 => digest_to_array!(Md5, input_value),
+            Self::Sha224 => digest_to_array!(Sha224, input_value),
+            Self::Sha256 => digest_to_array!(Sha256, input_value),
+            Self::Sha384 => digest_to_array!(Sha384, input_value),
+            Self::Sha512 => digest_to_array!(Sha512, input_value),
+        };
+        Ok(ColumnarValue::Array(array))
+    }
+}
+
+impl fmt::Display for DigestAlgorithm {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "{}", format!("{:?}", self).to_lowercase())
+    }
+}
+
+impl FromStr for DigestAlgorithm {
+    type Err = DataFusionError;
+    fn from_str(name: &str) -> Result<DigestAlgorithm> {
+        Ok(match name {
+            "md5" => Self::Md5,
+            "sha224" => Self::Sha224,
+            "sha256" => Self::Sha256,
+            "sha384" => Self::Sha384,
+            "sha512" => Self::Sha512,
+            _ => {
+                return Err(DataFusionError::Plan(format!(
+                    "There is no built-in digest algorithm named {}",
+                    name
+                )))
             }
-            other => Err(DataFusionError::Internal(format!(
-                "Unsupported data type {:?} for function md5",
-                other,
-            ))),
-        },
+        })
     }
 }
 
-/// crypto function that accepts Utf8 or LargeUtf8 and returns a 
[`ColumnarValue`]
-pub fn sha224(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    handle(args, sha_process::<Sha224>, "ssh224")
+macro_rules! define_digest_function {
+    ($NAME: ident, $METHOD: ident, $DOC: expr) => {
+        #[doc = $DOC]
+        pub fn $NAME(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+            if args.len() != 1 {
+                return Err(DataFusionError::Internal(format!(
+                    "{:?} args were supplied but {} takes exactly one 
argument",
+                    args.len(),
+                    DigestAlgorithm::$METHOD.to_string(),
+                )));
+            }
+            digest_process(&args[0], DigestAlgorithm::$METHOD)
+        }
+    };
 }
 
-/// crypto function that accepts Utf8 or LargeUtf8 and returns a 
[`ColumnarValue`]
-pub fn sha256(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    handle(args, sha_process::<Sha256>, "sha256")
+/// this function exists so that we do not need to pull in the crate hex. it 
is only used by md5
+/// function below
+#[inline]
+fn hex_encode<T: AsRef<[u8]>>(data: T) -> String {
+    let mut s = String::with_capacity(data.as_ref().len() * 2);
+    for b in data.as_ref() {
+        // Writing to a string never errors, so we can unwrap here.
+        write!(&mut s, "{:02x}", b).unwrap();
+    }
+    s
 }
 
-/// crypto function that accepts Utf8 or LargeUtf8 and returns a 
[`ColumnarValue`]
-pub fn sha384(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    handle(args, sha_process::<Sha384>, "sha384")
+/// computes md5 hash digest of the given input
+pub fn md5(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    if args.len() != 1 {
+        return Err(DataFusionError::Internal(format!(
+            "{:?} args were supplied but {} takes exactly one argument",
+            args.len(),
+            DigestAlgorithm::Md5.to_string(),
+        )));
+    }
+    let value = digest_process(&args[0], DigestAlgorithm::Md5)?;
+    // md5 requires special handling because of its unique utf8 return type
+    Ok(match value {
+        ColumnarValue::Array(array) => {
+            let binary_array = array
+                .as_ref()
+                .as_any()
+                .downcast_ref::<BinaryArray>()
+                .ok_or_else(|| {
+                    DataFusionError::Internal(
+                        "Impossibly got non-binary array data from 
digest".into(),
+                    )
+                })?;
+            let string_array: StringArray = binary_array
+                .iter()
+                .map(|opt| opt.map(hex_encode::<_>))
+                .collect();
+            ColumnarValue::Array(Arc::new(string_array))
+        }
+        ColumnarValue::Scalar(ScalarValue::Binary(opt)) => {
+            ColumnarValue::Scalar(ScalarValue::Utf8(opt.map(hex_encode::<_>)))
+        }
+        _ => {
+            return Err(DataFusionError::Internal(
+                "Impossibly got invalid results from digest".into(),
+            ))
+        }
+    })
 }
 
-/// crypto function that accepts Utf8 or LargeUtf8 and returns a 
[`ColumnarValue`]
-pub fn sha512(args: &[ColumnarValue]) -> Result<ColumnarValue> {
-    handle(args, sha_process::<Sha512>, "sha512")
+define_digest_function!(
+    sha224,
+    Sha224,
+    "computes sha224 hash digest of the given input"
+);
+
+define_digest_function!(
+    sha256,
+    Sha256,
+    "computes sha256 hash digest of the given input"
+);
+
+define_digest_function!(
+    sha384,
+    Sha384,
+    "computes sha384 hash digest of the given input"
+);
+
+define_digest_function!(
+    sha512,
+    Sha512,
+    "computes sha512 hash digest of the given input"
+);
+
+/// Digest computes a binary hash of the given data, accepts Utf8 or LargeUtf8 
and returns a [`ColumnarValue`].
+/// Second argument is the algorithm to use.
+/// Standard algorithms are md5, sha1, sha224, sha256, sha384 and sha512.
+pub fn digest(args: &[ColumnarValue]) -> Result<ColumnarValue> {
+    if args.len() != 2 {
+        return Err(DataFusionError::Internal(format!(
+            "{:?} args were supplied but digest takes exactly two arguments",
+            args.len(),
+        )));
+    }
+    let digest_algorithm = match &args[1] {
+        ColumnarValue::Scalar(scalar) => match scalar {
+            ScalarValue::Utf8(Some(method)) | 
ScalarValue::LargeUtf8(Some(method)) => {
+                method.parse::<DigestAlgorithm>()
+            }
+            other => Err(DataFusionError::Internal(format!(
+                "Unsupported data type {:?} for function digest",
+                other,
+            ))),
+        },
+        ColumnarValue::Array(_) => Err(DataFusionError::Internal(
+            "Digest using dynamically decided method is not yet 
supported".into(),
+        )),
+    }?;
+    digest_process(&args[0], digest_algorithm)
 }
diff --git a/datafusion/src/physical_plan/functions.rs 
b/datafusion/src/physical_plan/functions.rs
index a1d7d45..d00248e 100644
--- a/datafusion/src/physical_plan/functions.rs
+++ b/datafusion/src/physical_plan/functions.rs
@@ -185,6 +185,8 @@ pub enum BuiltinScalarFunction {
     Ceil,
     /// cos
     Cos,
+    /// Digest
+    Digest,
     /// exp
     Exp,
     /// floor
@@ -310,7 +312,7 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::Random | BuiltinScalarFunction::Now
         )
     }
-    /// Returns the [Volatility] of the builtin function.  
+    /// Returns the [Volatility] of the builtin function.
     pub fn volatility(&self) -> Volatility {
         match self {
             //Immutable scalar builtins
@@ -350,7 +352,6 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::MD5 => Volatility::Immutable,
             BuiltinScalarFunction::NullIf => Volatility::Immutable,
             BuiltinScalarFunction::OctetLength => Volatility::Immutable,
-
             BuiltinScalarFunction::RegexpReplace => Volatility::Immutable,
             BuiltinScalarFunction::Repeat => Volatility::Immutable,
             BuiltinScalarFunction::Replace => Volatility::Immutable,
@@ -362,6 +363,7 @@ impl BuiltinScalarFunction {
             BuiltinScalarFunction::SHA256 => Volatility::Immutable,
             BuiltinScalarFunction::SHA384 => Volatility::Immutable,
             BuiltinScalarFunction::SHA512 => Volatility::Immutable,
+            BuiltinScalarFunction::Digest => Volatility::Immutable,
             BuiltinScalarFunction::SplitPart => Volatility::Immutable,
             BuiltinScalarFunction::StartsWith => Volatility::Immutable,
             BuiltinScalarFunction::Strpos => Volatility::Immutable,
@@ -449,6 +451,7 @@ impl FromStr for BuiltinScalarFunction {
             "sha256" => BuiltinScalarFunction::SHA256,
             "sha384" => BuiltinScalarFunction::SHA384,
             "sha512" => BuiltinScalarFunction::SHA512,
+            "digest" => BuiltinScalarFunction::Digest,
             "split_part" => BuiltinScalarFunction::SplitPart,
             "starts_with" => BuiltinScalarFunction::StartsWith,
             "strpos" => BuiltinScalarFunction::Strpos,
@@ -554,6 +557,7 @@ pub fn return_type(
         BuiltinScalarFunction::SHA256 => utf8_to_binary_type(&arg_types[0], 
"sha256"),
         BuiltinScalarFunction::SHA384 => utf8_to_binary_type(&arg_types[0], 
"sha384"),
         BuiltinScalarFunction::SHA512 => utf8_to_binary_type(&arg_types[0], 
"sha512"),
+        BuiltinScalarFunction::Digest => utf8_to_binary_type(&arg_types[0], 
"digest"),
         BuiltinScalarFunction::SplitPart => utf8_to_str_type(&arg_types[0], 
"split_part"),
         BuiltinScalarFunction::StartsWith => Ok(DataType::Boolean),
         BuiltinScalarFunction::Strpos => utf8_to_int_type(&arg_types[0], 
"strpos"),
@@ -851,6 +855,9 @@ pub fn create_physical_fun(
         BuiltinScalarFunction::MD5 => {
             Arc::new(invoke_if_crypto_expressions_feature_flag!(md5, "md5"))
         }
+        BuiltinScalarFunction::Digest => {
+            Arc::new(invoke_if_crypto_expressions_feature_flag!(digest, 
"digest"))
+        }
         BuiltinScalarFunction::NullIf => Arc::new(nullif_func),
         BuiltinScalarFunction::OctetLength => Arc::new(|args| match &args[0] {
             ColumnarValue::Array(v) => 
Ok(ColumnarValue::Array(length(v.as_ref())?)),
@@ -1354,6 +1361,9 @@ fn signature(fun: &BuiltinScalarFunction) -> Signature {
             ],
             fun.volatility(),
         ),
+        BuiltinScalarFunction::Digest => {
+            Signature::exact(vec![DataType::Utf8, DataType::Utf8], 
fun.volatility())
+        }
         BuiltinScalarFunction::DateTrunc => Signature::exact(
             vec![
                 DataType::Utf8,
diff --git a/datafusion/src/prelude.rs b/datafusion/src/prelude.rs
index 168e1d5..02b9d4f 100644
--- a/datafusion/src/prelude.rs
+++ b/datafusion/src/prelude.rs
@@ -29,10 +29,10 @@ pub use crate::dataframe::DataFrame;
 pub use crate::execution::context::{ExecutionConfig, ExecutionContext};
 pub use crate::logical_plan::{
     array, ascii, avg, bit_length, btrim, character_length, chr, col, concat, 
concat_ws,
-    count, create_udf, date_part, date_trunc, in_list, initcap, left, length, 
lit, lower,
-    lpad, ltrim, max, md5, min, now, octet_length, random, regexp_replace, 
repeat,
-    replace, reverse, right, rpad, rtrim, sha224, sha256, sha384, sha512, 
split_part,
-    starts_with, strpos, substr, sum, to_hex, translate, trim, upper, Column, 
JoinType,
-    Partitioning,
+    count, create_udf, date_part, date_trunc, digest, in_list, initcap, left, 
length,
+    lit, lower, lpad, ltrim, max, md5, min, now, octet_length, random, 
regexp_replace,
+    repeat, replace, reverse, right, rpad, rtrim, sha224, sha256, sha384, 
sha512,
+    split_part, starts_with, strpos, substr, sum, to_hex, translate, trim, 
upper, Column,
+    JoinType, Partitioning,
 };
 pub use crate::physical_plan::csv::CsvReadOptions;
diff --git a/datafusion/tests/sql.rs b/datafusion/tests/sql.rs
index 32da908..4ed07af 100644
--- a/datafusion/tests/sql.rs
+++ b/datafusion/tests/sql.rs
@@ -3997,32 +3997,59 @@ async fn test_boolean_expressions() -> Result<()> {
 #[cfg_attr(not(feature = "crypto_expressions"), ignore)]
 async fn test_crypto_expressions() -> Result<()> {
     test_expression!("md5('tom')", "34b7da764b21d298ef307d04d8152dc5");
+    test_expression!("digest('tom','md5')", 
"34b7da764b21d298ef307d04d8152dc5");
     test_expression!("md5('')", "d41d8cd98f00b204e9800998ecf8427e");
+    test_expression!("digest('','md5')", "d41d8cd98f00b204e9800998ecf8427e");
     test_expression!("md5(NULL)", "NULL");
+    test_expression!("digest(NULL,'md5')", "NULL");
     test_expression!(
         "sha224('tom')",
         "0bf6cb62649c42a9ae3876ab6f6d92ad36cb5414e495f8873292be4d"
     );
     test_expression!(
+        "digest('tom','sha224')",
+        "0bf6cb62649c42a9ae3876ab6f6d92ad36cb5414e495f8873292be4d"
+    );
+    test_expression!(
         "sha224('')",
         "d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f"
     );
+    test_expression!(
+        "digest('','sha224')",
+        "d14a028c2a3a2bc9476102bb288234c415a2b01f828ea62ac5b3e42f"
+    );
     test_expression!("sha224(NULL)", "NULL");
+    test_expression!("digest(NULL,'sha224')", "NULL");
     test_expression!(
         "sha256('tom')",
         "e1608f75c5d7813f3d4031cb30bfb786507d98137538ff8e128a6ff74e84e643"
     );
     test_expression!(
+        "digest('tom','sha256')",
+        "e1608f75c5d7813f3d4031cb30bfb786507d98137538ff8e128a6ff74e84e643"
+    );
+    test_expression!(
         "sha256('')",
         "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
     );
+    test_expression!(
+        "digest('','sha256')",
+        "e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855"
+    );
     test_expression!("sha256(NULL)", "NULL");
+    test_expression!("digest(NULL,'sha256')", "NULL");
     test_expression!("sha384('tom')", 
"096f5b68aa77848e4fdf5c1c0b350de2dbfad60ffd7c25d9ea07c6c19b8a4d55a9187eb117c557883f58c16dfac3e343");
+    test_expression!("digest('tom','sha384')", 
"096f5b68aa77848e4fdf5c1c0b350de2dbfad60ffd7c25d9ea07c6c19b8a4d55a9187eb117c557883f58c16dfac3e343");
     test_expression!("sha384('')", 
"38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b95b");
+    test_expression!("digest('','sha384')", 
"38b060a751ac96384cd9327eb1b1e36a21fdb71114be07434c0cc7bf63f6e1da274edebfe76f65fbd51ad2f14898b95b");
     test_expression!("sha384(NULL)", "NULL");
+    test_expression!("digest(NULL,'sha384')", "NULL");
     test_expression!("sha512('tom')", 
"6e1b9b3fe840680e37051f7ad5e959d6f39ad0f8885d855166f55c659469d3c8b78118c44a2a49c72ddb481cd6d8731034e11cc030070ba843a90b3495cb8d3e");
+    test_expression!("digest('tom','sha512')", 
"6e1b9b3fe840680e37051f7ad5e959d6f39ad0f8885d855166f55c659469d3c8b78118c44a2a49c72ddb481cd6d8731034e11cc030070ba843a90b3495cb8d3e");
     test_expression!("sha512('')", 
"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e");
+    test_expression!("digest('','sha512')", 
"cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e");
     test_expression!("sha512(NULL)", "NULL");
+    test_expression!("digest(NULL,'sha512')", "NULL");
     Ok(())
 }
 

Reply via email to