alamb commented on code in PR #17732:
URL: https://github.com/apache/datafusion/pull/17732#discussion_r2373055683


##########
datafusion/sqllogictest/test_files/array.slt:
##########
@@ -6497,7 +6495,7 @@ physical_plan
 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
 05)--------ProjectionExec: expr=[]
 06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, 
c], substr(md5(CAST(value@0 AS Utf8View)), 1, 32))
+07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN 
([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { 
name: "lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: 
false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: 
"lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: 
false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: 
"lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: 
false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: 
"lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: 
false, metadata: {} } }])

Review Comment:
   this is a pretty terrible plan display (not made worse by this PR, of course)



##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -131,40 +131,42 @@ impl ScalarUDFImpl for ArrayHas {
 
         // if the haystack is a constant list, we can use an inlist expression 
which is more
         // efficient because the haystack is not varying per-row
-        if let Expr::Literal(ScalarValue::List(array), _) = haystack {
-            // TODO: support LargeList

Review Comment:
   🎉 



##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -3305,17 +3306,29 @@ impl ScalarValue {
     /// assert_eq!(scalar_vec, expected);
     /// ```
     pub fn convert_array_to_scalar_vec(array: &dyn Array) -> 
Result<Vec<Vec<Self>>> {
-        let mut scalars = Vec::with_capacity(array.len());
-
-        for index in 0..array.len() {
-            let nested_array = array.as_list::<i32>().value(index);
-            let scalar_values = (0..nested_array.len())
-                .map(|i| ScalarValue::try_from_array(&nested_array, i))
-                .collect::<Result<Vec<_>>>()?;
-            scalars.push(scalar_values);
+        fn generic_collect<OffsetSize: OffsetSizeTrait>(
+            array: &dyn Array,
+        ) -> Result<Vec<Vec<ScalarValue>>> {
+            array
+                .as_list::<OffsetSize>()
+                .iter()
+                .map(|nested_array| match nested_array {
+                    Some(nested_array) => (0..nested_array.len())
+                        .map(|i| ScalarValue::try_from_array(&nested_array, i))
+                        .collect::<Result<Vec<_>>>(),
+                    // TODO: what can we put for null?
+                    None => Ok(vec![]),
+                })
+                .collect()

Review Comment:
   Maybe we can file a ticket / leave a reference to that ticket in the 
comments 🤔 



##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -131,40 +131,42 @@ impl ScalarUDFImpl for ArrayHas {
 
         // if the haystack is a constant list, we can use an inlist expression 
which is more
         // efficient because the haystack is not varying per-row
-        if let Expr::Literal(ScalarValue::List(array), _) = haystack {
-            // TODO: support LargeList
-            // (not supported by `convert_array_to_scalar_vec`)
-            // (FixedSizeList not supported either, but seems to have worked 
fine when attempting to
-            // build a reproducer)
-
-            assert_eq!(array.len(), 1); // guarantee of ScalarValue
-            if let Ok(scalar_values) =
-                ScalarValue::convert_array_to_scalar_vec(array.as_ref())
-            {
-                assert_eq!(scalar_values.len(), 1);
-                let list = scalar_values
-                    .into_iter()
-                    .flatten()
-                    .map(|v| Expr::Literal(v, None))
-                    .collect();
-
-                return Ok(ExprSimplifyResult::Simplified(Expr::InList(InList {
-                    expr: Box::new(std::mem::take(needle)),
-                    list,
-                    negated: false,
-                })));
+        match haystack {
+            Expr::Literal(
+                // FixedSizeList gets coerced to List
+                scalar @ ScalarValue::List(_) | scalar @ 
ScalarValue::LargeList(_),
+                _,
+            ) => {
+                let array = scalar.to_array().unwrap(); // guarantee of 
ScalarValue
+                if let Ok(scalar_values) =
+                    ScalarValue::convert_array_to_scalar_vec(&array)
+                {
+                    assert_eq!(scalar_values.len(), 1);
+                    let list = scalar_values
+                        .into_iter()
+                        .flatten()
+                        .map(|v| Expr::Literal(v, None))
+                        .collect();
+
+                    return 
Ok(ExprSimplifyResult::Simplified(Expr::InList(InList {

Review Comment:
   You could potentially make this more concise using
   ```rust
                       return 
Ok(ExprSimplifyResult::Simplified(in_list(std::mem::take(needle), list, 
false)));
   ```



##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -3305,17 +3306,29 @@ impl ScalarValue {
     /// assert_eq!(scalar_vec, expected);
     /// ```
     pub fn convert_array_to_scalar_vec(array: &dyn Array) -> 
Result<Vec<Vec<Self>>> {
-        let mut scalars = Vec::with_capacity(array.len());
-
-        for index in 0..array.len() {
-            let nested_array = array.as_list::<i32>().value(index);
-            let scalar_values = (0..nested_array.len())
-                .map(|i| ScalarValue::try_from_array(&nested_array, i))
-                .collect::<Result<Vec<_>>>()?;
-            scalars.push(scalar_values);
+        fn generic_collect<OffsetSize: OffsetSizeTrait>(
+            array: &dyn Array,
+        ) -> Result<Vec<Vec<ScalarValue>>> {
+            array
+                .as_list::<OffsetSize>()
+                .iter()
+                .map(|nested_array| match nested_array {
+                    Some(nested_array) => (0..nested_array.len())
+                        .map(|i| ScalarValue::try_from_array(&nested_array, i))
+                        .collect::<Result<Vec<_>>>(),
+                    // TODO: what can we put for null?
+                    None => Ok(vec![]),
+                })
+                .collect()

Review Comment:
   I think we would have to research what the null semantics of array_has are 
in other implementations (postgres/duckdb/spark) and try and make this one 
consistent



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to