alamb commented on code in PR #17732:
URL: https://github.com/apache/datafusion/pull/17732#discussion_r2373055683
##########
datafusion/sqllogictest/test_files/array.slt:
##########
@@ -6497,7 +6495,7 @@ physical_plan
04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))]
05)--------ProjectionExec: expr=[]
06)----------CoalesceBatchesExec: target_batch_size=8192
-07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b,
c], substr(md5(CAST(value@0 AS Utf8View)), 1, 32))
+07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN
([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field {
name: "lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered:
false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name:
"lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered:
false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name:
"lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered:
false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name:
"lit", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered:
false, metadata: {} } }])
Review Comment:
this is a pretty terrible plan display (not made worse by this PR, of course)
##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -131,40 +131,42 @@ impl ScalarUDFImpl for ArrayHas {
// if the haystack is a constant list, we can use an inlist expression
which is more
// efficient because the haystack is not varying per-row
- if let Expr::Literal(ScalarValue::List(array), _) = haystack {
- // TODO: support LargeList
Review Comment:
🎉
##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -3305,17 +3306,29 @@ impl ScalarValue {
/// assert_eq!(scalar_vec, expected);
/// ```
pub fn convert_array_to_scalar_vec(array: &dyn Array) ->
Result<Vec<Vec<Self>>> {
- let mut scalars = Vec::with_capacity(array.len());
-
- for index in 0..array.len() {
- let nested_array = array.as_list::<i32>().value(index);
- let scalar_values = (0..nested_array.len())
- .map(|i| ScalarValue::try_from_array(&nested_array, i))
- .collect::<Result<Vec<_>>>()?;
- scalars.push(scalar_values);
+ fn generic_collect<OffsetSize: OffsetSizeTrait>(
+ array: &dyn Array,
+ ) -> Result<Vec<Vec<ScalarValue>>> {
+ array
+ .as_list::<OffsetSize>()
+ .iter()
+ .map(|nested_array| match nested_array {
+ Some(nested_array) => (0..nested_array.len())
+ .map(|i| ScalarValue::try_from_array(&nested_array, i))
+ .collect::<Result<Vec<_>>>(),
+ // TODO: what can we put for null?
+ None => Ok(vec![]),
+ })
+ .collect()
Review Comment:
Maybe we can file a ticket / leave a reference to that ticket in the
comments 🤔
##########
datafusion/functions-nested/src/array_has.rs:
##########
@@ -131,40 +131,42 @@ impl ScalarUDFImpl for ArrayHas {
// if the haystack is a constant list, we can use an inlist expression
which is more
// efficient because the haystack is not varying per-row
- if let Expr::Literal(ScalarValue::List(array), _) = haystack {
- // TODO: support LargeList
- // (not supported by `convert_array_to_scalar_vec`)
- // (FixedSizeList not supported either, but seems to have worked
fine when attempting to
- // build a reproducer)
-
- assert_eq!(array.len(), 1); // guarantee of ScalarValue
- if let Ok(scalar_values) =
- ScalarValue::convert_array_to_scalar_vec(array.as_ref())
- {
- assert_eq!(scalar_values.len(), 1);
- let list = scalar_values
- .into_iter()
- .flatten()
- .map(|v| Expr::Literal(v, None))
- .collect();
-
- return Ok(ExprSimplifyResult::Simplified(Expr::InList(InList {
- expr: Box::new(std::mem::take(needle)),
- list,
- negated: false,
- })));
+ match haystack {
+ Expr::Literal(
+ // FixedSizeList gets coerced to List
+ scalar @ ScalarValue::List(_) | scalar @
ScalarValue::LargeList(_),
+ _,
+ ) => {
+ let array = scalar.to_array().unwrap(); // guarantee of
ScalarValue
+ if let Ok(scalar_values) =
+ ScalarValue::convert_array_to_scalar_vec(&array)
+ {
+ assert_eq!(scalar_values.len(), 1);
+ let list = scalar_values
+ .into_iter()
+ .flatten()
+ .map(|v| Expr::Literal(v, None))
+ .collect();
+
+ return
Ok(ExprSimplifyResult::Simplified(Expr::InList(InList {
Review Comment:
You could potentially make this more concise using
```rust
return
Ok(ExprSimplifyResult::Simplified(in_list(std::mem::take(needle), list,
false)));
```
##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -3305,17 +3306,29 @@ impl ScalarValue {
/// assert_eq!(scalar_vec, expected);
/// ```
pub fn convert_array_to_scalar_vec(array: &dyn Array) ->
Result<Vec<Vec<Self>>> {
- let mut scalars = Vec::with_capacity(array.len());
-
- for index in 0..array.len() {
- let nested_array = array.as_list::<i32>().value(index);
- let scalar_values = (0..nested_array.len())
- .map(|i| ScalarValue::try_from_array(&nested_array, i))
- .collect::<Result<Vec<_>>>()?;
- scalars.push(scalar_values);
+ fn generic_collect<OffsetSize: OffsetSizeTrait>(
+ array: &dyn Array,
+ ) -> Result<Vec<Vec<ScalarValue>>> {
+ array
+ .as_list::<OffsetSize>()
+ .iter()
+ .map(|nested_array| match nested_array {
+ Some(nested_array) => (0..nested_array.len())
+ .map(|i| ScalarValue::try_from_array(&nested_array, i))
+ .collect::<Result<Vec<_>>>(),
+ // TODO: what can we put for null?
+ None => Ok(vec![]),
+ })
+ .collect()
Review Comment:
I think we would have to research what the null semantics of array_has are
in other implementations (postgres/duckdb/spark) and try and make this one
consistent
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]