This is an automated email from the ASF dual-hosted git repository.
comphead pushed a commit to branch branch-53
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/branch-53 by this push:
new 518560246e [53] fix: Fix bug in `array_has` scalar path with sliced
arrays (#20677) (#20700)
518560246e is described below
commit 518560246e87d489eba6d511fa167aa429b06728
Author: Neil Conway <[email protected]>
AuthorDate: Wed Mar 4 12:17:57 2026 -0500
[53] fix: Fix bug in `array_has` scalar path with sliced arrays (#20677)
(#20700)
Backport #20677 to the 53 release branch.
---
datafusion/functions-nested/src/array_has.rs | 74 ++++++++++++++++++++++---
datafusion/functions-nested/src/position.rs | 81 +++++++++++++++++++++++++---
2 files changed, 140 insertions(+), 15 deletions(-)
diff --git a/datafusion/functions-nested/src/array_has.rs
b/datafusion/functions-nested/src/array_has.rs
index ace69de66f..76cf786c95 100644
--- a/datafusion/functions-nested/src/array_has.rs
+++ b/datafusion/functions-nested/src/array_has.rs
@@ -352,8 +352,6 @@ fn array_has_dispatch_for_scalar(
haystack: ArrayWrapper<'_>,
needle: &dyn Datum,
) -> Result<ArrayRef> {
- let values = haystack.values();
- let is_nested = values.data_type().is_nested();
// If first argument is empty list (second argument is non-null), return
false
// i.e. array_has([], non-null element) -> false
if haystack.len() == 0 {
@@ -362,7 +360,17 @@ fn array_has_dispatch_for_scalar(
None,
)));
}
- let eq_array = compare_with_eq(values, needle, is_nested)?;
+
+ // For sliced ListArrays, values() returns the full underlying array but
+ // only elements between the first and last offset are visible.
+ let offsets: Vec<usize> = haystack.offsets().collect();
+ let first_offset = offsets[0];
+ let visible_values = haystack
+ .values()
+ .slice(first_offset, offsets[offsets.len() - 1] - first_offset);
+
+ let is_nested = visible_values.data_type().is_nested();
+ let eq_array = compare_with_eq(&visible_values, needle, is_nested)?;
// When a haystack element is null, `eq()` returns null (not false).
// In Arrow, a null BooleanArray entry has validity=0 but an
@@ -382,10 +390,14 @@ fn array_has_dispatch_for_scalar(
ArrayWrapper::LargeList(arr) => arr.nulls(),
};
let mut matches = eq_bits.set_indices().peekable();
- let mut values = BooleanBufferBuilder::new(haystack.len());
- values.append_n(haystack.len(), false);
+ let mut result = BooleanBufferBuilder::new(haystack.len());
+ result.append_n(haystack.len(), false);
+
+ // Match positions are relative to visible_values (0-based), so
+ // subtract first_offset from each offset when comparing.
+ for (i, window) in offsets.windows(2).enumerate() {
+ let end = window[1] - first_offset;
- for (i, (_start, end)) in haystack.offsets().tuple_windows().enumerate() {
let has_match = matches.peek().is_some_and(|&p| p < end);
// Advance past all match positions in this row's range.
@@ -394,14 +406,14 @@ fn array_has_dispatch_for_scalar(
}
if has_match && validity.is_none_or(|v| v.is_valid(i)) {
- values.set_bit(i, true);
+ result.set_bit(i, true);
}
}
// A null haystack row always produces a null output, so we can
// reuse the haystack's null buffer directly.
Ok(Arc::new(BooleanArray::new(
- values.finish(),
+ result.finish(),
validity.cloned(),
)))
}
@@ -1066,6 +1078,52 @@ mod tests {
Ok(())
}
+ #[test]
+ fn test_array_has_sliced_list() -> Result<(), DataFusionError> {
+ // [[10, 20], [30, 40], [50, 60], [70, 80]] → slice(1,2) → [[30,
40], [50, 60]]
+ let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+ Some(vec![Some(10), Some(20)]),
+ Some(vec![Some(30), Some(40)]),
+ Some(vec![Some(50), Some(60)]),
+ Some(vec![Some(70), Some(80)]),
+ ]);
+ let sliced = list.slice(1, 2);
+ let haystack_field =
+ Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+ let needle_field = Arc::new(Field::new("needle", DataType::Int32,
true));
+ let return_field = Arc::new(Field::new("return", DataType::Boolean,
true));
+
+ // Search for elements that exist only in sliced-away rows:
+ // 10 is in the prefix row, 70 is in the suffix row.
+ let invoke = |needle: i32| -> Result<ArrayRef, DataFusionError> {
+ ArrayHas::new()
+ .invoke_with_args(ScalarFunctionArgs {
+ args: vec![
+ ColumnarValue::Array(Arc::new(sliced.clone())),
+
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+ ],
+ arg_fields: vec![
+ Arc::clone(&haystack_field),
+ Arc::clone(&needle_field),
+ ],
+ number_rows: 2,
+ return_field: Arc::clone(&return_field),
+ config_options: Arc::new(ConfigOptions::default()),
+ })?
+ .into_array(2)
+ };
+
+ let output = invoke(10)?.as_boolean().clone();
+ assert!(!output.value(0));
+ assert!(!output.value(1));
+
+ let output = invoke(70)?.as_boolean().clone();
+ assert!(!output.value(0));
+ assert!(!output.value(1));
+
+ Ok(())
+ }
+
#[test]
fn test_array_has_list_null_haystack() -> Result<(), DataFusionError> {
let haystack_field = Arc::new(Field::new("haystack", DataType::Null,
true));
diff --git a/datafusion/functions-nested/src/position.rs
b/datafusion/functions-nested/src/position.rs
index ba16d08538..0214b1552b 100644
--- a/datafusion/functions-nested/src/position.rs
+++ b/datafusion/functions-nested/src/position.rs
@@ -230,26 +230,36 @@ fn array_position_scalar<O: OffsetSizeTrait>(
"array_position",
&[list_array.values(), element_array],
)?;
- let element_datum = Scalar::new(Arc::clone(element_array));
-
- let offsets = list_array.offsets();
- let validity = list_array.nulls();
if list_array.len() == 0 {
return Ok(Arc::new(UInt64Array::new_null(0)));
}
+ let element_datum = Scalar::new(Arc::clone(element_array));
+ let validity = list_array.nulls();
+
+ // Only compare the visible portion of the values buffer, which avoids
+ // wasted work for sliced ListArrays.
+ let offsets = list_array.offsets();
+ let first_offset = offsets[0].as_usize();
+ let last_offset = offsets[list_array.len()].as_usize();
+ let visible_values = list_array
+ .values()
+ .slice(first_offset, last_offset - first_offset);
+
// `not_distinct` treats NULL=NULL as true, matching the semantics of
// `array_position`
- let eq_array = arrow_ord::cmp::not_distinct(list_array.values(),
&element_datum)?;
+ let eq_array = arrow_ord::cmp::not_distinct(&visible_values,
&element_datum)?;
let eq_bits = eq_array.values();
let mut result: Vec<Option<u64>> = Vec::with_capacity(list_array.len());
let mut matches = eq_bits.set_indices().peekable();
+ // Match positions are relative to visible_values (0-based), so
+ // subtract first_offset from each offset when comparing.
for i in 0..list_array.len() {
- let start = offsets[i].as_usize();
- let end = offsets[i + 1].as_usize();
+ let start = offsets[i].as_usize() - first_offset;
+ let end = offsets[i + 1].as_usize() - first_offset;
if validity.is_some_and(|v| v.is_null(i)) {
// Null row -> null output; advance past matches in range
@@ -474,3 +484,60 @@ fn general_positions<OffsetSize: OffsetSizeTrait>(
ListArray::from_iter_primitive::<UInt64Type, _, _>(data),
))
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+ use arrow::array::AsArray;
+ use arrow::datatypes::Int32Type;
+ use datafusion_common::config::ConfigOptions;
+ use datafusion_expr::ScalarFunctionArgs;
+
+ #[test]
+ fn test_array_position_sliced_list() -> Result<()> {
+ // [[10, 20], [30, 40], [50, 60], [70, 80]] → slice(1,2) → [[30,
40], [50, 60]]
+ let list = ListArray::from_iter_primitive::<Int32Type, _, _>(vec![
+ Some(vec![Some(10), Some(20)]),
+ Some(vec![Some(30), Some(40)]),
+ Some(vec![Some(50), Some(60)]),
+ Some(vec![Some(70), Some(80)]),
+ ]);
+ let sliced = list.slice(1, 2);
+ let haystack_field =
+ Arc::new(Field::new("haystack", sliced.data_type().clone(), true));
+ let needle_field = Arc::new(Field::new("needle", DataType::Int32,
true));
+ let return_field = Arc::new(Field::new("return", UInt64, true));
+
+ // Search for elements that exist only in sliced-away rows:
+ // 10 is in the prefix row, 70 is in the suffix row.
+ let invoke = |needle: i32| -> Result<ArrayRef> {
+ ArrayPosition::new()
+ .invoke_with_args(ScalarFunctionArgs {
+ args: vec![
+ ColumnarValue::Array(Arc::new(sliced.clone())),
+
ColumnarValue::Scalar(ScalarValue::Int32(Some(needle))),
+ ],
+ arg_fields: vec![
+ Arc::clone(&haystack_field),
+ Arc::clone(&needle_field),
+ ],
+ number_rows: 2,
+ return_field: Arc::clone(&return_field),
+ config_options: Arc::new(ConfigOptions::default()),
+ })?
+ .into_array(2)
+ };
+
+ let output = invoke(10)?;
+ let output = output.as_primitive::<UInt64Type>();
+ assert!(output.is_null(0));
+ assert!(output.is_null(1));
+
+ let output = invoke(70)?;
+ let output = output.as_primitive::<UInt64Type>();
+ assert!(output.is_null(0));
+ assert!(output.is_null(1));
+
+ Ok(())
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]