This is an automated email from the ASF dual-hosted git repository.
jeffreyvo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new aa9432c883 Fix `extend_nulls` panic for UnionArray (#9607)
aa9432c883 is described below
commit aa9432c8833f5701085e8b933b30560d21df9f80
Author: Adrian Garcia Badaracco <[email protected]>
AuthorDate: Sun Mar 29 23:11:05 2026 -0700
Fix `extend_nulls` panic for UnionArray (#9607)
## Summary
- Fix `MutableArrayData::extend_nulls` which previously panicked
unconditionally for both sparse and dense Union arrays
- For sparse unions: append the first type_id and extend nulls in all
children
- For dense unions: append the first type_id, compute offsets into the
first child, and extend nulls in that child only
## Background
This bug was discovered via DataFusion. `CaseExpr` uses
`MutableArrayData` via `scatter()` to build result arrays. When a `CASE`
expression returns a Union type (e.g., from `json_get` which returns a
JSON union) and there are rows where no `WHEN` branch matches (implicit
`ELSE NULL`), `scatter` calls `extend_nulls` which panics with "cannot
call extend_nulls on UnionArray as cannot infer type".
Any query like:
```sql
SELECT CASE WHEN condition THEN returns_union(col, 'key') END FROM table
```
would panic if `condition` is false for any row.
## Root Cause
The `extend_nulls` implementation for Union arrays unconditionally
panicked because it claimed it "cannot infer type". However, the Union's
field definitions (child types and type IDs) are available in the
`MutableArrayData`'s data type — there's enough information to produce
valid null entries by picking the first declared type_id.
## Test plan
- [x] Added test for sparse union `extend_nulls`
- [x] Added test for dense union `extend_nulls`
- [x] Existing `test_union_dense` continues to pass
- [x] All `array_transform` tests pass
🤖 Generated with [Claude Code](https://claude.com/claude-code)
---------
Co-authored-by: Claude Opus 4.6 (1M context) <[email protected]>
Co-authored-by: Jeffrey Vo <[email protected]>
---
arrow-data/src/transform/mod.rs | 4 +-
arrow-data/src/transform/union.rs | 41 ++++++++++++++++--
arrow/tests/array_transform.rs | 88 +++++++++++++++++++++++++++++++++++++++
3 files changed, 127 insertions(+), 6 deletions(-)
diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs
index c6052817bf..66f6603f02 100644
--- a/arrow-data/src/transform/mod.rs
+++ b/arrow-data/src/transform/mod.rs
@@ -813,8 +813,8 @@ impl<'a> MutableArrayData<'a> {
};
let nulls = match data.data_type {
- // RunEndEncoded and Null arrays cannot have top-level null
bitmasks
- DataType::RunEndEncoded(_, _) | DataType::Null => None,
+ // RunEndEncoded, Null, and Union arrays cannot have top-level
null bitmasks
+ DataType::RunEndEncoded(_, _) | DataType::Null |
DataType::Union(_, _) => None,
_ => data
.null_buffer
.map(|nulls| {
diff --git a/arrow-data/src/transform/union.rs
b/arrow-data/src/transform/union.rs
index f6f291e3f0..d1301249d3 100644
--- a/arrow-data/src/transform/union.rs
+++ b/arrow-data/src/transform/union.rs
@@ -17,6 +17,7 @@
use super::{_MutableArrayData, Extend};
use crate::ArrayData;
+use arrow_schema::DataType;
pub(super) fn build_extend_sparse(array: &ArrayData) -> Extend<'_> {
let type_ids = array.buffer::<i8>(0);
@@ -68,10 +69,42 @@ pub(super) fn build_extend_dense(array: &ArrayData) ->
Extend<'_> {
)
}
-pub(super) fn extend_nulls_dense(_mutable: &mut _MutableArrayData, _len:
usize) {
- panic!("cannot call extend_nulls on UnionArray as cannot infer type");
+pub(super) fn extend_nulls_dense(mutable: &mut _MutableArrayData, len: usize) {
+ let DataType::Union(fields, _) = &mutable.data_type else {
+ unreachable!()
+ };
+ let first_type_id = fields
+ .iter()
+ .next()
+ .expect("union must have at least one field")
+ .0;
+
+ // Extend type_ids buffer
+ mutable.buffer1.extend_from_slice(&vec![first_type_id; len]);
+
+ // Dense: extend offsets pointing into the first child, then extend nulls
in that child
+ let child_offset = mutable.child_data[0].len();
+ let (start, end) = (child_offset as i32, (child_offset + len) as i32);
+ mutable.buffer2.extend(start..end);
+ mutable.child_data[0].extend_nulls(len);
}
-pub(super) fn extend_nulls_sparse(_mutable: &mut _MutableArrayData, _len:
usize) {
- panic!("cannot call extend_nulls on UnionArray as cannot infer type");
+pub(super) fn extend_nulls_sparse(mutable: &mut _MutableArrayData, len: usize)
{
+ let DataType::Union(fields, _) = &mutable.data_type else {
+ unreachable!()
+ };
+ let first_type_id = fields
+ .iter()
+ .next()
+ .expect("union must have at least one field")
+ .0;
+
+ // Extend type_ids buffer
+ mutable.buffer1.extend_from_slice(&vec![first_type_id; len]);
+
+ // Sparse: extend nulls in ALL children
+ mutable
+ .child_data
+ .iter_mut()
+ .for_each(|child| child.extend_nulls(len));
}
diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs
index 511dc1e8bf..c24d0992a4 100644
--- a/arrow/tests/array_transform.rs
+++ b/arrow/tests/array_transform.rs
@@ -1151,3 +1151,91 @@ fn test_fixed_size_list_append() {
.unwrap();
assert_eq!(finished, expected_fixed_size_list_data);
}
+
+#[test]
+fn test_extend_nulls_sparse_union() {
+ let fields = UnionFields::try_new(
+ vec![0, 1],
+ vec![
+ Field::new("null", DataType::Null, true),
+ Field::new("str", DataType::Utf8, true),
+ ],
+ )
+ .unwrap();
+
+ let type_ids = ScalarBuffer::from(vec![1i8]);
+ let child_null = Arc::new(NullArray::new(1)) as ArrayRef;
+ let child_str = Arc::new(StringArray::from(vec![Some("hello")])) as
ArrayRef;
+ let union_array = UnionArray::try_new(
+ fields.clone(),
+ type_ids,
+ None, // sparse
+ vec![child_null, child_str],
+ )
+ .unwrap();
+
+ let data = union_array.to_data();
+ let mut mutable = MutableArrayData::new(vec![&data], true, 4);
+ mutable.extend(0, 0, 1); // copy the first element
+ mutable.extend_nulls(2); // add two nulls
+ let result = mutable.freeze();
+
+ // Union arrays must not have a null bitmap per Arrow spec
+ assert!(result.nulls().is_none());
+
+ let result_array = UnionArray::from(result);
+ assert_eq!(result_array.len(), 3);
+ // First element should be type_id 1 (str)
+ assert_eq!(result_array.type_id(0), 1);
+ // Null elements use the first type_id (0)
+ assert_eq!(result_array.type_id(1), 0);
+ assert_eq!(result_array.type_id(2), 0);
+ // All children should have length 3 (sparse invariant)
+ assert_eq!(result_array.child(0).len(), 3);
+ assert_eq!(result_array.child(1).len(), 3);
+}
+
+#[test]
+fn test_extend_nulls_dense_union() {
+ let fields = UnionFields::try_new(
+ vec![0, 1],
+ vec![
+ Field::new("i", DataType::Int32, true),
+ Field::new("str", DataType::Utf8, true),
+ ],
+ )
+ .unwrap();
+
+ let type_ids = ScalarBuffer::from(vec![1i8]);
+ let offsets = ScalarBuffer::from(vec![0i32]);
+ let child_int = Arc::new(Int32Array::new_null(0)) as ArrayRef;
+ let child_str = Arc::new(StringArray::from(vec![Some("hello")])) as
ArrayRef;
+ let union_array = UnionArray::try_new(
+ fields.clone(),
+ type_ids,
+ Some(offsets),
+ vec![child_int, child_str],
+ )
+ .unwrap();
+
+ let data = union_array.to_data();
+ let mut mutable = MutableArrayData::new(vec![&data], true, 4);
+ mutable.extend(0, 0, 1); // copy the first element
+ mutable.extend_nulls(2); // add two nulls
+ let result = mutable.freeze();
+
+ // Union arrays must not have a null bitmap per Arrow spec
+ assert!(result.nulls().is_none());
+
+ let result_array = UnionArray::from(result);
+ assert_eq!(result_array.len(), 3);
+ // First element is type_id 1 (str)
+ assert_eq!(result_array.type_id(0), 1);
+ // Null elements use the first type_id (0)
+ assert_eq!(result_array.type_id(1), 0);
+ assert_eq!(result_array.type_id(2), 0);
+ // First child (int) should have 2 null entries from extend_nulls
+ assert_eq!(result_array.child(0).len(), 2);
+ // Second child (str) should have 1 entry from extend
+ assert_eq!(result_array.child(1).len(), 1);
+}