This is an automated email from the ASF dual-hosted git repository.

jeffreyvo pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/main by this push:
     new aa9432c883 Fix `extend_nulls` panic for UnionArray (#9607)
aa9432c883 is described below

commit aa9432c8833f5701085e8b933b30560d21df9f80
Author: Adrian Garcia Badaracco <[email protected]>
AuthorDate: Sun Mar 29 23:11:05 2026 -0700

    Fix `extend_nulls` panic for UnionArray (#9607)
    
    ## Summary
    
    - Fix `MutableArrayData::extend_nulls` which previously panicked
    unconditionally for both sparse and dense Union arrays
    - For sparse unions: append the first type_id and extend nulls in all
    children
    - For dense unions: append the first type_id, compute offsets into the
    first child, and extend nulls in that child only
    
    ## Background
    
    This bug was discovered via DataFusion. `CaseExpr` uses
    `MutableArrayData` via `scatter()` to build result arrays. When a `CASE`
    expression returns a Union type (e.g., from `json_get` which returns a
    JSON union) and there are rows where no `WHEN` branch matches (implicit
    `ELSE NULL`), `scatter` calls `extend_nulls` which panics with "cannot
    call extend_nulls on UnionArray as cannot infer type".
    
    Any query like:
    ```sql
    SELECT CASE WHEN condition THEN returns_union(col, 'key') END FROM table
    ```
    would panic if `condition` is false for any row.
    
    ## Root Cause
    
    The `extend_nulls` implementation for Union arrays unconditionally
    panicked because it claimed it "cannot infer type". However, the Union's
    field definitions (child types and type IDs) are available in the
    `MutableArrayData`'s data type — there's enough information to produce
    valid null entries by picking the first declared type_id.
    
    ## Test plan
    
    - [x] Added test for sparse union `extend_nulls`
    - [x] Added test for dense union `extend_nulls`
    - [x] Existing `test_union_dense` continues to pass
    - [x] All `array_transform` tests pass
    
    🤖 Generated with [Claude Code](https://claude.com/claude-code)
    
    ---------
    
    Co-authored-by: Claude Opus 4.6 (1M context) <[email protected]>
    Co-authored-by: Jeffrey Vo <[email protected]>
---
 arrow-data/src/transform/mod.rs   |  4 +-
 arrow-data/src/transform/union.rs | 41 ++++++++++++++++--
 arrow/tests/array_transform.rs    | 88 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 127 insertions(+), 6 deletions(-)

diff --git a/arrow-data/src/transform/mod.rs b/arrow-data/src/transform/mod.rs
index c6052817bf..66f6603f02 100644
--- a/arrow-data/src/transform/mod.rs
+++ b/arrow-data/src/transform/mod.rs
@@ -813,8 +813,8 @@ impl<'a> MutableArrayData<'a> {
         };
 
         let nulls = match data.data_type {
-            // RunEndEncoded and Null arrays cannot have top-level null 
bitmasks
-            DataType::RunEndEncoded(_, _) | DataType::Null => None,
+            // RunEndEncoded, Null, and Union arrays cannot have top-level 
null bitmasks
+            DataType::RunEndEncoded(_, _) | DataType::Null | 
DataType::Union(_, _) => None,
             _ => data
                 .null_buffer
                 .map(|nulls| {
diff --git a/arrow-data/src/transform/union.rs 
b/arrow-data/src/transform/union.rs
index f6f291e3f0..d1301249d3 100644
--- a/arrow-data/src/transform/union.rs
+++ b/arrow-data/src/transform/union.rs
@@ -17,6 +17,7 @@
 
 use super::{_MutableArrayData, Extend};
 use crate::ArrayData;
+use arrow_schema::DataType;
 
 pub(super) fn build_extend_sparse(array: &ArrayData) -> Extend<'_> {
     let type_ids = array.buffer::<i8>(0);
@@ -68,10 +69,42 @@ pub(super) fn build_extend_dense(array: &ArrayData) -> 
Extend<'_> {
     )
 }
 
-pub(super) fn extend_nulls_dense(_mutable: &mut _MutableArrayData, _len: 
usize) {
-    panic!("cannot call extend_nulls on UnionArray as cannot infer type");
+pub(super) fn extend_nulls_dense(mutable: &mut _MutableArrayData, len: usize) {
+    let DataType::Union(fields, _) = &mutable.data_type else {
+        unreachable!()
+    };
+    let first_type_id = fields
+        .iter()
+        .next()
+        .expect("union must have at least one field")
+        .0;
+
+    // Extend type_ids buffer
+    mutable.buffer1.extend_from_slice(&vec![first_type_id; len]);
+
+    // Dense: extend offsets pointing into the first child, then extend nulls 
in that child
+    let child_offset = mutable.child_data[0].len();
+    let (start, end) = (child_offset as i32, (child_offset + len) as i32);
+    mutable.buffer2.extend(start..end);
+    mutable.child_data[0].extend_nulls(len);
 }
 
-pub(super) fn extend_nulls_sparse(_mutable: &mut _MutableArrayData, _len: 
usize) {
-    panic!("cannot call extend_nulls on UnionArray as cannot infer type");
+pub(super) fn extend_nulls_sparse(mutable: &mut _MutableArrayData, len: usize) 
{
+    let DataType::Union(fields, _) = &mutable.data_type else {
+        unreachable!()
+    };
+    let first_type_id = fields
+        .iter()
+        .next()
+        .expect("union must have at least one field")
+        .0;
+
+    // Extend type_ids buffer
+    mutable.buffer1.extend_from_slice(&vec![first_type_id; len]);
+
+    // Sparse: extend nulls in ALL children
+    mutable
+        .child_data
+        .iter_mut()
+        .for_each(|child| child.extend_nulls(len));
 }
diff --git a/arrow/tests/array_transform.rs b/arrow/tests/array_transform.rs
index 511dc1e8bf..c24d0992a4 100644
--- a/arrow/tests/array_transform.rs
+++ b/arrow/tests/array_transform.rs
@@ -1151,3 +1151,91 @@ fn test_fixed_size_list_append() {
     .unwrap();
     assert_eq!(finished, expected_fixed_size_list_data);
 }
+
+#[test]
+fn test_extend_nulls_sparse_union() {
+    let fields = UnionFields::try_new(
+        vec![0, 1],
+        vec![
+            Field::new("null", DataType::Null, true),
+            Field::new("str", DataType::Utf8, true),
+        ],
+    )
+    .unwrap();
+
+    let type_ids = ScalarBuffer::from(vec![1i8]);
+    let child_null = Arc::new(NullArray::new(1)) as ArrayRef;
+    let child_str = Arc::new(StringArray::from(vec![Some("hello")])) as 
ArrayRef;
+    let union_array = UnionArray::try_new(
+        fields.clone(),
+        type_ids,
+        None, // sparse
+        vec![child_null, child_str],
+    )
+    .unwrap();
+
+    let data = union_array.to_data();
+    let mut mutable = MutableArrayData::new(vec![&data], true, 4);
+    mutable.extend(0, 0, 1); // copy the first element
+    mutable.extend_nulls(2); // add two nulls
+    let result = mutable.freeze();
+
+    // Union arrays must not have a null bitmap per Arrow spec
+    assert!(result.nulls().is_none());
+
+    let result_array = UnionArray::from(result);
+    assert_eq!(result_array.len(), 3);
+    // First element should be type_id 1 (str)
+    assert_eq!(result_array.type_id(0), 1);
+    // Null elements use the first type_id (0)
+    assert_eq!(result_array.type_id(1), 0);
+    assert_eq!(result_array.type_id(2), 0);
+    // All children should have length 3 (sparse invariant)
+    assert_eq!(result_array.child(0).len(), 3);
+    assert_eq!(result_array.child(1).len(), 3);
+}
+
+#[test]
+fn test_extend_nulls_dense_union() {
+    let fields = UnionFields::try_new(
+        vec![0, 1],
+        vec![
+            Field::new("i", DataType::Int32, true),
+            Field::new("str", DataType::Utf8, true),
+        ],
+    )
+    .unwrap();
+
+    let type_ids = ScalarBuffer::from(vec![1i8]);
+    let offsets = ScalarBuffer::from(vec![0i32]);
+    let child_int = Arc::new(Int32Array::new_null(0)) as ArrayRef;
+    let child_str = Arc::new(StringArray::from(vec![Some("hello")])) as 
ArrayRef;
+    let union_array = UnionArray::try_new(
+        fields.clone(),
+        type_ids,
+        Some(offsets),
+        vec![child_int, child_str],
+    )
+    .unwrap();
+
+    let data = union_array.to_data();
+    let mut mutable = MutableArrayData::new(vec![&data], true, 4);
+    mutable.extend(0, 0, 1); // copy the first element
+    mutable.extend_nulls(2); // add two nulls
+    let result = mutable.freeze();
+
+    // Union arrays must not have a null bitmap per Arrow spec
+    assert!(result.nulls().is_none());
+
+    let result_array = UnionArray::from(result);
+    assert_eq!(result_array.len(), 3);
+    // First element is type_id 1 (str)
+    assert_eq!(result_array.type_id(0), 1);
+    // Null elements use the first type_id (0)
+    assert_eq!(result_array.type_id(1), 0);
+    assert_eq!(result_array.type_id(2), 0);
+    // First child (int) should have 2 null entries from extend_nulls
+    assert_eq!(result_array.child(0).len(), 2);
+    // Second child (str) should have 1 entry from extend
+    assert_eq!(result_array.child(1).len(), 1);
+}

Reply via email to