This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 69cd666501 fix: skip empty metadata in intersect_metadata_for_union to 
prevent s… (#21127)
69cd666501 is described below

commit 69cd6665019e782cb2eb226d8e866c5c61e7d01e
Author: Rafael Herrero <[email protected]>
AuthorDate: Thu Mar 26 01:14:19 2026 +0100

    fix: skip empty metadata in intersect_metadata_for_union to prevent s… 
(#21127)
    
    ## Which issue does this PR close?
    
    - Closes #19049.
    
    ## Rationale for this change
    
    We're building a SQL engine on top of DataFusion and hit this while
    running benchmarks. A `UNION ALL` query against Parquet files that carry
    field metadata (like `PARQUET:field_id` or InfluxDB's
    `iox::column::type`). When one branch of the union has a NULL literal,
    `intersect_metadata_for_union` intersects the metadata from the data
    source with the empty metadata from the NULL — and since intersecting
    anything with an empty set gives empty, all metadata gets wiped out.
    
    Later, when `optimize_projections` prunes columns and `recompute_schema`
    rebuilds the Union schema, the logical schema has `{}` while the
    physical schema still has the original metadata from Parquet. This
    causes:
    
    ```
    Internal error: Physical input schema should be the same as the one
    converted from logical input schema.
    Differences:
      - field metadata at index 0 [usage_idle]: (physical) 
{"iox::column::type": "..."} vs (logical) {}
    ```
    
    As @erratic-pattern and @alamb discussed in the issue, empty metadata
    from NULL literals isn't saying "this field has no metadata" — it's
    saying "I don't know." It shouldn't erase metadata from branches that
    actually have it.
    
    I fixed this in `intersect_metadata_for_union` directly rather than
    patching `optimize_projections` or `recompute_schema`, since that's
    where the bad intersection happens and it covers all code paths that
    derive Union schemas.
    
    ## What changes are included in this PR?
    
    One change to `intersect_metadata_for_union` in
    `datafusion/expr/src/expr.rs`: branches with empty metadata are skipped
    during intersection instead of participating. Non-empty branches still
    intersect normally (conflicting values still get dropped). If every
    branch is empty, the result is empty — same as before.
    
    ## Are these changes tested?
    
    Added 7 unit tests for `intersect_metadata_for_union`:
    
    - Same metadata across branches — preserved
    - Conflicting non-empty values — dropped (existing behavior, unchanged)
    - One branch has metadata, other is empty — metadata preserved (the fix)
    - Empty branch comes first — still works
    - All branches empty — empty result
    - Mix of empty and conflicting non-empty — intersects only the non-empty
    ones
    - No inputs — empty result
    
    The full end-to-end reproduction needs Parquet files with field metadata
    as described in the issue. The unit tests cover the intersection logic
    directly.
    
    ## Are there any user-facing changes?
    
    No API changes. `UNION ALL` queries combining metadata-carrying sources
    with NULL literals will stop failing with schema mismatch errors.
---
 datafusion/expr/src/expr.rs                     | 86 +++++++++++++++++++++++--
 datafusion/sqllogictest/test_files/metadata.slt | 17 +++++
 2 files changed, 96 insertions(+), 7 deletions(-)

diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs
index 12c879a515..1f3b1d1811 100644
--- a/datafusion/expr/src/expr.rs
+++ b/datafusion/expr/src/expr.rs
@@ -512,17 +512,26 @@ pub type SchemaFieldMetadata = 
std::collections::HashMap<String, String>;
 pub fn intersect_metadata_for_union<'a>(
     metadatas: impl IntoIterator<Item = &'a SchemaFieldMetadata>,
 ) -> SchemaFieldMetadata {
-    let mut metadatas = metadatas.into_iter();
-    let Some(mut intersected) = metadatas.next().cloned() else {
-        return Default::default();
-    };
+    let mut intersected: Option<SchemaFieldMetadata> = None;
 
     for metadata in metadatas {
-        // Only keep keys that exist in both with the same value
-        intersected.retain(|k, v| metadata.get(k) == Some(v));
+        // Skip empty metadata (e.g. from NULL literals or computed 
expressions)
+        // to avoid dropping metadata from branches that have it.
+        if metadata.is_empty() {
+            continue;
+        }
+        match &mut intersected {
+            None => {
+                intersected = Some(metadata.clone());
+            }
+            Some(current) => {
+                // Only keep keys that exist in both with the same value
+                current.retain(|k, v| metadata.get(k) == Some(v));
+            }
+        }
     }
 
-    intersected
+    intersected.unwrap_or_default()
 }
 
 /// UNNEST expression.
@@ -4127,4 +4136,67 @@ mod test {
             }
         }
     }
+
+    mod intersect_metadata_tests {
+        use super::super::intersect_metadata_for_union;
+        use std::collections::HashMap;
+
+        #[test]
+        fn all_branches_same_metadata() {
+            let m1 = HashMap::from([("key".into(), "val".into())]);
+            let m2 = HashMap::from([("key".into(), "val".into())]);
+            let result = intersect_metadata_for_union([&m1, &m2]);
+            assert_eq!(result, HashMap::from([("key".into(), "val".into())]));
+        }
+
+        #[test]
+        fn conflicting_metadata_dropped() {
+            let m1 = HashMap::from([("key".into(), "a".into())]);
+            let m2 = HashMap::from([("key".into(), "b".into())]);
+            let result = intersect_metadata_for_union([&m1, &m2]);
+            assert!(result.is_empty());
+        }
+
+        #[test]
+        fn empty_metadata_branch_skipped() {
+            let m1 = HashMap::from([("key".into(), "val".into())]);
+            let m2 = HashMap::new(); // e.g. NULL literal
+            let result = intersect_metadata_for_union([&m1, &m2]);
+            assert_eq!(result, HashMap::from([("key".into(), "val".into())]));
+        }
+
+        #[test]
+        fn empty_metadata_first_branch_skipped() {
+            let m1 = HashMap::new();
+            let m2 = HashMap::from([("key".into(), "val".into())]);
+            let result = intersect_metadata_for_union([&m1, &m2]);
+            assert_eq!(result, HashMap::from([("key".into(), "val".into())]));
+        }
+
+        #[test]
+        fn all_branches_empty_metadata() {
+            let m1: HashMap<String, String> = HashMap::new();
+            let m2: HashMap<String, String> = HashMap::new();
+            let result = intersect_metadata_for_union([&m1, &m2]);
+            assert!(result.is_empty());
+        }
+
+        #[test]
+        fn mixed_empty_and_conflicting() {
+            let m1 = HashMap::from([("key".into(), "a".into())]);
+            let m2 = HashMap::new();
+            let m3 = HashMap::from([("key".into(), "b".into())]);
+            let result = intersect_metadata_for_union([&m1, &m2, &m3]);
+            // m2 is skipped; m1 and m3 conflict → dropped
+            assert!(result.is_empty());
+        }
+
+        #[test]
+        fn no_inputs() {
+            let result = intersect_metadata_for_union(std::iter::empty::<
+                &HashMap<String, String>,
+            >());
+            assert!(result.is_empty());
+        }
+    }
 }
diff --git a/datafusion/sqllogictest/test_files/metadata.slt 
b/datafusion/sqllogictest/test_files/metadata.slt
index 6ed461debb..f3836b23ec 100644
--- a/datafusion/sqllogictest/test_files/metadata.slt
+++ b/datafusion/sqllogictest/test_files/metadata.slt
@@ -139,6 +139,23 @@ ORDER BY id, name, l_name;
 NULL bar NULL
 NULL NULL l_bar
 
+# Regression test: UNION ALL + optimize_projections pruning unused columns 
causes
+# metadata loss when one branch has NULL literals (empty metadata) and the 
other
+# has field metadata. The optimizer prunes unused columns, triggering 
recompute_schema
+# which rebuilds the Union schema via intersect_metadata_for_union. 
Previously, this
+# intersection would drop metadata when any branch had empty metadata (from 
NULL literals).
+# See https://github.com/apache/datafusion/issues/19049
+query T
+SELECT name FROM (
+  SELECT id, name FROM "table_with_metadata"
+  UNION ALL
+  SELECT NULL::int as id, NULL::string as name
+) GROUP BY name ORDER BY name;
+----
+bar
+baz
+NULL
+
 # Regression test: missing field metadata from left side of the union when 
right side is chosen
 query T
 select name from (


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to