jonathanc-n commented on code in PR #20179:
URL: https://github.com/apache/datafusion/pull/20179#discussion_r2777810372


##########
datafusion/common/src/hash_utils.rs:
##########
@@ -630,6 +679,69 @@ fn hash_union_array(
     Ok(())
 }
 
+/// Hash a sparse union array.
+/// Sparse unions have child arrays with the same length as the union array.
+/// For 3+ types, we optimize by only hashing the N elements that are actually 
used
+/// (via take/scatter), instead of hashing all K*N elements.
+///
+/// For 1-2 types, the overhead of take/scatter outweighs the benefit, so we 
use
+/// the default approach of hashing all children (same as dense unions).
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_sparse_union_array(
+    array: &UnionArray,
+    union_fields: &UnionFields,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()> {
+    use std::collections::HashMap;
+
+    // For 1-2 types, the take/scatter overhead isn't worth it.
+    // Fall back to the default approach (same as dense union).
+    if union_fields.len() <= 2 {
+        return hash_union_array_default(
+            array,
+            union_fields,
+            random_state,
+            hashes_buffer,
+        );
+    }
+
+    let type_ids = array.type_ids();
+
+    // Group indices by type_id
+    let mut indices_by_type: HashMap<i8, Vec<u32>> = HashMap::new();
+    for (i, &type_id) in type_ids.iter().enumerate() {
+        indices_by_type.entry(type_id).or_default().push(i as u32);
+    }
+
+    // For each type, extract only the needed elements, hash them, and scatter 
back
+    for (type_id, _field) in union_fields.iter() {
+        if let Some(indices) = indices_by_type.get(&type_id) {
+            if indices.is_empty() {
+                continue;
+            }
+
+            let child = array.child(type_id);
+            let indices_array = UInt32Array::from(indices.clone());
+
+            // Extract only the elements we need using take()
+            let filtered = take(child.as_ref(), &indices_array, None)?;

Review Comment:
   We get improvements, but I think this should be considered. Can write it on 
the issue.
   
   > I do wonder if theres a way to skip this extract/scatter pattern, maybe by 
trying to override the null buffer of these child arrays (though that runs into 
problems for arrays that don't have a null buffer like run arrays)
   I'll try to run some benchmarks on this idea
   
   
   
   



##########
datafusion/common/src/hash_utils.rs:
##########
@@ -630,6 +679,69 @@ fn hash_union_array(
     Ok(())
 }
 
+/// Hash a sparse union array.
+/// Sparse unions have child arrays with the same length as the union array.
+/// For 3+ types, we optimize by only hashing the N elements that are actually 
used
+/// (via take/scatter), instead of hashing all K*N elements.
+///
+/// For 1-2 types, the overhead of take/scatter outweighs the benefit, so we 
use
+/// the default approach of hashing all children (same as dense unions).
+#[cfg(not(feature = "force_hash_collisions"))]
+fn hash_sparse_union_array(
+    array: &UnionArray,
+    union_fields: &UnionFields,
+    random_state: &RandomState,
+    hashes_buffer: &mut [u64],
+) -> Result<()> {
+    use std::collections::HashMap;
+
+    // For 1-2 types, the take/scatter overhead isn't worth it.
+    // Fall back to the default approach (same as dense union).
+    if union_fields.len() <= 2 {
+        return hash_union_array_default(
+            array,
+            union_fields,
+            random_state,
+            hashes_buffer,
+        );
+    }
+
+    let type_ids = array.type_ids();
+
+    // Group indices by type_id
+    let mut indices_by_type: HashMap<i8, Vec<u32>> = HashMap::new();
+    for (i, &type_id) in type_ids.iter().enumerate() {
+        indices_by_type.entry(type_id).or_default().push(i as u32);
+    }
+
+    // For each type, extract only the needed elements, hash them, and scatter 
back
+    for (type_id, _field) in union_fields.iter() {
+        if let Some(indices) = indices_by_type.get(&type_id) {
+            if indices.is_empty() {
+                continue;
+            }
+
+            let child = array.child(type_id);
+            let indices_array = UInt32Array::from(indices.clone());
+
+            // Extract only the elements we need using take()
+            let filtered = take(child.as_ref(), &indices_array, None)?;

Review Comment:
   We get improvements, but I think this should be considered. Can write it on 
the issue.
   
   > I do wonder if theres a way to skip this extract/scatter pattern, maybe by 
trying to override the null buffer of these child arrays (though that runs into 
problems for arrays that don't have a null buffer like run arrays)
   
   I'll try to run some benchmarks on this idea
   
   
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to