Dandandan commented on code in PR #19411:
URL: https://github.com/apache/datafusion/pull/19411#discussion_r2635957210


##########
datafusion/physical-plan/src/joins/array_map.rs:
##########
@@ -0,0 +1,565 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow::buffer::MutableBuffer;
+use num_traits::AsPrimitive;
+use std::fmt;
+use std::sync::Arc;
+
+use crate::joins::chain::traverse_chain;
+use crate::joins::join_hash_map::JoinHashMapOffset;
+use crate::joins::utils::JoinHashMapType;
+use arrow::array::{Array, ArrayRef, AsArray};
+use arrow::datatypes::DataType;
+use arrow::datatypes::{
+    Int8Type, Int16Type, Int32Type, Int64Type, UInt8Type, UInt16Type, 
UInt32Type,
+    UInt64Type,
+};
+use datafusion_common::{Result, internal_err};
+
+/// A "perfect" hash map for single-column integer join keys, represented as a 
dense array.
+///
+/// This structure is highly optimized for joins where the keys are integers 
within a limited
+/// range. Instead of calculating hashes, it uses the integer key itself as an 
index into a
+/// `Vec`, achieving O(1) lookup performance.
+///
+/// # NULL Handling
+///
+/// This optimization can be used for joins with 
`NullEquality::NullEqualsNothing` even if the
+/// join keys contain `NULL`s. This is because:
+///
+/// 1. `try_new` (build side): Ignores rows with `NULL` keys when creating the 
map. This is
+///    correct as `NULL` keys would not match anything anyway.
+/// 2. `get_matched_indices_with_limit_offset` (probe side): Skips any `NULL` 
keys encountered
+///    in the probe side input.
+///
+/// This structure **cannot** be used for joins with 
`NullEquality::NullEqualsNull` if the
+/// build side contains `NULL`s, as it does not have a mechanism to store and 
match `NULL` values.
+#[derive(Debug)]
+pub struct ArrayMap {
+    // data[probSideVal-offset] -> valIdxInBuildSide + 1; 0 for absent
+    data: Vec<u32>,
+    offset: u64, // min val in buildSide
+    next: Option<Vec<u32>>,
+}
+
+impl ArrayMap {
+    /// Creates a new [`ArrayKV`] from the given array of join keys.
+    ///
+    /// Note: This function processes only the non-null values in the input 
`array`,
+    /// effectively ignoring any rows where the key is `NULL`.
+    ///
+    /// TODO: Support `NullEquality::NullEqualsNull` by storing null indices 
in a
+    /// separate `Vec` to allow for `NULL=NULL` matching in the future.
+    pub(crate) fn try_new(
+        array: &ArrayRef,
+        offset_val: u64,
+        range: usize,
+    ) -> Result<Self> {
+        // Initialize with 0 (sentinel for not found)
+        let mut data: Vec<u32> = vec![0; range];
+        let mut next: Option<Vec<u32>> = None;

Review Comment:
   ```suggestion
           let mut next: Vec<u32> = vec![];
   ```
   
   I think this should work as well



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to