Copilot commented on code in PR #16434: URL: https://github.com/apache/datafusion/pull/16434#discussion_r2156783610
########## datafusion/physical-plan/src/joins/hash_join.rs: ########## @@ -980,15 +981,26 @@ async fn collect_left_input( .await?; // Estimation of memory size, required for hashtable, prior to allocation. - // Final result can be verified using `RawTable.allocation_info()` - let fixed_size = size_of::<JoinHashMap>(); - let estimated_hashtable_size = - estimate_memory_size::<(u64, u64)>(num_rows, fixed_size)?; - - reservation.try_grow(estimated_hashtable_size)?; - metrics.build_mem_used.add(estimated_hashtable_size); + // Final result can be verifiedJoinHashMapTypele.allocation_info()` Review Comment: Fix the typo in this comment. It should read something like: “Final result can be verified using `JoinHashMapType::allocation_info()`.” ```suggestion // Final result can be verified using `JoinHashMapType::allocation_info()`. ``` ########## datafusion/physical-plan/src/joins/stream_join_utils.rs: ########## @@ -47,26 +51,45 @@ use hashbrown::HashTable; /// Implementation of `JoinHashMapType` for `PruningJoinHashMap`. impl JoinHashMapType for PruningJoinHashMap { - type NextType = VecDeque<u64>; - // Extend with zero fn extend_zero(&mut self, len: usize) { self.next.resize(self.next.len() + len, 0) } - /// Get mutable references to the hash map and the next. - fn get_mut(&mut self) -> (&mut HashTable<(u64, u64)>, &mut Self::NextType) { - (&mut self.map, &mut self.next) + fn update_from_iter<'a>( + &mut self, + iter: Box<dyn Iterator<Item = (usize, &'a u64)> + Send + 'a>, + deleted_offset: usize, + ) { + let slice: &mut [u64] = self.next.make_contiguous(); + update_from_iter::<u64>(&mut self.map, slice, iter, deleted_offset); } - /// Get a reference to the hash map. - fn get_map(&self) -> &HashTable<(u64, u64)> { - &self.map + fn get_matched_indices<'a>( + &self, + iter: Box<dyn Iterator<Item = (usize, &'a u64)> + 'a>, + deleted_offset: Option<usize>, + ) -> (Vec<u32>, Vec<u64>) { + // Flatten the deque + let next: Vec<u64> = self.next.iter().copied().collect(); + get_matched_indices::<u64>(&self.map, &next, iter, deleted_offset) Review Comment: [nitpick] Collecting the `VecDeque` into a new `Vec<u64>` on every call can be expensive. Consider reusing a buffer or accessing the deque’s contiguous slice when possible to reduce allocations. ```suggestion // Access the deque's slices directly let (head, tail) = self.next.as_slices(); get_matched_indices::<u64>(&self.map, head, tail, iter, deleted_offset) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org