Kontinuation commented on code in PR #443: URL: https://github.com/apache/sedona-db/pull/443#discussion_r2618256493
########## rust/sedona-spatial-join/src/partitioning/kdb.rs: ########## @@ -0,0 +1,843 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! KDB-tree spatial partitioning implementation. +//! +//! This module provides a K-D-B tree (K-Dimensional B-tree) implementation for spatial +//! partitioning, which is essential for out-of-core spatial joins. +//! +//! # Partitioning Strategy +//! +//! - **Partition Assignment**: +//! - `partition()`: Returns `SpatialPartition::Regular(id)` if the bbox intersects exactly one partition, +//! `SpatialPartition::Multi` if it intersects multiple, or `SpatialPartition::None` if it intersects none. +//! - `partition_no_multi()`: Assigns the bbox to the partition with the largest intersection area +//! (Maximum Overlap Strategy), or `SpatialPartition::None` if no intersection. +//! +//! # Algorithm +//! +//! The KDB tree partitions space by recursively splitting it along alternating axes: +//! +//! 1. Start with the full spatial extent +//! 2. When a node exceeds `max_items_per_node`, split it: +//! - Choose the longer dimension (x or y) +//! - Sort items by their minimum coordinate on that dimension +//! - Split at the median item's coordinate +//! 3. Continue until reaching `max_levels` depth or items fit in nodes +//! 4. Assign sequential IDs to leaf nodes + +use std::sync::Arc; + +use crate::partitioning::{ + util::{bbox_to_geo_rect, rect_contains_point, rect_intersection_area, rects_intersect}, + SpatialPartition, SpatialPartitioner, +}; +use datafusion_common::Result; +use geo::{Coord, Rect}; +use sedona_geometry::bounding_box::BoundingBox; + +/// K-D-B tree spatial partitioner implementation. +/// +/// See https://en.wikipedia.org/wiki/K-D-B-tree +/// +/// The KDB tree is a hierarchical spatial data structure that recursively +/// partitions space using axis-aligned splits. It adapts to the spatial +/// distribution of data, making it effective for spatial partitioning. +/// +/// # Example +/// +/// ``` +/// use sedona_geometry::bounding_box::BoundingBox; +/// use sedona_spatial_join::partitioning::kdb::KDBPartitioner; +/// use sedona_spatial_join::partitioning::{SpatialPartitioner, SpatialPartition}; +/// +/// // Create sample bounding boxes +/// let bboxes = (0..20).map(|i| { +/// let x = (i % 10) as f64 * 10.0; +/// let y = (i / 10) as f64 * 10.0; +/// BoundingBox::xy((x, x + 5.0), (y, y + 5.0)) +/// }); +/// +/// // Build the KDB partitioner +/// let extent = BoundingBox::xy((0.0, 100.0), (0.0, 100.0)); +/// let partitioner = KDBPartitioner::build(bboxes, 5, 3, extent) +/// .expect("failed to build KDB partitioner"); +/// +/// // Query which partition a bounding box belongs to +/// let test_bbox = BoundingBox::xy((1.0, 4.0), (1.0, 4.0)); +/// match partitioner.partition(&test_bbox).unwrap() { +/// SpatialPartition::Regular(id) => println!("Partition ID: {}", id), +/// _ => unreachable!(), +/// } +/// ``` +pub(crate) struct KDBTree { + max_items_per_node: usize, + max_levels: usize, + extent: Rect<f32>, + level: usize, + items: Vec<Rect<f32>>, + children: Option<Box<[KDBTree; 2]>>, + leaf_id: u32, +} + +#[cfg(test)] +pub(crate) struct KDBResult { + extent: Rect<f32>, + leaf_id: u32, +} + +#[cfg(test)] +impl KDBResult { + fn new(kdb: &KDBTree) -> Self { + Self { + extent: kdb.extent, + leaf_id: kdb.leaf_id, + } + } +} + +impl KDBTree { + /// Create a new KDB tree with the given parameters. + /// + /// # Arguments + /// * `max_items_per_node` - Maximum number of items before splitting a node + /// * `max_levels` - Maximum depth of the tree + /// * `extent` - The spatial extent covered by this tree + pub fn new(max_items_per_node: usize, max_levels: usize, extent: BoundingBox) -> Result<Self> { + let extent_rect = bbox_to_geo_rect(&extent)?; + Ok(Self::new_with_level( + max_items_per_node, + max_levels, + 0, + extent_rect, + )) + } + + fn new_with_level( + max_items_per_node: usize, + max_levels: usize, + level: usize, + extent: Rect<f32>, + ) -> Self { + KDBTree { + max_items_per_node, + max_levels, + extent, + level, + items: Vec::new(), + children: None, + leaf_id: 0, + } + } + + /// Insert a bounding box into the tree. + pub fn insert(&mut self, bbox: BoundingBox) -> Result<()> { + let rect = bbox_to_geo_rect(&bbox)?; + self.insert_rect(rect); + Ok(()) + } + + fn insert_rect(&mut self, rect: Rect<f32>) { + if self.items.len() < self.max_items_per_node || self.level >= self.max_levels { + self.items.push(rect); + } else { + if self.children.is_none() { + // Split over longer side + let split_x = self.extent.width() > self.extent.height(); + let mut ok = self.split(split_x); + if !ok { + // Try splitting by the other side + ok = self.split(!split_x); + } + + if !ok { + // This could happen if all envelopes are the same. + self.items.push(rect); + return; + } + } + + // Insert into appropriate child + if let Some(ref mut children) = self.children { + let min_point = rect.min(); + for child in children.iter_mut() { + if rect_contains_point(child.extent(), &min_point) { + child.insert_rect(rect); + break; + } + } + } + } + } + + /// Check if this node is a leaf node. + pub fn is_leaf(&self) -> bool { + self.children.is_none() + } + + /// Get the leaf ID (only valid for leaf nodes). + pub fn leaf_id(&self) -> u32 { + assert!(self.is_leaf(), "leaf_id() called on non-leaf node"); + self.leaf_id + } + + /// Get the spatial extent of this node. + pub fn extent(&self) -> &Rect<f32> { + &self.extent + } + + /// Assign leaf IDs to all leaf nodes in the tree (breadth-first traversal). + pub fn assign_leaf_ids(&mut self) { + let mut next_id = 0; + self.assign_leaf_ids_recursive(&mut next_id); + } + + fn assign_leaf_ids_recursive(&mut self, next_id: &mut u32) { + if self.is_leaf() { + self.leaf_id = *next_id; + *next_id += 1; + } else if let Some(ref mut children) = self.children { + for child in children.iter_mut() { + child.assign_leaf_ids_recursive(next_id); + } + } + } + + /// Find all leaf nodes that intersect with the given bounding box. + #[cfg(test)] + pub fn find_leaf_nodes(&self, rect: &Rect<f32>, matches: &mut Vec<KDBResult>) { + self.visit_intersecting_leaf_nodes(rect, &mut |kdb| { + matches.push(KDBResult::new(kdb)); + }); + } + + pub fn visit_intersecting_leaf_nodes<'a>( + &'a self, + rect: &Rect<f32>, + f: &mut impl FnMut(&'a KDBTree), + ) { + if !rects_intersect(&self.extent, rect) { + return; + } + + if self.is_leaf() { + f(self) + } else if let Some(ref children) = self.children { + for child in children.iter() { + child.visit_intersecting_leaf_nodes(rect, f); + } + } + } + + pub fn visit_leaf_nodes<'a>(&'a self, f: &mut impl FnMut(&'a KDBTree)) { + if self.is_leaf() { + f(self) + } else if let Some(ref children) = self.children { + for child in children.iter() { + child.visit_leaf_nodes(f); + } + } + } + + #[cfg(test)] + pub fn collect_leaf_nodes(&self) -> Vec<&KDBTree> { + let mut leaf_nodes = Vec::new(); + self.visit_leaf_nodes(&mut |kdb| { + leaf_nodes.push(kdb); + }); + leaf_nodes + } + + pub fn num_leaf_nodes(&self) -> usize { + let mut num = 0; + self.visit_leaf_nodes(&mut |_| { + num += 1; + }); + num + } + + /// Drop all stored items to save memory after tree construction. + pub fn drop_elements(&mut self) { + self.items.clear(); + if let Some(ref mut children) = self.children { + for child in children.iter_mut() { + child.drop_elements(); + } + } + } + + /// Split this node along the specified axis. + /// Returns true if the split was successful, false otherwise. + fn split(&mut self, split_x: bool) -> bool { + // Sort items by the appropriate coordinate + if split_x { + self.items.sort_by(|a, b| { + let a_min = a.min(); + let b_min = b.min(); + a_min + .x + .partial_cmp(&b_min.x) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| { + a_min + .y + .partial_cmp(&b_min.y) + .unwrap_or(std::cmp::Ordering::Equal) + }) + }); + } else { + self.items.sort_by(|a, b| { + let a_min = a.min(); + let b_min = b.min(); + a_min + .y + .partial_cmp(&b_min.y) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| { + a_min + .x + .partial_cmp(&b_min.x) + .unwrap_or(std::cmp::Ordering::Equal) + }) + }); + } + + // Find the split coordinate from the middle item + let middle_idx = self.items.len() / 2; + let middle_min = self.items[middle_idx].min(); + + let split_coord = if split_x { middle_min.x } else { middle_min.y }; + + let extent_min = self.extent.min(); + let extent_max = self.extent.max(); + + // Check if split coordinate is valid + if split_x { + if split_coord <= extent_min.x || split_coord >= extent_max.x { + // Too many objects are crowded at the edge of the extent. Can't split. + return false; + } + } else if split_coord <= extent_min.y || split_coord >= extent_max.y { + // Too many objects are crowded at the edge of the extent. Can't split. + return false; + } + + // Create child extents + let (left_extent, right_extent) = if split_x { + let left = Rect::new( + extent_min, + Coord { + x: split_coord, + y: extent_max.y, + }, + ); + let right = Rect::new( + Coord { + x: split_coord, + y: extent_min.y, + }, + extent_max, + ); + (left, right) + } else { + let left = Rect::new( + extent_min, + Coord { + x: extent_max.x, + y: split_coord, + }, + ); + let right = Rect::new( + Coord { + x: extent_min.x, + y: split_coord, + }, + extent_max, + ); + (left, right) + }; + + // Create children + let mut left_child = KDBTree::new_with_level( + self.max_items_per_node, + self.max_levels, + self.level + 1, + left_extent, + ); + let mut right_child = KDBTree::new_with_level( + self.max_items_per_node, + self.max_levels, + self.level + 1, + right_extent, + ); + + // Distribute items to children + for item in self.items.drain(..) { + let belongs_to_left = if split_x { + item.min().x <= split_coord + } else { + item.min().y <= split_coord + }; + + if belongs_to_left { + left_child.insert_rect(item); + } else { + right_child.insert_rect(item); + } + } Review Comment: Switched to proposed code for splitting leaf nodes. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
