This is an automated email from the ASF dual-hosted git repository.
kontinuation pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new 54ee94e5 chore: Make KnnComponents field optional in SpatialIndex
(#437)
54ee94e5 is described below
commit 54ee94e50a72e60193c5ecc3d765161e4c557cc2
Author: Kristin Cowalcijk <[email protected]>
AuthorDate: Fri Dec 12 12:15:44 2025 +0800
chore: Make KnnComponents field optional in SpatialIndex (#437)
`KnnComponents` is only used when running KNN join (spatial predicate is
`SpatialPredicate::KNearestNeighbors(...)`), and it reserves lots of memory for
cached geometry objects. Currently the KnnComponents is always initialized even
when running range join or distance join. This patch changes `knn_components`
to be optional and only present when running KNN join.
---
.../sedona-spatial-join/src/index/spatial_index.rs | 99 ++++++++++++++--------
.../src/index/spatial_index_builder.rs | 8 +-
2 files changed, 72 insertions(+), 35 deletions(-)
diff --git a/rust/sedona-spatial-join/src/index/spatial_index.rs
b/rust/sedona-spatial-join/src/index/spatial_index.rs
index d70f2a63..0733571b 100644
--- a/rust/sedona-spatial-join/src/index/spatial_index.rs
+++ b/rust/sedona-spatial-join/src/index/spatial_index.rs
@@ -46,7 +46,7 @@ use crate::{
utils::concurrent_reservation::ConcurrentReservation,
};
use arrow::array::BooleanBufferBuilder;
-use sedona_common::{option::SpatialJoinOptions, ExecutionMode};
+use sedona_common::{option::SpatialJoinOptions, sedona_internal_err,
ExecutionMode};
pub struct SpatialIndex {
pub(crate) schema: SchemaRef,
@@ -89,7 +89,7 @@ pub struct SpatialIndex {
pub(crate) probe_threads_counter: AtomicUsize,
/// Shared KNN components (distance metrics and geometry cache) for
efficient KNN queries
- pub(crate) knn_components: KnnComponents,
+ pub(crate) knn_components: Option<KnnComponents>,
/// Memory reservation for tracking the memory usage of the spatial index
/// Cleared on `SpatialIndex` drop
@@ -117,6 +117,8 @@ impl SpatialIndex {
let refiner_reservation = reservation.split(0);
let refiner_reservation = ConcurrentReservation::try_new(0,
refiner_reservation).unwrap();
let rtree = RTreeBuilder::<f32>::new(0).finish::<HilbertSort>();
+ let knn_components = matches!(spatial_predicate,
SpatialPredicate::KNearestNeighbors(_))
+ .then(|| KnnComponents::new(0, &[], memory_pool.clone()).unwrap());
Self {
schema,
evaluator,
@@ -128,7 +130,7 @@ impl SpatialIndex {
geom_idx_vec: Vec::new(),
visited_left_side: None,
probe_threads_counter,
- knn_components: KnnComponents::new(0, &[],
memory_pool.clone()).unwrap(), // Empty index has no cache
+ knn_components,
reservation,
}
}
@@ -138,12 +140,15 @@ impl SpatialIndex {
}
/// Create a KNN geometry accessor for accessing geometries with caching
- fn create_knn_accessor(&self) -> SedonaKnnAdapter<'_> {
- SedonaKnnAdapter::new(
+ fn create_knn_accessor(&self) -> Result<SedonaKnnAdapter<'_>> {
+ let Some(knn_components) = self.knn_components.as_ref() else {
+ return sedona_internal_err!("knn_components is not initialized
when running KNN join");
+ };
+ Ok(SedonaKnnAdapter::new(
&self.indexed_batches,
&self.data_id_to_batch_pos,
- &self.knn_components,
- )
+ knn_components,
+ ))
}
/// Get the batch at the given index.
@@ -249,14 +254,21 @@ impl SpatialIndex {
};
// Select the appropriate distance metric
- let distance_metric: &dyn DistanceMetric<f32> = if use_spheroid {
- &self.knn_components.haversine_metric
- } else {
- &self.knn_components.euclidean_metric
+ let distance_metric: &dyn DistanceMetric<f32> = {
+ let Some(knn_components) = self.knn_components.as_ref() else {
+ return sedona_internal_err!(
+ "knn_components is not initialized when running KNN join"
+ );
+ };
+ if use_spheroid {
+ &knn_components.haversine_metric
+ } else {
+ &knn_components.euclidean_metric
+ }
};
// Create geometry accessor for on-demand WKB decoding and caching
- let geometry_accessor = self.create_knn_accessor();
+ let geometry_accessor = self.create_knn_accessor()?;
// Use neighbors_geometry to find k nearest neighbors
let initial_results = self.rtree.neighbors_geometry(
@@ -486,12 +498,13 @@ mod tests {
use crate::{
index::{SpatialIndexBuilder, SpatialJoinBuildMetrics},
operand_evaluator::EvaluatedGeometryArray,
- spatial_predicate::{RelationPredicate, SpatialRelationType},
+ spatial_predicate::{KNNPredicate, RelationPredicate,
SpatialRelationType},
};
use super::*;
use arrow_array::RecordBatch;
use arrow_schema::{DataType, Field};
+ use datafusion_common::JoinSide;
use datafusion_execution::memory_pool::GreedyMemoryPool;
use datafusion_expr::JoinType;
use datafusion_physical_expr::expressions::Column;
@@ -597,10 +610,12 @@ mod tests {
};
let metrics = SpatialJoinBuildMetrics::default();
- let spatial_predicate =
SpatialPredicate::Relation(RelationPredicate::new(
+ let spatial_predicate =
SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
Arc::new(Column::new("geom", 0)),
Arc::new(Column::new("geom", 1)),
- SpatialRelationType::Intersects,
+ 5,
+ false,
+ JoinSide::Left,
));
// Create sample geometry data - points at known locations
@@ -694,10 +709,12 @@ mod tests {
};
let metrics = SpatialJoinBuildMetrics::default();
- let spatial_predicate =
SpatialPredicate::Relation(RelationPredicate::new(
+ let spatial_predicate =
SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
Arc::new(Column::new("geom", 0)),
Arc::new(Column::new("geom", 1)),
- SpatialRelationType::Intersects,
+ 5,
+ false,
+ JoinSide::Left,
));
let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
@@ -783,10 +800,12 @@ mod tests {
};
let metrics = SpatialJoinBuildMetrics::default();
- let spatial_predicate =
SpatialPredicate::Relation(RelationPredicate::new(
+ let spatial_predicate =
SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
Arc::new(Column::new("geom", 0)),
Arc::new(Column::new("geom", 1)),
- SpatialRelationType::Intersects,
+ 5,
+ true,
+ JoinSide::Left,
));
let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
@@ -878,10 +897,12 @@ mod tests {
};
let metrics = SpatialJoinBuildMetrics::default();
- let spatial_predicate =
SpatialPredicate::Relation(RelationPredicate::new(
+ let spatial_predicate =
SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
Arc::new(Column::new("geom", 0)),
Arc::new(Column::new("geom", 1)),
- SpatialRelationType::Intersects,
+ 5,
+ false,
+ JoinSide::Left,
));
let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
@@ -970,10 +991,12 @@ mod tests {
let metrics = SpatialJoinBuildMetrics::default();
let schema = Arc::new(arrow_schema::Schema::empty());
- let spatial_predicate =
SpatialPredicate::Relation(RelationPredicate::new(
+ let spatial_predicate =
SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
Arc::new(Column::new("geom", 0)),
Arc::new(Column::new("geom", 1)),
- SpatialRelationType::Intersects,
+ 5,
+ false,
+ JoinSide::Left,
));
let builder = SpatialIndexBuilder::new(
@@ -1015,10 +1038,12 @@ mod tests {
};
let metrics = SpatialJoinBuildMetrics::default();
- let spatial_predicate =
SpatialPredicate::Relation(RelationPredicate::new(
+ let spatial_predicate =
SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
Arc::new(Column::new("geom", 0)),
Arc::new(Column::new("geom", 1)),
- SpatialRelationType::Intersects,
+ 5,
+ false,
+ JoinSide::Left,
));
let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
@@ -1128,10 +1153,12 @@ mod tests {
};
let metrics = SpatialJoinBuildMetrics::default();
- let spatial_predicate =
SpatialPredicate::Relation(RelationPredicate::new(
+ let spatial_predicate =
SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
Arc::new(Column::new("geom", 0)),
Arc::new(Column::new("geom", 1)),
- SpatialRelationType::Intersects,
+ 5,
+ false,
+ JoinSide::Left,
));
// Create sample geometry data - points at known locations
@@ -1214,10 +1241,12 @@ mod tests {
};
let metrics = SpatialJoinBuildMetrics::default();
- let spatial_predicate =
SpatialPredicate::Relation(RelationPredicate::new(
+ let spatial_predicate =
SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
Arc::new(Column::new("geom", 0)),
Arc::new(Column::new("geom", 1)),
- SpatialRelationType::Intersects,
+ 5,
+ false,
+ JoinSide::Left,
));
// Create different geometry types
@@ -1296,10 +1325,12 @@ mod tests {
};
let metrics = SpatialJoinBuildMetrics::default();
- let spatial_predicate =
SpatialPredicate::Relation(RelationPredicate::new(
+ let spatial_predicate =
SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
Arc::new(Column::new("geom", 0)),
Arc::new(Column::new("geom", 1)),
- SpatialRelationType::Intersects,
+ 5,
+ false,
+ JoinSide::Left,
));
let schema = Arc::new(arrow_schema::Schema::new(vec![Field::new(
@@ -1390,10 +1421,12 @@ mod tests {
};
let metrics = SpatialJoinBuildMetrics::default();
- let spatial_predicate =
SpatialPredicate::Relation(RelationPredicate::new(
+ let spatial_predicate =
SpatialPredicate::KNearestNeighbors(KNNPredicate::new(
Arc::new(Column::new("geom", 0)),
Arc::new(Column::new("geom", 1)),
- SpatialRelationType::Intersects,
+ 5,
+ false,
+ JoinSide::Left,
));
// Create geometry batch using the same pattern as other tests
diff --git a/rust/sedona-spatial-join/src/index/spatial_index_builder.rs
b/rust/sedona-spatial-join/src/index/spatial_index_builder.rs
index 0ab90322..abbe60a3 100644
--- a/rust/sedona-spatial-join/src/index/spatial_index_builder.rs
+++ b/rust/sedona-spatial-join/src/index/spatial_index_builder.rs
@@ -264,8 +264,12 @@ impl SpatialIndexBuilder {
.unwrap();
let cache_size = batch_pos_vec.len();
- let knn_components =
- KnnComponents::new(cache_size, &self.indexed_batches,
self.memory_pool.clone())?;
+ let knn_components = matches!(
+ self.spatial_predicate,
+ SpatialPredicate::KNearestNeighbors(_)
+ )
+ .then(|| KnnComponents::new(cache_size, &self.indexed_batches,
self.memory_pool.clone()))
+ .transpose()?;
Ok(SpatialIndex {
schema: self.schema,