Kontinuation commented on code in PR #53:
URL: https://github.com/apache/sedona-db/pull/53#discussion_r2338854370
##########
rust/sedona-spatial-join/src/index.rs:
##########
@@ -235,25 +235,76 @@ impl SpatialIndexBuilder {
geom_idx_vec
}
- /// Build cached geometries and positions for KNN queries to avoid
repeated WKB conversions
- fn build_cached_geometries(
- indexed_batches: &[IndexedBatch],
- ) -> (Vec<Geometry<f64>>, Vec<(i32, i32)>) {
+ /// Build cached geometries for KNN queries to avoid repeated WKB
conversions
+ fn build_cached_geometries(indexed_batches: &[IndexedBatch]) ->
Vec<Geometry<f64>> {
let mut geometries = Vec::new();
- let mut geometry_positions = Vec::new();
- for (batch_idx, indexed_batch) in indexed_batches.iter().enumerate() {
- for (row_idx, wkb_opt) in
indexed_batch.geom_array.wkbs().iter().enumerate() {
+ for indexed_batch in indexed_batches.iter() {
+ for wkb_opt in indexed_batch.geom_array.wkbs().iter() {
if let Some(wkb) = wkb_opt.as_ref() {
if let Ok(geom) = item_to_geometry(wkb) {
geometries.push(geom);
- geometry_positions.push((batch_idx as i32, row_idx as
i32));
}
}
}
}
- (geometries, geometry_positions)
+ geometries
+ }
+
+ /// Estimate the memory usage of cached geometries for memory reservation
+ /// This provides a rough approximation of the memory used by
geo::Geometry objects
+ fn estimate_geometry_memory(geometries: &[Geometry<f64>]) -> usize {
+ // Rough estimation based on geometry types:
+ // - Each Point: ~32 bytes (2 f64s + overhead)
+ // - Each LineString: base size + (num_coords * 16 bytes)
+ // - Each Polygon: base size + exterior ring + interior rings
+ // - Add some overhead for Vec storage and enum variants
+
+ const BASE_GEOMETRY_SIZE: usize = 64; // Base size for geometry enum +
overhead
+ const POINT_SIZE: usize = 16; // 2 f64s (x, y)
+ const COORDINATE_SIZE: usize = 16; // 2 f64s per coordinate
+
+ geometries
+ .iter()
+ .map(|geom| {
+ BASE_GEOMETRY_SIZE
+ + match geom {
+ Geometry::Point(_) => POINT_SIZE,
+ Geometry::LineString(ls) => ls.coords().count() *
COORDINATE_SIZE,
+ Geometry::Polygon(poly) => {
+ let exterior_coords =
+ poly.exterior().coords().count() *
COORDINATE_SIZE;
+ let interior_coords: usize = poly
+ .interiors()
+ .iter()
+ .map(|ring| ring.coords().count() *
COORDINATE_SIZE)
+ .sum();
+ exterior_coords + interior_coords
+ }
+ Geometry::MultiPoint(mp) => mp.0.len() * POINT_SIZE,
+ Geometry::MultiLineString(mls) => mls
+ .0
+ .iter()
+ .map(|ls| ls.coords().count() * COORDINATE_SIZE)
+ .sum::<usize>(),
+ Geometry::MultiPolygon(mp) => mp
+ .0
+ .iter()
+ .map(|poly| {
+ let exterior =
poly.exterior().coords().count() * COORDINATE_SIZE;
+ let interiors: usize = poly
+ .interiors()
+ .iter()
+ .map(|ring| ring.coords().count() *
COORDINATE_SIZE)
+ .sum();
+ exterior + interiors
+ })
+ .sum::<usize>(),
+ _ => 256, // Conservative estimate for other geometry
types
+ }
+ })
+ .sum()
}
Review Comment:
This is overly complicated but less accurate than directly using the length
of WKB buffer. I'd rather just take the length of WKB buffer plus or multiply
some overhead.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]