This is an automated email from the ASF dual-hosted git repository.
kontinuation pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new f49016d Add an efficient GeometryTypeAndDimensionsSet to improve the
performance of geo statistics collection and ST_Analyze_Aggr (#285)
f49016d is described below
commit f49016d6e99e1e38ee9efac5b451dd399b3cf740
Author: Kristin Cowalcijk <[email protected]>
AuthorDate: Sun Nov 9 12:24:29 2025 +0800
Add an efficient GeometryTypeAndDimensionsSet to improve the performance of
geo statistics collection and ST_Analyze_Aggr (#285)
The idea is pretty simple: we use a 32-bit bitset to represent geometry
type and dimensions. This replaces usages of HashSet<GeometryTypeAndDimensions>
in `GeoStatistics` and reduces the overhead of updating geo statistics in
`AnalyzeAccumulator`. We care about the performance geo statistics analyzer
because it is applied to all geometries on the indexed side when running a
spatial join. `AnalyzeAccumulator` and ST_Analyze_Aggr can be useful in some
other places as well, so we'd like [...]
Here is the benchmark result of st_analyze_aggr after applying this patch:
```
Gnuplot not found, using plotters backend
native-st_analyze_aggr-Array(Point)
time: [4.1267 ms 4.2026 ms 4.3423 ms]
change: [-87.458% -87.216% -86.808%] (p = 0.00 <
0.05)
Performance has improved.
Found 8 outliers among 100 measurements (8.00%)
6 (6.00%) high mild
2 (2.00%) high severe
native-st_analyze_aggr-Array(LineString(10))
time: [5.6607 ms 5.6728 ms 5.6868 ms]
change: [-83.578% -83.529% -83.482%] (p = 0.00 <
0.05)
Performance has improved.
Found 1 outliers among 100 measurements (1.00%)
1 (1.00%) high severe
```
Co-authored-by: Copilot <[email protected]>
---
rust/sedona-expr/src/statistics.rs | 109 ++++---
rust/sedona-functions/src/st_analyze_aggr.rs | 20 +-
rust/sedona-geometry/src/types.rs | 438 +++++++++++++++++++++++++++
rust/sedona-geoparquet/src/metadata.rs | 18 +-
4 files changed, 511 insertions(+), 74 deletions(-)
diff --git a/rust/sedona-expr/src/statistics.rs
b/rust/sedona-expr/src/statistics.rs
index 37e6e80..eafa3d1 100644
--- a/rust/sedona-expr/src/statistics.rs
+++ b/rust/sedona-expr/src/statistics.rs
@@ -14,11 +14,14 @@
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
-use std::{collections::HashSet, str::FromStr};
+use std::str::FromStr;
use datafusion_common::{stats::Precision, ColumnStatistics, DataFusionError,
Result, ScalarValue};
use sedona_geometry::interval::{Interval, IntervalTrait};
-use sedona_geometry::{bounding_box::BoundingBox,
types::GeometryTypeAndDimensions};
+use sedona_geometry::{
+ bounding_box::BoundingBox,
+ types::{GeometryTypeAndDimensions, GeometryTypeAndDimensionsSet},
+};
use serde::{Deserialize, Serialize};
/// Statistics specific to spatial data types
@@ -33,7 +36,7 @@ use serde::{Deserialize, Serialize};
pub struct GeoStatistics {
// Core spatial statistics for pruning
bbox: Option<BoundingBox>, // The overall bounding box (min/max
coordinates) containing all geometries
- geometry_types: Option<HashSet<GeometryTypeAndDimensions>>, // Set of all
geometry types and dimensions present
+ geometry_types: Option<GeometryTypeAndDimensionsSet>, // Set of all
geometry types and dimensions present
// Extended statistics for analysis
total_geometries: Option<i64>, // Total count of all geometries
@@ -73,16 +76,16 @@ impl GeoStatistics {
pub fn empty() -> Self {
Self {
bbox: Some(BoundingBox::xy(Interval::empty(), Interval::empty())),
- geometry_types: Some(HashSet::new()), // Empty set of geometry
types
- total_geometries: Some(0), // Zero geometries
- total_size_bytes: Some(0), // Zero bytes
- total_points: Some(0), // Zero points
- puntal_count: Some(0), // Zero point geometries
- lineal_count: Some(0), // Zero line geometries
- polygonal_count: Some(0), // Zero polygon geometries
- collection_count: Some(0), // Zero collection geometries
- total_envelope_width: Some(0.0), // Zero width
- total_envelope_height: Some(0.0), // Zero height
+ geometry_types: Some(GeometryTypeAndDimensionsSet::new()), //
Empty set of geometry types
+ total_geometries: Some(0), // Zero
geometries
+ total_size_bytes: Some(0), // Zero
bytes
+ total_points: Some(0), // Zero
points
+ puntal_count: Some(0), // Zero
point geometries
+ lineal_count: Some(0), // Zero
line geometries
+ polygonal_count: Some(0), // Zero
polygon geometries
+ collection_count: Some(0), // Zero
collection geometries
+ total_envelope_width: Some(0.0), // Zero
width
+ total_envelope_height: Some(0.0), // Zero
height
}
}
@@ -92,20 +95,10 @@ impl GeoStatistics {
}
/// Update the geometry types and return self
- pub fn with_geometry_types(self, types:
Option<&[GeometryTypeAndDimensions]>) -> Self {
- match types {
- Some(type_slice) => {
- let type_set: HashSet<GeometryTypeAndDimensions> =
- type_slice.iter().cloned().collect();
- Self {
- geometry_types: Some(type_set),
- ..self
- }
- }
- None => Self {
- geometry_types: None,
- ..self
- },
+ pub fn with_geometry_types(self, types:
Option<GeometryTypeAndDimensionsSet>) -> Self {
+ Self {
+ geometry_types: types,
+ ..self
}
}
@@ -115,7 +108,7 @@ impl GeoStatistics {
}
/// Get the geometry types if available
- pub fn geometry_types(&self) ->
Option<&HashSet<GeometryTypeAndDimensions>> {
+ pub fn geometry_types(&self) -> Option<&GeometryTypeAndDimensionsSet> {
self.geometry_types.as_ref()
}
@@ -290,9 +283,7 @@ impl GeoStatistics {
if let Some(other_types) = &other.geometry_types {
match &mut self.geometry_types {
Some(types) => {
- let mut new_types = types.clone();
- new_types.extend(other_types.iter().cloned());
- self.geometry_types = Some(new_types);
+ types.merge(other_types);
}
None => self.geometry_types = Some(other_types.clone()),
}
@@ -374,13 +365,12 @@ impl GeoStatistics {
pub fn try_with_str_geometry_types(self, geometry_types: Option<&[&str]>)
-> Result<Self> {
match geometry_types {
Some(strings) => {
- let new_geometry_types = strings
- .iter()
- .map(|string| {
- GeometryTypeAndDimensions::from_str(string)
- .map_err(|e|
DataFusionError::External(Box::new(e)))
- })
- .collect::<Result<HashSet<GeometryTypeAndDimensions>>>()?;
+ let mut new_geometry_types =
GeometryTypeAndDimensionsSet::new();
+ for string in strings {
+ let type_and_dim =
GeometryTypeAndDimensions::from_str(string)
+ .map_err(|e| DataFusionError::External(Box::new(e)))?;
+ new_geometry_types.insert_or_ignore(&type_and_dim);
+ }
Ok(Self {
geometry_types: Some(new_geometry_types),
@@ -442,7 +432,10 @@ mod test {
// Test with_bbox
let stats = GeoStatistics::empty().with_bbox(Some(bbox.clone()));
assert_eq!(stats.bbox(), Some(&bbox));
- assert_eq!(stats.geometry_types(), Some(HashSet::new()).as_ref());
+ assert_eq!(
+ stats.geometry_types(),
+ Some(&GeometryTypeAndDimensionsSet::new())
+ );
let regular_stats = stats.to_column_statistics().unwrap();
assert_eq!(
@@ -459,15 +452,17 @@ mod test {
#[test]
fn specified_geometry_types() {
- let type_array = [GeometryTypeAndDimensions::new(
- GeometryTypeId::Polygon,
- Dimensions::Xy,
- )];
+ let mut types = GeometryTypeAndDimensionsSet::new();
+ types
+ .insert(&GeometryTypeAndDimensions::new(
+ GeometryTypeId::Polygon,
+ Dimensions::Xy,
+ ))
+ .unwrap();
// Test with_geometry_types
- let stats =
GeoStatistics::empty().with_geometry_types(Some(&type_array));
- let expected_set: HashSet<GeometryTypeAndDimensions> =
type_array.iter().cloned().collect();
- assert_eq!(stats.geometry_types(), Some(&expected_set));
+ let stats =
GeoStatistics::empty().with_geometry_types(Some(types.clone()));
+ assert_eq!(stats.geometry_types(), Some(&types));
assert_eq!(
stats.bbox(),
Some(&BoundingBox::xy(Interval::empty(), Interval::empty()))
@@ -493,15 +488,19 @@ mod test {
.try_with_str_geometry_types(Some(&["polygon", "point"]))
.unwrap();
- let mut expected_types = HashSet::new();
- expected_types.insert(GeometryTypeAndDimensions::new(
- GeometryTypeId::Polygon,
- Dimensions::Xy,
- ));
- expected_types.insert(GeometryTypeAndDimensions::new(
- GeometryTypeId::Point,
- Dimensions::Xy,
- ));
+ let mut expected_types = GeometryTypeAndDimensionsSet::new();
+ expected_types
+ .insert(&GeometryTypeAndDimensions::new(
+ GeometryTypeId::Polygon,
+ Dimensions::Xy,
+ ))
+ .unwrap();
+ expected_types
+ .insert(&GeometryTypeAndDimensions::new(
+ GeometryTypeId::Point,
+ Dimensions::Xy,
+ ))
+ .unwrap();
assert_eq!(stats.geometry_types(), Some(&expected_types));
assert_eq!(
diff --git a/rust/sedona-functions/src/st_analyze_aggr.rs
b/rust/sedona-functions/src/st_analyze_aggr.rs
index f300b07..a3c6d8c 100644
--- a/rust/sedona-functions/src/st_analyze_aggr.rs
+++ b/rust/sedona-functions/src/st_analyze_aggr.rs
@@ -35,7 +35,7 @@ use sedona_expr::aggregate_udf::SedonaAggregateUDF;
use sedona_expr::{aggregate_udf::SedonaAccumulator, statistics::GeoStatistics};
use sedona_geometry::analyze::GeometryAnalysis;
use sedona_geometry::interval::IntervalTrait;
-use sedona_geometry::types::GeometryTypeAndDimensions;
+use sedona_geometry::types::{GeometryTypeAndDimensions,
GeometryTypeAndDimensionsSet};
use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher};
use wkb::reader::Wkb;
@@ -353,18 +353,15 @@ impl AnalyzeAccumulator {
let current_types = stats.geometry_types();
let types = if let Some(existing_types) = current_types {
let mut new_types = existing_types.clone();
- new_types.insert(geometry_type);
+ new_types.insert_or_ignore(&geometry_type);
Some(new_types)
} else {
- Some(std::collections::HashSet::from([geometry_type]))
+ let mut new_set = GeometryTypeAndDimensionsSet::new();
+ new_set.insert_or_ignore(&geometry_type);
+ Some(new_set)
};
- if let Some(type_set) = &types {
- let type_vec: Vec<GeometryTypeAndDimensions> =
type_set.iter().cloned().collect();
- stats.with_geometry_types(Some(&type_vec))
- } else {
- stats.with_geometry_types(None)
- }
+ stats.with_geometry_types(types)
}
fn execute_update(&mut self, executor: WkbExecutor) -> Result<()> {
@@ -414,9 +411,10 @@ impl Accumulator for AnalyzeAccumulator {
// Add approximate size for geometry types if present
let types_size = match self.stats.geometry_types() {
Some(types) => {
+ // GeometryTypeAndDimensionsSet is a u32 bitset
let elem_size = size_of::<GeometryTypeAndDimensions>();
- let capacity = types.capacity();
- capacity * elem_size
+ let count = types.size();
+ count * elem_size
}
None => 0,
};
diff --git a/rust/sedona-geometry/src/types.rs
b/rust/sedona-geometry/src/types.rs
index 26e853c..0042b19 100644
--- a/rust/sedona-geometry/src/types.rs
+++ b/rust/sedona-geometry/src/types.rs
@@ -292,6 +292,198 @@ impl FromStr for GeometryTypeAndDimensions {
}
}
+/// A set containing [`GeometryTypeAndDimensions`] values
+///
+/// This set is conceptually similar to `HashSet<GeometryTypeAndDimensions>`
but
+/// uses a compact bitset representation for efficiency.
+///
+/// This set only supports the standard dimensions: XY, XYZ, XYM, and XYZM.
+/// Unknown dimensions (other than these four standard types) are not supported
+/// and will be rejected by [`insert`](Self::insert) or silently ignored by
+/// [`insert_or_ignore`](Self::insert_or_ignore).
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
+pub struct GeometryTypeAndDimensionsSet {
+ /// Bitset encoding geometry types and dimensions.
+ ///
+ /// Uses bits 0-31 where each geometry type's WKB ID (0-7) is encoded
+ /// at different offsets based on dimensions:
+ /// - XY: bits 0-7
+ /// - XYZ: bits 8-15
+ /// - XYM: bits 16-23
+ /// - XYZM: bits 24-31
+ types: u32,
+}
+
+impl GeometryTypeAndDimensionsSet {
+ #[inline]
+ pub fn new() -> Self {
+ Self { types: 0 }
+ }
+
+ /// Insert a geometry type and dimensions into the set.
+ ///
+ /// Returns an error if the dimensions are unknown (not one of XY, XYZ,
XYM, or XYZM).
+ /// Only the standard four dimension types are supported; attempting to
insert
+ /// a geometry with `Dimensions::Unknown(_)` will result in an error.
+ #[inline]
+ pub fn insert(
+ &mut self,
+ type_and_dim: &GeometryTypeAndDimensions,
+ ) -> Result<(), SedonaGeometryError> {
+ if let Dimensions::Unknown(n) = type_and_dim.dimensions() {
+ return Err(SedonaGeometryError::Invalid(format!(
+ "Unknown dimensions {} in
GeometryTypeAndDimensionsSet::insert",
+ n
+ )));
+ }
+ self.insert_or_ignore(type_and_dim);
+ Ok(())
+ }
+
+ /// Insert a geometry type and dimensions into the set, ignoring unknown
dimensions.
+ ///
+ /// If the dimensions are unknown (not one of XY, XYZ, XYM, or XYZM), this
method
+ /// silently ignores the insertion without returning an error. This is
useful when
+ /// processing data that may contain unsupported dimension types that
should be
+ /// skipped rather than causing an error.
+ #[inline]
+ pub fn insert_or_ignore(&mut self, type_and_dim:
&GeometryTypeAndDimensions) {
+ let geom_shift = type_and_dim.geometry_type().wkb_id();
+ // WKB ID must be < 8 to fit in the bitset layout (8 bits per
dimension)
+ if geom_shift >= 8 {
+ panic!(
+ "Invalid geometry type wkb_id {} in
GeometryTypeAndDimensionsSet::insert_or_ignore",
+ geom_shift
+ );
+ }
+ let dim_shift = match type_and_dim.dimensions() {
+ geo_traits::Dimensions::Unknown(_) => {
+ // Ignore unknown dimensions
+ return;
+ }
+ geo_traits::Dimensions::Xy => 0,
+ geo_traits::Dimensions::Xyz => 8,
+ geo_traits::Dimensions::Xym => 16,
+ geo_traits::Dimensions::Xyzm => 24,
+ };
+ let bit_position = geom_shift + dim_shift;
+ self.types |= 1 << bit_position;
+ }
+
+ /// Merge the given set into this set.
+ #[inline]
+ pub fn merge(&mut self, other: &Self) {
+ self.types |= other.types;
+ }
+
+ /// Returns `true` if the set contains no geometry types.
+ #[inline]
+ pub fn is_empty(&self) -> bool {
+ self.types == 0
+ }
+
+ /// Returns the number of geometry types in the set.
+ #[inline]
+ pub fn size(&self) -> usize {
+ self.types.count_ones() as usize
+ }
+
+ /// Clears the set, removing all geometry types.
+ #[inline]
+ pub fn clear(&mut self) {
+ self.types = 0;
+ }
+
+ /// Returns an iterator over the geometry types in the set.
+ pub fn iter(&self) -> GeometryTypeSetIter {
+ GeometryTypeSetIter {
+ types: self.types,
+ current_bit: 0,
+ }
+ }
+}
+
+/// Iterator over [`GeometryTypeAndDimensions`] values in a
[`GeometryTypeAndDimensionsSet`]
+pub struct GeometryTypeSetIter {
+ types: u32,
+ current_bit: u32,
+}
+
+impl Iterator for GeometryTypeSetIter {
+ type Item = GeometryTypeAndDimensions;
+
+ fn next(&mut self) -> Option<Self::Item> {
+ // Find the next set bit
+ while self.current_bit < 32 {
+ let bit = self.current_bit;
+ self.current_bit += 1;
+
+ if (self.types & (1 << bit)) != 0 {
+ // Decode the bit position into geometry type and dimensions
+ let dim_shift = (bit / 8) * 8;
+ let geom_shift = bit % 8;
+ let dimensions = match dim_shift {
+ 0 => Dimensions::Xy,
+ 8 => Dimensions::Xyz,
+ 16 => Dimensions::Xym,
+ 24 => Dimensions::Xyzm,
+ _ => panic!(
+ "Invalid dimension bits at position {} in
GeometryTypeAndDimensionsSet",
+ bit
+ ),
+ };
+
+ let geometry_type = GeometryTypeId::try_from_wkb_id(geom_shift)
+ .expect("Invalid geometry type wkb_id in
GeometryTypeAndDimensionsSet");
+
+ return Some(GeometryTypeAndDimensions::new(geometry_type,
dimensions));
+ }
+ }
+
+ None
+ }
+}
+
+impl IntoIterator for &GeometryTypeAndDimensionsSet {
+ type Item = GeometryTypeAndDimensions;
+ type IntoIter = GeometryTypeSetIter;
+
+ fn into_iter(self) -> Self::IntoIter {
+ self.iter()
+ }
+}
+
+// Serialize as a Vec to maintain compatibility with HashSet JSON format
+impl Serialize for GeometryTypeAndDimensionsSet {
+ fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+ where
+ S: serde::Serializer,
+ {
+ use serde::ser::SerializeSeq; // codespell:ignore ser
+ let mut seq = serializer.serialize_seq(Some(self.size()))?;
+ for item in self.iter() {
+ seq.serialize_element(&item)?;
+ }
+ seq.end()
+ }
+}
+
+// Deserialize from a Vec (which is what HashSet was serialized as)
+impl<'de> Deserialize<'de> for GeometryTypeAndDimensionsSet {
+ fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+ where
+ D: serde::Deserializer<'de>,
+ {
+ use serde::de::Error;
+ let items: Vec<GeometryTypeAndDimensions> =
Vec::deserialize(deserializer)?;
+ let mut set = GeometryTypeAndDimensionsSet::new();
+ for item in items {
+ set.insert(&item).map_err(D::Error::custom)?;
+ }
+ Ok(set)
+ }
+}
+
#[cfg(test)]
mod test {
use super::*;
@@ -432,4 +624,250 @@ mod test {
value
);
}
+
+ #[test]
+ fn geometry_type_set_new_is_empty() {
+ let set = GeometryTypeAndDimensionsSet::new();
+ assert!(set.is_empty());
+ assert_eq!(set.size(), 0);
+ assert_eq!(set.iter().count(), 0);
+ }
+
+ #[test]
+ fn geometry_type_set_insert_single() {
+ let mut set = GeometryTypeAndDimensionsSet::new();
+ let point_xy = GeometryTypeAndDimensions::new(Point, Xy);
+
+ set.insert(&point_xy).unwrap();
+ assert!(!set.is_empty());
+ assert_eq!(set.size(), 1);
+
+ let items: Vec<_> = set.iter().collect();
+ assert_eq!(items.len(), 1);
+ assert_eq!(items[0], point_xy);
+ }
+
+ #[test]
+ fn geometry_type_set_insert_duplicate() {
+ let mut set = GeometryTypeAndDimensionsSet::new();
+ let point_xy = GeometryTypeAndDimensions::new(Point, Xy);
+
+ set.insert(&point_xy).unwrap();
+ set.insert(&point_xy).unwrap();
+ set.insert(&point_xy).unwrap();
+
+ assert_eq!(set.size(), 1);
+ let items: Vec<_> = set.iter().collect();
+ assert_eq!(items.len(), 1);
+ assert_eq!(items[0], point_xy);
+ }
+
+ #[test]
+ fn geometry_type_set_insert_all_types() {
+ let mut set = GeometryTypeAndDimensionsSet::new();
+
+ // Insert all geometry types with XY dimension
+ for geom_type in [
+ Geometry,
+ Point,
+ LineString,
+ Polygon,
+ MultiPoint,
+ MultiLineString,
+ MultiPolygon,
+ GeometryCollection,
+ ] {
+ set.insert(&GeometryTypeAndDimensions::new(geom_type, Xy))
+ .unwrap();
+ }
+
+ assert_eq!(set.size(), 8);
+ let items: Vec<_> = set.iter().collect();
+ assert_eq!(items.len(), 8);
+ }
+
+ #[test]
+ fn geometry_type_set_insert_unknown_dimension() {
+ let mut set = GeometryTypeAndDimensionsSet::new();
+ let point_unknown = GeometryTypeAndDimensions::new(Point,
Dimensions::Unknown(2));
+
+ let result = set.insert(&point_unknown);
+
+ // Unknown dimensions should return an error
+ assert!(result.is_err());
+ assert_eq!(
+ result.unwrap_err().to_string(),
+ "Unknown dimensions 2 in GeometryTypeAndDimensionsSet::insert"
+ );
+ assert!(set.is_empty());
+ }
+
+ #[test]
+ fn geometry_type_set_clear() {
+ let mut set = GeometryTypeAndDimensionsSet::new();
+ let point_xy = GeometryTypeAndDimensions::new(Point, Xy);
+ let linestring_xyz = GeometryTypeAndDimensions::new(LineString, Xyz);
+
+ set.insert(&point_xy).unwrap();
+ set.insert(&linestring_xyz).unwrap();
+ assert!(!set.is_empty());
+ assert_eq!(set.size(), 2);
+
+ set.clear();
+ assert!(set.is_empty());
+ assert_eq!(set.size(), 0);
+ assert_eq!(set.iter().count(), 0);
+ }
+
+ #[test]
+ fn geometry_type_set_merge() {
+ let mut set1 = GeometryTypeAndDimensionsSet::new();
+ let mut set2 = GeometryTypeAndDimensionsSet::new();
+
+ let point_xy = GeometryTypeAndDimensions::new(Point, Xy);
+ let linestring_xy = GeometryTypeAndDimensions::new(LineString, Xy);
+ let polygon_xyz = GeometryTypeAndDimensions::new(Polygon, Xyz);
+
+ set1.insert(&point_xy).unwrap();
+ set1.insert(&linestring_xy).unwrap();
+
+ set2.insert(&linestring_xy).unwrap(); // Duplicate
+ set2.insert(&polygon_xyz).unwrap();
+
+ set1.merge(&set2);
+
+ assert_eq!(set1.size(), 3);
+ let items: Vec<_> = set1.iter().collect();
+ assert_eq!(items.len(), 3);
+ assert!(items.contains(&point_xy));
+ assert!(items.contains(&linestring_xy));
+ assert!(items.contains(&polygon_xyz));
+ }
+
+ #[test]
+ fn geometry_type_set_comprehensive() {
+ let mut set = GeometryTypeAndDimensionsSet::new();
+
+ // Add a mix of geometry types and dimensions
+ let test_types = vec![
+ GeometryTypeAndDimensions::new(Geometry, Xy),
+ GeometryTypeAndDimensions::new(Point, Xy),
+ GeometryTypeAndDimensions::new(LineString, Xyz),
+ GeometryTypeAndDimensions::new(Polygon, Xym),
+ GeometryTypeAndDimensions::new(MultiPoint, Xyzm),
+ GeometryTypeAndDimensions::new(MultiLineString, Xy),
+ GeometryTypeAndDimensions::new(MultiPolygon, Xyz),
+ GeometryTypeAndDimensions::new(GeometryCollection, Xym),
+ GeometryTypeAndDimensions::new(GeometryCollection, Xyzm),
+ ];
+
+ for type_and_dim in &test_types {
+ set.insert(type_and_dim).unwrap();
+ }
+
+ assert_eq!(set.size(), test_types.len());
+ let items: Vec<_> = set.iter().collect();
+ assert_eq!(items.len(), test_types.len());
+
+ for type_and_dim in &test_types {
+ assert!(items.contains(type_and_dim));
+ }
+ }
+
+ #[test]
+ fn geometry_type_set_serde_empty() {
+ let set = GeometryTypeAndDimensionsSet::new();
+
+ // Serialize
+ let json = serde_json::to_string(&set).unwrap();
+ assert_eq!(json, "[]");
+
+ // Deserialize
+ let deserialized: GeometryTypeAndDimensionsSet =
serde_json::from_str(&json).unwrap();
+ assert!(deserialized.is_empty());
+ assert_eq!(deserialized.size(), 0);
+ }
+
+ #[test]
+ fn geometry_type_set_serde_single() {
+ let mut set = GeometryTypeAndDimensionsSet::new();
+ let point_xy = GeometryTypeAndDimensions::new(Point, Xy);
+ set.insert(&point_xy).unwrap();
+
+ // Serialize
+ let json = serde_json::to_string(&set).unwrap();
+ assert_eq!(json, "[\"Point\"]");
+
+ // Deserialize
+ let deserialized: GeometryTypeAndDimensionsSet =
serde_json::from_str(&json).unwrap();
+ assert_eq!(deserialized.size(), 1);
+ let items: Vec<_> = deserialized.iter().collect();
+ assert_eq!(items[0], point_xy);
+ }
+
+ #[test]
+ fn geometry_type_set_serde_multiple() {
+ let mut set = GeometryTypeAndDimensionsSet::new();
+
+ let test_types = vec![
+ GeometryTypeAndDimensions::new(Point, Xy),
+ GeometryTypeAndDimensions::new(LineString, Xyz),
+ GeometryTypeAndDimensions::new(Polygon, Xyzm),
+ ];
+
+ for type_and_dim in &test_types {
+ set.insert(type_and_dim).unwrap();
+ }
+
+ // Serialize
+ let json = serde_json::to_string(&set).unwrap();
+ assert_eq!(json, "[\"Point\",\"LineString Z\",\"Polygon ZM\"]");
+
+ // Deserialize
+ let deserialized: GeometryTypeAndDimensionsSet =
serde_json::from_str(&json).unwrap();
+ assert_eq!(deserialized.size(), test_types.len());
+
+ let items: Vec<_> = deserialized.iter().collect();
+ for type_and_dim in &test_types {
+ assert!(items.contains(type_and_dim));
+ }
+ }
+
+ #[test]
+ fn geometry_type_set_serde_roundtrip() {
+ let mut set = GeometryTypeAndDimensionsSet::new();
+
+ // Add all combinations of one geometry type with different dimensions
+ set.insert(&GeometryTypeAndDimensions::new(Point, Xy))
+ .unwrap();
+ set.insert(&GeometryTypeAndDimensions::new(Point, Xyz))
+ .unwrap();
+ set.insert(&GeometryTypeAndDimensions::new(Point, Xym))
+ .unwrap();
+ set.insert(&GeometryTypeAndDimensions::new(Point, Xyzm))
+ .unwrap();
+ set.insert(&GeometryTypeAndDimensions::new(LineString, Xy))
+ .unwrap();
+
+ // Serialize to JSON
+ let json = serde_json::to_string(&set).unwrap();
+ assert_eq!(
+ json,
+ "[\"Point\",\"LineString\",\"Point Z\",\"Point M\",\"Point ZM\"]"
+ );
+
+ // Deserialize back
+ let deserialized: GeometryTypeAndDimensionsSet =
serde_json::from_str(&json).unwrap();
+
+ // Verify the deserialized set matches the original
+ assert_eq!(set.size(), deserialized.size());
+
+ let original_items: Vec<_> = set.iter().collect();
+ let deserialized_items: Vec<_> = deserialized.iter().collect();
+
+ assert_eq!(original_items.len(), deserialized_items.len());
+ for item in &original_items {
+ assert!(deserialized_items.contains(item));
+ }
+ }
}
diff --git a/rust/sedona-geoparquet/src/metadata.rs
b/rust/sedona-geoparquet/src/metadata.rs
index 09caac7..aac33cc 100644
--- a/rust/sedona-geoparquet/src/metadata.rs
+++ b/rust/sedona-geoparquet/src/metadata.rs
@@ -26,8 +26,8 @@ use parquet::file::metadata::ParquetMetaData;
use sedona_expr::statistics::GeoStatistics;
use sedona_geometry::bounding_box::BoundingBox;
use sedona_geometry::interval::{Interval, IntervalTrait};
-use sedona_geometry::types::GeometryTypeAndDimensions;
-use std::collections::{HashMap, HashSet};
+use sedona_geometry::types::GeometryTypeAndDimensionsSet;
+use std::collections::HashMap;
use std::fmt::Display;
use std::fmt::Write;
@@ -329,7 +329,7 @@ pub struct GeoParquetColumnMetadata {
/// and multipolygons, it is not sufficient to specify `["MultiPolygon"]`,
but it is expected
/// to specify `["Polygon", "MultiPolygon"]`. Or if having 3D points, it
is not sufficient to
/// specify `["Point"]`, but it is expected to list `["Point Z"]`.
- pub geometry_types: HashSet<GeometryTypeAndDimensions>,
+ pub geometry_types: GeometryTypeAndDimensionsSet,
/// [PROJJSON](https://proj.org/specifications/projjson.html) object
representing the
/// Coordinate Reference System (CRS) of the geometry. If the field is not
provided, the
@@ -414,7 +414,10 @@ impl GeoParquetMetadata {
column_meta.geometry_types.clear();
} else {
for item in &other_column_meta.geometry_types {
- column_meta.geometry_types.insert(*item);
+ column_meta
+ .geometry_types
+ .insert(&item)
+ .map_err(|e| DataFusionError::External(Box::new(e)))?;
}
}
}
@@ -517,8 +520,7 @@ impl GeoParquetColumnMetadata {
if self.geometry_types.is_empty() {
stats
} else {
- let geometry_types =
self.geometry_types.iter().cloned().collect::<Vec<_>>();
- stats.with_geometry_types(Some(&geometry_types))
+ stats.with_geometry_types(Some(self.geometry_types.clone()))
}
}
@@ -543,7 +545,7 @@ impl GeoParquetColumnMetadata {
#[cfg(test)]
mod test {
use geo_traits::Dimensions;
- use sedona_geometry::types::GeometryTypeId;
+ use sedona_geometry::types::{GeometryTypeAndDimensions, GeometryTypeId};
use super::*;
@@ -560,7 +562,7 @@ mod test {
assert_eq!(meta.encoding, GeoParquetColumnEncoding::WKB);
assert_eq!(
meta.geometry_types.iter().next().unwrap(),
- &GeometryTypeAndDimensions::new(GeometryTypeId::Point,
Dimensions::Xy)
+ GeometryTypeAndDimensions::new(GeometryTypeId::Point,
Dimensions::Xy)
);
}
}