This is an automated email from the ASF dual-hosted git repository.

kontinuation pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git


The following commit(s) were added to refs/heads/main by this push:
     new f49016d  Add an efficient GeometryTypeAndDimensionsSet to improve the 
performance of geo statistics collection and ST_Analyze_Aggr (#285)
f49016d is described below

commit f49016d6e99e1e38ee9efac5b451dd399b3cf740
Author: Kristin Cowalcijk <[email protected]>
AuthorDate: Sun Nov 9 12:24:29 2025 +0800

    Add an efficient GeometryTypeAndDimensionsSet to improve the performance of 
geo statistics collection and ST_Analyze_Aggr (#285)
    
    The idea is pretty simple: we use a 32-bit bitset to represent geometry 
type and dimensions. This replaces usages of HashSet<GeometryTypeAndDimensions> 
in `GeoStatistics` and reduces the overhead of updating geo statistics in 
`AnalyzeAccumulator`. We care about the performance geo statistics analyzer 
because it is applied to all geometries on the indexed side when running a 
spatial join. `AnalyzeAccumulator` and ST_Analyze_Aggr can be useful in some 
other places as well, so we'd like  [...]
    
    Here is the benchmark result of st_analyze_aggr after applying this patch:
    
    ```
    Gnuplot not found, using plotters backend
    native-st_analyze_aggr-Array(Point)
                            time:   [4.1267 ms 4.2026 ms 4.3423 ms]
                            change: [-87.458% -87.216% -86.808%] (p = 0.00 < 
0.05)
                            Performance has improved.
    Found 8 outliers among 100 measurements (8.00%)
      6 (6.00%) high mild
      2 (2.00%) high severe
    
    native-st_analyze_aggr-Array(LineString(10))
                            time:   [5.6607 ms 5.6728 ms 5.6868 ms]
                            change: [-83.578% -83.529% -83.482%] (p = 0.00 < 
0.05)
                            Performance has improved.
    Found 1 outliers among 100 measurements (1.00%)
      1 (1.00%) high severe
    ```
    
    Co-authored-by: Copilot <[email protected]>
---
 rust/sedona-expr/src/statistics.rs           | 109 ++++---
 rust/sedona-functions/src/st_analyze_aggr.rs |  20 +-
 rust/sedona-geometry/src/types.rs            | 438 +++++++++++++++++++++++++++
 rust/sedona-geoparquet/src/metadata.rs       |  18 +-
 4 files changed, 511 insertions(+), 74 deletions(-)

diff --git a/rust/sedona-expr/src/statistics.rs 
b/rust/sedona-expr/src/statistics.rs
index 37e6e80..eafa3d1 100644
--- a/rust/sedona-expr/src/statistics.rs
+++ b/rust/sedona-expr/src/statistics.rs
@@ -14,11 +14,14 @@
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.
-use std::{collections::HashSet, str::FromStr};
+use std::str::FromStr;
 
 use datafusion_common::{stats::Precision, ColumnStatistics, DataFusionError, 
Result, ScalarValue};
 use sedona_geometry::interval::{Interval, IntervalTrait};
-use sedona_geometry::{bounding_box::BoundingBox, 
types::GeometryTypeAndDimensions};
+use sedona_geometry::{
+    bounding_box::BoundingBox,
+    types::{GeometryTypeAndDimensions, GeometryTypeAndDimensionsSet},
+};
 use serde::{Deserialize, Serialize};
 
 /// Statistics specific to spatial data types
@@ -33,7 +36,7 @@ use serde::{Deserialize, Serialize};
 pub struct GeoStatistics {
     // Core spatial statistics for pruning
     bbox: Option<BoundingBox>, // The overall bounding box (min/max 
coordinates) containing all geometries
-    geometry_types: Option<HashSet<GeometryTypeAndDimensions>>, // Set of all 
geometry types and dimensions present
+    geometry_types: Option<GeometryTypeAndDimensionsSet>, // Set of all 
geometry types and dimensions present
 
     // Extended statistics for analysis
     total_geometries: Option<i64>, // Total count of all geometries
@@ -73,16 +76,16 @@ impl GeoStatistics {
     pub fn empty() -> Self {
         Self {
             bbox: Some(BoundingBox::xy(Interval::empty(), Interval::empty())),
-            geometry_types: Some(HashSet::new()), // Empty set of geometry 
types
-            total_geometries: Some(0),            // Zero geometries
-            total_size_bytes: Some(0),            // Zero bytes
-            total_points: Some(0),                // Zero points
-            puntal_count: Some(0),                // Zero point geometries
-            lineal_count: Some(0),                // Zero line geometries
-            polygonal_count: Some(0),             // Zero polygon geometries
-            collection_count: Some(0),            // Zero collection geometries
-            total_envelope_width: Some(0.0),      // Zero width
-            total_envelope_height: Some(0.0),     // Zero height
+            geometry_types: Some(GeometryTypeAndDimensionsSet::new()), // 
Empty set of geometry types
+            total_geometries: Some(0),                                 // Zero 
geometries
+            total_size_bytes: Some(0),                                 // Zero 
bytes
+            total_points: Some(0),                                     // Zero 
points
+            puntal_count: Some(0),                                     // Zero 
point geometries
+            lineal_count: Some(0),                                     // Zero 
line geometries
+            polygonal_count: Some(0),                                  // Zero 
polygon geometries
+            collection_count: Some(0),                                 // Zero 
collection geometries
+            total_envelope_width: Some(0.0),                           // Zero 
width
+            total_envelope_height: Some(0.0),                          // Zero 
height
         }
     }
 
@@ -92,20 +95,10 @@ impl GeoStatistics {
     }
 
     /// Update the geometry types and return self
-    pub fn with_geometry_types(self, types: 
Option<&[GeometryTypeAndDimensions]>) -> Self {
-        match types {
-            Some(type_slice) => {
-                let type_set: HashSet<GeometryTypeAndDimensions> =
-                    type_slice.iter().cloned().collect();
-                Self {
-                    geometry_types: Some(type_set),
-                    ..self
-                }
-            }
-            None => Self {
-                geometry_types: None,
-                ..self
-            },
+    pub fn with_geometry_types(self, types: 
Option<GeometryTypeAndDimensionsSet>) -> Self {
+        Self {
+            geometry_types: types,
+            ..self
         }
     }
 
@@ -115,7 +108,7 @@ impl GeoStatistics {
     }
 
     /// Get the geometry types if available
-    pub fn geometry_types(&self) -> 
Option<&HashSet<GeometryTypeAndDimensions>> {
+    pub fn geometry_types(&self) -> Option<&GeometryTypeAndDimensionsSet> {
         self.geometry_types.as_ref()
     }
 
@@ -290,9 +283,7 @@ impl GeoStatistics {
         if let Some(other_types) = &other.geometry_types {
             match &mut self.geometry_types {
                 Some(types) => {
-                    let mut new_types = types.clone();
-                    new_types.extend(other_types.iter().cloned());
-                    self.geometry_types = Some(new_types);
+                    types.merge(other_types);
                 }
                 None => self.geometry_types = Some(other_types.clone()),
             }
@@ -374,13 +365,12 @@ impl GeoStatistics {
     pub fn try_with_str_geometry_types(self, geometry_types: Option<&[&str]>) 
-> Result<Self> {
         match geometry_types {
             Some(strings) => {
-                let new_geometry_types = strings
-                    .iter()
-                    .map(|string| {
-                        GeometryTypeAndDimensions::from_str(string)
-                            .map_err(|e| 
DataFusionError::External(Box::new(e)))
-                    })
-                    .collect::<Result<HashSet<GeometryTypeAndDimensions>>>()?;
+                let mut new_geometry_types = 
GeometryTypeAndDimensionsSet::new();
+                for string in strings {
+                    let type_and_dim = 
GeometryTypeAndDimensions::from_str(string)
+                        .map_err(|e| DataFusionError::External(Box::new(e)))?;
+                    new_geometry_types.insert_or_ignore(&type_and_dim);
+                }
 
                 Ok(Self {
                     geometry_types: Some(new_geometry_types),
@@ -442,7 +432,10 @@ mod test {
         // Test with_bbox
         let stats = GeoStatistics::empty().with_bbox(Some(bbox.clone()));
         assert_eq!(stats.bbox(), Some(&bbox));
-        assert_eq!(stats.geometry_types(), Some(HashSet::new()).as_ref());
+        assert_eq!(
+            stats.geometry_types(),
+            Some(&GeometryTypeAndDimensionsSet::new())
+        );
 
         let regular_stats = stats.to_column_statistics().unwrap();
         assert_eq!(
@@ -459,15 +452,17 @@ mod test {
 
     #[test]
     fn specified_geometry_types() {
-        let type_array = [GeometryTypeAndDimensions::new(
-            GeometryTypeId::Polygon,
-            Dimensions::Xy,
-        )];
+        let mut types = GeometryTypeAndDimensionsSet::new();
+        types
+            .insert(&GeometryTypeAndDimensions::new(
+                GeometryTypeId::Polygon,
+                Dimensions::Xy,
+            ))
+            .unwrap();
 
         // Test with_geometry_types
-        let stats = 
GeoStatistics::empty().with_geometry_types(Some(&type_array));
-        let expected_set: HashSet<GeometryTypeAndDimensions> = 
type_array.iter().cloned().collect();
-        assert_eq!(stats.geometry_types(), Some(&expected_set));
+        let stats = 
GeoStatistics::empty().with_geometry_types(Some(types.clone()));
+        assert_eq!(stats.geometry_types(), Some(&types));
         assert_eq!(
             stats.bbox(),
             Some(&BoundingBox::xy(Interval::empty(), Interval::empty()))
@@ -493,15 +488,19 @@ mod test {
             .try_with_str_geometry_types(Some(&["polygon", "point"]))
             .unwrap();
 
-        let mut expected_types = HashSet::new();
-        expected_types.insert(GeometryTypeAndDimensions::new(
-            GeometryTypeId::Polygon,
-            Dimensions::Xy,
-        ));
-        expected_types.insert(GeometryTypeAndDimensions::new(
-            GeometryTypeId::Point,
-            Dimensions::Xy,
-        ));
+        let mut expected_types = GeometryTypeAndDimensionsSet::new();
+        expected_types
+            .insert(&GeometryTypeAndDimensions::new(
+                GeometryTypeId::Polygon,
+                Dimensions::Xy,
+            ))
+            .unwrap();
+        expected_types
+            .insert(&GeometryTypeAndDimensions::new(
+                GeometryTypeId::Point,
+                Dimensions::Xy,
+            ))
+            .unwrap();
 
         assert_eq!(stats.geometry_types(), Some(&expected_types));
         assert_eq!(
diff --git a/rust/sedona-functions/src/st_analyze_aggr.rs 
b/rust/sedona-functions/src/st_analyze_aggr.rs
index f300b07..a3c6d8c 100644
--- a/rust/sedona-functions/src/st_analyze_aggr.rs
+++ b/rust/sedona-functions/src/st_analyze_aggr.rs
@@ -35,7 +35,7 @@ use sedona_expr::aggregate_udf::SedonaAggregateUDF;
 use sedona_expr::{aggregate_udf::SedonaAccumulator, statistics::GeoStatistics};
 use sedona_geometry::analyze::GeometryAnalysis;
 use sedona_geometry::interval::IntervalTrait;
-use sedona_geometry::types::GeometryTypeAndDimensions;
+use sedona_geometry::types::{GeometryTypeAndDimensions, 
GeometryTypeAndDimensionsSet};
 use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher};
 use wkb::reader::Wkb;
 
@@ -353,18 +353,15 @@ impl AnalyzeAccumulator {
         let current_types = stats.geometry_types();
         let types = if let Some(existing_types) = current_types {
             let mut new_types = existing_types.clone();
-            new_types.insert(geometry_type);
+            new_types.insert_or_ignore(&geometry_type);
             Some(new_types)
         } else {
-            Some(std::collections::HashSet::from([geometry_type]))
+            let mut new_set = GeometryTypeAndDimensionsSet::new();
+            new_set.insert_or_ignore(&geometry_type);
+            Some(new_set)
         };
 
-        if let Some(type_set) = &types {
-            let type_vec: Vec<GeometryTypeAndDimensions> = 
type_set.iter().cloned().collect();
-            stats.with_geometry_types(Some(&type_vec))
-        } else {
-            stats.with_geometry_types(None)
-        }
+        stats.with_geometry_types(types)
     }
 
     fn execute_update(&mut self, executor: WkbExecutor) -> Result<()> {
@@ -414,9 +411,10 @@ impl Accumulator for AnalyzeAccumulator {
         // Add approximate size for geometry types if present
         let types_size = match self.stats.geometry_types() {
             Some(types) => {
+                // GeometryTypeAndDimensionsSet is a u32 bitset
                 let elem_size = size_of::<GeometryTypeAndDimensions>();
-                let capacity = types.capacity();
-                capacity * elem_size
+                let count = types.size();
+                count * elem_size
             }
             None => 0,
         };
diff --git a/rust/sedona-geometry/src/types.rs 
b/rust/sedona-geometry/src/types.rs
index 26e853c..0042b19 100644
--- a/rust/sedona-geometry/src/types.rs
+++ b/rust/sedona-geometry/src/types.rs
@@ -292,6 +292,198 @@ impl FromStr for GeometryTypeAndDimensions {
     }
 }
 
+/// A set containing [`GeometryTypeAndDimensions`] values
+///
+/// This set is conceptually similar to `HashSet<GeometryTypeAndDimensions>` 
but
+/// uses a compact bitset representation for efficiency.
+///
+/// This set only supports the standard dimensions: XY, XYZ, XYM, and XYZM.
+/// Unknown dimensions (other than these four standard types) are not supported
+/// and will be rejected by [`insert`](Self::insert) or silently ignored by
+/// [`insert_or_ignore`](Self::insert_or_ignore).
+#[derive(Clone, Debug, Default, PartialEq, Eq)]
+pub struct GeometryTypeAndDimensionsSet {
+    /// Bitset encoding geometry types and dimensions.
+    ///
+    /// Uses bits 0-31 where each geometry type's WKB ID (0-7) is encoded
+    /// at different offsets based on dimensions:
+    /// - XY: bits 0-7
+    /// - XYZ: bits 8-15
+    /// - XYM: bits 16-23
+    /// - XYZM: bits 24-31
+    types: u32,
+}
+
+impl GeometryTypeAndDimensionsSet {
+    #[inline]
+    pub fn new() -> Self {
+        Self { types: 0 }
+    }
+
+    /// Insert a geometry type and dimensions into the set.
+    ///
+    /// Returns an error if the dimensions are unknown (not one of XY, XYZ, 
XYM, or XYZM).
+    /// Only the standard four dimension types are supported; attempting to 
insert
+    /// a geometry with `Dimensions::Unknown(_)` will result in an error.
+    #[inline]
+    pub fn insert(
+        &mut self,
+        type_and_dim: &GeometryTypeAndDimensions,
+    ) -> Result<(), SedonaGeometryError> {
+        if let Dimensions::Unknown(n) = type_and_dim.dimensions() {
+            return Err(SedonaGeometryError::Invalid(format!(
+                "Unknown dimensions {} in 
GeometryTypeAndDimensionsSet::insert",
+                n
+            )));
+        }
+        self.insert_or_ignore(type_and_dim);
+        Ok(())
+    }
+
+    /// Insert a geometry type and dimensions into the set, ignoring unknown 
dimensions.
+    ///
+    /// If the dimensions are unknown (not one of XY, XYZ, XYM, or XYZM), this 
method
+    /// silently ignores the insertion without returning an error. This is 
useful when
+    /// processing data that may contain unsupported dimension types that 
should be
+    /// skipped rather than causing an error.
+    #[inline]
+    pub fn insert_or_ignore(&mut self, type_and_dim: 
&GeometryTypeAndDimensions) {
+        let geom_shift = type_and_dim.geometry_type().wkb_id();
+        // WKB ID must be < 8 to fit in the bitset layout (8 bits per 
dimension)
+        if geom_shift >= 8 {
+            panic!(
+                "Invalid geometry type wkb_id {} in 
GeometryTypeAndDimensionsSet::insert_or_ignore",
+                geom_shift
+            );
+        }
+        let dim_shift = match type_and_dim.dimensions() {
+            geo_traits::Dimensions::Unknown(_) => {
+                // Ignore unknown dimensions
+                return;
+            }
+            geo_traits::Dimensions::Xy => 0,
+            geo_traits::Dimensions::Xyz => 8,
+            geo_traits::Dimensions::Xym => 16,
+            geo_traits::Dimensions::Xyzm => 24,
+        };
+        let bit_position = geom_shift + dim_shift;
+        self.types |= 1 << bit_position;
+    }
+
+    /// Merge the given set into this set.
+    #[inline]
+    pub fn merge(&mut self, other: &Self) {
+        self.types |= other.types;
+    }
+
+    /// Returns `true` if the set contains no geometry types.
+    #[inline]
+    pub fn is_empty(&self) -> bool {
+        self.types == 0
+    }
+
+    /// Returns the number of geometry types in the set.
+    #[inline]
+    pub fn size(&self) -> usize {
+        self.types.count_ones() as usize
+    }
+
+    /// Clears the set, removing all geometry types.
+    #[inline]
+    pub fn clear(&mut self) {
+        self.types = 0;
+    }
+
+    /// Returns an iterator over the geometry types in the set.
+    pub fn iter(&self) -> GeometryTypeSetIter {
+        GeometryTypeSetIter {
+            types: self.types,
+            current_bit: 0,
+        }
+    }
+}
+
+/// Iterator over [`GeometryTypeAndDimensions`] values in a 
[`GeometryTypeAndDimensionsSet`]
+pub struct GeometryTypeSetIter {
+    types: u32,
+    current_bit: u32,
+}
+
+impl Iterator for GeometryTypeSetIter {
+    type Item = GeometryTypeAndDimensions;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // Find the next set bit
+        while self.current_bit < 32 {
+            let bit = self.current_bit;
+            self.current_bit += 1;
+
+            if (self.types & (1 << bit)) != 0 {
+                // Decode the bit position into geometry type and dimensions
+                let dim_shift = (bit / 8) * 8;
+                let geom_shift = bit % 8;
+                let dimensions = match dim_shift {
+                    0 => Dimensions::Xy,
+                    8 => Dimensions::Xyz,
+                    16 => Dimensions::Xym,
+                    24 => Dimensions::Xyzm,
+                    _ => panic!(
+                        "Invalid dimension bits at position {} in 
GeometryTypeAndDimensionsSet",
+                        bit
+                    ),
+                };
+
+                let geometry_type = GeometryTypeId::try_from_wkb_id(geom_shift)
+                    .expect("Invalid geometry type wkb_id in 
GeometryTypeAndDimensionsSet");
+
+                return Some(GeometryTypeAndDimensions::new(geometry_type, 
dimensions));
+            }
+        }
+
+        None
+    }
+}
+
+impl IntoIterator for &GeometryTypeAndDimensionsSet {
+    type Item = GeometryTypeAndDimensions;
+    type IntoIter = GeometryTypeSetIter;
+
+    fn into_iter(self) -> Self::IntoIter {
+        self.iter()
+    }
+}
+
+// Serialize as a Vec to maintain compatibility with HashSet JSON format
+impl Serialize for GeometryTypeAndDimensionsSet {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        use serde::ser::SerializeSeq; // codespell:ignore ser
+        let mut seq = serializer.serialize_seq(Some(self.size()))?;
+        for item in self.iter() {
+            seq.serialize_element(&item)?;
+        }
+        seq.end()
+    }
+}
+
+// Deserialize from a Vec (which is what HashSet was serialized as)
+impl<'de> Deserialize<'de> for GeometryTypeAndDimensionsSet {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        use serde::de::Error;
+        let items: Vec<GeometryTypeAndDimensions> = 
Vec::deserialize(deserializer)?;
+        let mut set = GeometryTypeAndDimensionsSet::new();
+        for item in items {
+            set.insert(&item).map_err(D::Error::custom)?;
+        }
+        Ok(set)
+    }
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
@@ -432,4 +624,250 @@ mod test {
             value
         );
     }
+
+    #[test]
+    fn geometry_type_set_new_is_empty() {
+        let set = GeometryTypeAndDimensionsSet::new();
+        assert!(set.is_empty());
+        assert_eq!(set.size(), 0);
+        assert_eq!(set.iter().count(), 0);
+    }
+
+    #[test]
+    fn geometry_type_set_insert_single() {
+        let mut set = GeometryTypeAndDimensionsSet::new();
+        let point_xy = GeometryTypeAndDimensions::new(Point, Xy);
+
+        set.insert(&point_xy).unwrap();
+        assert!(!set.is_empty());
+        assert_eq!(set.size(), 1);
+
+        let items: Vec<_> = set.iter().collect();
+        assert_eq!(items.len(), 1);
+        assert_eq!(items[0], point_xy);
+    }
+
+    #[test]
+    fn geometry_type_set_insert_duplicate() {
+        let mut set = GeometryTypeAndDimensionsSet::new();
+        let point_xy = GeometryTypeAndDimensions::new(Point, Xy);
+
+        set.insert(&point_xy).unwrap();
+        set.insert(&point_xy).unwrap();
+        set.insert(&point_xy).unwrap();
+
+        assert_eq!(set.size(), 1);
+        let items: Vec<_> = set.iter().collect();
+        assert_eq!(items.len(), 1);
+        assert_eq!(items[0], point_xy);
+    }
+
+    #[test]
+    fn geometry_type_set_insert_all_types() {
+        let mut set = GeometryTypeAndDimensionsSet::new();
+
+        // Insert all geometry types with XY dimension
+        for geom_type in [
+            Geometry,
+            Point,
+            LineString,
+            Polygon,
+            MultiPoint,
+            MultiLineString,
+            MultiPolygon,
+            GeometryCollection,
+        ] {
+            set.insert(&GeometryTypeAndDimensions::new(geom_type, Xy))
+                .unwrap();
+        }
+
+        assert_eq!(set.size(), 8);
+        let items: Vec<_> = set.iter().collect();
+        assert_eq!(items.len(), 8);
+    }
+
+    #[test]
+    fn geometry_type_set_insert_unknown_dimension() {
+        let mut set = GeometryTypeAndDimensionsSet::new();
+        let point_unknown = GeometryTypeAndDimensions::new(Point, 
Dimensions::Unknown(2));
+
+        let result = set.insert(&point_unknown);
+
+        // Unknown dimensions should return an error
+        assert!(result.is_err());
+        assert_eq!(
+            result.unwrap_err().to_string(),
+            "Unknown dimensions 2 in GeometryTypeAndDimensionsSet::insert"
+        );
+        assert!(set.is_empty());
+    }
+
+    #[test]
+    fn geometry_type_set_clear() {
+        let mut set = GeometryTypeAndDimensionsSet::new();
+        let point_xy = GeometryTypeAndDimensions::new(Point, Xy);
+        let linestring_xyz = GeometryTypeAndDimensions::new(LineString, Xyz);
+
+        set.insert(&point_xy).unwrap();
+        set.insert(&linestring_xyz).unwrap();
+        assert!(!set.is_empty());
+        assert_eq!(set.size(), 2);
+
+        set.clear();
+        assert!(set.is_empty());
+        assert_eq!(set.size(), 0);
+        assert_eq!(set.iter().count(), 0);
+    }
+
+    #[test]
+    fn geometry_type_set_merge() {
+        let mut set1 = GeometryTypeAndDimensionsSet::new();
+        let mut set2 = GeometryTypeAndDimensionsSet::new();
+
+        let point_xy = GeometryTypeAndDimensions::new(Point, Xy);
+        let linestring_xy = GeometryTypeAndDimensions::new(LineString, Xy);
+        let polygon_xyz = GeometryTypeAndDimensions::new(Polygon, Xyz);
+
+        set1.insert(&point_xy).unwrap();
+        set1.insert(&linestring_xy).unwrap();
+
+        set2.insert(&linestring_xy).unwrap(); // Duplicate
+        set2.insert(&polygon_xyz).unwrap();
+
+        set1.merge(&set2);
+
+        assert_eq!(set1.size(), 3);
+        let items: Vec<_> = set1.iter().collect();
+        assert_eq!(items.len(), 3);
+        assert!(items.contains(&point_xy));
+        assert!(items.contains(&linestring_xy));
+        assert!(items.contains(&polygon_xyz));
+    }
+
+    #[test]
+    fn geometry_type_set_comprehensive() {
+        let mut set = GeometryTypeAndDimensionsSet::new();
+
+        // Add a mix of geometry types and dimensions
+        let test_types = vec![
+            GeometryTypeAndDimensions::new(Geometry, Xy),
+            GeometryTypeAndDimensions::new(Point, Xy),
+            GeometryTypeAndDimensions::new(LineString, Xyz),
+            GeometryTypeAndDimensions::new(Polygon, Xym),
+            GeometryTypeAndDimensions::new(MultiPoint, Xyzm),
+            GeometryTypeAndDimensions::new(MultiLineString, Xy),
+            GeometryTypeAndDimensions::new(MultiPolygon, Xyz),
+            GeometryTypeAndDimensions::new(GeometryCollection, Xym),
+            GeometryTypeAndDimensions::new(GeometryCollection, Xyzm),
+        ];
+
+        for type_and_dim in &test_types {
+            set.insert(type_and_dim).unwrap();
+        }
+
+        assert_eq!(set.size(), test_types.len());
+        let items: Vec<_> = set.iter().collect();
+        assert_eq!(items.len(), test_types.len());
+
+        for type_and_dim in &test_types {
+            assert!(items.contains(type_and_dim));
+        }
+    }
+
+    #[test]
+    fn geometry_type_set_serde_empty() {
+        let set = GeometryTypeAndDimensionsSet::new();
+
+        // Serialize
+        let json = serde_json::to_string(&set).unwrap();
+        assert_eq!(json, "[]");
+
+        // Deserialize
+        let deserialized: GeometryTypeAndDimensionsSet = 
serde_json::from_str(&json).unwrap();
+        assert!(deserialized.is_empty());
+        assert_eq!(deserialized.size(), 0);
+    }
+
+    #[test]
+    fn geometry_type_set_serde_single() {
+        let mut set = GeometryTypeAndDimensionsSet::new();
+        let point_xy = GeometryTypeAndDimensions::new(Point, Xy);
+        set.insert(&point_xy).unwrap();
+
+        // Serialize
+        let json = serde_json::to_string(&set).unwrap();
+        assert_eq!(json, "[\"Point\"]");
+
+        // Deserialize
+        let deserialized: GeometryTypeAndDimensionsSet = 
serde_json::from_str(&json).unwrap();
+        assert_eq!(deserialized.size(), 1);
+        let items: Vec<_> = deserialized.iter().collect();
+        assert_eq!(items[0], point_xy);
+    }
+
+    #[test]
+    fn geometry_type_set_serde_multiple() {
+        let mut set = GeometryTypeAndDimensionsSet::new();
+
+        let test_types = vec![
+            GeometryTypeAndDimensions::new(Point, Xy),
+            GeometryTypeAndDimensions::new(LineString, Xyz),
+            GeometryTypeAndDimensions::new(Polygon, Xyzm),
+        ];
+
+        for type_and_dim in &test_types {
+            set.insert(type_and_dim).unwrap();
+        }
+
+        // Serialize
+        let json = serde_json::to_string(&set).unwrap();
+        assert_eq!(json, "[\"Point\",\"LineString Z\",\"Polygon ZM\"]");
+
+        // Deserialize
+        let deserialized: GeometryTypeAndDimensionsSet = 
serde_json::from_str(&json).unwrap();
+        assert_eq!(deserialized.size(), test_types.len());
+
+        let items: Vec<_> = deserialized.iter().collect();
+        for type_and_dim in &test_types {
+            assert!(items.contains(type_and_dim));
+        }
+    }
+
+    #[test]
+    fn geometry_type_set_serde_roundtrip() {
+        let mut set = GeometryTypeAndDimensionsSet::new();
+
+        // Add all combinations of one geometry type with different dimensions
+        set.insert(&GeometryTypeAndDimensions::new(Point, Xy))
+            .unwrap();
+        set.insert(&GeometryTypeAndDimensions::new(Point, Xyz))
+            .unwrap();
+        set.insert(&GeometryTypeAndDimensions::new(Point, Xym))
+            .unwrap();
+        set.insert(&GeometryTypeAndDimensions::new(Point, Xyzm))
+            .unwrap();
+        set.insert(&GeometryTypeAndDimensions::new(LineString, Xy))
+            .unwrap();
+
+        // Serialize to JSON
+        let json = serde_json::to_string(&set).unwrap();
+        assert_eq!(
+            json,
+            "[\"Point\",\"LineString\",\"Point Z\",\"Point M\",\"Point ZM\"]"
+        );
+
+        // Deserialize back
+        let deserialized: GeometryTypeAndDimensionsSet = 
serde_json::from_str(&json).unwrap();
+
+        // Verify the deserialized set matches the original
+        assert_eq!(set.size(), deserialized.size());
+
+        let original_items: Vec<_> = set.iter().collect();
+        let deserialized_items: Vec<_> = deserialized.iter().collect();
+
+        assert_eq!(original_items.len(), deserialized_items.len());
+        for item in &original_items {
+            assert!(deserialized_items.contains(item));
+        }
+    }
 }
diff --git a/rust/sedona-geoparquet/src/metadata.rs 
b/rust/sedona-geoparquet/src/metadata.rs
index 09caac7..aac33cc 100644
--- a/rust/sedona-geoparquet/src/metadata.rs
+++ b/rust/sedona-geoparquet/src/metadata.rs
@@ -26,8 +26,8 @@ use parquet::file::metadata::ParquetMetaData;
 use sedona_expr::statistics::GeoStatistics;
 use sedona_geometry::bounding_box::BoundingBox;
 use sedona_geometry::interval::{Interval, IntervalTrait};
-use sedona_geometry::types::GeometryTypeAndDimensions;
-use std::collections::{HashMap, HashSet};
+use sedona_geometry::types::GeometryTypeAndDimensionsSet;
+use std::collections::HashMap;
 use std::fmt::Display;
 use std::fmt::Write;
 
@@ -329,7 +329,7 @@ pub struct GeoParquetColumnMetadata {
     /// and multipolygons, it is not sufficient to specify `["MultiPolygon"]`, 
but it is expected
     /// to specify `["Polygon", "MultiPolygon"]`. Or if having 3D points, it 
is not sufficient to
     /// specify `["Point"]`, but it is expected to list `["Point Z"]`.
-    pub geometry_types: HashSet<GeometryTypeAndDimensions>,
+    pub geometry_types: GeometryTypeAndDimensionsSet,
 
     /// [PROJJSON](https://proj.org/specifications/projjson.html) object 
representing the
     /// Coordinate Reference System (CRS) of the geometry. If the field is not 
provided, the
@@ -414,7 +414,10 @@ impl GeoParquetMetadata {
                 column_meta.geometry_types.clear();
             } else {
                 for item in &other_column_meta.geometry_types {
-                    column_meta.geometry_types.insert(*item);
+                    column_meta
+                        .geometry_types
+                        .insert(&item)
+                        .map_err(|e| DataFusionError::External(Box::new(e)))?;
                 }
             }
         }
@@ -517,8 +520,7 @@ impl GeoParquetColumnMetadata {
         if self.geometry_types.is_empty() {
             stats
         } else {
-            let geometry_types = 
self.geometry_types.iter().cloned().collect::<Vec<_>>();
-            stats.with_geometry_types(Some(&geometry_types))
+            stats.with_geometry_types(Some(self.geometry_types.clone()))
         }
     }
 
@@ -543,7 +545,7 @@ impl GeoParquetColumnMetadata {
 #[cfg(test)]
 mod test {
     use geo_traits::Dimensions;
-    use sedona_geometry::types::GeometryTypeId;
+    use sedona_geometry::types::{GeometryTypeAndDimensions, GeometryTypeId};
 
     use super::*;
 
@@ -560,7 +562,7 @@ mod test {
         assert_eq!(meta.encoding, GeoParquetColumnEncoding::WKB);
         assert_eq!(
             meta.geometry_types.iter().next().unwrap(),
-            &GeometryTypeAndDimensions::new(GeometryTypeId::Point, 
Dimensions::Xy)
+            GeometryTypeAndDimensions::new(GeometryTypeId::Point, 
Dimensions::Xy)
         );
     }
 }

Reply via email to