petern48 commented on code in PR #171:
URL: https://github.com/apache/sedona-db/pull/171#discussion_r2471762144


##########
rust/sedona-geometry/src/wkb_header.rs:
##########
@@ -0,0 +1,1016 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use geo_traits::Dimensions;
+
+use crate::error::SedonaGeometryError;
+use crate::types::GeometryTypeId;
+
+const Z_FLAG_BIT: u32 = 0x80000000;
+const M_FLAG_BIT: u32 = 0x40000000;
+const SRID_FLAG_BIT: u32 = 0x20000000;
+
+/// Fast-path WKB header parser
+/// Performs operations lazily and caches them after the first computation
+#[derive(Debug)]
+pub struct WkbHeader {
+    geometry_type: u32,
+    // Not applicable for a point
+    // number of points for a linestring
+    // number of rings for a polygon
+    // number of geometries for a MULTIPOINT, MULTILINESTRING, MULTIPOLYGON, 
or GEOMETRYCOLLECTION
+    size: u32,
+    // SRID if given buffer was EWKB. Otherwise, 0.
+    srid: u32,
+    // First x,y coordinates for a point. Otherwise (f64::NAN, f64::NAN) if 
empty
+    first_xy: (f64, f64),
+    // Dimensions of the first nested geometry of a collection or None if empty
+    // For POINT, LINESTRING, POLYGON, returns the dimensions of the geometry
+    first_geom_dimensions: Option<Dimensions>,
+}
+
+impl WkbHeader {
+    /// Creates a new [WkbHeader] from a buffer
+    pub fn try_new(buf: &[u8]) -> Result<Self, SedonaGeometryError> {
+        let mut wkb_buffer = WkbBuffer::new(buf);
+
+        wkb_buffer.read_endian()?;
+
+        let geometry_type = wkb_buffer.read_u32()?;
+
+        let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 
0x7)?;
+
+        let mut srid = 0;
+        // if EWKB
+        if geometry_type & SRID_FLAG_BIT != 0 {
+            srid = wkb_buffer.read_u32()?;
+        }
+
+        let size = if geometry_type_id == GeometryTypeId::Point {
+            // Dummy value for a point
+            1
+        } else {
+            wkb_buffer.read_u32()?
+        };
+
+        // Default values for empty geometries
+        let first_x;
+        let first_y;
+        let first_geom_dimensions: Option<Dimensions>;
+
+        wkb_buffer.set_offset(0);
+
+        let first_geom_idx = wkb_buffer.first_geom_idx()?;
+        if let Some(i) = first_geom_idx {
+            // Reset to first_geom_idx and parse the dimensions
+            wkb_buffer.set_offset(i);
+            // Parse dimension
+            wkb_buffer.read_endian()?;
+            let code = wkb_buffer.read_u32()?;
+            first_geom_dimensions = Some(calc_dimensions(code)?);
+
+            // For first_xy_coord, we need to pass the buffer starting from 
the geometry header
+            wkb_buffer.set_offset(i);
+            (first_x, first_y) = wkb_buffer.first_xy_coord()?;
+        } else {
+            first_geom_dimensions = None;
+            first_x = f64::NAN;
+            first_y = f64::NAN;
+        }
+
+        Ok(Self {
+            geometry_type,
+            srid,
+            size,
+            first_xy: (first_x, first_y),
+            first_geom_dimensions,
+        })
+    }
+
+    /// Returns the geometry type id of the WKB by only parsing the header 
instead of the entire WKB
+    /// 1 -> Point
+    /// 2 -> LineString
+    /// 3 -> Polygon
+    /// 4 -> MultiPoint
+    /// 5 -> MultiLineString
+    /// 6 -> MultiPolygon
+    /// 7 -> GeometryCollection
+    ///
+    /// Spec: https://libgeos.org/specifications/wkb/
+    pub fn geometry_type_id(&self) -> Result<GeometryTypeId, 
SedonaGeometryError> {
+        // Only low 3 bits is for the base type, high bits include additional 
info
+        let code = self.geometry_type & 0x7;
+
+        let geometry_type_id = GeometryTypeId::try_from_wkb_id(code)?;
+
+        Ok(geometry_type_id)
+    }
+
+    /// Returns the size of the geometry
+    /// Not applicable for a point
+    /// Number of points for a linestring
+    /// Number of rings for a polygon
+    /// Number of geometries for a MULTIPOINT, MULTILINESTRING, MULTIPOLYGON, 
or GEOMETRYCOLLECTION
+    pub fn size(&self) -> u32 {
+        self.size
+    }
+
+    /// Returns the SRID if given buffer was EWKB. Otherwise, 0.
+    pub fn srid(&self) -> u32 {
+        self.srid
+    }
+
+    /// Returns the first x, y coordinates for a point. Otherwise (f64::NAN, 
f64::NAN) if empty
+    pub fn first_xy(&self) -> (f64, f64) {
+        self.first_xy
+    }
+
+    /// Returns the top-level dimension of the WKB
+    pub fn dimensions(&self) -> Result<Dimensions, SedonaGeometryError> {
+        calc_dimensions(self.geometry_type)
+    }
+
+    /// Returns the dimensions of the first coordinate of the geometry
+    pub fn first_geom_dimensions(&self) -> Option<Dimensions> {
+        self.first_geom_dimensions
+    }
+}
+
+// A helper struct for calculating the WKBHeader
+struct WkbBuffer<'a> {
+    buf: &'a [u8],
+    offset: usize,
+    remaining: usize,
+    last_endian: u8,
+}
+
+impl<'a> WkbBuffer<'a> {
+    fn new(buf: &'a [u8]) -> Self {
+        Self {
+            buf,
+            offset: 0,
+            remaining: buf.len(),
+            last_endian: 0,
+        }
+    }
+
+    // For MULITPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION, 
returns the index to the first nested
+    // non-collection geometry (POINT, LINESTRING, or POLYGON), or None if 
empty
+    // For POINT, LINESTRING, POLYGON, returns 0 as it already is a 
non-collection geometry
+    fn first_geom_idx(&mut self) -> Result<Option<usize>, SedonaGeometryError> 
{
+        self.read_endian()?;
+        let geometry_type = self.read_u32()?;
+        let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 
0x7)?;
+
+        match geometry_type_id {
+            GeometryTypeId::Point | GeometryTypeId::LineString | 
GeometryTypeId::Polygon => {
+                Ok(Some(0))
+            }
+            GeometryTypeId::MultiPoint
+            | GeometryTypeId::MultiLineString
+            | GeometryTypeId::MultiPolygon
+            | GeometryTypeId::GeometryCollection => {
+                if geometry_type & SRID_FLAG_BIT != 0 {
+                    // Skip the SRID
+                    self.read_u32()?;
+                }
+
+                let num_geometries = self.read_u32()?;
+
+                if num_geometries == 0 {
+                    return Ok(None);
+                }
+
+                // Recursive call to get the first geom of the first nested 
geometry
+                // Add to current offset of i
+                let mut nested_buffer = 
WkbBuffer::new(&self.buf[self.offset..]);
+                let off = nested_buffer.first_geom_idx()?;
+                if let Some(off) = off {
+                    Ok(Some(self.offset + off))
+                } else {
+                    Ok(None)
+                }
+            }
+            _ => Err(SedonaGeometryError::Invalid(format!(
+                "Unexpected geometry type: {geometry_type_id:?}"
+            ))),
+        }
+    }
+
+    // Given a point, linestring, or polygon, return the first xy coordinate
+    // If the geometry, is empty, (NaN, NaN) is returned
+    fn first_xy_coord(&mut self) -> Result<(f64, f64), SedonaGeometryError> {
+        self.read_endian()?;
+        let geometry_type = self.read_u32()?;
+
+        let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 
0x7)?;
+
+        // Skip the SRID if it's present
+        if geometry_type & SRID_FLAG_BIT != 0 {
+            self.read_u32()?;
+        }
+
+        if matches!(
+            geometry_type_id,
+            GeometryTypeId::LineString | GeometryTypeId::Polygon
+        ) {
+            let size = self.read_u32()?;
+
+            // (NaN, NaN) for empty geometries
+            if size == 0 {
+                return Ok((f64::NAN, f64::NAN));
+            }
+
+            // For POLYGON, after the number of rings, the next 4 bytes are the
+            // number of points in the exterior ring. We must skip that count 
to
+            // land on the first coordinate's x value.
+            if geometry_type_id == GeometryTypeId::Polygon {
+                let ring0_num_points = self.read_u32()?;
+
+                // (NaN, NaN) for empty first ring
+                if ring0_num_points == 0 {
+                    return Ok((f64::NAN, f64::NAN));
+                }
+            }
+        }
+
+        let x = self.read_coord()?;
+        let y = self.read_coord()?;
+        Ok((x, y))
+    }
+
+    fn read_endian(&mut self) -> Result<(), SedonaGeometryError> {
+        if self.remaining < 1 {
+            return Err(SedonaGeometryError::Invalid(format!(
+                "Invalid WKB: buffer too small. At offset: {}. Need 1 byte.",
+                self.offset
+            )));
+        }
+        self.last_endian = self.buf[self.offset];
+        self.remaining -= 1;
+        self.offset += 1;
+        Ok(())
+    }
+
+    fn read_u32(&mut self) -> Result<u32, SedonaGeometryError> {
+        if self.remaining < 4 {
+            return Err(SedonaGeometryError::Invalid(format!(
+                "Invalid WKB: buffer too small. At offset: {}. Need 4 bytes.",
+                self.offset
+            )));
+        }
+
+        let off = self.offset;
+        let num = match self.last_endian {
+            0 => u32::from_be_bytes([
+                self.buf[off],
+                self.buf[off + 1],
+                self.buf[off + 2],
+                self.buf[off + 3],
+            ]),
+            1 => u32::from_le_bytes([
+                self.buf[off],
+                self.buf[off + 1],
+                self.buf[off + 2],
+                self.buf[off + 3],
+            ]),
+            other => {
+                return Err(SedonaGeometryError::Invalid(format!(
+                    "Unexpected byte order: {other:?}"
+                )))
+            }
+        };
+        self.remaining -= 4;
+        self.offset += 4;
+        Ok(num)
+    }
+
+    // Given a buffer starting at the coordinate itself, parse the x and y 
coordinates
+    fn read_coord(&mut self) -> Result<f64, SedonaGeometryError> {
+        if self.remaining < 8 {
+            return Err(SedonaGeometryError::Invalid(format!(
+                "Invalid WKB: buffer too small. At offset: {}. Need 8 bytes.",
+                self.offset
+            )));
+        }
+
+        let buf = &self.buf;
+        let off = self.offset;
+        let coord: f64 = match self.last_endian {
+            0 => f64::from_be_bytes([

Review Comment:
   I see what you mean, but I think it would look / feel weird to pull it out 
prematurely. The loop doesn't exist yet (I assume you're talking about a loop 
iterating over the coords, bc I'm not seeing any loop in the existing code). If 
I'm understanding you right. It wouldn't make a difference now performance-wise 
wise since we're only reading one xy coord. I'd rather leave it like this for 
now, and pull it out if / when we read more coords.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to