petern48 commented on code in PR #171:
URL: https://github.com/apache/sedona-db/pull/171#discussion_r2404278128


##########
rust/sedona-geometry/src/wkb_header.rs:
##########
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::types::GeometryTypeId;
+use datafusion_common::error::{DataFusionError, Result};
+use geo_traits::Dimensions;
+use sedona_common::sedona_internal_err;
+
+/// Fast-path WKB header parser
+/// Performs operations lazily and caches them after the first computation
+pub struct WkbHeader<'a> {
+    buf: &'a [u8],
+    geometry_type_id: Option<GeometryTypeId>,
+    dimensions: Option<Dimensions>,
+}
+
+impl<'a> WkbHeader<'a> {
+    /// Creates a new [WkbHeader] from a buffer
+    pub fn new(buf: &'a [u8]) -> Result<Self> {
+        Ok(Self {
+            buf,
+            geometry_type_id: None,
+            dimensions: None,
+        })
+    }
+
+    /// Returns the geometry type id of the WKB by only parsing the header 
instead of the entire WKB
+    /// 1 -> Point
+    /// 2 -> LineString
+    /// 3 -> Polygon
+    /// 4 -> MultiPoint
+    /// 5 -> MultiLineString
+    /// 6 -> MultiPolygon
+    /// 7 -> GeometryCollection
+    ///
+    /// Spec: https://libgeos.org/specifications/wkb/
+    pub fn geometry_type_id(mut self) -> Result<GeometryTypeId> {
+        if self.geometry_type_id.is_none() {
+            self.geometry_type_id = Some(parse_geometry_type_id(self.buf)?);
+        }
+        self.geometry_type_id.ok_or_else(|| {
+            DataFusionError::External("Unexpected internal state in 
WkbHeader".into())
+        })
+    }
+
+    /// Returns the dimension of the WKB by only parsing what's minimally 
necessary instead of the entire WKB
+    pub fn dimension(mut self) -> Result<Dimensions> {
+        // Calculate the dimension if we haven't already
+        if self.dimensions.is_none() {
+            self.dimensions = Some(parse_dimension(self.buf)?);
+        }
+        self.dimensions.ok_or_else(|| {
+            DataFusionError::External("Unexpected internal state in 
WkbHeader".into())
+        })
+    }
+}
+
+fn parse_dimension(buf: &[u8]) -> Result<Dimensions> {
+    if buf.len() < 5 {
+        return sedona_internal_err!("Invalid WKB: buffer too small ({} 
bytes)", buf.len());
+    }
+
+    let byte_order = buf[0];
+
+    let code = match byte_order {
+        0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]),
+        1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]),
+        other => return sedona_internal_err!("Unexpected byte order: {other}"),
+    };
+
+    // 0000 -> xy or unspecified
+    // 1000 -> xyz
+    // 2000 -> xym
+    // 3000 -> xyzm
+    match code / 1000 {
+        // If xy, it's possible we need to infer the dimension
+        0 => {}
+        1 => return Ok(Dimensions::Xyz),
+        2 => return Ok(Dimensions::Xym),
+        3 => return Ok(Dimensions::Xyzm),
+        _ => return sedona_internal_err!("Unexpected code: {code}"),
+    };
+
+    // Try to infer dimension
+    // If geometry is a collection (MULTIPOINT, ... GEOMETRYCOLLECTION, code 
4-7), we need to check the dimension of the first geometry
+    if code & 0x7 >= 4 {
+        // The next 4 bytes are the number of geometries in the collection
+        let num_geometries = match byte_order {
+            0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]),
+            1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]),
+            other => return sedona_internal_err!("Unexpected byte order: 
{other}"),
+        };
+        // Check the dimension of the first geometry since they all have to be 
the same dimension
+        // Note: Attempting to create the following geometries error and are 
thus not possible to create:
+        // - Nested geometry dimension doesn't match the **specified** geom 
collection z-dimension
+        //   - GEOMETRYCOLLECTION M (POINT Z (1 1 1))
+        // - Nested geometry doesn't have the specified dimension
+        //   - GEOMETRYCOLLECTION Z (POINT (1 1))
+        // - Nested geometries have different dimensions
+        //   - GEOMETRYCOLLECTION (POINT Z (1 1 1), POINT (1 1))

Review Comment:
   I think this logic should be kept here, actually. The logic of `st_hasz()` 
is simply to get the dimensionality of the object and see if it has a 
z-dimension. No other special logic. This logic here you're referring to is for 
handling a slight nuance in how SedonaDB converts WKT to WKB. Specifically, it 
translates the following geometry into WKB where the top-most dimension is 
specified as xy, while all of the actual coordinates in the geometry have z 
dimension.
   
   e.g `GEOMETRYCOLLECTION (POINT Z (1 2 3))`
   (I'd expect the same issue with `MULTIPOINT ((1 1 1))` is WKT supported 
parsing it)
   
   I think of these examples as geometries that really are xyz dimension, but 
rely on us to infer the z-dimension.
   
   SedonaDB parses the first example as follows:
   ```sql
   select st_asbinary(st_geomfromtext('geometrycollection (point z (1 2 3))'));
   -- 
01**07000000**0100000001e9030000000000000000f03f00000000000000400000000000000840
   ```
   
   Notably, the top-level dimensionality is simply `7` (xy), whereas we should 
really be interpretting the whole thing as an xyz.
   
   Interestingly, the same query on PostGIS, returns the binary as the 
following where the top-level dimensionality is xyz.
   ```
   
01**ef030000**0100000001e9030000000000000000f03f00000000000000400000000000000840
   ```
   
   I'm not sure if this is a bug in how WKT is translated into WKB, but this 
logic should be necessary to interpret that WKT the same way as PostGIS 
interprets it. We'd want to kept this logic for ST_ZMFlag, for example. Are 
there any concrete functions you can think of where we'd want to take the 
top-level dimension and ignore any potential extra dimensions in the points?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to