petern48 commented on code in PR #171: URL: https://github.com/apache/sedona-db/pull/171#discussion_r2404289847
########## rust/sedona-geometry/src/wkb_header.rs: ########## @@ -0,0 +1,363 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::types::GeometryTypeId; +use datafusion_common::error::{DataFusionError, Result}; +use geo_traits::Dimensions; +use sedona_common::sedona_internal_err; + +/// Fast-path WKB header parser +/// Performs operations lazily and caches them after the first computation +pub struct WkbHeader<'a> { + buf: &'a [u8], + geometry_type_id: Option<GeometryTypeId>, + dimensions: Option<Dimensions>, +} + +impl<'a> WkbHeader<'a> { + /// Creates a new [WkbHeader] from a buffer + pub fn new(buf: &'a [u8]) -> Result<Self> { + Ok(Self { + buf, + geometry_type_id: None, + dimensions: None, + }) + } + + /// Returns the geometry type id of the WKB by only parsing the header instead of the entire WKB + /// 1 -> Point + /// 2 -> LineString + /// 3 -> Polygon + /// 4 -> MultiPoint + /// 5 -> MultiLineString + /// 6 -> MultiPolygon + /// 7 -> GeometryCollection + /// + /// Spec: https://libgeos.org/specifications/wkb/ + pub fn geometry_type_id(mut self) -> Result<GeometryTypeId> { + if self.geometry_type_id.is_none() { + self.geometry_type_id = Some(parse_geometry_type_id(self.buf)?); + } + self.geometry_type_id.ok_or_else(|| { + DataFusionError::External("Unexpected internal state in WkbHeader".into()) + }) + } + + /// Returns the dimension of the WKB by only parsing what's minimally necessary instead of the entire WKB + pub fn dimension(mut self) -> Result<Dimensions> { + // Calculate the dimension if we haven't already + if self.dimensions.is_none() { + self.dimensions = Some(parse_dimension(self.buf)?); + } + self.dimensions.ok_or_else(|| { + DataFusionError::External("Unexpected internal state in WkbHeader".into()) + }) + } +} + +fn parse_dimension(buf: &[u8]) -> Result<Dimensions> { + if buf.len() < 5 { + return sedona_internal_err!("Invalid WKB: buffer too small ({} bytes)", buf.len()); + } + + let byte_order = buf[0]; + + let code = match byte_order { + 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), + 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + + // 0000 -> xy or unspecified + // 1000 -> xyz + // 2000 -> xym + // 3000 -> xyzm + match code / 1000 { + // If xy, it's possible we need to infer the dimension + 0 => {} + 1 => return Ok(Dimensions::Xyz), + 2 => return Ok(Dimensions::Xym), + 3 => return Ok(Dimensions::Xyzm), + _ => return sedona_internal_err!("Unexpected code: {code}"), + }; + + // Try to infer dimension + // If geometry is a collection (MULTIPOINT, ... GEOMETRYCOLLECTION, code 4-7), we need to check the dimension of the first geometry + if code & 0x7 >= 4 { + // The next 4 bytes are the number of geometries in the collection + let num_geometries = match byte_order { + 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), + 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + // Check the dimension of the first geometry since they all have to be the same dimension + // Note: Attempting to create the following geometries error and are thus not possible to create: + // - Nested geometry dimension doesn't match the **specified** geom collection z-dimension + // - GEOMETRYCOLLECTION M (POINT Z (1 1 1)) + // - Nested geometry doesn't have the specified dimension + // - GEOMETRYCOLLECTION Z (POINT (1 1)) + // - Nested geometries have different dimensions + // - GEOMETRYCOLLECTION (POINT Z (1 1 1), POINT (1 1)) Review Comment: Hmm, unless you want to move this `dimensions()` method from the `WKBHeader` class entirely. I do like your idea of removing the `buf` field from the class, but considering this edge case, I see two options we could do to maintain correctness: 1. Move the `dimensions()` function outside and don't provide any method inside of `WKBHeader` 2. In `try_new()` also check the dimension of the first coordinate (e.g, it's xyz) and store that as a separate field to be retrieved in the `dimensions()` method. We could get this info during our pass to getting `first_xy`. edit: I'm working on option 2 atm, unless you say otherwise -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
