petern48 commented on code in PR #171: URL: https://github.com/apache/sedona-db/pull/171#discussion_r2468209793
########## rust/sedona-geometry/src/wkb_header.rs: ########## @@ -0,0 +1,728 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::types::GeometryTypeId; +use datafusion_common::{ + error::{DataFusionError, Result}, + exec_err, +}; +use geo_traits::Dimensions; +use sedona_common::sedona_internal_err; + +const SRID_FLAG_BIT: u32 = 0x20000000; + +/// Fast-path WKB header parser +/// Performs operations lazily and caches them after the first computation +pub struct WkbHeader { + geometry_type: u32, + // Not applicable for a point + // number of points for a linestring + // number of rings for a polygon + // number of geometries for a MULTIPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION + size: u32, + // SRID if given buffer was EWKB. Otherwise, 0. + srid: u32, + // First x,y coordinates for a point. Otherwise (f64::NAN, f64::NAN) if empty + first_xy: (f64, f64), + // Dimensions of the first nested geometry of a collection or None if empty + // For POINT, LINESTRING, POLYGON, returns the dimensions of the geometry + first_geom_dimensions: Option<Dimensions>, +} + +impl WkbHeader { + /// Creates a new [WkbHeader] from a buffer + pub fn try_new(buf: &[u8]) -> Result<Self> { + if buf.len() < 5 { + return exec_err!("Invalid WKB: buffer too small -> try_new"); + }; + + let byte_order = buf[0]; + + // Parse geometry type + let geometry_type = match byte_order { + 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]), + 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + + let geometry_type_id = GeometryTypeId::try_from_wkb_id(geometry_type & 0x7) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + let mut i = 5; + let mut srid = 0; + // if EWKB + if geometry_type & SRID_FLAG_BIT != 0 { + srid = match byte_order { + 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]), + 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + }; + i = 9; + } + + let size = if geometry_type_id == GeometryTypeId::Point { + // Dummy value for a point + 1 + } else { + match byte_order { + 0 => u32::from_be_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), + 1 => u32::from_le_bytes([buf[i], buf[i + 1], buf[i + 2], buf[i + 3]]), + other => return sedona_internal_err!("Unexpected byte order: {other}"), + } + }; + + // Default values for empty geometries + let first_x; + let first_y; + let first_geom_dimensions: Option<Dimensions>; + + let first_geom_idx = first_geom_idx(buf)?; + if let Some(i) = first_geom_idx { + first_geom_dimensions = Some(parse_dimensions(&buf[i..])?); + (first_x, first_y) = first_xy(&buf[i..])?; + } else { + first_geom_dimensions = None; + first_x = f64::NAN; + first_y = f64::NAN; + } + + Ok(Self { + geometry_type, + srid, + size, + first_xy: (first_x, first_y), + first_geom_dimensions, + }) + } + + /// Returns the geometry type id of the WKB by only parsing the header instead of the entire WKB + /// 1 -> Point + /// 2 -> LineString + /// 3 -> Polygon + /// 4 -> MultiPoint + /// 5 -> MultiLineString + /// 6 -> MultiPolygon + /// 7 -> GeometryCollection + /// + /// Spec: https://libgeos.org/specifications/wkb/ + pub fn geometry_type_id(&self) -> Result<GeometryTypeId> { + // Only low 3 bits is for the base type, high bits include additional info + let code = self.geometry_type & 0x7; + + let geometry_type_id = GeometryTypeId::try_from_wkb_id(code) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + Ok(geometry_type_id) + } + + /// Returns the size of the geometry + /// Not applicable for a point + /// Number of points for a linestring + /// Number of rings for a polygon + /// Number of geometries for a MULTIPOINT, MULTILINESTRING, MULTIPOLYGON, or GEOMETRYCOLLECTION + pub fn size(&self) -> u32 { + self.size + } + + /// Returns the SRID if given buffer was EWKB. Otherwise, 0. + pub fn srid(&self) -> u32 { + self.srid + } + + /// Returns the first x, y coordinates for a point. Otherwise (f64::NAN, f64::NAN) if empty + pub fn first_xy(&self) -> (f64, f64) { + self.first_xy + } + + /// Returns the top-level dimension of the WKB + pub fn dimensions(&self) -> Result<Dimensions> { + let dimensions = match self.geometry_type / 1000 { + 0 => Dimensions::Xy, + 1 => Dimensions::Xyz, + 2 => Dimensions::Xym, + 3 => Dimensions::Xyzm, + _ => exec_err!("Unexpected code: {}", self.geometry_type)?, + }; + Ok(dimensions) Review Comment: did not know what you meant here, until I had to debug it 🙃 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
