paleolimbot commented on code in PR #171:
URL: https://github.com/apache/sedona-db/pull/171#discussion_r2404234856
##########
python/sedonadb/tests/functions/test_functions.py:
##########
@@ -481,8 +483,18 @@ def test_st_geomfromwkb(eng, geom):
("LINESTRING Z (0 0 0, 1 1 1)", True),
("POLYGON EMPTY", False),
("MULTIPOINT ((0 0), (1 1))", False),
+ ("MULTIPOINT Z ((0 0 0))", True),
+ # SedonaDB can't parse this yet:
https://github.com/apache/sedona-db/issues/162
+ # ("MULTIPOINT ((0 0 0))", True),
+ ("MULTIPOINT ZM ((0 0 0 0))", True),
("GEOMETRYCOLLECTION EMPTY", False),
+ # Z-dim specified only in the nested geometry
("GEOMETRYCOLLECTION (POINT Z (0 0 0))", True),
+ # SedonaDB can't parse this yet:
https://github.com/apache/sedona-db/issues/162
+ # Z-dim specified only on the geom collection level but not the nested
geometry level
+ # ("GEOMETRYCOLLECTION Z (POINT (0 0 0))", True),
Review Comment:
```suggestion
```
##########
python/sedonadb/tests/functions/test_functions.py:
##########
@@ -481,8 +483,18 @@ def test_st_geomfromwkb(eng, geom):
("LINESTRING Z (0 0 0, 1 1 1)", True),
("POLYGON EMPTY", False),
("MULTIPOINT ((0 0), (1 1))", False),
+ ("MULTIPOINT Z ((0 0 0))", True),
+ # SedonaDB can't parse this yet:
https://github.com/apache/sedona-db/issues/162
+ # ("MULTIPOINT ((0 0 0))", True),
Review Comment:
(We should still remove these, though. These comments highlight WKT parsing
issues and this is a WKB test)
```suggestion
```
##########
rust/sedona-geometry/src/wkb_header.rs:
##########
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::types::GeometryTypeId;
+use datafusion_common::error::{DataFusionError, Result};
+use geo_traits::Dimensions;
+use sedona_common::sedona_internal_err;
+
+/// Fast-path WKB header parser
+/// Performs operations lazily and caches them after the first computation
+pub struct WkbHeader<'a> {
+ buf: &'a [u8],
+ geometry_type_id: Option<GeometryTypeId>,
+ dimensions: Option<Dimensions>,
+}
+
+impl<'a> WkbHeader<'a> {
+ /// Creates a new [WkbHeader] from a buffer
+ pub fn new(buf: &'a [u8]) -> Result<Self> {
+ Ok(Self {
+ buf,
+ geometry_type_id: None,
+ dimensions: None,
+ })
+ }
+
+ /// Returns the geometry type id of the WKB by only parsing the header
instead of the entire WKB
+ /// 1 -> Point
+ /// 2 -> LineString
+ /// 3 -> Polygon
+ /// 4 -> MultiPoint
+ /// 5 -> MultiLineString
+ /// 6 -> MultiPolygon
+ /// 7 -> GeometryCollection
+ ///
+ /// Spec: https://libgeos.org/specifications/wkb/
+ pub fn geometry_type_id(mut self) -> Result<GeometryTypeId> {
+ if self.geometry_type_id.is_none() {
+ self.geometry_type_id = Some(parse_geometry_type_id(self.buf)?);
+ }
+ self.geometry_type_id.ok_or_else(|| {
+ DataFusionError::External("Unexpected internal state in
WkbHeader".into())
+ })
+ }
+
+ /// Returns the dimension of the WKB by only parsing what's minimally
necessary instead of the entire WKB
+ pub fn dimension(mut self) -> Result<Dimensions> {
+ // Calculate the dimension if we haven't already
+ if self.dimensions.is_none() {
+ self.dimensions = Some(parse_dimension(self.buf)?);
+ }
+ self.dimensions.ok_or_else(|| {
+ DataFusionError::External("Unexpected internal state in
WkbHeader".into())
+ })
+ }
+}
+
+fn parse_dimension(buf: &[u8]) -> Result<Dimensions> {
+ if buf.len() < 5 {
+ return sedona_internal_err!("Invalid WKB: buffer too small ({}
bytes)", buf.len());
+ }
+
+ let byte_order = buf[0];
+
+ let code = match byte_order {
+ 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]),
+ 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]),
+ other => return sedona_internal_err!("Unexpected byte order: {other}"),
+ };
Review Comment:
I would put this in `try_new()`
(Also, `sedona_internal_err!()` should be `exec_err!()`)
##########
python/sedonadb/tests/functions/test_functions.py:
##########
@@ -473,6 +473,8 @@ def test_st_geomfromwkb(eng, geom):
("POINT Z EMPTY", True),
("POINT M EMPTY", False),
("POINT ZM EMPTY", True),
+ # SedonaDB can't parse this yet:
https://github.com/apache/sedona-db/issues/162
+ # ("POINT (0 0 0)", True),
Review Comment:
(Not relevant to this test, which is not about parsing WKT)
```suggestion
```
##########
rust/sedona-geometry/src/wkb_header.rs:
##########
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::types::GeometryTypeId;
+use datafusion_common::error::{DataFusionError, Result};
+use geo_traits::Dimensions;
+use sedona_common::sedona_internal_err;
+
+/// Fast-path WKB header parser
+/// Performs operations lazily and caches them after the first computation
+pub struct WkbHeader<'a> {
+ buf: &'a [u8],
+ geometry_type_id: Option<GeometryTypeId>,
+ dimensions: Option<Dimensions>,
+}
+
+impl<'a> WkbHeader<'a> {
+ /// Creates a new [WkbHeader] from a buffer
+ pub fn new(buf: &'a [u8]) -> Result<Self> {
Review Comment:
```suggestion
pub fn try_new(buf: &'a [u8]) -> Result<Self> {
```
(I believe it's `Wkb::try_new()` as well)
##########
rust/sedona-testing/src/fixtures.rs:
##########
@@ -22,3 +22,17 @@ pub const MULTIPOINT_WITH_EMPTY_CHILD_WKB: [u8; 30] = [
0x01, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x01, 0x00,
0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0xf8, 0x7f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0xf8, 0x7f,
];
+
+/// A well-known binary blob of MULTIPOINT ((0 0 0)) where outer dimension is
specified for xy
+/// while inner point's dimension is actually xyz
+pub const MULTIPOINT_WITH_INFERRED_Z_DIMENSION_WKB: [u8; 38] = [
+ 0x01, // byte-order
+ 0x04, 0x00, 0x00, 0x00, // multipoint with xy-dimension specified
+ 0x01, 0x00, 0x00, 0x00, // 1 point
+ // nested point geom
+ 0x01, // byte-order
+ 0xe9, 0x03, 0x00, 0x00, // point with xyz-dimension specified
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // x-coordinate of point
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // y-coordinate of point
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // z-coordinate of point
Review Comment:
optional nit: This would be a slightly better fixture if these were non-zero
(1, 2, 3, for example).
##########
rust/sedona-geometry/src/wkb_header.rs:
##########
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::types::GeometryTypeId;
+use datafusion_common::error::{DataFusionError, Result};
+use geo_traits::Dimensions;
+use sedona_common::sedona_internal_err;
+
+/// Fast-path WKB header parser
+/// Performs operations lazily and caches them after the first computation
+pub struct WkbHeader<'a> {
+ buf: &'a [u8],
+ geometry_type_id: Option<GeometryTypeId>,
+ dimensions: Option<Dimensions>,
+}
+
+impl<'a> WkbHeader<'a> {
+ /// Creates a new [WkbHeader] from a buffer
+ pub fn new(buf: &'a [u8]) -> Result<Self> {
+ Ok(Self {
+ buf,
+ geometry_type_id: None,
+ dimensions: None,
+ })
+ }
+
+ /// Returns the geometry type id of the WKB by only parsing the header
instead of the entire WKB
+ /// 1 -> Point
+ /// 2 -> LineString
+ /// 3 -> Polygon
+ /// 4 -> MultiPoint
+ /// 5 -> MultiLineString
+ /// 6 -> MultiPolygon
+ /// 7 -> GeometryCollection
+ ///
+ /// Spec: https://libgeos.org/specifications/wkb/
+ pub fn geometry_type_id(mut self) -> Result<GeometryTypeId> {
+ if self.geometry_type_id.is_none() {
+ self.geometry_type_id = Some(parse_geometry_type_id(self.buf)?);
+ }
+ self.geometry_type_id.ok_or_else(|| {
+ DataFusionError::External("Unexpected internal state in
WkbHeader".into())
+ })
+ }
+
+ /// Returns the dimension of the WKB by only parsing what's minimally
necessary instead of the entire WKB
+ pub fn dimension(mut self) -> Result<Dimensions> {
+ // Calculate the dimension if we haven't already
+ if self.dimensions.is_none() {
+ self.dimensions = Some(parse_dimension(self.buf)?);
+ }
+ self.dimensions.ok_or_else(|| {
+ DataFusionError::External("Unexpected internal state in
WkbHeader".into())
+ })
+ }
+}
+
+fn parse_dimension(buf: &[u8]) -> Result<Dimensions> {
+ if buf.len() < 5 {
+ return sedona_internal_err!("Invalid WKB: buffer too small ({}
bytes)", buf.len());
+ }
+
+ let byte_order = buf[0];
+
+ let code = match byte_order {
+ 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]),
+ 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]),
+ other => return sedona_internal_err!("Unexpected byte order: {other}"),
+ };
+
+ // 0000 -> xy or unspecified
+ // 1000 -> xyz
+ // 2000 -> xym
+ // 3000 -> xyzm
+ match code / 1000 {
+ // If xy, it's possible we need to infer the dimension
+ 0 => {}
+ 1 => return Ok(Dimensions::Xyz),
+ 2 => return Ok(Dimensions::Xym),
+ 3 => return Ok(Dimensions::Xyzm),
+ _ => return sedona_internal_err!("Unexpected code: {code}"),
+ };
Review Comment:
This should also handle EWKB high bit flags. Most of the time this will be
ISO WKB from GeoParquet but not all tools have control over the type of WKB
they generate and we're better for dealing with it (unless you can demonstrate
measurable performance overhead, which I doubt is the case here). One notable
data point is that WKB coming from Sedona Spark's `dataframe_to_arrow()` is
EWKB.
##########
rust/sedona-geometry/src/wkb_header.rs:
##########
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::types::GeometryTypeId;
+use datafusion_common::error::{DataFusionError, Result};
+use geo_traits::Dimensions;
+use sedona_common::sedona_internal_err;
+
+/// Fast-path WKB header parser
+/// Performs operations lazily and caches them after the first computation
+pub struct WkbHeader<'a> {
+ buf: &'a [u8],
+ geometry_type_id: Option<GeometryTypeId>,
+ dimensions: Option<Dimensions>,
+}
+
+impl<'a> WkbHeader<'a> {
+ /// Creates a new [WkbHeader] from a buffer
+ pub fn new(buf: &'a [u8]) -> Result<Self> {
+ Ok(Self {
+ buf,
+ geometry_type_id: None,
+ dimensions: None,
+ })
+ }
+
+ /// Returns the geometry type id of the WKB by only parsing the header
instead of the entire WKB
+ /// 1 -> Point
+ /// 2 -> LineString
+ /// 3 -> Polygon
+ /// 4 -> MultiPoint
+ /// 5 -> MultiLineString
+ /// 6 -> MultiPolygon
+ /// 7 -> GeometryCollection
+ ///
+ /// Spec: https://libgeos.org/specifications/wkb/
+ pub fn geometry_type_id(mut self) -> Result<GeometryTypeId> {
+ if self.geometry_type_id.is_none() {
+ self.geometry_type_id = Some(parse_geometry_type_id(self.buf)?);
+ }
+ self.geometry_type_id.ok_or_else(|| {
+ DataFusionError::External("Unexpected internal state in
WkbHeader".into())
+ })
+ }
+
+ /// Returns the dimension of the WKB by only parsing what's minimally
necessary instead of the entire WKB
+ pub fn dimension(mut self) -> Result<Dimensions> {
+ // Calculate the dimension if we haven't already
+ if self.dimensions.is_none() {
+ self.dimensions = Some(parse_dimension(self.buf)?);
+ }
+ self.dimensions.ok_or_else(|| {
+ DataFusionError::External("Unexpected internal state in
WkbHeader".into())
+ })
+ }
+}
+
+fn parse_dimension(buf: &[u8]) -> Result<Dimensions> {
+ if buf.len() < 5 {
+ return sedona_internal_err!("Invalid WKB: buffer too small ({}
bytes)", buf.len());
+ }
+
+ let byte_order = buf[0];
+
+ let code = match byte_order {
+ 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]),
+ 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]),
+ other => return sedona_internal_err!("Unexpected byte order: {other}"),
+ };
+
+ // 0000 -> xy or unspecified
+ // 1000 -> xyz
+ // 2000 -> xym
+ // 3000 -> xyzm
+ match code / 1000 {
+ // If xy, it's possible we need to infer the dimension
+ 0 => {}
+ 1 => return Ok(Dimensions::Xyz),
+ 2 => return Ok(Dimensions::Xym),
+ 3 => return Ok(Dimensions::Xyzm),
+ _ => return sedona_internal_err!("Unexpected code: {code}"),
+ };
+
+ // Try to infer dimension
+ // If geometry is a collection (MULTIPOINT, ... GEOMETRYCOLLECTION, code
4-7), we need to check the dimension of the first geometry
+ if code & 0x7 >= 4 {
+ // The next 4 bytes are the number of geometries in the collection
+ let num_geometries = match byte_order {
+ 0 => u32::from_be_bytes([buf[5], buf[6], buf[7], buf[8]]),
+ 1 => u32::from_le_bytes([buf[5], buf[6], buf[7], buf[8]]),
+ other => return sedona_internal_err!("Unexpected byte order:
{other}"),
+ };
+ // Check the dimension of the first geometry since they all have to be
the same dimension
+ // Note: Attempting to create the following geometries error and are
thus not possible to create:
+ // - Nested geometry dimension doesn't match the **specified** geom
collection z-dimension
+ // - GEOMETRYCOLLECTION M (POINT Z (1 1 1))
+ // - Nested geometry doesn't have the specified dimension
+ // - GEOMETRYCOLLECTION Z (POINT (1 1))
+ // - Nested geometries have different dimensions
+ // - GEOMETRYCOLLECTION (POINT Z (1 1 1), POINT (1 1))
Review Comment:
This part is, I believe unique to `st_hasz()` and should possibly live in
the file implementing that function (or be explicit in the name of the
function...I think of `dimensions` as the explicitly declared dimensions at the
top-level WKB).
##########
rust/sedona-functions/src/st_haszm.rs:
##########
@@ -107,28 +106,27 @@ impl SedonaScalarKernel for STHasZm {
}
}
-fn invoke_scalar(item: &Wkb, dim_index: usize) -> Result<Option<bool>> {
- match item.as_type() {
- geo_traits::GeometryType::GeometryCollection(collection) => {
- use geo_traits::GeometryCollectionTrait;
- if collection.num_geometries() == 0 {
- Ok(Some(false))
- } else {
- // PostGIS doesn't allow creating a GeometryCollection with
geometries of different dimensions
- // so we can just check the dimension of the first one
- let first_geom = unsafe { collection.geometry_unchecked(0) };
- invoke_scalar(first_geom, dim_index)
- }
- }
- _ => {
- let geom_dim = item.dim();
- match dim_index {
- 2 => Ok(Some(matches!(geom_dim, Dimensions::Xyz |
Dimensions::Xyzm))),
- 3 => Ok(Some(matches!(geom_dim, Dimensions::Xym |
Dimensions::Xyzm))),
- _ => sedona_internal_err!("unexpected dim_index"),
- }
- }
+/// Fast-path inference of geometry type name from raw WKB bytes
+/// An error will be thrown for invalid WKB bytes input
+///
+/// Spec: https://libgeos.org/specifications/wkb/
Review Comment:
```suggestion
```
##########
rust/sedona-geometry/src/wkb_header.rs:
##########
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::types::GeometryTypeId;
+use datafusion_common::error::{DataFusionError, Result};
+use geo_traits::Dimensions;
+use sedona_common::sedona_internal_err;
+
+/// Fast-path WKB header parser
+/// Performs operations lazily and caches them after the first computation
+pub struct WkbHeader<'a> {
+ buf: &'a [u8],
+ geometry_type_id: Option<GeometryTypeId>,
+ dimensions: Option<Dimensions>,
+}
+
+impl<'a> WkbHeader<'a> {
+ /// Creates a new [WkbHeader] from a buffer
+ pub fn new(buf: &'a [u8]) -> Result<Self> {
+ Ok(Self {
+ buf,
+ geometry_type_id: None,
+ dimensions: None,
+ })
+ }
+
+ /// Returns the geometry type id of the WKB by only parsing the header
instead of the entire WKB
+ /// 1 -> Point
+ /// 2 -> LineString
+ /// 3 -> Polygon
+ /// 4 -> MultiPoint
+ /// 5 -> MultiLineString
+ /// 6 -> MultiPolygon
+ /// 7 -> GeometryCollection
+ ///
+ /// Spec: https://libgeos.org/specifications/wkb/
+ pub fn geometry_type_id(mut self) -> Result<GeometryTypeId> {
+ if self.geometry_type_id.is_none() {
+ self.geometry_type_id = Some(parse_geometry_type_id(self.buf)?);
+ }
+ self.geometry_type_id.ok_or_else(|| {
+ DataFusionError::External("Unexpected internal state in
WkbHeader".into())
+ })
+ }
+
+ /// Returns the dimension of the WKB by only parsing what's minimally
necessary instead of the entire WKB
+ pub fn dimension(mut self) -> Result<Dimensions> {
Review Comment:
```suggestion
pub fn dimensions(mut self) -> Result<Dimensions> {
```
(`dimension` is something different)
##########
rust/sedona-geometry/src/wkb_header.rs:
##########
@@ -0,0 +1,363 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::types::GeometryTypeId;
+use datafusion_common::error::{DataFusionError, Result};
+use geo_traits::Dimensions;
+use sedona_common::sedona_internal_err;
+
+/// Fast-path WKB header parser
+/// Performs operations lazily and caches them after the first computation
+pub struct WkbHeader<'a> {
+ buf: &'a [u8],
+ geometry_type_id: Option<GeometryTypeId>,
+ dimensions: Option<Dimensions>,
Review Comment:
How about:
```suggestion
geometry_type: u32,
size: u32,
srid: u32,
first_xy: (f64, f64)
```
- I don't think you need to keep a reference to the slice here...whoever was
calling this has access to that already.
- The laziness here I don't think is helping performance (these objects
aren't in places where the values are getting reused and caching the very cheap
calculation would help)
- Size + first_xy are enough to ensure non-empty in most cases (we need the
first_xy for points in this case)
- first_xy is something I need shortly to implement spatial sorting
- srid is something I need shortly to implement EWKB parsing into our new
row-level SRID type
- Other things can be implemented as methods on top of those values
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]