paleolimbot commented on code in PR #171:
URL: https://github.com/apache/sedona-db/pull/171#discussion_r2393285912
##########
python/sedonadb/tests/functions/test_functions.py:
##########
@@ -473,6 +473,8 @@ def test_st_geomfromwkb(eng, geom):
("POINT Z EMPTY", True),
("POINT M EMPTY", False),
("POINT ZM EMPTY", True),
+ # SedonaDB can't parse this yet:
https://github.com/apache/sedona-db/issues/162
+ # ("POINT (0 0 0)", True),
Review Comment:
From a WKB perspective, this is actually identical...that is a WKT parser
issue, not an underlying data representation issue for us.
##########
rust/sedona-functions/src/st_haszm.rs:
##########
@@ -107,28 +107,63 @@ impl SedonaScalarKernel for STHasZm {
}
}
-fn invoke_scalar(item: &Wkb, dim_index: usize) -> Result<Option<bool>> {
- match item.as_type() {
- geo_traits::GeometryType::GeometryCollection(collection) => {
- use geo_traits::GeometryCollectionTrait;
- if collection.num_geometries() == 0 {
- Ok(Some(false))
- } else {
- // PostGIS doesn't allow creating a GeometryCollection with
geometries of different dimensions
- // so we can just check the dimension of the first one
- let first_geom = unsafe { collection.geometry_unchecked(0) };
- invoke_scalar(first_geom, dim_index)
- }
- }
- _ => {
- let geom_dim = item.dim();
- match dim_index {
- 2 => Ok(Some(matches!(geom_dim, Dimensions::Xyz |
Dimensions::Xyzm))),
- 3 => Ok(Some(matches!(geom_dim, Dimensions::Xym |
Dimensions::Xyzm))),
- _ => sedona_internal_err!("unexpected dim_index"),
- }
+/// Fast-path inference of geometry type name from raw WKB bytes
+/// An error will be thrown for invalid WKB bytes input
+///
+/// Spec: https://libgeos.org/specifications/wkb/
+fn infer_haszm(buf: &[u8], dim_index: usize) -> Result<Option<bool>> {
+ if buf.len() < 5 {
+ return sedona_internal_err!("Invalid WKB: buffer too small ({}
bytes)", buf.len());
+ }
+
+ let byte_order = buf[0];
+ let code = match byte_order {
+ 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]),
+ 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]),
+ other => return sedona_internal_err!("Unexpected byte order: {other}"),
+ };
+
+ // 0000 -> xy or unspecified
+ // 1000 -> xyz
+ // 2000 -> xym
+ // 3000 -> xyzm
+ match code / 1000 {
+ // If xy, it's possible we need to infer the dimension
+ 0 => {}
+ 1 => return Ok(Some(dim_index == 2)),
+ 2 => return Ok(Some(dim_index == 3)),
+ 3 => return Ok(Some(true)),
+ _ => return sedona_internal_err!("Unexpected code: {code}"),
+ };
Review Comment:
I am not sure whether or not we should handle EWKB here (the rest of the
package does via parsing by the wkb crate). In that case you would also need to
check the high bits for ZM flags too.
##########
python/sedonadb/tests/functions/test_functions.py:
##########
@@ -481,8 +483,18 @@ def test_st_geomfromwkb(eng, geom):
("LINESTRING Z (0 0 0, 1 1 1)", True),
("POLYGON EMPTY", False),
("MULTIPOINT ((0 0), (1 1))", False),
+ ("MULTIPOINT Z ((0 0 0))", True),
+ # SedonaDB can't parse this yet:
https://github.com/apache/sedona-db/issues/162
+ # ("MULTIPOINT ((0 0 0))", True),
Review Comment:
This one we actually can test, but you need to generate the WKB bytes
yourself. Something like `0x01, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
<the WKB for POINT Z (0 0 0)>` (0x01 for little endian, 4 as a little endian
int 32 for multipoint, 1 as a little endian int for one point, the point).
##########
rust/sedona-functions/src/st_haszm.rs:
##########
@@ -172,6 +207,16 @@ mod tests {
let result = m_tester.invoke_scalar("POINT (1 2)").unwrap();
m_tester.assert_scalar_result_equals(result, false);
+ // SedonaDB can't parse this yet:
https://github.com/apache/sedona-db/issues/162
+ // Infer z-dimension
+ // let result = z_tester.invoke_scalar("POINT (1 2 3)").unwrap();
+ // z_tester.assert_scalar_result_equals(result, true);
Review Comment:
See my comment above...WKT parsers that accept this (GEOS/PostGIS) would
write out the WKB identically to `POINT Z (1 2 3)`.
`MULTIPOINT ((1 2 3))` is a bit different. You should be able to generate
bytes for that using shapely (or use my sketch above to do it by hand). We have
a place for WKB bytes of things that are hard to generate using WKT:
https://github.com/apache/sedona-db/blob/a15844b4ff9c7e9416b8f1c1c07ad81d908a89cd/rust/sedona-testing/src/fixtures.rs#L17-L24
##########
python/sedonadb/tests/functions/test_functions.py:
##########
@@ -481,8 +483,18 @@ def test_st_geomfromwkb(eng, geom):
("LINESTRING Z (0 0 0, 1 1 1)", True),
("POLYGON EMPTY", False),
("MULTIPOINT ((0 0), (1 1))", False),
+ ("MULTIPOINT Z ((0 0 0))", True),
+ # SedonaDB can't parse this yet:
https://github.com/apache/sedona-db/issues/162
+ # ("MULTIPOINT ((0 0 0))", True),
+ ("MULTIPOINT ZM ((0 0 0 0))", True),
("GEOMETRYCOLLECTION EMPTY", False),
+ # Z-dim specified only in the nested geometry
("GEOMETRYCOLLECTION (POINT Z (0 0 0))", True),
+ # SedonaDB can't parse this yet:
https://github.com/apache/sedona-db/issues/162
+ # Z-dim specified only on the geom collection level but not the nested
geometry level
+ # ("GEOMETRYCOLLECTION Z (POINT (0 0 0))", True),
Review Comment:
This is also a parser issue, not a data representation issue.
##########
rust/sedona-functions/src/st_haszm.rs:
##########
@@ -107,28 +107,63 @@ impl SedonaScalarKernel for STHasZm {
}
}
-fn invoke_scalar(item: &Wkb, dim_index: usize) -> Result<Option<bool>> {
- match item.as_type() {
- geo_traits::GeometryType::GeometryCollection(collection) => {
- use geo_traits::GeometryCollectionTrait;
- if collection.num_geometries() == 0 {
- Ok(Some(false))
- } else {
- // PostGIS doesn't allow creating a GeometryCollection with
geometries of different dimensions
- // so we can just check the dimension of the first one
- let first_geom = unsafe { collection.geometry_unchecked(0) };
- invoke_scalar(first_geom, dim_index)
- }
- }
- _ => {
- let geom_dim = item.dim();
- match dim_index {
- 2 => Ok(Some(matches!(geom_dim, Dimensions::Xyz |
Dimensions::Xyzm))),
- 3 => Ok(Some(matches!(geom_dim, Dimensions::Xym |
Dimensions::Xyzm))),
- _ => sedona_internal_err!("unexpected dim_index"),
- }
+/// Fast-path inference of geometry type name from raw WKB bytes
+/// An error will be thrown for invalid WKB bytes input
+///
+/// Spec: https://libgeos.org/specifications/wkb/
+fn infer_haszm(buf: &[u8], dim_index: usize) -> Result<Option<bool>> {
+ if buf.len() < 5 {
+ return sedona_internal_err!("Invalid WKB: buffer too small ({}
bytes)", buf.len());
+ }
Review Comment:
We should consider consolidating this with the geometry type optimization
like:
```rust
struct WkbHeader {
geometry_type: u32,
size: u32,
first_coord_geometry_type: u32,
first_coord_offset: u32
}
impl WkbHeader {
pub fn geometry_type_id(&self) -> GeometryTypeId {...}
pub fn dimensions(&self) -> Dimensions { ... }
pub fn num_dimensions(&self) -> usize { ... }
}
```
There are a few functions that can benefit from this (npoints in a few
cases, hilbert, isempty), although it might make a better PR into the `wkb`
crate where there's already the logic for the parsing.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]