This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/sedona-db.git
The following commit(s) were added to refs/heads/main by this push:
new 8d7d778 perf: Faster `st_geometrytype()` function (#90)
8d7d778 is described below
commit 8d7d77850f5aad0be9ca6be203ad7ee513f8aa77
Author: Yongting You <[email protected]>
AuthorDate: Thu Sep 18 00:23:55 2025 +0800
perf: Faster `st_geometrytype()` function (#90)
---
rust/sedona-functions/src/executor.rs | 24 +++++++++++
rust/sedona-functions/src/st_geometrytype.rs | 61 ++++++++++++++++------------
2 files changed, 60 insertions(+), 25 deletions(-)
diff --git a/rust/sedona-functions/src/executor.rs
b/rust/sedona-functions/src/executor.rs
index 081b627..0a98e71 100644
--- a/rust/sedona-functions/src/executor.rs
+++ b/rust/sedona-functions/src/executor.rs
@@ -246,6 +246,30 @@ impl GeometryFactory for WkbGeometryFactory {
}
}
+/// A [GeometryFactory] whose geometry type are raw WKB bytes
+///
+/// Using this geometry factory iterates over items as references to the raw
underlying
+/// bytes, which is useful for writing optimized kernels that do not need the
full buffer to
+/// be validated and/or parsed.
+#[derive(Default)]
+pub struct WkbBytesFactory {}
+
+impl GeometryFactory for WkbBytesFactory {
+ type Geom<'a> = &'a [u8];
+
+ fn try_from_wkb<'a>(&self, wkb_bytes: &'a [u8]) -> Result<Self::Geom<'a>> {
+ Ok(wkb_bytes)
+ }
+}
+
+/// Alias for an executor that iterates over geometries in their raw [Wkb]
bytes.
+///
+/// This [GenericExecutor] implementation provides more optimization
opportunities,
+/// but it requires additional manual processing of the raw [Wkb] bytes
compared to
+/// the [WkbExecutor].
+pub(crate) type WkbBytesExecutor<'a, 'b> =
+ GenericExecutor<'a, 'b, WkbBytesFactory, WkbBytesFactory>;
+
/// Trait for iterating over a container type as geometry scalars
///
/// Currently the only scalar type supported is [Wkb]; however, for future
diff --git a/rust/sedona-functions/src/st_geometrytype.rs
b/rust/sedona-functions/src/st_geometrytype.rs
index 0cdd9a8..6cccf0e 100644
--- a/rust/sedona-functions/src/st_geometrytype.rs
+++ b/rust/sedona-functions/src/st_geometrytype.rs
@@ -16,18 +16,16 @@
// under the License.
use std::sync::Arc;
-use crate::executor::WkbExecutor;
+use crate::executor::WkbBytesExecutor;
use arrow_array::builder::StringBuilder;
use arrow_schema::DataType;
use datafusion_common::error::Result;
use datafusion_expr::{
scalar_doc_sections::DOC_SECTION_OTHER, ColumnarValue, Documentation,
Volatility,
};
-use geo_traits::GeometryTrait;
use sedona_common::sedona_internal_err;
use sedona_expr::scalar_udf::{SedonaScalarKernel, SedonaScalarUDF};
use sedona_schema::{datatypes::SedonaType, matchers::ArgMatcher};
-use wkb::reader::Wkb;
pub fn st_geometry_type_udf() -> SedonaScalarUDF {
SedonaScalarUDF::new(
@@ -67,16 +65,16 @@ impl SedonaScalarKernel for STGeometryType {
arg_types: &[SedonaType],
args: &[ColumnarValue],
) -> Result<ColumnarValue> {
- let executor = WkbExecutor::new(arg_types, args);
- let min_output_size = "POINT".len() * executor.num_iterations();
+ let executor = WkbBytesExecutor::new(arg_types, args);
+ let min_output_size = "ST_POINT".len() * executor.num_iterations();
let mut builder =
StringBuilder::with_capacity(executor.num_iterations(), min_output_size);
- // We can do quite a lot better than this with some vectorized WKB
processing,
- // but for now we just do a slow iteration
- executor.execute_wkb_void(|maybe_item| {
- match maybe_item {
- Some(item) => {
- builder.append_option(invoke_scalar(&item)?);
+ // Iterate over raw WKB bytes for faster type inference
+ executor.execute_wkb_void(|maybe_bytes| {
+ match maybe_bytes {
+ Some(bytes) => {
+ let name = infer_geometry_type_name(bytes)?;
+ builder.append_value(name);
}
None => builder.append_null(),
}
@@ -87,20 +85,33 @@ impl SedonaScalarKernel for STGeometryType {
}
}
-fn invoke_scalar(item: &Wkb) -> Result<Option<String>> {
- match item.as_type() {
- geo_traits::GeometryType::Point(_) => Ok(Some("ST_Point".to_string())),
- geo_traits::GeometryType::LineString(_) =>
Ok(Some("ST_LineString".to_string())),
- geo_traits::GeometryType::Polygon(_) =>
Ok(Some("ST_Polygon".to_string())),
- geo_traits::GeometryType::MultiPoint(_) =>
Ok(Some("ST_MultiPoint".to_string())),
- geo_traits::GeometryType::MultiLineString(_) =>
Ok(Some("ST_MultiLineString".to_string())),
- geo_traits::GeometryType::MultiPolygon(_) =>
Ok(Some("ST_MultiPolygon".to_string())),
- geo_traits::GeometryType::GeometryCollection(_) => {
- Ok(Some("ST_GeometryCollection".to_string()))
- }
-
- // Other geometry types in geo that we should not get here: Rect,
Triangle, Line
- _ => sedona_internal_err!("unexpected geometry type"),
+/// Fast-path inference of geometry type name from raw WKB bytes
+/// An error will be thrown for invalid WKB bytes input
+///
+/// Spec: https://libgeos.org/specifications/wkb/
+#[inline]
+fn infer_geometry_type_name(buf: &[u8]) -> Result<&'static str> {
+ if buf.len() < 5 {
+ return sedona_internal_err!("Invalid WKB: buffer too small ({}
bytes)", buf.len());
+ }
+
+ let byte_order = buf[0];
+ let code = match byte_order {
+ 0 => u32::from_be_bytes([buf[1], buf[2], buf[3], buf[4]]),
+ 1 => u32::from_le_bytes([buf[1], buf[2], buf[3], buf[4]]),
+ other => return sedona_internal_err!("Unexpected byte order: {other}"),
+ };
+
+ // Only low 3 bits is for the base type, high bits include additional info
+ match code & 0x7 {
+ 1 => Ok("ST_Point"),
+ 2 => Ok("ST_LineString"),
+ 3 => Ok("ST_Polygon"),
+ 4 => Ok("ST_MultiPoint"),
+ 5 => Ok("ST_MultiLineString"),
+ 6 => Ok("ST_MultiPolygon"),
+ 7 => Ok("ST_GeometryCollection"),
+ _ => sedona_internal_err!("WKB type code out of range. Got: {}", code),
}
}