paleolimbot commented on code in PR #91:
URL: https://github.com/apache/sedona-db/pull/91#discussion_r2353005980
##########
rust/sedona-spatial-join/src/optimizer.rs:
##########
@@ -89,6 +90,123 @@ impl PhysicalOptimizerRule for SpatialJoinOptimizer {
}
}
+impl OptimizerRule for SpatialJoinOptimizer {
+ fn name(&self) -> &str {
+ "spatial_join_optimizer"
+ }
+
+ fn apply_order(&self) -> Option<ApplyOrder> {
+ Some(ApplyOrder::BottomUp)
+ }
+
+ /// Try to rewrite the plan containing a spatial Filter on top of a cross
join without on or filter
+ /// to a theta-join with filter. For instance, the following query plan:
+ ///
+ /// ```text
+ /// Filter: st_intersects(l.geom, _scalar_sq_1.geom)
+ /// Left Join (no on, no filter):
+ /// TableScan: l projection=[id, geom]
+ /// SubqueryAlias: __scalar_sq_1
+ /// Projection: r.geom
+ /// Filter: r.id = Int32(1)
+ /// TableScan: r projection=[id, geom]
+ /// ```
+ ///
+ /// will be rewritten to
+ ///
+ /// ```text
+ /// Inner Join: Filter: st_intersects(l.geom, _scalar_sq_1.geom)
+ /// TableScan: l projection=[id, geom]
+ /// SubqueryAlias: __scalar_sq_1
+ /// Projection: r.geom
+ /// Filter: r.id = Int32(1)
+ /// TableScan: r projection=[id, geom]
+ /// ```
+ ///
+ /// This is for enabling this logical join operator to be converted to a
NestedLoopJoin physical
+ /// node with a spatial predicate, so that it could subsequently be
optimized to a SpatialJoin
+ /// physical node. Please refer to the `PhysicalOptimizerRule`
implementation of this struct
+ /// and [SpatialJoinOptimizer::try_optimize_join] for details.
+ fn rewrite(
+ &self,
+ plan: LogicalPlan,
+ _config: &dyn OptimizerConfig,
+ ) -> Result<Transformed<LogicalPlan>> {
+ let LogicalPlan::Filter(Filter {
+ predicate, input, ..
+ }) = &plan
+ else {
+ return Ok(Transformed::no(plan));
+ };
+ if !is_spatial_predicate(predicate) {
+ return Ok(Transformed::no(plan));
+ }
+
+ let LogicalPlan::Join(Join {
+ ref left,
+ ref right,
+ ref on,
+ ref filter,
+ join_type,
+ ref join_constraint,
+ ref null_equality,
+ ..
+ }) = input.as_ref()
+ else {
+ return Ok(Transformed::no(plan));
+ };
+
+ // Check if this is a suitable join for rewriting
+ if !matches!(
+ join_type,
+ JoinType::Inner | JoinType::Left | JoinType::Right
+ ) || !on.is_empty()
+ || filter.is_some()
+ {
+ return Ok(Transformed::no(plan));
+ }
+
+ let rewritten_plan = Join::try_new(
+ Arc::clone(left),
+ Arc::clone(right),
+ on.clone(),
+ Some(predicate.clone()),
+ JoinType::Inner,
+ *join_constraint,
+ *null_equality,
+ )?;
+
+ Ok(Transformed::yes(LogicalPlan::Join(rewritten_plan)))
+ }
+}
+
+/// Check if a given logical expression contains a spatial predicate component
or not. We assume that the given
+/// `expr` evaluates to a boolean value and originates from a filter logical
node.
+fn is_spatial_predicate(expr: &Expr) -> bool {
+ fn is_distance_expr(expr: &Expr) -> bool {
+ let Expr::ScalarFunction(datafusion_expr::expr::ScalarFunction { func,
.. }) = expr else {
+ return false;
+ };
+ func.name().to_lowercase() == "st_distance"
+ }
+
+ match expr {
+ Expr::BinaryExpr(datafusion_expr::expr::BinaryExpr {
+ left, right, op, ..
+ }) => match op {
+ Operator::And => is_spatial_predicate(left) ||
is_spatial_predicate(right),
+ Operator::Lt | Operator::LtEq => is_distance_expr(left),
+ Operator::Gt | Operator::GtEq => is_distance_expr(right),
+ _ => false,
+ },
+ Expr::ScalarFunction(datafusion_expr::expr::ScalarFunction { func, ..
}) => {
+ let func_name = func.name().to_lowercase();
+ func_name.starts_with("st_")
Review Comment:
Should this be more specific about which st_ function will return `true`
here? (e.g., `ST_IsEmpty()` might make it through because it's a predicate, but
does this optimization apply if that's the case?)
##########
rust/sedona-spatial-join/src/exec.rs:
##########
@@ -1028,6 +1028,16 @@ mod tests {
Ok(())
}
+ #[tokio::test]
+ async fn test_query_window_in_subquery() -> Result<()> {
+ let ((left_schema, left_partitions), (right_schema, right_partitions))
=
+ create_test_data_with_size_range((50.0, 60.0), WKB_GEOMETRY)?;
+ let options = SpatialJoinOptions::default();
+ test_spatial_join_query(&left_schema, &right_schema,
left_partitions.clone(), right_partitions.clone(), &options, 10,
+ "SELECT id FROM L WHERE ST_Intersects(L.geometry, (SELECT
R.geometry FROM R WHERE R.id = 1))").await?;
Review Comment:
This may also benefit from one more more cases where there's a subquery that
should *not* be optimized or that should error. (Or perhaps that would be a
better fit for the Python test above).
##########
python/sedonadb/tests/test_sjoin.py:
##########
@@ -140,3 +140,51 @@ def test_spatial_join_geography(join_type, on):
sedonadb_results = eng_sedonadb.execute_and_collect(sql).to_pandas()
eng_postgis.assert_query_result(sql, sedonadb_results)
+
+
+def test_query_window_in_subquery():
+ with (
+ SedonaDB.create_or_skip() as eng_sedonadb,
+ PostGIS.create_or_skip() as eng_postgis,
+ ):
+ options = json.dumps(
+ {
+ "geom_type": "Point",
+ "seed": 42,
+ }
+ )
Review Comment:
I'll follow up with a PR shortly with a shortcut for this since we're using
it a lot now!
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]