This is an automated email from the ASF dual-hosted git repository.

jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git


The following commit(s) were added to refs/heads/master by this push:
     new e6b8a1a99c [SEDONA-742] Push down ST_DistanceSphere filter (#2238)
e6b8a1a99c is described below

commit e6b8a1a99c86620cdee60fdffd381fabe1ef532d
Author: Pranav Toggi <[email protected]>
AuthorDate: Tue Aug 5 20:42:09 2025 -0700

    [SEDONA-742] Push down ST_DistanceSphere filter (#2238)
---
 .../org/apache/sedona/common/sphere/Haversine.java |  8 +-
 .../SpatialFilterPushDownForGeoParquet.scala       | 98 +++++++++++++++-------
 .../sql/GeoParquetSpatialFilterPushDownSuite.scala | 35 ++++++++
 .../sql/GeoParquetSpatialFilterPushDownSuite.scala | 35 ++++++++
 .../sql/GeoParquetSpatialFilterPushDownSuite.scala | 35 ++++++++
 5 files changed, 177 insertions(+), 34 deletions(-)

diff --git 
a/common/src/main/java/org/apache/sedona/common/sphere/Haversine.java 
b/common/src/main/java/org/apache/sedona/common/sphere/Haversine.java
index 3e4b77a63c..698ef36d14 100644
--- a/common/src/main/java/org/apache/sedona/common/sphere/Haversine.java
+++ b/common/src/main/java/org/apache/sedona/common/sphere/Haversine.java
@@ -38,7 +38,9 @@ public class Haversine {
    * @param geom2 The second geometry. Each coordinate is in lon/lat order
    * @return
    */
-  public static double distance(Geometry geom1, Geometry geom2, double 
AVG_EARTH_RADIUS) {
+  public static final double AVG_EARTH_RADIUS = 6371008.0;
+
+  public static double distance(Geometry geom1, Geometry geom2, double 
avg_earth_radius) {
     Coordinate coordinate1 =
         geom1.getGeometryType().equals("Point")
             ? geom1.getCoordinate()
@@ -61,13 +63,13 @@ public class Haversine {
                 * sin(lngDistance / 2)
                 * sin(lngDistance / 2);
     double c = 2 * atan2(sqrt(a), sqrt(1 - a));
-    return AVG_EARTH_RADIUS * c * 1.0;
+    return avg_earth_radius * c * 1.0;
   }
 
   // Calculate the distance between two points on the earth using the 
"haversine" formula.
   // The radius of the earth is 6371.0 km
   public static double distance(Geometry geom1, Geometry geom2) {
-    return distance(geom1, geom2, 6371008.0);
+    return distance(geom1, geom2, AVG_EARTH_RADIUS);
   }
 
   /**
diff --git 
a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala
 
b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala
index b18a74831a..a287343ddf 100644
--- 
a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala
+++ 
b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala
@@ -18,7 +18,7 @@
  */
 package org.apache.spark.sql.sedona_sql.optimization
 
-import org.apache.sedona.common.geometryObjects.Circle
+import org.apache.sedona.common.sphere.Haversine
 import org.apache.sedona.core.spatialOperator.SpatialPredicate
 import org.apache.sedona.sql.utils.GeometrySerializer
 import org.apache.spark.sql.SparkSession
@@ -45,7 +45,7 @@ import 
org.apache.spark.sql.execution.datasources.parquet.GeoParquetSpatialFilte
 import 
org.apache.spark.sql.execution.datasources.parquet.GeoParquetSpatialFilter.LeafFilter
 import 
org.apache.spark.sql.execution.datasources.parquet.GeoParquetSpatialFilter.OrFilter
 import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT
-import org.apache.spark.sql.sedona_sql.expressions.{ST_AsEWKT, ST_Buffer, 
ST_Contains, ST_CoveredBy, ST_Covers, ST_Crosses, ST_DWithin, ST_Distance, 
ST_DistanceSpheroid, ST_Equals, ST_Intersects, ST_OrderingEquals, ST_Overlaps, 
ST_Touches, ST_Within}
+import org.apache.spark.sql.sedona_sql.expressions.{ST_AsEWKT, ST_Buffer, 
ST_Contains, ST_CoveredBy, ST_Covers, ST_Crosses, ST_DWithin, ST_Distance, 
ST_DistanceSphere, ST_DistanceSpheroid, ST_Equals, ST_Intersects, 
ST_OrderingEquals, ST_Overlaps, ST_Touches, ST_Within}
 import 
org.apache.spark.sql.sedona_sql.optimization.ExpressionUtils.splitConjunctivePredicates
 import org.apache.spark.sql.types.DoubleType
 import org.locationtech.jts.geom.Geometry
@@ -158,7 +158,7 @@ class SpatialFilterPushDownForGeoParquet(sparkSession: 
SparkSession) extends Rul
             name,
             GeometryUDT.deserialize(value),
             d.asInstanceOf[Double],
-            useSpheroid = true)
+            distanceType = "spheroid")
 
       case LessThanOrEqual(ST_DistanceSpheroid(distArgs), Literal(d, 
DoubleType)) =>
         for ((name, value) <- resolveNameAndLiteral(distArgs, pushableColumn))
@@ -166,21 +166,47 @@ class SpatialFilterPushDownForGeoParquet(sparkSession: 
SparkSession) extends Rul
             name,
             GeometryUDT.deserialize(value),
             d.asInstanceOf[Double],
-            useSpheroid = true)
+            distanceType = "spheroid")
+
+      case LessThan(ST_DistanceSphere(distArgs), Literal(d, DoubleType)) =>
+        val radiusOpt = distArgs.lift(2).collect {
+          case Literal(customRadius: Double, DoubleType) => customRadius
+        }
+
+        resolveNameAndLiteral(distArgs.take(2), pushableColumn).map { case 
(name, value) =>
+          distanceFilter(
+            name,
+            GeometryUDT.deserialize(value),
+            d.asInstanceOf[Double],
+            distanceType = "sphere",
+            sphereRadiusOverride = radiusOpt)
+        }
+
+      case LessThanOrEqual(ST_DistanceSphere(distArgs), Literal(d, 
DoubleType)) =>
+        val radiusOpt = distArgs.lift(2).collect {
+          case Literal(customRadius: Double, DoubleType) => customRadius
+        }
+
+        resolveNameAndLiteral(distArgs.take(2), pushableColumn).map { case 
(name, value) =>
+          distanceFilter(
+            name,
+            GeometryUDT.deserialize(value),
+            d.asInstanceOf[Double],
+            distanceType = "sphere",
+            sphereRadiusOverride = radiusOpt)
+        }
 
       case ST_DWithin(args) if args.length == 3 || args.length == 4 =>
         val distanceLit = args(2)
-        val useSpheroid = if (args.length == 4) {
-          args(3) match {
-            case Literal(flag: Boolean, _) => flag
-            case _ => false
-          }
-        } else false
+        val distanceType = args.lift(3) match {
+          case Some(Literal(flag: Boolean, _)) => if (flag) "spheroid" else 
"planar"
+          case _ => "planar"
+        }
 
         distanceLit match {
           case Literal(distance: Double, DoubleType) =>
             resolveNameAndLiteral(args.take(2), pushableColumn).map { case 
(name, value) =>
-              distanceFilter(name, GeometryUDT.deserialize(value), distance, 
useSpheroid)
+              distanceFilter(name, GeometryUDT.deserialize(value), distance, 
distanceType)
             }
           case _ => None
         }
@@ -192,26 +218,36 @@ class SpatialFilterPushDownForGeoParquet(sparkSession: 
SparkSession) extends Rul
       name: String,
       geom: Geometry,
       distance: Double,
-      useSpheroid: Boolean = false): GeoParquetSpatialFilter = {
-    val queryWindow: Geometry = if (useSpheroid) {
-      // Spheroidal buffer
-      // Increase buffer distance by 3% to account for false negatives with 
Spheroidal Buffer calculations
-      val distanceLit = Literal(distance * 1.03)
-      val spheroidLit = Literal(true)
-      val geomLit = Literal.create(GeometrySerializer.serialize(geom), new 
GeometryUDT())
-
-      val bufferGeometry = {
-        val bufferExpr = ST_Buffer(
-          scala.collection.immutable.Seq(geomLit, distanceLit, spheroidLit))
-        val wkb = bufferExpr.eval().asInstanceOf[Array[Byte]]
-        GeometrySerializer.deserialize(wkb)
-      }
-      bufferGeometry
-    } else {
-      // Euclidean distance
-      val envelope = geom.getEnvelopeInternal
-      envelope.expandBy(distance)
-      geom.getFactory.toGeometry(envelope)
+      distanceType: String = "planar",
+      sphereRadiusOverride: Option[Double] = None): GeoParquetSpatialFilter = {
+    val queryWindow: Geometry = distanceType match {
+      case "spheroid" =>
+        // Spheroidal buffer
+        // Increase buffer distance by 3% to account for false negatives with 
Spheroidal Buffer calculations
+        val distanceLit = Literal(distance * 1.03)
+        val spheroidLit = Literal(true)
+        val geomLit = Literal.create(GeometrySerializer.serialize(geom), new 
GeometryUDT())
+
+        val bufferGeometry = {
+          val bufferExpr = ST_Buffer(
+            scala.collection.immutable.Seq(geomLit, distanceLit, spheroidLit))
+          val wkb = bufferExpr.eval().asInstanceOf[Array[Byte]]
+          GeometrySerializer.deserialize(wkb)
+        }
+        bufferGeometry
+
+      case "sphere" =>
+        // The Haversine expandEnvelope already conservatively expands 
envelope by 10% to avoid false negatives
+        val radius = sphereRadiusOverride.getOrElse(Haversine.AVG_EARTH_RADIUS)
+        val expandedEnvelope =
+          Haversine.expandEnvelope(geom.getEnvelopeInternal, distance, radius)
+        geom.getFactory.toGeometry(expandedEnvelope)
+
+      case _ =>
+        // Euclidean distance
+        val envelope = geom.getEnvelopeInternal
+        envelope.expandBy(distance)
+        geom.getFactory.toGeometry(envelope)
     }
     LeafFilter(unquote(name), SpatialPredicate.INTERSECTS, queryWindow)
   }
diff --git 
a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
 
b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
index 7e8e900b2b..c38b1730c8 100644
--- 
a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
+++ 
b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
@@ -229,6 +229,41 @@ class GeoParquetSpatialFilterPushDownSuite extends 
TestBaseScala with TableDrive
           s"ST_DistanceSpheroid(geom, ST_GeomFromText('LINESTRING (17 17, 18 
18)')) $op 500000",
           Seq(1))
       }
+
+      it(s"Push down ST_DistanceSphere $op d") {
+        testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) 
$op 100", Seq.empty)
+        testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) 
$op 500", Seq.empty)
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)')) $op 
808691.391",
+          Seq(0, 1))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) $op 
7100000",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)')) $op 
500000",
+          Seq(2))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1 
1, -1 1, -1 -1))')) $op 2",
+          Seq.empty)
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1 
1, -1 1, -1 -1))')) $op 500000",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18 
18)')) $op 500000",
+          Seq(1))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)'), 5000000) 
$op 808691.391",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)'), 5000000) 
$op 400000",
+          Seq(2))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1 
1, -1 1, -1 -1))'), 7000000) $op 500000",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18 
18)'), 6000000) $op 500000",
+          Seq(1))
+      }
     }
 
     it("Push down And(filters...)") {
diff --git 
a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
 
b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
index 7e8e900b2b..c38b1730c8 100644
--- 
a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
+++ 
b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
@@ -229,6 +229,41 @@ class GeoParquetSpatialFilterPushDownSuite extends 
TestBaseScala with TableDrive
           s"ST_DistanceSpheroid(geom, ST_GeomFromText('LINESTRING (17 17, 18 
18)')) $op 500000",
           Seq(1))
       }
+
+      it(s"Push down ST_DistanceSphere $op d") {
+        testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) 
$op 100", Seq.empty)
+        testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) 
$op 500", Seq.empty)
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)')) $op 
808691.391",
+          Seq(0, 1))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) $op 
7100000",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)')) $op 
500000",
+          Seq(2))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1 
1, -1 1, -1 -1))')) $op 2",
+          Seq.empty)
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1 
1, -1 1, -1 -1))')) $op 500000",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18 
18)')) $op 500000",
+          Seq(1))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)'), 5000000) 
$op 808691.391",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)'), 5000000) 
$op 400000",
+          Seq(2))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1 
1, -1 1, -1 -1))'), 7000000) $op 500000",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18 
18)'), 6000000) $op 500000",
+          Seq(1))
+      }
     }
 
     it("Push down And(filters...)") {
diff --git 
a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
 
b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
index 7e8e900b2b..c38b1730c8 100644
--- 
a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
+++ 
b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
@@ -229,6 +229,41 @@ class GeoParquetSpatialFilterPushDownSuite extends 
TestBaseScala with TableDrive
           s"ST_DistanceSpheroid(geom, ST_GeomFromText('LINESTRING (17 17, 18 
18)')) $op 500000",
           Seq(1))
       }
+
+      it(s"Push down ST_DistanceSphere $op d") {
+        testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) 
$op 100", Seq.empty)
+        testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) 
$op 500", Seq.empty)
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)')) $op 
808691.391",
+          Seq(0, 1))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) $op 
7100000",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)')) $op 
500000",
+          Seq(2))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1 
1, -1 1, -1 -1))')) $op 2",
+          Seq.empty)
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1 
1, -1 1, -1 -1))')) $op 500000",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18 
18)')) $op 500000",
+          Seq(1))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)'), 5000000) 
$op 808691.391",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)'), 5000000) 
$op 400000",
+          Seq(2))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1 
1, -1 1, -1 -1))'), 7000000) $op 500000",
+          Seq(0, 1, 2, 3))
+        testFilter(
+          s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18 
18)'), 6000000) $op 500000",
+          Seq(1))
+      }
     }
 
     it("Push down And(filters...)") {

Reply via email to