This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new e6b8a1a99c [SEDONA-742] Push down ST_DistanceSphere filter (#2238)
e6b8a1a99c is described below
commit e6b8a1a99c86620cdee60fdffd381fabe1ef532d
Author: Pranav Toggi <[email protected]>
AuthorDate: Tue Aug 5 20:42:09 2025 -0700
[SEDONA-742] Push down ST_DistanceSphere filter (#2238)
---
.../org/apache/sedona/common/sphere/Haversine.java | 8 +-
.../SpatialFilterPushDownForGeoParquet.scala | 98 +++++++++++++++-------
.../sql/GeoParquetSpatialFilterPushDownSuite.scala | 35 ++++++++
.../sql/GeoParquetSpatialFilterPushDownSuite.scala | 35 ++++++++
.../sql/GeoParquetSpatialFilterPushDownSuite.scala | 35 ++++++++
5 files changed, 177 insertions(+), 34 deletions(-)
diff --git
a/common/src/main/java/org/apache/sedona/common/sphere/Haversine.java
b/common/src/main/java/org/apache/sedona/common/sphere/Haversine.java
index 3e4b77a63c..698ef36d14 100644
--- a/common/src/main/java/org/apache/sedona/common/sphere/Haversine.java
+++ b/common/src/main/java/org/apache/sedona/common/sphere/Haversine.java
@@ -38,7 +38,9 @@ public class Haversine {
* @param geom2 The second geometry. Each coordinate is in lon/lat order
* @return
*/
- public static double distance(Geometry geom1, Geometry geom2, double
AVG_EARTH_RADIUS) {
+ public static final double AVG_EARTH_RADIUS = 6371008.0;
+
+ public static double distance(Geometry geom1, Geometry geom2, double
avg_earth_radius) {
Coordinate coordinate1 =
geom1.getGeometryType().equals("Point")
? geom1.getCoordinate()
@@ -61,13 +63,13 @@ public class Haversine {
* sin(lngDistance / 2)
* sin(lngDistance / 2);
double c = 2 * atan2(sqrt(a), sqrt(1 - a));
- return AVG_EARTH_RADIUS * c * 1.0;
+ return avg_earth_radius * c * 1.0;
}
// Calculate the distance between two points on the earth using the
"haversine" formula.
// The radius of the earth is 6371.0 km
public static double distance(Geometry geom1, Geometry geom2) {
- return distance(geom1, geom2, 6371008.0);
+ return distance(geom1, geom2, AVG_EARTH_RADIUS);
}
/**
diff --git
a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala
b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala
index b18a74831a..a287343ddf 100644
---
a/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala
+++
b/spark/common/src/main/scala/org/apache/spark/sql/sedona_sql/optimization/SpatialFilterPushDownForGeoParquet.scala
@@ -18,7 +18,7 @@
*/
package org.apache.spark.sql.sedona_sql.optimization
-import org.apache.sedona.common.geometryObjects.Circle
+import org.apache.sedona.common.sphere.Haversine
import org.apache.sedona.core.spatialOperator.SpatialPredicate
import org.apache.sedona.sql.utils.GeometrySerializer
import org.apache.spark.sql.SparkSession
@@ -45,7 +45,7 @@ import
org.apache.spark.sql.execution.datasources.parquet.GeoParquetSpatialFilte
import
org.apache.spark.sql.execution.datasources.parquet.GeoParquetSpatialFilter.LeafFilter
import
org.apache.spark.sql.execution.datasources.parquet.GeoParquetSpatialFilter.OrFilter
import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT
-import org.apache.spark.sql.sedona_sql.expressions.{ST_AsEWKT, ST_Buffer,
ST_Contains, ST_CoveredBy, ST_Covers, ST_Crosses, ST_DWithin, ST_Distance,
ST_DistanceSpheroid, ST_Equals, ST_Intersects, ST_OrderingEquals, ST_Overlaps,
ST_Touches, ST_Within}
+import org.apache.spark.sql.sedona_sql.expressions.{ST_AsEWKT, ST_Buffer,
ST_Contains, ST_CoveredBy, ST_Covers, ST_Crosses, ST_DWithin, ST_Distance,
ST_DistanceSphere, ST_DistanceSpheroid, ST_Equals, ST_Intersects,
ST_OrderingEquals, ST_Overlaps, ST_Touches, ST_Within}
import
org.apache.spark.sql.sedona_sql.optimization.ExpressionUtils.splitConjunctivePredicates
import org.apache.spark.sql.types.DoubleType
import org.locationtech.jts.geom.Geometry
@@ -158,7 +158,7 @@ class SpatialFilterPushDownForGeoParquet(sparkSession:
SparkSession) extends Rul
name,
GeometryUDT.deserialize(value),
d.asInstanceOf[Double],
- useSpheroid = true)
+ distanceType = "spheroid")
case LessThanOrEqual(ST_DistanceSpheroid(distArgs), Literal(d,
DoubleType)) =>
for ((name, value) <- resolveNameAndLiteral(distArgs, pushableColumn))
@@ -166,21 +166,47 @@ class SpatialFilterPushDownForGeoParquet(sparkSession:
SparkSession) extends Rul
name,
GeometryUDT.deserialize(value),
d.asInstanceOf[Double],
- useSpheroid = true)
+ distanceType = "spheroid")
+
+ case LessThan(ST_DistanceSphere(distArgs), Literal(d, DoubleType)) =>
+ val radiusOpt = distArgs.lift(2).collect {
+ case Literal(customRadius: Double, DoubleType) => customRadius
+ }
+
+ resolveNameAndLiteral(distArgs.take(2), pushableColumn).map { case
(name, value) =>
+ distanceFilter(
+ name,
+ GeometryUDT.deserialize(value),
+ d.asInstanceOf[Double],
+ distanceType = "sphere",
+ sphereRadiusOverride = radiusOpt)
+ }
+
+ case LessThanOrEqual(ST_DistanceSphere(distArgs), Literal(d,
DoubleType)) =>
+ val radiusOpt = distArgs.lift(2).collect {
+ case Literal(customRadius: Double, DoubleType) => customRadius
+ }
+
+ resolveNameAndLiteral(distArgs.take(2), pushableColumn).map { case
(name, value) =>
+ distanceFilter(
+ name,
+ GeometryUDT.deserialize(value),
+ d.asInstanceOf[Double],
+ distanceType = "sphere",
+ sphereRadiusOverride = radiusOpt)
+ }
case ST_DWithin(args) if args.length == 3 || args.length == 4 =>
val distanceLit = args(2)
- val useSpheroid = if (args.length == 4) {
- args(3) match {
- case Literal(flag: Boolean, _) => flag
- case _ => false
- }
- } else false
+ val distanceType = args.lift(3) match {
+ case Some(Literal(flag: Boolean, _)) => if (flag) "spheroid" else
"planar"
+ case _ => "planar"
+ }
distanceLit match {
case Literal(distance: Double, DoubleType) =>
resolveNameAndLiteral(args.take(2), pushableColumn).map { case
(name, value) =>
- distanceFilter(name, GeometryUDT.deserialize(value), distance,
useSpheroid)
+ distanceFilter(name, GeometryUDT.deserialize(value), distance,
distanceType)
}
case _ => None
}
@@ -192,26 +218,36 @@ class SpatialFilterPushDownForGeoParquet(sparkSession:
SparkSession) extends Rul
name: String,
geom: Geometry,
distance: Double,
- useSpheroid: Boolean = false): GeoParquetSpatialFilter = {
- val queryWindow: Geometry = if (useSpheroid) {
- // Spheroidal buffer
- // Increase buffer distance by 3% to account for false negatives with
Spheroidal Buffer calculations
- val distanceLit = Literal(distance * 1.03)
- val spheroidLit = Literal(true)
- val geomLit = Literal.create(GeometrySerializer.serialize(geom), new
GeometryUDT())
-
- val bufferGeometry = {
- val bufferExpr = ST_Buffer(
- scala.collection.immutable.Seq(geomLit, distanceLit, spheroidLit))
- val wkb = bufferExpr.eval().asInstanceOf[Array[Byte]]
- GeometrySerializer.deserialize(wkb)
- }
- bufferGeometry
- } else {
- // Euclidean distance
- val envelope = geom.getEnvelopeInternal
- envelope.expandBy(distance)
- geom.getFactory.toGeometry(envelope)
+ distanceType: String = "planar",
+ sphereRadiusOverride: Option[Double] = None): GeoParquetSpatialFilter = {
+ val queryWindow: Geometry = distanceType match {
+ case "spheroid" =>
+ // Spheroidal buffer
+ // Increase buffer distance by 3% to account for false negatives with
Spheroidal Buffer calculations
+ val distanceLit = Literal(distance * 1.03)
+ val spheroidLit = Literal(true)
+ val geomLit = Literal.create(GeometrySerializer.serialize(geom), new
GeometryUDT())
+
+ val bufferGeometry = {
+ val bufferExpr = ST_Buffer(
+ scala.collection.immutable.Seq(geomLit, distanceLit, spheroidLit))
+ val wkb = bufferExpr.eval().asInstanceOf[Array[Byte]]
+ GeometrySerializer.deserialize(wkb)
+ }
+ bufferGeometry
+
+ case "sphere" =>
+ // The Haversine expandEnvelope already conservatively expands
envelope by 10% to avoid false negatives
+ val radius = sphereRadiusOverride.getOrElse(Haversine.AVG_EARTH_RADIUS)
+ val expandedEnvelope =
+ Haversine.expandEnvelope(geom.getEnvelopeInternal, distance, radius)
+ geom.getFactory.toGeometry(expandedEnvelope)
+
+ case _ =>
+ // Euclidean distance
+ val envelope = geom.getEnvelopeInternal
+ envelope.expandBy(distance)
+ geom.getFactory.toGeometry(envelope)
}
LeafFilter(unquote(name), SpatialPredicate.INTERSECTS, queryWindow)
}
diff --git
a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
index 7e8e900b2b..c38b1730c8 100644
---
a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
+++
b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
@@ -229,6 +229,41 @@ class GeoParquetSpatialFilterPushDownSuite extends
TestBaseScala with TableDrive
s"ST_DistanceSpheroid(geom, ST_GeomFromText('LINESTRING (17 17, 18
18)')) $op 500000",
Seq(1))
}
+
+ it(s"Push down ST_DistanceSphere $op d") {
+ testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)'))
$op 100", Seq.empty)
+ testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)'))
$op 500", Seq.empty)
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)')) $op
808691.391",
+ Seq(0, 1))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) $op
7100000",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)')) $op
500000",
+ Seq(2))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1
1, -1 1, -1 -1))')) $op 2",
+ Seq.empty)
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1
1, -1 1, -1 -1))')) $op 500000",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18
18)')) $op 500000",
+ Seq(1))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)'), 5000000)
$op 808691.391",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)'), 5000000)
$op 400000",
+ Seq(2))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1
1, -1 1, -1 -1))'), 7000000) $op 500000",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18
18)'), 6000000) $op 500000",
+ Seq(1))
+ }
}
it("Push down And(filters...)") {
diff --git
a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
index 7e8e900b2b..c38b1730c8 100644
---
a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
+++
b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
@@ -229,6 +229,41 @@ class GeoParquetSpatialFilterPushDownSuite extends
TestBaseScala with TableDrive
s"ST_DistanceSpheroid(geom, ST_GeomFromText('LINESTRING (17 17, 18
18)')) $op 500000",
Seq(1))
}
+
+ it(s"Push down ST_DistanceSphere $op d") {
+ testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)'))
$op 100", Seq.empty)
+ testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)'))
$op 500", Seq.empty)
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)')) $op
808691.391",
+ Seq(0, 1))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) $op
7100000",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)')) $op
500000",
+ Seq(2))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1
1, -1 1, -1 -1))')) $op 2",
+ Seq.empty)
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1
1, -1 1, -1 -1))')) $op 500000",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18
18)')) $op 500000",
+ Seq(1))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)'), 5000000)
$op 808691.391",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)'), 5000000)
$op 400000",
+ Seq(2))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1
1, -1 1, -1 -1))'), 7000000) $op 500000",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18
18)'), 6000000) $op 500000",
+ Seq(1))
+ }
}
it("Push down And(filters...)") {
diff --git
a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
index 7e8e900b2b..c38b1730c8 100644
---
a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
+++
b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoParquetSpatialFilterPushDownSuite.scala
@@ -229,6 +229,41 @@ class GeoParquetSpatialFilterPushDownSuite extends
TestBaseScala with TableDrive
s"ST_DistanceSpheroid(geom, ST_GeomFromText('LINESTRING (17 17, 18
18)')) $op 500000",
Seq(1))
}
+
+ it(s"Push down ST_DistanceSphere $op d") {
+ testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)'))
$op 100", Seq.empty)
+ testFilter(s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)'))
$op 500", Seq.empty)
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)')) $op
808691.391",
+ Seq(0, 1))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (0 0)')) $op
7100000",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)')) $op
500000",
+ Seq(2))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1
1, -1 1, -1 -1))')) $op 2",
+ Seq.empty)
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1
1, -1 1, -1 -1))')) $op 500000",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18
18)')) $op 500000",
+ Seq(1))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (3 4)'), 5000000)
$op 808691.391",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POINT (-5 -5)'), 5000000)
$op 400000",
+ Seq(2))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('POLYGON ((-1 -1, 1 -1, 1
1, -1 1, -1 -1))'), 7000000) $op 500000",
+ Seq(0, 1, 2, 3))
+ testFilter(
+ s"ST_DistanceSphere(geom, ST_GeomFromText('LINESTRING (17 17, 18
18)'), 6000000) $op 500000",
+ Seq(1))
+ }
}
it("Push down And(filters...)") {