This is an automated email from the ASF dual-hosted git repository.
jiayu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/sedona.git
The following commit(s) were added to refs/heads/master by this push:
new 4b2b8a6889 [GH-2337] Fix the DateTimeParseException in GeoPackage
reader (#2339)
4b2b8a6889 is described below
commit 4b2b8a68897a22aa929f5a65c8be94c49ee7c6dd
Author: Jia Yu <[email protected]>
AuthorDate: Sun Sep 7 00:43:52 2025 -0700
[GH-2337] Fix the DateTimeParseException in GeoPackage reader (#2339)
---
.../transform/DataTypesTransformations.scala | 41 +++++++++++++---
.../resources/geopackage/test_datetime_issue.gpkg | Bin 0 -> 53248 bytes
.../apache/sedona/sql/GeoPackageReaderTest.scala | 52 +++++++++++++++++++--
.../apache/sedona/sql/GeoPackageReaderTest.scala | 44 +++++++++++++++++
.../apache/sedona/sql/GeoPackageReaderTest.scala | 44 +++++++++++++++++
5 files changed, 170 insertions(+), 11 deletions(-)
diff --git
a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala
b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala
index 9a23f0a088..c0e532b08a 100644
---
a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala
+++
b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala
@@ -18,22 +18,49 @@
*/
package org.apache.sedona.sql.datasources.geopackage.transform
-import java.time.{Instant, LocalDate}
+import java.time.{Instant, LocalDate, LocalDateTime, ZoneOffset}
import java.time.format.DateTimeFormatter
+import java.time.format.DateTimeParseException
import java.time.temporal.ChronoUnit
object DataTypesTransformations {
- def getDays(dateString: String): Int = {
- val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
-
- val date = LocalDate.parse(dateString, formatter)
+ // Pre-created formatters to avoid repeated object creation
+ private val dateFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
+ private val datetimeFormatters = Array(
+ DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SSS"), // 3 digits
+ DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.SS"), // 2 digits
+ DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss.S"), // 1 digit
+ DateTimeFormatter.ofPattern("yyyy-MM-dd'T'HH:mm:ss") // no milliseconds
+ )
+ def getDays(dateString: String): Int = {
+ val date = LocalDate.parse(dateString, dateFormatter)
val epochDate = LocalDate.of(1970, 1, 1)
-
ChronoUnit.DAYS.between(epochDate, date).toInt
}
def epoch(timestampStr: String): Long = {
- Instant.parse(timestampStr).toEpochMilli
+ try {
+ // Try parsing as-is first (works for timestamps with timezone info)
+ Instant.parse(timestampStr).toEpochMilli
+ } catch {
+ case _: DateTimeParseException =>
+ // If parsing fails, try treating it as UTC (common case for
GeoPackage)
+ // Handle various datetime formats without timezone info
+ // Try different patterns to handle various millisecond formats
+ for (formatter <- datetimeFormatters) {
+ try {
+ val localDateTime = LocalDateTime.parse(timestampStr, formatter)
+ return localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli
+ } catch {
+ case _: DateTimeParseException =>
+ // Continue to next formatter
+ }
+ }
+
+ // If all formatters failed, throw a descriptive exception
+ throw new IllegalArgumentException(s"Unable to parse datetime:
$timestampStr. " +
+ s"Expected formats: 'yyyy-MM-ddTHH:mm:ss[.SSS|.SS|.S]' or
'yyyy-MM-ddTHH:mm:ss[.SSS|.SS|.S]Z'")
+ }
}
}
diff --git
a/spark/common/src/test/resources/geopackage/test_datetime_issue.gpkg
b/spark/common/src/test/resources/geopackage/test_datetime_issue.gpkg
new file mode 100644
index 0000000000..f53a11fe08
Binary files /dev/null and
b/spark/common/src/test/resources/geopackage/test_datetime_issue.gpkg differ
diff --git
a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
index ee9931cbf4..1e4b071361 100644
---
a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
+++
b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
@@ -18,11 +18,11 @@
*/
package org.apache.sedona.sql
-import io.minio.{MakeBucketArgs, MinioClient, PutObjectArgs}
-import org.apache.spark.sql.{DataFrame, SparkSession}
+import io.minio.{MakeBucketArgs, MinioClient}
+import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.expr
import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT
-import org.apache.spark.sql.types.{BinaryType, BooleanType, DateType,
DoubleType, IntegerType, StringType, StructField, StructType, TimestampType}
+import org.apache.spark.sql.types._
import org.scalatest.matchers.should.Matchers
import org.scalatest.prop.TableDrivenPropertyChecks._
import org.testcontainers.containers.MinIOContainer
@@ -168,6 +168,50 @@ class GeoPackageReaderTest extends TestBaseScala with
Matchers {
df.count() shouldEqual expectedCount
}
}
+
+ it("should handle datetime fields without timezone information") {
+ // This test verifies the fix for DateTimeParseException when reading
+ // GeoPackage files with datetime fields that don't include timezone info
+ val testFilePath = resourceFolder + "geopackage/test_datetime_issue.gpkg"
+
+ // Test reading the test_features table with problematic datetime formats
+ val df = sparkSession.read
+ .format("geopackage")
+ .option("tableName", "test_features")
+ .load(testFilePath)
+
+ // The test should not throw DateTimeParseException when reading
datetime fields
+ noException should be thrownBy {
+ df.select("created_at", "updated_at").collect()
+ }
+
+ // Verify that datetime fields are properly parsed as TimestampType
+ df.schema.fields.find(_.name == "created_at").get.dataType shouldEqual
TimestampType
+ df.schema.fields.find(_.name == "updated_at").get.dataType shouldEqual
TimestampType
+
+ // Verify that we can read the datetime values
+ val datetimeValues = df.select("created_at", "updated_at").collect()
+ datetimeValues should not be empty
+
+ // Verify that datetime values are valid timestamps
+ datetimeValues.foreach { row =>
+ val createdTimestamp = row.getAs[Timestamp]("created_at")
+ val updatedTimestamp = row.getAs[Timestamp]("updated_at")
+ createdTimestamp should not be null
+ updatedTimestamp should not be null
+ createdTimestamp.getTime should be > 0L
+ updatedTimestamp.getTime should be > 0L
+ }
+
+ // Test showMetadata option with the same file
+ noException should be thrownBy {
+ val metadataDf = sparkSession.read
+ .format("geopackage")
+ .option("showMetadata", "true")
+ .load(testFilePath)
+ metadataDf.select("last_change").collect()
+ }
+ }
}
describe("GeoPackage Raster Data Test") {
@@ -257,7 +301,7 @@ class GeoPackageReaderTest extends TestBaseScala with
Matchers {
.load(inputPath)
.count shouldEqual 34
- val df = sparkSessionMinio.read
+ val df = sparkSession.read
.format("geopackage")
.option("tableName", "point1")
.load(inputPath)
diff --git
a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
index 9de19c3c48..6d9f41bf4e 100644
---
a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
+++
b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
@@ -168,6 +168,50 @@ class GeoPackageReaderTest extends TestBaseScala with
Matchers {
df.count() shouldEqual expectedCount
}
}
+
+ it("should handle datetime fields without timezone information") {
+ // This test verifies the fix for DateTimeParseException when reading
+ // GeoPackage files with datetime fields that don't include timezone info
+ val testFilePath = resourceFolder + "geopackage/test_datetime_issue.gpkg"
+
+ // Test reading the test_features table with problematic datetime formats
+ val df = sparkSession.read
+ .format("geopackage")
+ .option("tableName", "test_features")
+ .load(testFilePath)
+
+ // The test should not throw DateTimeParseException when reading
datetime fields
+ noException should be thrownBy {
+ df.select("created_at", "updated_at").collect()
+ }
+
+ // Verify that datetime fields are properly parsed as TimestampType
+ df.schema.fields.find(_.name == "created_at").get.dataType shouldEqual
TimestampType
+ df.schema.fields.find(_.name == "updated_at").get.dataType shouldEqual
TimestampType
+
+ // Verify that we can read the datetime values
+ val datetimeValues = df.select("created_at", "updated_at").collect()
+ datetimeValues should not be empty
+
+ // Verify that datetime values are valid timestamps
+ datetimeValues.foreach { row =>
+ val createdTimestamp = row.getAs[Timestamp]("created_at")
+ val updatedTimestamp = row.getAs[Timestamp]("updated_at")
+ createdTimestamp should not be null
+ updatedTimestamp should not be null
+ createdTimestamp.getTime should be > 0L
+ updatedTimestamp.getTime should be > 0L
+ }
+
+ // Test showMetadata option with the same file
+ noException should be thrownBy {
+ val metadataDf = sparkSession.read
+ .format("geopackage")
+ .option("showMetadata", "true")
+ .load(testFilePath)
+ metadataDf.select("last_change").collect()
+ }
+ }
}
describe("GeoPackage Raster Data Test") {
diff --git
a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
index 9de19c3c48..6d9f41bf4e 100644
---
a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
+++
b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala
@@ -168,6 +168,50 @@ class GeoPackageReaderTest extends TestBaseScala with
Matchers {
df.count() shouldEqual expectedCount
}
}
+
+ it("should handle datetime fields without timezone information") {
+ // This test verifies the fix for DateTimeParseException when reading
+ // GeoPackage files with datetime fields that don't include timezone info
+ val testFilePath = resourceFolder + "geopackage/test_datetime_issue.gpkg"
+
+ // Test reading the test_features table with problematic datetime formats
+ val df = sparkSession.read
+ .format("geopackage")
+ .option("tableName", "test_features")
+ .load(testFilePath)
+
+ // The test should not throw DateTimeParseException when reading
datetime fields
+ noException should be thrownBy {
+ df.select("created_at", "updated_at").collect()
+ }
+
+ // Verify that datetime fields are properly parsed as TimestampType
+ df.schema.fields.find(_.name == "created_at").get.dataType shouldEqual
TimestampType
+ df.schema.fields.find(_.name == "updated_at").get.dataType shouldEqual
TimestampType
+
+ // Verify that we can read the datetime values
+ val datetimeValues = df.select("created_at", "updated_at").collect()
+ datetimeValues should not be empty
+
+ // Verify that datetime values are valid timestamps
+ datetimeValues.foreach { row =>
+ val createdTimestamp = row.getAs[Timestamp]("created_at")
+ val updatedTimestamp = row.getAs[Timestamp]("updated_at")
+ createdTimestamp should not be null
+ updatedTimestamp should not be null
+ createdTimestamp.getTime should be > 0L
+ updatedTimestamp.getTime should be > 0L
+ }
+
+ // Test showMetadata option with the same file
+ noException should be thrownBy {
+ val metadataDf = sparkSession.read
+ .format("geopackage")
+ .option("showMetadata", "true")
+ .load(testFilePath)
+ metadataDf.select("last_change").collect()
+ }
+ }
}
describe("GeoPackage Raster Data Test") {