This is an automated email from the ASF dual-hosted git repository. jiayu pushed a commit to branch geopackage-bug in repository https://gitbox.apache.org/repos/asf/sedona.git
commit a6362240e29899d57188f456be47989f2b113eda Author: Jia Yu <[email protected]> AuthorDate: Sat Sep 6 23:43:48 2025 -0700 Fix the DateTimeParseException --- .../transform/DataTypesTransformations.scala | 47 +++++++++++++++++++- .../apache/sedona/sql/GeoPackageReaderTest.scala | 51 ++++++++++++++++++++-- .../apache/sedona/sql/GeoPackageReaderTest.scala | 44 +++++++++++++++++++ .../apache/sedona/sql/GeoPackageReaderTest.scala | 44 +++++++++++++++++++ 4 files changed, 181 insertions(+), 5 deletions(-) diff --git a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala index 9a23f0a088..2207194157 100644 --- a/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala +++ b/spark/common/src/main/scala/org/apache/sedona/sql/datasources/geopackage/transform/DataTypesTransformations.scala @@ -18,8 +18,9 @@ */ package org.apache.sedona.sql.datasources.geopackage.transform -import java.time.{Instant, LocalDate} +import java.time.{Instant, LocalDate, LocalDateTime, ZoneOffset} import java.time.format.DateTimeFormatter +import java.time.format.DateTimeParseException import java.time.temporal.ChronoUnit object DataTypesTransformations { @@ -34,6 +35,48 @@ object DataTypesTransformations { } def epoch(timestampStr: String): Long = { - Instant.parse(timestampStr).toEpochMilli + try { + // Try parsing as-is first (works for timestamps with timezone info) + Instant.parse(timestampStr).toEpochMilli + } catch { + case _: DateTimeParseException => + // If parsing fails, try treating it as UTC (common case for GeoPackage) + try { + // Handle various datetime formats without timezone info + // Try different patterns to handle various millisecond formats + val patterns = Array( + "yyyy-MM-dd'T'HH:mm:ss.SSS", // 3 digits + "yyyy-MM-dd'T'HH:mm:ss.SS", // 2 digits + "yyyy-MM-dd'T'HH:mm:ss.S", // 1 digit + "yyyy-MM-dd'T'HH:mm:ss" // no milliseconds + ) + + var localDateTime: LocalDateTime = null + var lastException: DateTimeParseException = null + + for (pattern <- patterns) { + try { + val formatter = DateTimeFormatter.ofPattern(pattern) + localDateTime = LocalDateTime.parse(timestampStr, formatter) + lastException = null + } catch { + case e: DateTimeParseException => + lastException = e + } + } + + if (localDateTime != null) { + localDateTime.toInstant(ZoneOffset.UTC).toEpochMilli + } else { + throw lastException + } + } catch { + case e: DateTimeParseException => + throw new IllegalArgumentException( + s"Unable to parse datetime: $timestampStr. " + + s"Expected formats: 'yyyy-MM-ddTHH:mm:ss[.S]' or 'yyyy-MM-ddTHH:mm:ss[.S]Z'", + e) + } + } } } diff --git a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala index ee9931cbf4..61b78e2c56 100644 --- a/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala +++ b/spark/spark-3.4/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala @@ -18,11 +18,11 @@ */ package org.apache.sedona.sql -import io.minio.{MakeBucketArgs, MinioClient, PutObjectArgs} -import org.apache.spark.sql.{DataFrame, SparkSession} +import io.minio.{MakeBucketArgs, MinioClient} +import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.expr import org.apache.spark.sql.sedona_sql.UDT.GeometryUDT -import org.apache.spark.sql.types.{BinaryType, BooleanType, DateType, DoubleType, IntegerType, StringType, StructField, StructType, TimestampType} +import org.apache.spark.sql.types._ import org.scalatest.matchers.should.Matchers import org.scalatest.prop.TableDrivenPropertyChecks._ import org.testcontainers.containers.MinIOContainer @@ -38,6 +38,7 @@ class GeoPackageReaderTest extends TestBaseScala with Matchers { val path: String = resourceFolder + "geopackage/example.gpkg" val polygonsPath: String = resourceFolder + "geopackage/features.gpkg" val rasterPath: String = resourceFolder + "geopackage/raster.gpkg" + val datetimePath: String = resourceFolder + "geopackage/test_datetime_issue.gpkg" val wktReader = new org.locationtech.jts.io.WKTReader() val wktWriter = new org.locationtech.jts.io.WKTWriter() @@ -168,6 +169,50 @@ class GeoPackageReaderTest extends TestBaseScala with Matchers { df.count() shouldEqual expectedCount } } + + it("should handle datetime fields without timezone information") { + // This test verifies the fix for DateTimeParseException when reading + // GeoPackage files with datetime fields that don't include timezone info + val testFilePath = resourceFolder + "geopackage/test_datetime_issue.gpkg" + + // Test reading the test_features table with problematic datetime formats + val df = sparkSession.read + .format("geopackage") + .option("tableName", "test_features") + .load(testFilePath) + + // The test should not throw DateTimeParseException when reading datetime fields + noException should be thrownBy { + df.select("created_at", "updated_at").collect() + } + + // Verify that datetime fields are properly parsed as TimestampType + df.schema.fields.find(_.name == "created_at").get.dataType shouldEqual TimestampType + df.schema.fields.find(_.name == "updated_at").get.dataType shouldEqual TimestampType + + // Verify that we can read the datetime values + val datetimeValues = df.select("created_at", "updated_at").collect() + datetimeValues should not be empty + + // Verify that datetime values are valid timestamps + datetimeValues.foreach { row => + val createdTimestamp = row.getAs[Timestamp]("created_at") + val updatedTimestamp = row.getAs[Timestamp]("updated_at") + createdTimestamp should not be null + updatedTimestamp should not be null + createdTimestamp.getTime should be > 0L + updatedTimestamp.getTime should be > 0L + } + + // Test showMetadata option with the same file + noException should be thrownBy { + val metadataDf = sparkSession.read + .format("geopackage") + .option("showMetadata", "true") + .load(testFilePath) + metadataDf.select("last_change").collect() + } + } } describe("GeoPackage Raster Data Test") { diff --git a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala index 9de19c3c48..6d9f41bf4e 100644 --- a/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala +++ b/spark/spark-3.5/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala @@ -168,6 +168,50 @@ class GeoPackageReaderTest extends TestBaseScala with Matchers { df.count() shouldEqual expectedCount } } + + it("should handle datetime fields without timezone information") { + // This test verifies the fix for DateTimeParseException when reading + // GeoPackage files with datetime fields that don't include timezone info + val testFilePath = resourceFolder + "geopackage/test_datetime_issue.gpkg" + + // Test reading the test_features table with problematic datetime formats + val df = sparkSession.read + .format("geopackage") + .option("tableName", "test_features") + .load(testFilePath) + + // The test should not throw DateTimeParseException when reading datetime fields + noException should be thrownBy { + df.select("created_at", "updated_at").collect() + } + + // Verify that datetime fields are properly parsed as TimestampType + df.schema.fields.find(_.name == "created_at").get.dataType shouldEqual TimestampType + df.schema.fields.find(_.name == "updated_at").get.dataType shouldEqual TimestampType + + // Verify that we can read the datetime values + val datetimeValues = df.select("created_at", "updated_at").collect() + datetimeValues should not be empty + + // Verify that datetime values are valid timestamps + datetimeValues.foreach { row => + val createdTimestamp = row.getAs[Timestamp]("created_at") + val updatedTimestamp = row.getAs[Timestamp]("updated_at") + createdTimestamp should not be null + updatedTimestamp should not be null + createdTimestamp.getTime should be > 0L + updatedTimestamp.getTime should be > 0L + } + + // Test showMetadata option with the same file + noException should be thrownBy { + val metadataDf = sparkSession.read + .format("geopackage") + .option("showMetadata", "true") + .load(testFilePath) + metadataDf.select("last_change").collect() + } + } } describe("GeoPackage Raster Data Test") { diff --git a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala index 9de19c3c48..6d9f41bf4e 100644 --- a/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala +++ b/spark/spark-4.0/src/test/scala/org/apache/sedona/sql/GeoPackageReaderTest.scala @@ -168,6 +168,50 @@ class GeoPackageReaderTest extends TestBaseScala with Matchers { df.count() shouldEqual expectedCount } } + + it("should handle datetime fields without timezone information") { + // This test verifies the fix for DateTimeParseException when reading + // GeoPackage files with datetime fields that don't include timezone info + val testFilePath = resourceFolder + "geopackage/test_datetime_issue.gpkg" + + // Test reading the test_features table with problematic datetime formats + val df = sparkSession.read + .format("geopackage") + .option("tableName", "test_features") + .load(testFilePath) + + // The test should not throw DateTimeParseException when reading datetime fields + noException should be thrownBy { + df.select("created_at", "updated_at").collect() + } + + // Verify that datetime fields are properly parsed as TimestampType + df.schema.fields.find(_.name == "created_at").get.dataType shouldEqual TimestampType + df.schema.fields.find(_.name == "updated_at").get.dataType shouldEqual TimestampType + + // Verify that we can read the datetime values + val datetimeValues = df.select("created_at", "updated_at").collect() + datetimeValues should not be empty + + // Verify that datetime values are valid timestamps + datetimeValues.foreach { row => + val createdTimestamp = row.getAs[Timestamp]("created_at") + val updatedTimestamp = row.getAs[Timestamp]("updated_at") + createdTimestamp should not be null + updatedTimestamp should not be null + createdTimestamp.getTime should be > 0L + updatedTimestamp.getTime should be > 0L + } + + // Test showMetadata option with the same file + noException should be thrownBy { + val metadataDf = sparkSession.read + .format("geopackage") + .option("showMetadata", "true") + .load(testFilePath) + metadataDf.select("last_change").collect() + } + } } describe("GeoPackage Raster Data Test") {
