zhangfengcdt commented on code in PR #561:
URL: https://github.com/apache/sedona-db/pull/561#discussion_r2755798311
##########
rust/sedona-geoparquet/src/file_opener.rs:
##########
@@ -371,6 +453,109 @@ fn parse_column_coverings(
.collect()
}
+/// Calculates a Vec of [GeoStatistics] based on Parquet-native GeoStatistics
+///
+/// Each element is either a [GeoStatistics] populated with a [BoundingBox]
+/// or [GeoStatistics::unspecified], which is a value that will ensure that
+/// any spatial predicate that references those statistics will evaluate to
+/// true.
+fn row_group_native_geo_stats(
+ row_group_metadata: &RowGroupMetaData,
+ column_indices: &[usize],
+) -> Vec<GeoStatistics> {
+ column_indices
+ .iter()
+ .map(|column_index| {
+ let native_geo_stats_opt =
row_group_metadata.column(*column_index).geo_statistics();
+ native_geo_stats_opt
+ .map(parquet_geo_stats_to_sedona_geo_stats)
+ .unwrap_or(GeoStatistics::unspecified())
+ })
+ .collect()
+}
+
+/// Convert Parquet [GeospatialStatistics] into Sedona [GeoStatistics]
+///
+/// This also sanity checks the Parquet statistics for non-finite or
non-sensical
+/// ranges, treating the information as unknown if it fails the sanity check.
+fn parquet_geo_stats_to_sedona_geo_stats(
+ parquet_geo_stats: &GeospatialStatistics,
+) -> GeoStatistics {
+ let mut out = GeoStatistics::unspecified();
+
+ if let Some(native_bbox) = parquet_geo_stats.bounding_box() {
+ let x_range = (native_bbox.get_xmin(), native_bbox.get_xmax());
+ let y_range = (native_bbox.get_ymin(), native_bbox.get_ymax());
+ let z_range = match (native_bbox.get_zmin(), native_bbox.get_zmax()) {
+ (Some(lo), Some(hi)) => Some(Interval::new(lo, hi)),
+ _ => None,
+ };
+ let m_range = match (native_bbox.get_mmin(), native_bbox.get_mmax()) {
+ (Some(lo), Some(hi)) => Some(Interval::new(lo, hi)),
+ _ => None,
+ };
+
+ let bbox = BoundingBox::xyzm(x_range, y_range, z_range, m_range);
+
+ // Sanity check the bbox statistics. If the sanity check fails, don't
set
+ // a bounding box for pruning. Note that the x width can be < 0
(wraparound).
+ let mut bbox_is_valid =
Review Comment:
Does this also cover NaN case?
##########
rust/sedona-geoparquet/src/metadata.rs:
##########
@@ -382,20 +383,50 @@ impl GeoParquetMetadata {
}
/// Construct a [`GeoParquetMetadata`] from a [`ParquetMetaData`]
+ ///
+ /// This constructor considers (1) the GeoParquet metadata in the key/value
+ /// metadata and (2) Geometry/Geography types present in the Parquet
schema.
+ /// Specification of a column in the GeoParquet metadata takes precedence.
pub fn try_from_parquet_metadata(metadata: &ParquetMetaData) ->
Result<Option<Self>> {
- if let Some(kv) = metadata.file_metadata().key_value_metadata() {
- for item in kv {
- if item.key != "geo" {
- continue;
- }
+ let schema = metadata.file_metadata().schema_descr().root_schema();
+ let kv_metadata = metadata.file_metadata().key_value_metadata();
+ Self::try_from_parquet_metadata_impl(schema, kv_metadata)
+ }
- if let Some(value) = &item.value {
- return Ok(Some(Self::try_new(value)?));
- }
+ /// For testing, as it is easier to simulate the schema and key/value
metadata
+ /// than the whole ParquetMetaData.
+ fn try_from_parquet_metadata_impl(
+ root_schema: &parquet::schema::types::Type,
+ kv_metadata: Option<&Vec<KeyValue>>,
+ ) -> Result<Option<Self>> {
+ let mut columns_from_schema = columns_from_parquet_schema(root_schema,
kv_metadata)?;
+
+ if let Some(value) = get_parquet_key_value("geo", kv_metadata) {
+ // Values in the GeoParquet metadata take precedence over those
from the
+ // Parquet schema
+ let mut out = Self::try_new(&value)?;
+ for (k, v) in columns_from_schema.drain() {
+ out.columns.entry(k).or_insert(v);
}
+
+ return Ok(Some(out));
}
- Ok(None)
+ // No geo metadata key, but we have geo columns from the schema
+ if !columns_from_schema.is_empty() {
+ // To keep metadata valid, ensure we set a primary column
deterministically
+ let mut column_names =
columns_from_schema.keys().collect::<Vec<_>>();
+ column_names.sort();
+ let primary_column = column_names[0].to_string();
Review Comment:
This picks the column alphabetically I think, do we want to prefer column
names like "geo", "gemo", ...
Ignore this if this is more derterministic.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]