2010YOUY01 commented on code in PR #578:
URL: https://github.com/apache/sedona-db/pull/578#discussion_r2778495539


##########
python/sedonadb/tests/test_context.py:
##########
@@ -313,3 +313,79 @@ def test_read_parquet_invalid_azure_option():
         match="Unknown Azure option.*azure.unknown_option.*Valid options are",
     ):
         con.read_parquet(url, options={"azure.unknown_option": "value"})
+
+
+def test_read_parquet_validate_wkb_single_valid_row(con, tmp_path):
+    valid_wkb = bytes.fromhex("0101000000000000000000F03F0000000000000040")
+
+    table = pa.table({"id": [1], "geom": [valid_wkb]})
+    path = tmp_path / "single_valid_wkb.parquet"
+    pq.write_table(table, path)

Review Comment:
   updated in 99d52a2



##########
rust/sedona-geoparquet/src/file_opener.rs:
##########
@@ -158,12 +170,108 @@ impl FileOpener for GeoParquetFileOpener {
             // We could also consider filtering using null_count here in the 
future (i.e.,
             // skip row groups that are all null)
             let file = file.with_extensions(Arc::new(access_plan));
+            let stream = self_clone.inner.open(file)?.await?;
+
+            // Validate geometry columns when enabled from read option.
+            let validation_columns = if self_clone.options.validate {
+                maybe_geoparquet_metadata
+                    .as_ref()
+                    .map(|metadata| 
wkb_validation_columns(&self_clone.file_schema, metadata))
+                    .unwrap_or_default()
+            } else {
+                Vec::new()
+            };
 
-            self_clone.inner.open(file)?.await
+            if !self_clone.options.validate || validation_columns.is_empty() {
+                return Ok(stream);
+            }
+
+            let validated_stream = stream.map(move |batch_result| {
+                let batch = batch_result?;
+                validate_wkb_batch(&batch, &validation_columns)?;
+                Ok(batch)
+            });
+
+            Ok(Box::pin(validated_stream))
         }))
     }
 }
 
+fn wkb_validation_columns(
+    file_schema: &SchemaRef,
+    metadata: &GeoParquetMetadata,
+) -> Vec<(usize, String)> {
+    file_schema
+        .fields()
+        .iter()
+        .enumerate()
+        .filter_map(|(column_index, field)| {
+            metadata
+                .columns
+                .get(field.name())
+                .and_then(|column_metadata| {
+                    if matches!(column_metadata.encoding, 
GeoParquetColumnEncoding::WKB) {
+                        Some((column_index, field.name().clone()))
+                    } else {
+                        None
+                    }
+                })
+        })
+        .collect()
+}
+
+fn validate_wkb_batch(batch: &RecordBatch, validation_columns: &[(usize, 
String)]) -> Result<()> {
+    for (column_index, column_name) in validation_columns {
+        let column = batch.column(*column_index);
+        validate_wkb_array(column.as_ref(), column_name)?;
+    }
+    Ok(())
+}
+
+fn validate_wkb_array(array: &dyn Array, column_name: &str) -> Result<()> {
+    match array.data_type() {
+        DataType::Binary => {
+            let array = as_binary_array(array)?;
+            for (row_index, maybe_wkb) in array.iter().enumerate() {
+                if let Some(wkb_bytes) = maybe_wkb {
+                    if let Err(e) = wkb::reader::read_wkb(wkb_bytes) {
+                        return exec_err!(
+                            "WKB validation failed for column '{}' at row {}: 
{}",
+                            column_name,
+                            row_index,
+                            e
+                        );
+                    }
+                }
+            }
+        }

Review Comment:
   fixed in  99d52a2



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to