efredine commented on code in PR #11289:
URL: https://github.com/apache/datafusion/pull/11289#discussion_r1676020569
##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -946,13 +946,29 @@ pub(crate) fn parquet_column<'a>(
) -> Option<(usize, &'a FieldRef)> {
let (root_idx, field) = arrow_schema.fields.find(name)?;
if field.data_type().is_nested() {
- // Nested fields are not supported and require non-trivial logic
- // to correctly walk the parquet schema accounting for the
- // logical type rules -
<https://github.com/apache/parquet-format/blob/master/LogicalTypes.md>
- //
- // For example a ListArray could correspond to anything from 1 to 3
levels
- // in the parquet schema
- return None;
+ match field.data_type() {
Review Comment:
Perhaps we should extract the repeated code into a helper function? Could
also reduce nesting with an early return for the case where it's not nested. So
something like:
```suggestion
pub(crate) fn parquet_column<'a>(
parquet_schema: &SchemaDescriptor,
arrow_schema: &'a Schema,
name: &str,
) -> Option<(usize, &'a FieldRef)> {
let (root_idx, field) = arrow_schema.fields.find(name)?;
if !field.data_type().is_nested() {
let parquet_idx = find_parquet_idx(parquet_schema, root_idx)?;
return Some((parquet_idx, field));
}
// Nested field
match field.data_type() {
DataType::Struct(_) => {
let parquet_idx = find_parquet_idx(parquet_schema, root_idx)?;
Some((parquet_idx, field))
}
_ => {
if field.data_type().is_nested() {
// Nested fields are not supported and require non-trivial
logic
// to correctly walk the parquet schema accounting for the
// logical type rules -
<https://github.com/apache/parquet-format/blob/master/LogicalTypes.md>
//
// For example a ListArray could correspond to anything from
1 to 3 levels
// in the parquet schema
None
} else {
let parquet_idx = find_parquet_idx(parquet_schema,
root_idx)?;
Some((parquet_idx, field))
}
}
}
}
// Helper function to find parquet index - TBD: this could be more efficient
fn find_parquet_idx(parquet_schema: &SchemaDescriptor, root_idx: usize) ->
Option<usize> {
(0..parquet_schema.columns().len())
.find(|x| parquet_schema.get_column_root_idx(*x) == root_idx)
}
```
##########
datafusion/core/tests/parquet/arrow_statistics.rs:
##########
@@ -1984,7 +1981,96 @@ async fn test_struct() {
}
.run();
}
+// test nested struct
+#[tokio::test]
+async fn test_nested_struct() {
+ // This creates a parquet file with 1 column named "nested_struct"
+ // The file is created by 1 record batch with 3 rows in the nested struct
array
+ let reader = TestReader {
+ scenario: Scenario::StructArrayNested,
+ row_per_group: 5,
+ }
+ .build()
+ .await;
+ // Expected minimum and maximum values for nested struct fields
+ let inner_min = StructArray::from(vec![
+ (
+ Arc::new(Field::new("b", DataType::Boolean, false)),
+ Arc::new(BooleanArray::from(vec![Some(false)])) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("c", DataType::Int32, false)),
+ Arc::new(Int32Array::from(vec![Some(42)])) as ArrayRef,
+ ),
+ ]);
+ let inner_max = StructArray::from(vec![
+ (
+ Arc::new(Field::new("b", DataType::Boolean, false)),
+ Arc::new(BooleanArray::from(vec![Some(true)])) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("c", DataType::Int32, false)),
+ Arc::new(Int32Array::from(vec![Some(44)])) as ArrayRef,
+ ),
+ ]);
+
+ let inner_fields = Fields::from(vec![
+ Field::new("b", DataType::Boolean, false),
+ Field::new("c", DataType::Int32, false),
+ ]);
+
+ // Expected minimum outer struct
+ let expected_min_outer = StructArray::from(vec![
+ (
+ Arc::new(Field::new(
+ "inner_struct",
+ DataType::Struct(inner_fields.clone()),
+ false,
+ )),
+ Arc::new(inner_min) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("outer_float", DataType::Float64, false)),
+ Arc::new(Float64Array::from(vec![Some(5.0)])) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("outer_boolean", DataType::Boolean, false)),
+ Arc::new(BooleanArray::from(vec![Some(false)])) as ArrayRef,
+ ),
+ ]);
+
+ // Expected maximum outer struct
+ let expected_max_outer = StructArray::from(vec![
+ (
+ Arc::new(Field::new(
+ "inner_struct",
+ DataType::Struct(inner_fields),
+ false,
+ )),
+ Arc::new(inner_max) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("outer_float", DataType::Float64, false)),
+ Arc::new(Float64Array::from(vec![Some(7.0)])) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("outer_boolean", DataType::Boolean, false)),
+ Arc::new(BooleanArray::from(vec![Some(true)])) as ArrayRef,
+ ),
+ ]);
+
Review Comment:
I wondered if we needed a struct array with top-level nulls. But for the
stats, it shouldn't matter whether the null is coming from the top level or the
nested arrays. That would be testing the how the stats are calculated. (This
assumes I'm interpreting the way nulls are handled correctly.)
But it might be worth having some null counts to see that they get
calculated correctly?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]