alamb commented on code in PR #11319:
URL: https://github.com/apache/datafusion/pull/11319#discussion_r1667756939


##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -875,14 +914,14 @@ macro_rules! get_data_page_statistics {
                 Some(DataType::Date32) => 
Ok(Arc::new(Date32Array::from_iter([<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator).flatten()))),
                 Some(DataType::Date64) => Ok(
                     Arc::new(
-                        Date64Array::from([<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator)
+                        Date64Array::from_iter([<$stat_type_prefix 
Int32DataPageStatsIterator>]::new($iterator)
                             .map(|x| {
                                 x.into_iter()
-                                .filter_map(|x| {
+                                .map(|x| {

Review Comment:
   I think these changes are similar to what is in 
https://github.com/apache/datafusion/pull/11295 -- perhaps we should merge that 
one first (as it has test) and then merge this PR



##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -392,51 +393,73 @@ macro_rules! get_statistics {
                 })
             },
             DataType::Binary => Ok(Arc::new(BinaryArray::from_iter(
-                [<$stat_type_prefix 
ByteArrayStatsIterator>]::new($iterator).map(|x| x.map(|x| x.to_vec())),
+                [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator)
             ))),
             DataType::LargeBinary => Ok(Arc::new(LargeBinaryArray::from_iter(
-                [<$stat_type_prefix 
ByteArrayStatsIterator>]::new($iterator).map(|x| x.map(|x|x.to_vec())),
-            ))),
-            DataType::Utf8 => Ok(Arc::new(StringArray::from_iter(
-                [<$stat_type_prefix 
ByteArrayStatsIterator>]::new($iterator).map(|x| {
-                    x.and_then(|x| {
-                        let res = std::str::from_utf8(x).map(|s| 
s.to_string()).ok();
-                        if res.is_none() {
-                            log::debug!("Utf8 statistics is a non-UTF8 value, 
ignoring it.");
-                        }
-                        res
-                    })
-                }),
+                [<$stat_type_prefix ByteArrayStatsIterator>]::new($iterator)
             ))),
+            DataType::Utf8 => {
+                let iterator = [<$stat_type_prefix 
ByteArrayStatsIterator>]::new($iterator);
+                let mut builder = StringBuilder::new();
+                for x in iterator {
+                    let Some(x) = x else {
+                        builder.append_null(); // no statistics value
+                        continue;
+                    };
+
+                    let Ok(x) = std::str::from_utf8(x) else {
+                        log::debug!("Utf8 statistics is a non-UTF8 value, 
ignoring it.");
+                        builder.append_null();
+                        continue;
+                    };
+
+                    builder.append_value(x);
+                }
+                Ok(Arc::new(builder.finish()))
+            },
             DataType::LargeUtf8 => {
-                Ok(Arc::new(LargeStringArray::from_iter(
-                    [<$stat_type_prefix 
ByteArrayStatsIterator>]::new($iterator).map(|x| {
-                        x.and_then(|x| {
-                            let res = std::str::from_utf8(x).map(|s| 
s.to_string()).ok();
-                            if res.is_none() {
-                                log::debug!("LargeUtf8 statistics is a 
non-UTF8 value, ignoring it.");
-                            }
-                            res
-                        })
-                    }),
-                )))
-            }
-            DataType::FixedSizeBinary(size) => 
Ok(Arc::new(FixedSizeBinaryArray::from(
-                [<$stat_type_prefix 
FixedLenByteArrayStatsIterator>]::new($iterator).map(|x| {
-                    x.and_then(|x| {
-                        if x.len().try_into() == Ok(*size) {
-                            Some(x)
-                        } else {
-                            log::debug!(
-                                "FixedSizeBinary({}) statistics is a binary of 
size {}, ignoring it.",
-                                size,
-                                x.len(),
-                            );
-                            None
-                        }
-                    })
-                }).collect::<Vec<_>>(),
-            ))),
+                let iterator = [<$stat_type_prefix 
ByteArrayStatsIterator>]::new($iterator);
+                let mut builder = LargeStringBuilder::new();
+                for x in iterator {
+                    let Some(x) = x else {
+                        builder.append_null(); // no statistics value
+                        continue;
+                    };
+
+                    let Ok(x) = std::str::from_utf8(x) else {
+                        log::debug!("Utf8 statistics is a non-UTF8 value, 
ignoring it.");
+                        builder.append_null();
+                        continue;
+                    };
+
+                    builder.append_value(x);
+                }
+                Ok(Arc::new(builder.finish()))
+            },
+            DataType::FixedSizeBinary(size) => {
+                let iterator = [<$stat_type_prefix 
FixedLenByteArrayStatsIterator>]::new($iterator);
+                let mut builder = FixedSizeBinaryBuilder::new(*size);
+                for x in iterator {
+                    let Some(x) = x else {
+                        builder.append_null(); // no statistics value
+                        continue;
+                    };
+
+                    // ignore invalid values
+                    if x.len().try_into() != Ok(*size){
+                        log::debug!(
+                            "FixedSizeBinary({}) statistics is a binary of 
size {}, ignoring it.",
+                            size,
+                            x.len(),
+                        );
+                        builder.append_null();
+                        continue;
+                    }
+
+                    builder.append_value(x).expect("ensure to append 
successfully here, because size have been checked before");

Review Comment:
   Rather than using `expect` it is likely possible to check the value of 
`builder.append_value` and if it is an error append null. But perhaps that is 
overly confusing. This formulation is quite clear and explicit



##########
datafusion/core/src/datasource/physical_plan/parquet/statistics.rs:
##########
@@ -747,10 +770,10 @@ macro_rules! get_data_page_statistics {
                 Some(DataType::Boolean) => Ok(Arc::new(
                     BooleanArray::from_iter(
                         [<$stat_type_prefix 
BooleanDataPageStatsIterator>]::new($iterator)
-                            .flatten()
-                            // BooleanArray::from_iter required a sized 
iterator, so collect into Vec first
-                            .collect::<Vec<_>>()
-                            .into_iter()
+                        .flatten()
+                        // BooleanArray::from_iter required a sized iterator, 
so collect into Vec first

Review Comment:
   👍 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to