Re: [PR] test: some tests to write data to a parquet file and read its metadata [datafusion]

via GitHub Wed, 15 May 2024 14:51:03 -0700


NGA-TRAN commented on code in PR #10537:
URL: https://github.com/apache/datafusion/pull/10537#discussion_r1602292079



##########
datafusion/core/tests/parquet/arrow_statistics.rs:
##########
@@ -0,0 +1,528 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! This file contains an end to end test of extracting statitics from parquet 
files.
+//! It writes data into a parquet file, reads statistics and verifies they are 
correct
+
+use std::sync::Arc;
+
+// use arrow::json::reader;
+use arrow_array::{make_array, Array, Int64Array, RecordBatch};
+use arrow_schema::{DataType, Field, Schema};
+use parquet::arrow::arrow_reader::ArrowReaderBuilder;
+use parquet::arrow::ArrowWriter;
+use parquet::file::properties::{EnabledStatistics, WriterProperties};
+use parquet::file::statistics::Statistics as ParquetFileStatistics;
+use parquet::format::Statistics as ParquetFormatStatistics;
+
+// TEST HELPERS
+
+/// Return a record batch with i64 with Null values
+fn make_int64_batches_with_null(
+    null_values: usize,
+    no_null_values_start: i64,
+    no_null_values_end: i64,
+) -> RecordBatch {
+    let schema = Arc::new(Schema::new(vec![Field::new("i64", DataType::Int64, 
true)]));
+
+    let v64: Vec<i64> = (no_null_values_start as _..no_null_values_end as 
_).collect();
+
+    RecordBatch::try_new(
+        schema,
+        vec![make_array(
+            Int64Array::from_iter(
+                v64.into_iter()
+                    .map(Some)
+                    .chain(std::iter::repeat(None).take(null_values)),
+            )
+            .to_data(),
+        )],
+    )
+    .unwrap()
+}
+
+// TODO: Eventually, we will need only read_statistics and read_row_count,
+// but for my understading, we will keep both write and read statistics.
+// Their values should be the same
+pub struct RowGroupsStatistics {
+    // Statistics from the file writer
+    pub write_statistics: Vec<WriteRowGroupStatistics>,
+
+    // Statistics from the file reader
+    pub read_statistics: Vec<ReadRowGroupStatistics>,
+}
+
+pub struct WriteRowGroupStatistics {
+    pub statistics: ParquetFormatStatistics,
+    pub row_count: i64,
+}
+
+pub struct ReadRowGroupStatistics {
+    pub statistics: ParquetFileStatistics,
+    pub row_count: i64,
+}
+
+// Create a parquet file with one column for data type i64
+// Data of the file include
+//   . Number of null rows is the given num_null
+//   . There are non-null values in the range [no_null_values_start, 
no_null_values_end], one value each row
+//   . The file is divided into row groups of size row_per_group
+pub fn parquet_row_group_statistics(
+    num_null: usize,
+    no_null_values_start: i64,
+    no_null_values_end: i64,
+    row_per_group: usize,
+) -> RowGroupsStatistics {
+    let mut output_file = tempfile::Builder::new()
+        .prefix("parquert_statistics_test")
+        .suffix(".parquet")
+        .tempfile()
+        .expect("tempfile creation");
+
+    let props = WriterProperties::builder()
+        .set_max_row_group_size(row_per_group)
+        .set_statistics_enabled(EnabledStatistics::Chunk)
+        .build();
+
+    let batches = vec![
+        make_int64_batches_with_null(num_null, no_null_values_start, 
no_null_values_end), // TODO: likely make this more general using
+                                                                               
           // create_data_batch(scenario); where scenario is the respective 
enum data type
+    ];
+
+    let schema = batches[0].schema();
+
+    let mut writer = ArrowWriter::try_new(&mut output_file, schema, 
Some(props)).unwrap();
+
+    for batch in batches {
+        writer.write(&batch).expect("writing batch");
+    }
+
+    //////////////// WRITE STATISTICS ///////////////////////
+    let file_meta = writer.close().unwrap();

Review Comment:
   With my surprise, the write and read statistics even though have the same 
content are stored in different structures. Write is 
`parquet::format::statistics` and read is 
`parquet::file::statistics::Statistics`. Why?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] test: some tests to write data to a parquet file and read its metadata [datafusion]

Reply via email to