This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 0e8be3c ARROW-10620: [Rust][Parquet] move column chunk range logic to
metadata.rs
0e8be3c is described below
commit 0e8be3caa4a6a2dc71e4cc5cb6fe9ac375f5648d
Author: rdettai <[email protected]>
AuthorDate: Sat Nov 21 18:16:50 2020 -0500
ARROW-10620: [Rust][Parquet] move column chunk range logic to metadata.rs
> Getting the range of bytes of a column chunk inside a parquet file can be
useful for external crates (for instance if they want to pre-fetch the
columns), and is not completely obvious (it is enough to take a look at [1] and
[2] to see that things can quickly get messy).
>
> I think it would be nice to move this logic in the metadata definition
rather than have lost it in the middle of the reader implem.
>
> [1]
https://stackoverflow.com/questions/55225108/why-is-dictionary-page-offset-0-for-plain-dictionary-encoding/
> [2] https://issues.apache.org/jira/browse/PARQUET-816
https://issues.apache.org/jira/browse/ARROW-10620
Closes #8682 from rdettai/ARROW-10620-chunk-range
Authored-by: rdettai <[email protected]>
Signed-off-by: Andrew Lamb <[email protected]>
---
rust/parquet/src/file/metadata.rs | 15 +++++++++++++++
rust/parquet/src/file/serialized_reader.rs | 11 ++---------
2 files changed, 17 insertions(+), 9 deletions(-)
diff --git a/rust/parquet/src/file/metadata.rs
b/rust/parquet/src/file/metadata.rs
index b6b489d..8565115 100644
--- a/rust/parquet/src/file/metadata.rs
+++ b/rust/parquet/src/file/metadata.rs
@@ -433,6 +433,21 @@ impl ColumnChunkMetaData {
self.dictionary_page_offset
}
+ /// Returns the offset and length in bytes of the column chunk within the
file
+ pub fn byte_range(&self) -> (u64, u64) {
+ let col_start = if self.has_dictionary_page() {
+ self.dictionary_page_offset().unwrap()
+ } else {
+ self.data_page_offset()
+ };
+ let col_len = self.compressed_size();
+ assert!(
+ col_start >= 0 && col_len >= 0,
+ "column start and length should not be negative"
+ );
+ (col_start as u64, col_len as u64)
+ }
+
/// Returns statistics that are set for this column chunk,
/// or `None` if no statistics are available.
pub fn statistics(&self) -> Option<&Statistics> {
diff --git a/rust/parquet/src/file/serialized_reader.rs
b/rust/parquet/src/file/serialized_reader.rs
index bd246af..663412d 100644
--- a/rust/parquet/src/file/serialized_reader.rs
+++ b/rust/parquet/src/file/serialized_reader.rs
@@ -191,15 +191,8 @@ impl<'a, R: 'static + ChunkReader> RowGroupReader for
SerializedRowGroupReader<'
// TODO: fix PARQUET-816
fn get_column_page_reader(&self, i: usize) -> Result<Box<PageReader>> {
let col = self.metadata.column(i);
- let col_start = if col.has_dictionary_page() {
- col.dictionary_page_offset().unwrap()
- } else {
- col.data_page_offset()
- };
- let col_length = col.compressed_size();
- let file_chunk = self
- .chunk_reader
- .get_read(col_start as u64, col_length as usize)?;
+ let (col_start, col_length) = col.byte_range();
+ let file_chunk = self.chunk_reader.get_read(col_start, col_length as
usize)?;
let page_reader = SerializedPageReader::new(
file_chunk,
col.num_values(),