This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new e4bc3f6b6f [feature-wip] (parquet-reader) add parquet reader impl template (#11285) e4bc3f6b6f is described below commit e4bc3f6b6f47e352067bf3bd7b95b0359af4baca Author: slothever <18522955+w...@users.noreply.github.com> AuthorDate: Fri Jul 29 14:30:31 2022 +0800 [feature-wip] (parquet-reader) add parquet reader impl template (#11285) --- be/src/vec/CMakeLists.txt | 10 ++- be/src/vec/exec/file_hdfs_scanner.cpp | 20 +++++ be/src/vec/exec/file_hdfs_scanner.h | 26 +++++++ .../vec/exec/format/parquet/parquet_thrift_util.h | 87 ++++++++++++++++++++++ be/src/vec/exec/format/parquet/schema_desc.cpp | 33 ++++++++ be/src/vec/exec/format/parquet/schema_desc.h | 46 ++++++++++++ .../parquet/vparquet_column_chunk_reader.cpp | 36 +++++++++ .../format/parquet/vparquet_column_chunk_reader.h | 33 ++++++++ .../exec/format/parquet/vparquet_file_metadata.cpp | 52 +++++++++++++ .../exec/format/parquet/vparquet_file_metadata.h | 45 +++++++++++ .../exec/format/parquet/vparquet_group_reader.cpp | 20 +++++ .../exec/format/parquet/vparquet_group_reader.h | 24 ++++++ .../exec/format/parquet/vparquet_page_index.cpp | 29 ++++++++ .../vec/exec/format/parquet/vparquet_page_index.h | 35 +++++++++ .../exec/format/parquet/vparquet_page_reader.cpp | 33 ++++++++ .../vec/exec/format/parquet/vparquet_page_reader.h | 34 +++++++++ be/src/vec/exec/format/parquet/vparquet_reader.cpp | 61 +++++++++++++++ be/src/vec/exec/format/parquet/vparquet_reader.h | 87 ++++++++++++++++++++++ be/test/CMakeLists.txt | 1 + be/test/vec/exec/parquet/parquet_thrift_test.cpp | 66 ++++++++++++++++ 20 files changed, 777 insertions(+), 1 deletion(-) diff --git a/be/src/vec/CMakeLists.txt b/be/src/vec/CMakeLists.txt index 9ee5a5066f..6be1a9c797 100644 --- a/be/src/vec/CMakeLists.txt +++ b/be/src/vec/CMakeLists.txt @@ -221,7 +221,15 @@ set(VEC_FILES exec/file_arrow_scanner.cpp exec/file_scanner.cpp exec/file_scan_node.cpp - exec/file_text_scanner.cpp) + exec/file_text_scanner.cpp + exec/file_hdfs_scanner.cpp + exec/format/parquet/vparquet_column_chunk_reader.cpp + exec/format/parquet/vparquet_group_reader.cpp + exec/format/parquet/vparquet_page_index.cpp + exec/format/parquet/vparquet_reader.cpp + exec/format/parquet/vparquet_file_metadata.cpp + exec/format/parquet/vparquet_page_reader.cpp + exec/format/parquet/schema_desc.cpp) add_library(Vec STATIC ${VEC_FILES} diff --git a/be/src/vec/exec/file_hdfs_scanner.cpp b/be/src/vec/exec/file_hdfs_scanner.cpp new file mode 100644 index 0000000000..d4c7398913 --- /dev/null +++ b/be/src/vec/exec/file_hdfs_scanner.cpp @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "file_hdfs_scanner.h" + +namespace doris::vectorized {} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/file_hdfs_scanner.h b/be/src/vec/exec/file_hdfs_scanner.h new file mode 100644 index 0000000000..e88e3a887a --- /dev/null +++ b/be/src/vec/exec/file_hdfs_scanner.h @@ -0,0 +1,26 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +namespace doris::vectorized { + +class HdfsFileScanner {}; + +class ParquetFileHdfsScanner : public HdfsFileScanner {}; + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/parquet_thrift_util.h b/be/src/vec/exec/format/parquet/parquet_thrift_util.h new file mode 100644 index 0000000000..3e2cbec60e --- /dev/null +++ b/be/src/vec/exec/format/parquet/parquet_thrift_util.h @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include <common/status.h> + +#include <cstdint> + +#include "common/logging.h" +#include "gen_cpp/parquet_types.h" +#include "io/file_reader.h" +#include "util/coding.h" +#include "util/thrift_util.h" +#include "vparquet_file_metadata.h" + +namespace doris::vectorized { + +constexpr uint8_t PARQUET_VERSION_NUMBER[4] = {'P', 'A', 'R', '1'}; +constexpr int64_t PARQUET_FOOTER_READ_SIZE = 64 * 1024; +constexpr uint32_t PARQUET_FOOTER_SIZE = 8; + +Status parse_thrift_footer(FileReader* file, std::shared_ptr<FileMetaData>& file_metadata) { + // try with buffer on stack + uint8_t buff[PARQUET_FOOTER_READ_SIZE]; + int64_t file_size = file->size(); + // read footer bytes + uint64_t footer_read_size = std::min(file_size, PARQUET_FOOTER_READ_SIZE); + + int64_t bytes_read = 0; + RETURN_IF_ERROR( + file->readat(file_size - footer_read_size, footer_read_size, &bytes_read, buff)); + + // validate magic + uint8_t* magic_ptr = buff + footer_read_size - sizeof(PARQUET_VERSION_NUMBER); + if (memcmp(magic_ptr, PARQUET_VERSION_NUMBER, sizeof(PARQUET_VERSION_NUMBER)) != 0) { + return Status::Corruption("Invalid magic number in parquet file"); + } + + // get metadata_size + uint8_t* footer_buff = buff + footer_read_size - PARQUET_FOOTER_SIZE; + uint32_t metadata_size = decode_fixed32_le(footer_buff); + if (metadata_size > file_size - PARQUET_FOOTER_SIZE) { + Status::Corruption("Parquet file size is ", file_size, + " bytes, smaller than the size reported by footer's (", metadata_size, + "bytes)"); + } + tparquet::FileMetaData t_metadata; + // deserialize footer + RETURN_IF_ERROR( + deserialize_thrift_msg(footer_buff - metadata_size, &metadata_size, true, &t_metadata)); + file_metadata.reset(new FileMetaData(t_metadata)); + RETURN_IF_ERROR(file_metadata->init_schema()); + return Status::OK(); +} + +// Status parse_page_header() { +// uint8_t* page_buf; +// +// } + +// Status parse_page_index() { +// +// } + +// void deserialize_column_index(int64_t start_offset, tparquet::ColumnIndex) { +// +// } +// +// void deserialize_offset_index(int64_t start_offset, tparquet::OffsetIndex) { +// +// } + +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/schema_desc.cpp b/be/src/vec/exec/format/parquet/schema_desc.cpp new file mode 100644 index 0000000000..21275d2bb4 --- /dev/null +++ b/be/src/vec/exec/format/parquet/schema_desc.cpp @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "schema_desc.h" + +namespace doris::vectorized { + +SchemaDescriptor::~SchemaDescriptor() { + // fields.clear(); +} + +std::string SchemaDescriptor::debug_string() const { + return std::string(); +} + +std::string FieldSchema::debug_string() const { + return std::string(); +} +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/schema_desc.h b/be/src/vec/exec/format/parquet/schema_desc.h new file mode 100644 index 0000000000..678f633e29 --- /dev/null +++ b/be/src/vec/exec/format/parquet/schema_desc.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "common/status.h" + +namespace doris::vectorized { +class FieldSchema { +public: + int16_t max_def_level() const { return _max_def_level; } + int16_t max_rep_level() const { return _max_rep_level; } + std::string debug_string() const; + +private: + int16_t _max_def_level; + int16_t _max_rep_level; + // std::vector<FieldSchema> children; +}; + +class SchemaDescriptor { +public: + SchemaDescriptor() = default; + ~SchemaDescriptor(); + + std::string debug_string() const; + +private: + // std::vector<FieldSchema> fields; +}; + +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp new file mode 100644 index 0000000000..d9fc6313f1 --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.cpp @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#include "vparquet_column_chunk_reader.h" + +namespace doris::vectorized { + +Status ColumnChunkReader::init() { + return Status(); +} + +Status ColumnChunkReader::read_min_max_stat() { + return Status(); +} + +Status ColumnChunkReader::decode_dict_page() { + return Status(); +} + +Status ColumnChunkReader::decode_nested_page() { + return Status(); +} +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h new file mode 100644 index 0000000000..efc9880ed2 --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_column_chunk_reader.h @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include <common/status.h> + +namespace doris::vectorized { + +class ColumnChunkReader { +public: + Status init(); + Status read_min_max_stat(); + Status decode_dict_page(); + Status decode_nested_page(); + +private: +}; + +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp b/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp new file mode 100644 index 0000000000..6e6ead39d2 --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_file_metadata.cpp @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vparquet_file_metadata.h" + +#include <sstream> + +namespace doris::vectorized { + +FileMetaData::FileMetaData(tparquet::FileMetaData& metadata) : _metadata(metadata) { + _num_rows = metadata.num_rows; + _num_groups = metadata.row_groups.size(); + if (_num_groups != 0) { + _num_columns = metadata.row_groups[0].columns.size(); + } + if (metadata.schema[0].num_children <= 0) { + } +} + +Status FileMetaData::init_schema() { + return Status(); +} + +const tparquet::FileMetaData& FileMetaData::to_thrift_metadata() { + return _metadata; +} + +std::string FileMetaData::debug_string() const { + std::stringstream out; + out << "Parquet Metadata("; + out << "; version=" << _metadata.version; + out << "; num row groups=" << _num_groups; + out << "; num rows=" << _num_rows; + out << ")"; + return out.str(); +} + +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_file_metadata.h b/be/src/vec/exec/format/parquet/vparquet_file_metadata.h new file mode 100644 index 0000000000..1ad15edf7f --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_file_metadata.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include "common/status.h" +#include "gen_cpp/parquet_types.h" +#include "schema_desc.h" + +namespace doris::vectorized { + +class FileMetaData { +public: + FileMetaData(tparquet::FileMetaData& metadata); + ~FileMetaData() = default; + Status init_schema(); + const tparquet::FileMetaData& to_thrift_metadata(); + int32_t num_row_groups() const { return _num_groups; } + int32_t num_columns() const { return _num_columns; }; + int32_t num_rows() const { return _num_rows; }; + SchemaDescriptor schema() const { return _schema; }; + std::string debug_string() const; + +private: + tparquet::FileMetaData _metadata; + int32_t _num_groups = 0; + int32_t _num_columns = 0; + int64_t _num_rows = 0; + SchemaDescriptor _schema; +}; + +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp new file mode 100644 index 0000000000..543b1dd450 --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.cpp @@ -0,0 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vparquet_group_reader.h" + +namespace doris::vectorized {} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_group_reader.h b/be/src/vec/exec/format/parquet/vparquet_group_reader.h new file mode 100644 index 0000000000..adb150e696 --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_group_reader.h @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +#pragma once +#include <common/status.h> + +namespace doris::vectorized { + +class RowGroupReader {}; + +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_page_index.cpp b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp new file mode 100644 index 0000000000..fedf544bbd --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_page_index.cpp @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vparquet_page_index.h" + +namespace doris::vectorized { + +Status PageIndex::get_row_range_for_page() { + return Status(); +} + +Status PageIndex::collect_skipped_page_range() { + return Status(); +} +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_page_index.h b/be/src/vec/exec/format/parquet/vparquet_page_index.h new file mode 100644 index 0000000000..801de39b8a --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_page_index.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include <common/status.h> +#include <gen_cpp/parquet_types.h> + +namespace doris::vectorized { + +class PageIndex { +public: + Status get_row_range_for_page(); + Status collect_skipped_page_range(); + + //private: + // // row range define + // tparquet::ColumnIndex _column_index; + // tparquet::OffsetIndex _offset_index; +}; + +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp new file mode 100644 index 0000000000..d781159cab --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.cpp @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vparquet_page_reader.h" + +namespace doris::vectorized { + +Status PageReader::read_page_header() { + return Status(); +} + +Status PageReader::read_page_data() { + return Status(); +} + +Status PageReader::init() { + return Status(); +} +} // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/vparquet_page_reader.h b/be/src/vec/exec/format/parquet/vparquet_page_reader.h new file mode 100644 index 0000000000..9b66896aaf --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_page_reader.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once +#include <common/status.h> +#include <gen_cpp/parquet_types.h> + +namespace doris::vectorized { + +class PageReader { +public: + Status init(); + Status read_page_header(); + Status read_page_data(); + + //private: + // tparquet::PageHeader* _page_header; +}; + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp new file mode 100644 index 0000000000..215dec1dca --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -0,0 +1,61 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vparquet_reader.h" + +#include "parquet_thrift_util.h" + +namespace doris::vectorized { +doris::vectorized::ParquetReader::ParquetReader(doris::FileReader* file_reader, int64_t batch_size, + int32_t num_of_columns_from_file, + int64_t range_start_offset, int64_t range_size) { + // : _batch_size(batch_size), _num_of_columns_from_file(num_of_columns_from_file) { + _file_reader = file_reader; + _total_groups = 0; + // _current_group = 0; + // _statistics = std::make_shared<Statistics>(); +} + +doris::vectorized::ParquetReader::~ParquetReader() { + // _batch.clear(); +} + +Status ParquetReader::init_reader(const TupleDescriptor* tuple_desc, + const std::vector<SlotDescriptor*>& tuple_slot_descs, + const std::vector<ExprContext*>& conjunct_ctxs, + const std::string& timezone) { + _file_reader->open(); + RETURN_IF_ERROR(parse_thrift_footer(_file_reader, _file_metadata)); + auto metadata = _file_metadata->to_thrift_metadata(); + + _total_groups = metadata.row_groups.size(); + if (_total_groups == 0) { + return Status::EndOfFile("Empty Parquet File"); + } + + return Status::OK(); +} + +int64_t ParquetReader::_get_row_group_start_offset(const tparquet::RowGroup& row_group) { + if (row_group.__isset.file_offset) { + return row_group.file_offset; + } + const tparquet::ColumnMetaData& first_column = row_group.columns[0].meta_data; + return first_column.data_page_offset; +} + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h new file mode 100644 index 0000000000..2f5d4aa480 --- /dev/null +++ b/be/src/vec/exec/format/parquet/vparquet_reader.h @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <parquet/api/reader.h> +#include <parquet/api/writer.h> +#include <parquet/exception.h> +#include <stdint.h> + +#include <string> +#include <vector> + +#include "common/status.h" +#include "exprs/expr_context.h" +#include "gen_cpp/PaloBrokerService_types.h" +#include "gen_cpp/PlanNodes_types.h" +#include "gen_cpp/Types_types.h" +#include "gen_cpp/parquet_types.h" +#include "io/file_reader.h" +#include "vec/core/block.h" +#include "vparquet_file_metadata.h" + +namespace doris::vectorized { + +// struct Statistics { +// int32_t filtered_row_groups = 0; +// int32_t total_groups = 0; +// int64_t filtered_rows = 0; +// int64_t total_rows = 0; +// int64_t filtered_total_bytes = 0; +// int64_t total_bytes = 0; +// }; + +class ParquetReader { +public: + ParquetReader(FileReader* file_reader, int64_t batch_size, int32_t num_of_columns_from_file, + int64_t range_start_offset, int64_t range_size); + ~ParquetReader(); + virtual Status init_reader(const TupleDescriptor* tuple_desc, + const std::vector<SlotDescriptor*>& tuple_slot_descs, + const std::vector<ExprContext*>& conjunct_ctxs, + const std::string& timezone) = 0; + virtual Status next_batch(bool* eof) = 0; + // std::shared_ptr<Statistics>& statistics() { return _statistics; } + void close() {}; + int64_t size(int64_t* size) { return _file_reader->size(); } + +private: + int64_t _get_row_group_start_offset(const tparquet::RowGroup& row_group); + +private: + FileReader* _file_reader; + std::shared_ptr<FileMetaData> _file_metadata; + // const int64_t _batch_size; + // const int32_t _num_of_columns_from_file; + int _total_groups; // num of groups(stripes) of a parquet(orc) file + // int _current_group; // current group(stripe) + // std::map<std::string, int> _map_column; // column-name <---> column-index + // std::vector<int> _include_column_ids; // columns that need to get from file + // std::shared_ptr<Statistics> _statistics; + + // parquet file reader object + // std::vector<Block*> _batch; + // std::string _timezone; + // int64_t _range_start_offset; + // int64_t _range_size; + +private: + std::atomic<bool> _closed = false; +}; + +} // namespace doris::vectorized diff --git a/be/test/CMakeLists.txt b/be/test/CMakeLists.txt index 0bf06f7b45..d05fe0a976 100644 --- a/be/test/CMakeLists.txt +++ b/be/test/CMakeLists.txt @@ -59,6 +59,7 @@ set(EXEC_TEST_FILES exec/s3_reader_test.cpp exec/multi_bytes_separator_test.cpp exec/hdfs_file_reader_test.cpp + vec/exec/parquet/parquet_thrift_test.cpp # exec/new_olap_scan_node_test.cpp # exec/pre_aggregation_node_test.cpp # exec/partitioned_hash_table_test.cpp diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/parquet/parquet_thrift_test.cpp new file mode 100644 index 0000000000..d5ac78264b --- /dev/null +++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <glog/logging.h> +#include <gtest/gtest.h> +#include <stdio.h> +#include <stdlib.h> + +#include <string> + +#include "io/buffered_reader.h" +#include "io/file_reader.h" +#include "io/local_file_reader.h" +#include "util/runtime_profile.h" +#include "vec/exec/format/parquet/parquet_thrift_util.h" +#include "vec/exec/format/parquet/vparquet_file_metadata.h" + +namespace doris { +namespace vectorized { + +class ParquetThriftReaderTest : public testing::Test { +public: + ParquetThriftReaderTest() {} +}; + +TEST_F(ParquetThriftReaderTest, normal) { + LocalFileReader reader("./be/test/exec/test_data/parquet_scanner/localfile.parquet", 0); + + auto st = reader.open(); + EXPECT_TRUE(st.ok()); + + std::shared_ptr<FileMetaData> metaData; + parse_thrift_footer(&reader, metaData); + tparquet::FileMetaData t_metadata = metaData->to_thrift_metadata(); + LOG(WARNING) << "num row groups: " << metaData->num_row_groups(); + LOG(WARNING) << "num columns: " << metaData->num_columns(); + LOG(WARNING) << "====================================="; + for (auto value : t_metadata.row_groups) { + LOG(WARNING) << "row group num_rows: " << value.num_rows; + } + LOG(WARNING) << "====================================="; + for (auto value : t_metadata.schema) { + LOG(WARNING) << "schema column name: " << value.name; + LOG(WARNING) << "schema column type: " << value.type; + LOG(WARNING) << "schema column repetition_type: " << value.repetition_type; + LOG(WARNING) << "schema column num children: " << value.num_children; + } +} + +} // namespace vectorized + +} // namespace doris --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org