AshinGau commented on code in PR #32438: URL: https://github.com/apache/doris/pull/32438#discussion_r1529656171
########## be/src/vec/exec/format/parquet/vparquet_column_reader.cpp: ########## @@ -728,28 +729,40 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr } auto& doris_struct = static_cast<ColumnStruct&>(*data_column); - if (_child_readers.size() != doris_struct.tuple_size()) { - return Status::InternalError("Wrong number of struct fields"); - } const DataTypeStruct* doris_struct_type = reinterpret_cast<const DataTypeStruct*>(remove_nullable(type).get()); - for (int i = 0; i < doris_struct.tuple_size(); ++i) { + + bool least_one_reader = false; + size_t least_one_idx = 0; + std::vector<size_t> missing_column_idxs {}; + + for (size_t i = 0; i < doris_struct.tuple_size(); ++i) { ColumnPtr& doris_field = doris_struct.get_column_ptr(i); DataTypePtr& doris_type = const_cast<DataTypePtr&>(doris_struct_type->get_element(i)); + String& doris_name = const_cast<String&>(doris_struct_type->get_element_name(i)); Review Comment: Compatible with the situation where the field names is case-insensitive? ########## be/src/vec/exec/format/parquet/vparquet_column_reader.cpp: ########## @@ -759,9 +772,31 @@ Status StructColumnReader::read_column_data(ColumnPtr& doris_column, DataTypePtr } } + if (!least_one_reader) { + // TODO: support read struct which columns are all missing + return Status::Corruption("Not support read struct '{}' which columns are all missing", + _field_schema->name); + } + + // fill missing column with null or default value Review Comment: No default value for nested fields, just call `insert_many_defaults` ########## be/src/vec/exec/format/parquet/vparquet_column_reader.h: ########## @@ -262,23 +263,24 @@ class StructColumnReader : public ParquetColumnReader { : ParquetColumnReader(row_ranges, ctz, io_ctx) {} ~StructColumnReader() override { close(); } - Status init(std::vector<std::unique_ptr<ParquetColumnReader>>&& child_readers, - FieldSchema* field); + Status init( + std::unordered_map<std::string, std::unique_ptr<ParquetColumnReader>>&& child_readers, + FieldSchema* field); Status read_column_data(ColumnPtr& doris_column, DataTypePtr& type, ColumnSelectVector& select_vector, size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter) override; const std::vector<level_t>& get_rep_level() const override { - return _child_readers[0]->get_rep_level(); + return _child_readers.begin()->second->get_rep_level(); Review Comment: Only the read fields can return right levels, the missing fields will return empty level value. ########## be/src/vec/exec/format/parquet/vparquet_column_reader.h: ########## @@ -262,23 +263,24 @@ class StructColumnReader : public ParquetColumnReader { : ParquetColumnReader(row_ranges, ctz, io_ctx) {} ~StructColumnReader() override { close(); } - Status init(std::vector<std::unique_ptr<ParquetColumnReader>>&& child_readers, - FieldSchema* field); + Status init( + std::unordered_map<std::string, std::unique_ptr<ParquetColumnReader>>&& child_readers, + FieldSchema* field); Status read_column_data(ColumnPtr& doris_column, DataTypePtr& type, ColumnSelectVector& select_vector, size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter) override; const std::vector<level_t>& get_rep_level() const override { - return _child_readers[0]->get_rep_level(); + return _child_readers.begin()->second->get_rep_level(); } const std::vector<level_t>& get_def_level() const override { - return _child_readers[0]->get_def_level(); + return _child_readers.begin()->second->get_def_level(); } Statistics statistics() override { Statistics st; for (const auto& reader : _child_readers) { - Statistics cst = reader->statistics(); + Statistics cst = reader.second->statistics(); Review Comment: This may be correct, but some fields may not have been read ########## be/src/vec/exec/format/parquet/vparquet_column_reader.h: ########## @@ -262,23 +263,24 @@ class StructColumnReader : public ParquetColumnReader { : ParquetColumnReader(row_ranges, ctz, io_ctx) {} ~StructColumnReader() override { close(); } - Status init(std::vector<std::unique_ptr<ParquetColumnReader>>&& child_readers, - FieldSchema* field); + Status init( + std::unordered_map<std::string, std::unique_ptr<ParquetColumnReader>>&& child_readers, + FieldSchema* field); Status read_column_data(ColumnPtr& doris_column, DataTypePtr& type, ColumnSelectVector& select_vector, size_t batch_size, size_t* read_rows, bool* eof, bool is_dict_filter) override; const std::vector<level_t>& get_rep_level() const override { - return _child_readers[0]->get_rep_level(); + return _child_readers.begin()->second->get_rep_level(); Review Comment: So the tow level nested type like `struct<struct<>>` will be parsed badly. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org