AshinGau commented on code in PR #32438:
URL: https://github.com/apache/doris/pull/32438#discussion_r1529656171


##########
be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:
##########
@@ -728,28 +729,40 @@ Status StructColumnReader::read_column_data(ColumnPtr& 
doris_column, DataTypePtr
     }
 
     auto& doris_struct = static_cast<ColumnStruct&>(*data_column);
-    if (_child_readers.size() != doris_struct.tuple_size()) {
-        return Status::InternalError("Wrong number of struct fields");
-    }
     const DataTypeStruct* doris_struct_type =
             reinterpret_cast<const 
DataTypeStruct*>(remove_nullable(type).get());
-    for (int i = 0; i < doris_struct.tuple_size(); ++i) {
+
+    bool least_one_reader = false;
+    size_t least_one_idx = 0;
+    std::vector<size_t> missing_column_idxs {};
+
+    for (size_t i = 0; i < doris_struct.tuple_size(); ++i) {
         ColumnPtr& doris_field = doris_struct.get_column_ptr(i);
         DataTypePtr& doris_type = 
const_cast<DataTypePtr&>(doris_struct_type->get_element(i));
+        String& doris_name = 
const_cast<String&>(doris_struct_type->get_element_name(i));

Review Comment:
   Compatible with the situation where the field names is case-insensitive?



##########
be/src/vec/exec/format/parquet/vparquet_column_reader.cpp:
##########
@@ -759,9 +772,31 @@ Status StructColumnReader::read_column_data(ColumnPtr& 
doris_column, DataTypePtr
         }
     }
 
+    if (!least_one_reader) {
+        // TODO: support read struct which columns are all missing
+        return Status::Corruption("Not support read struct '{}' which columns 
are all missing",
+                                  _field_schema->name);
+    }
+
+    // fill missing column with null or default value

Review Comment:
   No default value for nested fields, just call `insert_many_defaults`



##########
be/src/vec/exec/format/parquet/vparquet_column_reader.h:
##########
@@ -262,23 +263,24 @@ class StructColumnReader : public ParquetColumnReader {
             : ParquetColumnReader(row_ranges, ctz, io_ctx) {}
     ~StructColumnReader() override { close(); }
 
-    Status init(std::vector<std::unique_ptr<ParquetColumnReader>>&& 
child_readers,
-                FieldSchema* field);
+    Status init(
+            std::unordered_map<std::string, 
std::unique_ptr<ParquetColumnReader>>&& child_readers,
+            FieldSchema* field);
     Status read_column_data(ColumnPtr& doris_column, DataTypePtr& type,
                             ColumnSelectVector& select_vector, size_t 
batch_size, size_t* read_rows,
                             bool* eof, bool is_dict_filter) override;
 
     const std::vector<level_t>& get_rep_level() const override {
-        return _child_readers[0]->get_rep_level();
+        return _child_readers.begin()->second->get_rep_level();

Review Comment:
   Only the read fields can return right levels, the missing fields will return 
empty level value.



##########
be/src/vec/exec/format/parquet/vparquet_column_reader.h:
##########
@@ -262,23 +263,24 @@ class StructColumnReader : public ParquetColumnReader {
             : ParquetColumnReader(row_ranges, ctz, io_ctx) {}
     ~StructColumnReader() override { close(); }
 
-    Status init(std::vector<std::unique_ptr<ParquetColumnReader>>&& 
child_readers,
-                FieldSchema* field);
+    Status init(
+            std::unordered_map<std::string, 
std::unique_ptr<ParquetColumnReader>>&& child_readers,
+            FieldSchema* field);
     Status read_column_data(ColumnPtr& doris_column, DataTypePtr& type,
                             ColumnSelectVector& select_vector, size_t 
batch_size, size_t* read_rows,
                             bool* eof, bool is_dict_filter) override;
 
     const std::vector<level_t>& get_rep_level() const override {
-        return _child_readers[0]->get_rep_level();
+        return _child_readers.begin()->second->get_rep_level();
     }
     const std::vector<level_t>& get_def_level() const override {
-        return _child_readers[0]->get_def_level();
+        return _child_readers.begin()->second->get_def_level();
     }
 
     Statistics statistics() override {
         Statistics st;
         for (const auto& reader : _child_readers) {
-            Statistics cst = reader->statistics();
+            Statistics cst = reader.second->statistics();

Review Comment:
   This may be correct, but some fields may not have been read



##########
be/src/vec/exec/format/parquet/vparquet_column_reader.h:
##########
@@ -262,23 +263,24 @@ class StructColumnReader : public ParquetColumnReader {
             : ParquetColumnReader(row_ranges, ctz, io_ctx) {}
     ~StructColumnReader() override { close(); }
 
-    Status init(std::vector<std::unique_ptr<ParquetColumnReader>>&& 
child_readers,
-                FieldSchema* field);
+    Status init(
+            std::unordered_map<std::string, 
std::unique_ptr<ParquetColumnReader>>&& child_readers,
+            FieldSchema* field);
     Status read_column_data(ColumnPtr& doris_column, DataTypePtr& type,
                             ColumnSelectVector& select_vector, size_t 
batch_size, size_t* read_rows,
                             bool* eof, bool is_dict_filter) override;
 
     const std::vector<level_t>& get_rep_level() const override {
-        return _child_readers[0]->get_rep_level();
+        return _child_readers.begin()->second->get_rep_level();

Review Comment:
   So the tow level nested type like `struct<struct<>>` will be parsed badly.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to