xiaokang commented on code in PR #26749: URL: https://github.com/apache/doris/pull/26749#discussion_r1393507262
########## be/src/olap/rowset/segment_v2/column_reader.cpp: ########## @@ -1462,5 +1481,73 @@ void DefaultValueColumnIterator::_insert_many_default(vectorized::MutableColumnP } } +Status VariantRootColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& dst, + bool* has_null) { + size_t size = dst->size(); + auto& obj = + dst->is_nullable() + ? assert_cast<vectorized::ColumnObject&>( + assert_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()) + : assert_cast<vectorized::ColumnObject&>(*dst); + if (obj.is_null_root()) { + obj.create_root(); + } + auto root_column = obj.get_root(); + RETURN_IF_ERROR(_inner_iter->next_batch(n, root_column, has_null)); + obj.incr_num_rows(*n); + for (auto& entry : obj.get_subcolumns()) { + if (entry->data.size() != size + *n) { + entry->data.insertManyDefaults(*n); Review Comment: size + *n - data.size() ########## be/src/olap/rowset/segment_v2/column_reader.cpp: ########## @@ -1462,5 +1481,73 @@ void DefaultValueColumnIterator::_insert_many_default(vectorized::MutableColumnP } } +Status VariantRootColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr& dst, + bool* has_null) { + size_t size = dst->size(); + auto& obj = + dst->is_nullable() + ? assert_cast<vectorized::ColumnObject&>( + assert_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()) + : assert_cast<vectorized::ColumnObject&>(*dst); + if (obj.is_null_root()) { + obj.create_root(); + } + auto root_column = obj.get_root(); + RETURN_IF_ERROR(_inner_iter->next_batch(n, root_column, has_null)); + obj.incr_num_rows(*n); + for (auto& entry : obj.get_subcolumns()) { + if (entry->data.size() != size + *n) { + entry->data.insertManyDefaults(*n); + } + } + // fill nullmap + if (root_column->is_nullable()) { + DCHECK(dst->is_nullable()); + vectorized::ColumnUInt8& dst_null_map = + assert_cast<vectorized::ColumnNullable&>(*dst).get_null_map_column(); + vectorized::ColumnUInt8& src_null_map = + assert_cast<vectorized::ColumnNullable&>(*root_column).get_null_map_column(); + dst_null_map.insert_range_from(src_null_map, 0, src_null_map.size()); + } +#ifndef NDEBUG + obj.check_consistency(); +#endif + return Status::OK(); +} + +Status VariantRootColumnIterator::read_by_rowids(const rowid_t* rowids, const size_t count, + vectorized::MutableColumnPtr& dst) { + size_t size = dst->size(); + auto& obj = + dst->is_nullable() + ? assert_cast<vectorized::ColumnObject&>( + assert_cast<vectorized::ColumnNullable&>(*dst).get_nested_column()) + : assert_cast<vectorized::ColumnObject&>(*dst); + if (obj.is_null_root()) { + obj.create_root(); + } + auto root_column = obj.get_root(); + RETURN_IF_ERROR(_inner_iter->read_by_rowids(rowids, count, root_column)); + obj.incr_num_rows(count); + for (auto& entry : obj.get_subcolumns()) { + if (entry->data.size() != size + count) { + entry->data.insertManyDefaults(count); Review Comment: same as above ########## be/src/olap/rowset/segment_v2/segment.cpp: ########## @@ -318,14 +330,44 @@ Status Segment::_load_index_impl() { }); } +static vectorized::DataTypePtr get_data_type_from_column_meta( + const segment_v2::ColumnMetaPB& column) { + return vectorized::DataTypeFactory::instance().create_data_type(column); +} + +vectorized::DataTypePtr Segment::get_data_type_of(const Field& field, bool ignore_children) const { + // Path has higher priority + if (!field.path().empty()) { + auto node = _sub_column_tree.find_leaf(field.path()); + if (node) { + if (ignore_children || node->children.empty()) { + return node->data.file_column_type; + } + } + // it contains children or column missing in storage, so treat it as variant + return field.is_nullable() + ? vectorized::make_nullable(std::make_shared<vectorized::DataTypeObject>()) + : std::make_shared<vectorized::DataTypeObject>(); + } + // TODO support normal column type + return nullptr; Review Comment: It's not intuitive to return nullprt of normal column. May be we can change the function name related to variant. ########## be/src/olap/rowset/segment_v2/segment_iterator.cpp: ########## @@ -2002,6 +2123,7 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { _second_read_column_ids.end()) { _replace_version_col(selected_size); } + RETURN_IF_ERROR(_convert_to_expected_type(_second_read_column_ids)); Review Comment: Should _convert_to_expected_type be called before _output_column_by_sel_idx? ########## be/src/olap/rowset/segment_v2/segment_iterator.h: ########## @@ -224,16 +227,46 @@ class SegmentIterator : public RowwiseIterator { uint16_t* sel_rowid_idx, size_t select_size, vectorized::MutableColumns* mutable_columns); + Status copy_column_data_by_selector(vectorized::IColumn* input_col_ptr, + vectorized::MutableColumnPtr& output_col, + uint16_t* sel_rowid_idx, uint16_t select_size, + size_t batch_size); + template <class Container> [[nodiscard]] Status _output_column_by_sel_idx(vectorized::Block* block, const Container& column_ids, uint16_t* sel_rowid_idx, uint16_t select_size) { SCOPED_RAW_TIMER(&_opts.stats->output_col_ns); for (auto cid : column_ids) { int block_cid = _schema_block_id_map[cid]; - RETURN_IF_ERROR(block->copy_column_data_to_block(_current_return_columns[cid].get(), - sel_rowid_idx, select_size, block_cid, + // Only the additional deleted filter condition need to materialize column be at the end of the block + // We should not to materialize the column of query engine do not need. So here just return OK. + // Eg: + // `delete from table where a = 10;` + // `select b from table;` + // a column only effective in segment iterator, the block from query engine only contain the b column. + // so the `block_cid >= data.size()` is true + if (block_cid >= block->columns()) { + continue; Review Comment: We can refactor Block::copy_column_data_to_block and call it with different args instead of copy its code and modify it. ########## be/src/olap/rowset/segment_v2/segment_iterator.cpp: ########## @@ -1632,19 +1685,35 @@ void SegmentIterator::_init_current_block( for (size_t i = 0; i < _schema->num_column_ids(); i++) { auto cid = _schema->column_id(i); auto column_desc = _schema->column(cid); - // the column in block must clear() here to insert new data - if (_is_pred_column[cid] || - i >= block->columns()) { //todo(wb) maybe we can release it after output block - current_columns[cid]->clear(); - } else { // non-predicate column - current_columns[cid] = std::move(*block->get_by_position(i).column).mutate(); - - if (column_desc->type() == FieldType::OLAP_FIELD_TYPE_DATE) { - current_columns[cid]->set_date_type(); - } else if (column_desc->type() == FieldType::OLAP_FIELD_TYPE_DATETIME) { - current_columns[cid]->set_datetime_type(); - } + if (!_is_pred_column[cid] && + !_segment->same_with_storage_type( Review Comment: It will affect column which is not variant. Is it safe? ########## be/src/olap/rowset/segment_v2/segment.cpp: ########## @@ -318,14 +330,44 @@ Status Segment::_load_index_impl() { }); } +static vectorized::DataTypePtr get_data_type_from_column_meta( Review Comment: Is it worth to wrap a function ? ########## be/src/olap/rowset/segment_v2/segment_iterator.cpp: ########## @@ -1891,6 +1960,49 @@ Status SegmentIterator::next_batch(vectorized::Block* block) { return status; } +Status SegmentIterator::_convert_to_expected_type(const std::vector<ColumnId>& col_ids) { + for (ColumnId i : col_ids) { + if (_current_return_columns[i] == nullptr || _converted_column_ids[i] || + _is_pred_column[i]) { + continue; + } + if (!_segment->same_with_storage_type( + i, *_schema, _opts.io_ctx.reader_type != ReaderType::READER_QUERY)) { + const Field* field_type = _schema->column(i); + vectorized::DataTypePtr expected_type = Schema::get_data_type_ptr(*field_type); + vectorized::DataTypePtr file_column_type = _storage_name_and_type[i].second; + vectorized::ColumnPtr expected; + vectorized::ColumnPtr original = + _current_return_columns[i]->assume_mutable()->get_ptr(); + RETURN_IF_ERROR(vectorized::schema_util::cast_column({original, file_column_type, ""}, + expected_type, &expected)); + _current_return_columns[i] = expected->assume_mutable(); + _converted_column_ids[i] = 1; + VLOG_DEBUG << fmt::format("Convert {} fom file column type {} to {}, num_rows {}", + field_type->path().get_path(), file_column_type->get_name(), + expected_type->get_name(), + _current_return_columns[i]->size()); + } + } + return Status::OK(); +} + +Status SegmentIterator::copy_column_data_by_selector(vectorized::IColumn* input_col_ptr, + vectorized::MutableColumnPtr& output_col, Review Comment: We can refactor Block::copy_column_data_to_block and call it with different args instead of copy its code and modify it. ########## be/src/olap/rowset/segment_v2/segment_iterator.cpp: ########## @@ -1903,13 +2015,17 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { _block_rowids.resize(_opts.block_row_max); } _current_return_columns.resize(_schema->columns().size()); + _converted_column_ids.resize(_schema->columns().size(), 0); for (size_t i = 0; i < _schema->num_column_ids(); i++) { auto cid = _schema->column_id(i); auto column_desc = _schema->column(cid); if (_is_pred_column[cid]) { - RETURN_IF_CATCH_EXCEPTION(_current_return_columns[cid] = - Schema::get_predicate_column_ptr( - *column_desc, _opts.io_ctx.reader_type)); + auto storage_column_type = _storage_name_and_type[cid].second; + RETURN_IF_CATCH_EXCEPTION( + _current_return_columns[cid] = Schema::get_predicate_column_ptr( + _is_char_type[cid] ? FieldType::OLAP_FIELD_TYPE_CHAR Review Comment: Why do special process for char? ########## be/src/olap/rowset/segment_v2/segment.h: ########## @@ -84,7 +91,14 @@ class Segment : public std::enable_shared_from_this<Segment> { uint32_t num_rows() const { return _num_rows; } Status new_column_iterator(const TabletColumn& tablet_column, - std::unique_ptr<ColumnIterator>* iter); + std::unique_ptr<ColumnIterator>* iter, + StorageReadOptions* opt = nullptr); + + Status new_iterator_with_path(const TabletColumn& tablet_column, Review Comment: use consistent name new_column_iterator_with_path ########## be/src/olap/rowset/segment_v2/segment_iterator.cpp: ########## @@ -276,7 +282,12 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { _file_reader = _segment->_file_reader; _opts = opts; _col_predicates.clear(); + for (auto& predicate : opts.column_predicates) { + if (!_segment->can_apply_predicate_safely(predicate->column_id(), predicate, *_schema, + _opts.io_ctx.reader_type)) { + continue; Review Comment: column_predicates id assumed to be push down in storage layer and removed from expr in compute layer. I think it will cause wrongt result if continue. ########## be/src/olap/rowset/segment_v2/segment_iterator.cpp: ########## @@ -1923,12 +2039,13 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { // TODO: skip read the not effective delete column to speed up segment read. _current_return_columns[cid] = Schema::get_data_type_ptr(*column_desc)->create_column(); + ; Review Comment: uesless code ########## be/src/olap/rowset/segment_v2/segment.cpp: ########## @@ -483,5 +649,25 @@ Status Segment::read_key_by_rowid(uint32_t row_id, std::string* key) { return Status::OK(); } +bool Segment::same_with_storage_type(int32_t cid, const Schema& schema, + bool ignore_children) const { + auto file_column_type = get_data_type_of(*schema.column(cid), ignore_children); + auto expected_type = Schema::get_data_type_ptr(*schema.column(cid)); + // ignore struct and map now + auto type_without_nullable = vectorized::remove_nullable(expected_type); + if (vectorized::WhichDataType(type_without_nullable).is_struct() || + vectorized::WhichDataType(type_without_nullable).is_map()) { + return true; Review Comment: Why? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org