github-actions[bot] commented on code in PR #24403: URL: https://github.com/apache/doris/pull/24403#discussion_r1389097936
########## be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp: ########## @@ -0,0 +1,1008 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/vertical_segment_writer.h" + +#include <gen_cpp/segment_v2.pb.h> +#include <parallel_hashmap/phmap.h> + +#include <algorithm> +#include <cassert> +#include <ostream> +#include <unordered_map> +#include <utility> + +#include "cloud/config.h" +#include "common/compiler_util.h" // IWYU pragma: keep +#include "common/config.h" +#include "common/logging.h" // LOG +#include "gutil/port.h" +#include "io/fs/file_writer.h" +#include "olap/data_dir.h" +#include "olap/key_coder.h" +#include "olap/olap_common.h" +#include "olap/primary_key_index.h" +#include "olap/row_cursor.h" // RowCursor // IWYU pragma: keep +#include "olap/rowset/rowset_writer_context.h" // RowsetWriterContext +#include "olap/rowset/segment_v2/column_writer.h" // ColumnWriter +#include "olap/rowset/segment_v2/page_io.h" +#include "olap/rowset/segment_v2/page_pointer.h" +#include "olap/segment_loader.h" +#include "olap/short_key_index.h" +#include "olap/tablet_schema.h" +#include "olap/utils.h" +#include "runtime/memory/mem_tracker.h" +#include "service/point_query_executor.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/faststring.h" +#include "util/key_util.h" +#include "vec/columns/column_nullable.h" +#include "vec/common/schema_util.h" +#include "vec/core/block.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/core/types.h" +#include "vec/io/reader_buffer.h" +#include "vec/jsonb/serialize.h" +#include "vec/olap/olap_data_convertor.h" + +namespace doris { +namespace segment_v2 { Review Comment: warning: nested namespaces can be concatenated [modernize-concat-nested-namespaces] ```suggestion namespace doris::segment_v2 { ``` be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp:1006: ```diff - } // namespace segment_v2 - } // namespace doris + } // namespace doris ``` ########## be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp: ########## @@ -0,0 +1,1008 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/rowset/segment_v2/vertical_segment_writer.h" + +#include <gen_cpp/segment_v2.pb.h> +#include <parallel_hashmap/phmap.h> + +#include <algorithm> +#include <cassert> +#include <ostream> +#include <unordered_map> +#include <utility> + +#include "cloud/config.h" +#include "common/compiler_util.h" // IWYU pragma: keep +#include "common/config.h" +#include "common/logging.h" // LOG +#include "gutil/port.h" +#include "io/fs/file_writer.h" +#include "olap/data_dir.h" +#include "olap/key_coder.h" +#include "olap/olap_common.h" +#include "olap/primary_key_index.h" +#include "olap/row_cursor.h" // RowCursor // IWYU pragma: keep +#include "olap/rowset/rowset_writer_context.h" // RowsetWriterContext +#include "olap/rowset/segment_v2/column_writer.h" // ColumnWriter +#include "olap/rowset/segment_v2/page_io.h" +#include "olap/rowset/segment_v2/page_pointer.h" +#include "olap/segment_loader.h" +#include "olap/short_key_index.h" +#include "olap/tablet_schema.h" +#include "olap/utils.h" +#include "runtime/memory/mem_tracker.h" +#include "service/point_query_executor.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/faststring.h" +#include "util/key_util.h" +#include "vec/columns/column_nullable.h" +#include "vec/common/schema_util.h" +#include "vec/core/block.h" +#include "vec/core/column_with_type_and_name.h" +#include "vec/core/types.h" +#include "vec/io/reader_buffer.h" +#include "vec/jsonb/serialize.h" +#include "vec/olap/olap_data_convertor.h" + +namespace doris { +namespace segment_v2 { + +using namespace ErrorCode; + +static const char* k_segment_magic = "D0R1"; +static const uint32_t k_segment_magic_length = 4; + +VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, + TabletSchemaSPtr tablet_schema, BaseTabletSPtr tablet, + DataDir* data_dir, uint32_t max_row_per_segment, + const VerticalSegmentWriterOptions& opts, + std::shared_ptr<MowContext> mow_context) + : _segment_id(segment_id), + _tablet_schema(std::move(tablet_schema)), + _tablet(std::move(tablet)), + _data_dir(data_dir), + _opts(opts), + _file_writer(file_writer), + _mem_tracker(std::make_unique<MemTracker>("VerticalSegmentWriter:Segment-" + + std::to_string(segment_id))), + _mow_context(std::move(mow_context)) { + CHECK_NOTNULL(file_writer); + _num_key_columns = _tablet_schema->num_key_columns(); + _num_short_key_columns = _tablet_schema->num_short_key_columns(); + DCHECK(_num_key_columns >= _num_short_key_columns); + for (size_t cid = 0; cid < _num_key_columns; ++cid) { + const auto& column = _tablet_schema->column(cid); + _key_coders.push_back(get_key_coder(column.type())); + _key_index_size.push_back(column.index_length()); + } + // encode the sequence id into the primary key index + if (_tablet_schema->has_sequence_col() && _tablet_schema->keys_type() == UNIQUE_KEYS && + _opts.enable_unique_key_merge_on_write) { + const auto& column = _tablet_schema->column(_tablet_schema->sequence_col_idx()); + _seq_coder = get_key_coder(column.type()); + } +} + +VerticalSegmentWriter::~VerticalSegmentWriter() { + _mem_tracker->release(_mem_tracker->consumption()); +} + +void VerticalSegmentWriter::_init_column_meta(ColumnMetaPB* meta, uint32_t column_id, + const TabletColumn& column) { + meta->set_column_id(column_id); + meta->set_unique_id(column.unique_id()); + meta->set_type(int(column.type())); + meta->set_length(column.length()); + meta->set_encoding(DEFAULT_ENCODING); + meta->set_compression(_opts.compression_type); + meta->set_is_nullable(column.is_nullable()); + for (uint32_t i = 0; i < column.get_subtype_count(); ++i) { + _init_column_meta(meta->add_children_columns(), column_id, column.get_sub_column(i)); + } +} + +Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& column) { + ColumnWriterOptions opts; + opts.meta = _footer.add_columns(); + + _init_column_meta(opts.meta, cid, column); + + // now we create zone map for key columns in AGG_KEYS or all column in UNIQUE_KEYS or DUP_KEYS + // and not support zone map for array type and jsonb type. + opts.need_zone_map = (column.is_key() || _tablet_schema->keys_type() != KeysType::AGG_KEYS) && + column.type() != FieldType::OLAP_FIELD_TYPE_OBJECT; + opts.need_bloom_filter = column.is_bf_column(); + auto* tablet_index = _tablet_schema->get_ngram_bf_index(column.unique_id()); + if (tablet_index) { + opts.need_bloom_filter = true; + opts.is_ngram_bf_index = true; + opts.gram_size = tablet_index->get_gram_size(); + opts.gram_bf_size = tablet_index->get_gram_bf_size(); + } + + opts.need_bitmap_index = column.has_bitmap_index(); + bool skip_inverted_index = false; + if (_opts.rowset_ctx != nullptr) { + // skip write inverted index for index compaction + skip_inverted_index = _opts.rowset_ctx->skip_inverted_index.count(column.unique_id()) > 0; + } + // skip write inverted index on load if skip_write_index_on_load is true + if (_opts.write_type == DataWriteType::TYPE_DIRECT && + _tablet_schema->skip_write_index_on_load()) { + skip_inverted_index = true; + } + // indexes for this column + opts.indexes = _tablet_schema->get_indexes_for_column(column.unique_id()); + for (auto index : opts.indexes) { + if (!skip_inverted_index && index && index->index_type() == IndexType::INVERTED) { + opts.inverted_index = index; + // TODO support multiple inverted index + break; + } + } + if (column.type() == FieldType::OLAP_FIELD_TYPE_STRUCT) { + opts.need_zone_map = false; + if (opts.need_bloom_filter) { + return Status::NotSupported("Do not support bloom filter for struct type"); + } + if (opts.need_bitmap_index) { + return Status::NotSupported("Do not support bitmap index for struct type"); + } + } + if (column.type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { + opts.need_zone_map = false; + if (opts.need_bloom_filter) { + return Status::NotSupported("Do not support bloom filter for array type"); + } + if (opts.need_bitmap_index) { + return Status::NotSupported("Do not support bitmap index for array type"); + } + } + if (column.type() == FieldType::OLAP_FIELD_TYPE_JSONB) { + opts.need_zone_map = false; + if (opts.need_bloom_filter) { + return Status::NotSupported("Do not support bloom filter for jsonb type"); + } + if (opts.need_bitmap_index) { + return Status::NotSupported("Do not support bitmap index for jsonb type"); + } + } + if (column.type() == FieldType::OLAP_FIELD_TYPE_AGG_STATE) { + opts.need_zone_map = false; + if (opts.need_bloom_filter) { + return Status::NotSupported("Do not support bloom filter for agg_state type"); + } + if (opts.need_bitmap_index) { + return Status::NotSupported("Do not support bitmap index for agg_state type"); + } + } + if (column.type() == FieldType::OLAP_FIELD_TYPE_MAP) { + opts.need_zone_map = false; + if (opts.need_bloom_filter) { + return Status::NotSupported("Do not support bloom filter for map type"); + } + if (opts.need_bitmap_index) { + return Status::NotSupported("Do not support bitmap index for map type"); + } + } + + if (column.is_row_store_column()) { + // smaller page size for row store column + opts.data_page_size = config::row_column_page_size; + } + + std::unique_ptr<ColumnWriter> writer; + RETURN_IF_ERROR(ColumnWriter::create(opts, &column, _file_writer, &writer)); + RETURN_IF_ERROR(writer->init()); + _column_writers.push_back(std::move(writer)); + + _olap_data_convertor->add_column_data_convertor(column); + return Status::OK(); +}; + +Status VerticalSegmentWriter::init() { + DCHECK(_column_writers.empty()); + if (_opts.compression_type == UNKNOWN_COMPRESSION) { + _opts.compression_type = _tablet_schema->compression_type(); + } + _olap_data_convertor = std::make_unique<vectorized::OlapBlockDataConvertor>(); + _olap_data_convertor->reserve(_tablet_schema->num_columns()); + _column_writers.reserve(_tablet_schema->columns().size()); + // we don't need the short key index for unique key merge on write table. + if (_tablet_schema->keys_type() == UNIQUE_KEYS && _opts.enable_unique_key_merge_on_write) { + size_t seq_col_length = 0; + if (_tablet_schema->has_sequence_col()) { + seq_col_length = + _tablet_schema->column(_tablet_schema->sequence_col_idx()).length() + 1; + } + _primary_key_index_builder.reset(new PrimaryKeyIndexBuilder(_file_writer, seq_col_length)); + RETURN_IF_ERROR(_primary_key_index_builder->init()); + } else { + _short_key_index_builder.reset( + new ShortKeyIndexBuilder(_segment_id, _opts.num_rows_per_block)); + } + return Status::OK(); +} + +void VerticalSegmentWriter::_maybe_invalid_row_cache(const std::string& key) const { + // Just invalid row cache for simplicity, since the rowset is not visible at present. + // If we update/insert cache, if load failed rowset will not be visible but cached data + // will be visible, and lead to inconsistency. + if (!config::disable_storage_row_cache && _tablet_schema->store_row_column() && + _opts.write_type == DataWriteType::TYPE_DIRECT) { + // invalidate cache + RowCache::instance()->erase({_opts.rowset_ctx->tablet_id, key}); + } +} + +void VerticalSegmentWriter::_serialize_block_to_row_column(vectorized::Block& block) { Review Comment: warning: method '_serialize_block_to_row_column' can be made static [readability-convert-member-functions-to-static] be/src/olap/rowset/segment_v2/vertical_segment_writer.h:141: ```diff - void _serialize_block_to_row_column(vectorized::Block& block); + static void _serialize_block_to_row_column(vectorized::Block& block); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org