gaodayue commented on a change in pull request #1409: Add dict page URL: https://github.com/apache/incubator-doris/pull/1409#discussion_r300385330
########## File path: be/src/olap/rowset/segment_v2/binary_dict_page.h ########## @@ -0,0 +1,127 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <memory> +#include <unordered_map> +#include <functional> +#include <string> + +#include "olap/olap_common.h" +#include "olap/types.h" +#include "olap/rowset/segment_v2/binary_plain_page.h" +#include "olap/rowset/segment_v2/options.h" +#include "olap/rowset/segment_v2/common.h" +#include "runtime/mem_pool.h" +#include "gen_cpp/segment_v2.pb.h" + +namespace doris { +namespace segment_v2 { + +enum { + BINARY_DICT_PAGE_HEADER_SIZE = 4 +}; + +// This type of page use dictionary encoding for strings. +// There is only one dictionary page for all the data pages within a column. +// +// Layout for dictionary encoded page: +// Either header + embedded codeword page, which can be encoded with any +// int PageBuilder, when mode_ = DICT_ENCODING. +// Or header + embedded BinaryPlainPage, when mode_ = PLAIN_ENCOING. +// Data pages start with mode_ = DICT_ENCODING, when the the size of dictionary +// page go beyond the option_->dict_page_size, the subsequent data pages will switch +// to string plain page automatically. +class BinaryDictPageBuilder : public PageBuilder { +public: + BinaryDictPageBuilder(const PageBuilderOptions& options); + + bool is_page_full() override; + + Status add(const uint8_t* vals, size_t* count) override; + + Slice finish() override; + + void reset() override; + + size_t count() const override; + + Status get_dictionary_page(Slice* dictionary_page) override; + + // this api will release the memory ownership of encoded data + // Note: + // release() should be called after finish + // reset() should be called after this function before reuse the builder + void release() override { + uint8_t* ret = _buffer.release(); + (void)ret; + } + +private: + PageBuilderOptions _options; + bool _finished; + + std::unique_ptr<PageBuilder> _data_page_builder; + + std::unique_ptr<BinaryPlainPageBuilder> _dict_builder; + + EncodingTypePB _encoding_type; + struct HashOfSlice { + size_t operator()(const Slice& slice) const { + return std::hash<std::string>()(std::string(slice.data, slice.size)); + } + }; + // query for dict item -> dict id + std::unordered_map<Slice, uint32_t, HashOfSlice> _dictionary; + // for all dict items by insert sequence Review comment: ```suggestion // used to remember the insertion order of dict keys ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@doris.apache.org For additional commands, e-mail: dev-h...@doris.apache.org