yiguolei commented on code in PR #14480: URL: https://github.com/apache/doris/pull/14480#discussion_r1028947971
########## be/src/vec/common/hash_table/partitioned_hash_table.h: ########## @@ -0,0 +1,666 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/HashTable/TwoLevelHashTable.h +// and modified by Doris +#pragma once + +#include "vec/common/hash_table/hash_table.h" + +/** Partitioned hash table. + * Represents 16 (or 1ULL << BITS_FOR_SUB_TABLE) small hash tables (sub table count of the first level). + * To determine which one to use, one of the bytes of the hash function is taken. + * + * Usually works a little slower than a simple hash table. + * However, it has advantages in some cases: + * - if you need to merge two hash tables together, then you can easily parallelize it by sub tables; + * - delay during resizes is amortized, since the small hash tables will be resized separately; + * - in theory, resizes are cache-local in a larger range of sizes. + */ + +template <size_t initial_size_degree = 8> +struct PartitionedHashTableGrower : public HashTableGrowerWithPrecalculation<initial_size_degree> { + /// Increase the size of the hash table. + void increase_size() { this->increase_size_degree(this->size_degree() >= 15 ? 1 : 2); } +}; + +template <typename Key, typename Cell, typename Hash, typename Grower, typename Allocator, + typename ImplTable = HashTable<Key, Cell, Hash, Grower, Allocator>, + bool ENABLE_PARTITIONED = false, size_t BITS_FOR_SUB_TABLE = 4> +class PartitionedHashTable : private boost::noncopyable, + protected Hash /// empty base optimization +{ +public: + using Impl = ImplTable; + + using key_type = typename Impl::key_type; + using mapped_type = typename Impl::mapped_type; + using value_type = typename Impl::value_type; + using cell_type = typename Impl::cell_type; + + using LookupResult = typename Impl::LookupResult; + using ConstLookupResult = typename Impl::ConstLookupResult; + +protected: + friend class const_iterator; + friend class iterator; + + using HashValue = size_t; + using Self = PartitionedHashTable; + +private: + static constexpr size_t NUM_LEVEL1_SUB_TABLES = 1ULL << BITS_FOR_SUB_TABLE; + static constexpr size_t MAX_SUB_TABLE = NUM_LEVEL1_SUB_TABLES - 1; + + //factor that will trigger growing the hash table on insert. + static constexpr float MAX_SUB_TABLE_OCCUPANCY_FRACTION = 0.5f; + + static const int PARTITIONED_BUCKET_THRESHOLD = 8388608; Review Comment: It should not be a const variable, it should be passed in from session variable. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org