imay commented on a change in pull request #1756: Encapsulate HLL logic URL: https://github.com/apache/incubator-doris/pull/1756#discussion_r322049774
########## File path: be/src/olap/hll.h ########## @@ -33,14 +33,339 @@ const static int HLL_REGISTERS_COUNT = 16384; // maximum size in byte of serialized HLL: type(1) + registers (2^14) const static int HLL_COLUMN_DEFAULT_LEN = 16385; -struct HllContext { - bool has_value; - bool has_sparse_or_full; - char registers[HLL_REGISTERS_COUNT]; - std::set<uint64_t>* hash64_set = nullptr; +// Hyperloglog distinct estimate algorithm. +// See these papers for more details. +// 1) Hyperloglog: The analysis of a near-optimal cardinality estimation +// algorithm (2007) +// 2) HyperLogLog in Practice (paper from google with some improvements) + +// 通过varchar的变长编码方式实现hll集合 +// 实现hll列中间计算结果的处理 +// empty 空集合 +// explicit 存储64位hash值的集合 +// sparse 存储hll非0的register +// full 存储全部的hll register +// empty -> explicit -> sparse -> full 四种类型的转换方向不可逆 +// 第一个字节存放hll集合的类型 0:empty 1:explicit 2:sparse 3:full +// 已决定后面的数据怎么解析 +class HyperLogLog { +public: + HyperLogLog(): _type(HLL_DATA_EMPTY){ + memset(_registers, 0, HLL_REGISTERS_COUNT); + } + + explicit HyperLogLog(uint64_t hash_value): _type(HLL_DATA_EXPLICIT) { + _hash_set.emplace(hash_value); + } + + explicit HyperLogLog(char* src) { + _type = (HllDataType)src[0]; + memset(_registers, 0, HLL_REGISTERS_COUNT); + char* sparse_data = nullptr; + switch (_type) { + case HLL_DATA_EXPLICIT: + // first byte : type + // second~five byte : hash values's number + // five byte later : hash value + { + auto _explicit_num = (uint8_t) (src[sizeof(SetTypeValueType)]); + auto *_explicit_value = (uint64_t *) (src + sizeof(SetTypeValueType) + sizeof(uint8_t)); + for (int i = 0; i < _explicit_num; ++i) { + _hash_set.insert(_explicit_value[i]); + } + } + break; + case HLL_DATA_SPRASE: + // first byte : type + // second ~(2^HLL_COLUMN_PRECISION)/8 byte : bitmap mark which is not zero + // 2^HLL_COLUMN_PRECISION)/8 + 1以后value + { + auto* _sparse_count = (SparseLengthValueType*)(src + sizeof (SetTypeValueType)); + sparse_data = src + sizeof(SetTypeValueType) + sizeof(SparseLengthValueType); + std::map<SparseIndexType, SparseValueType> _sparse_map; + for (int i = 0; i < *_sparse_count; i++) { + auto* index = (SparseIndexType*)sparse_data; + sparse_data += sizeof(SparseIndexType); + auto* value = (SparseValueType*)sparse_data; + _sparse_map[*index] = *value; + sparse_data += sizeof(SetTypeValueType); + } + + for (auto iter: _sparse_map) { + _registers[iter.first] = (uint8_t)iter.second; + } + } + break; + case HLL_DATA_FULL: + // first byte : type + // second byte later : hll register value + { + char* _full_value_position = src + sizeof (SetTypeValueType); + memcpy(_registers, _full_value_position, HLL_REGISTERS_COUNT); + } + break; + case HLL_DATA_EMPTY: + break; + default: + break; + } + } + + typedef uint8_t SetTypeValueType; + typedef int32_t SparseLengthValueType; + typedef uint16_t SparseIndexType; + typedef uint8_t SparseValueType; + + static void update_registers(char* registers, uint64_t hash_value) { + // Use the lower bits to index into the number of streams and then + // find the first 1 bit after the index bits. + int idx = hash_value % HLL_REGISTERS_COUNT; + uint8_t first_one_bit = __builtin_ctzl(hash_value >> HLL_COLUMN_PRECISION) + 1; + registers[idx] = std::max((uint8_t)registers[idx], first_one_bit); + } + + static void merge_hash_set_to_registers(char* registers, const std::set<uint64_t>& hash_set) { + for (auto hash_value: hash_set) { + update_registers(registers, hash_value); + } + } + + static void merge_registers(char* registers, const char* other_registers) { + for (int i = 0; i < doris::HLL_REGISTERS_COUNT; ++i) { + registers[i] = std::max(registers[i], other_registers[i]); + } + } + + static int serialize_full(char* result, char* registers) { + result[0] = HLL_DATA_FULL; + memcpy(result + 1, registers, HLL_REGISTERS_COUNT); + return HLL_COLUMN_DEFAULT_LEN; + } + + static int serialize_sparse(char *result, const std::map<int, uint8_t>& index_to_value) { + result[0] = HLL_DATA_SPRASE; + int len = sizeof(SetTypeValueType) + sizeof(SparseLengthValueType); + char* write_value_pos = result + len; + for (auto iter = index_to_value.begin(); + iter != index_to_value.end(); iter++) { + write_value_pos[0] = (char)(iter->first & 0xff); + write_value_pos[1] = (char)(iter->first >> 8 & 0xff); + write_value_pos[2] = iter->second; + write_value_pos += 3; + } + int registers_count = index_to_value.size(); + len += registers_count * (sizeof(SparseIndexType)+ sizeof(SparseValueType)); + *(int*)(result + 1) = registers_count; + return len; + } + + static int serialize_explicit(char* result, const std::set<uint64_t>& hash_value_set) { + result[0] = HLL_DATA_EXPLICIT; + result[1] = (uint8_t)(hash_value_set.size()); + int len = sizeof(SetTypeValueType) + sizeof(uint8_t); + char* write_pos = result + len; + for (auto iter = hash_value_set.begin(); + iter != hash_value_set.end(); iter++) { + uint64_t hash_value = *iter; + *(uint64_t*)write_pos = hash_value; + write_pos += 8; + } + len += sizeof(uint64_t) * hash_value_set.size(); + return len; + } + + // change the _type to HLL_DATA_FULL directly has two reasons: + // 1. keep behavior consistent with before + // 2. make the code logic is simple + void update(const uint64_t hash_value) { + _type = HLL_DATA_FULL; + update_registers(_registers, hash_value); + } + + void merge(const HyperLogLog& other) { Review comment: For large code block, we'd better to write it in cpp/cc files to accelerate our compile time ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services --------------------------------------------------------------------- To unsubscribe, e-mail: dev-unsubscr...@doris.apache.org For additional commands, e-mail: dev-h...@doris.apache.org