This is an automated email from the ASF dual-hosted git repository. panxiaolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 6ec1a86201d [Bug](hash) fix wrong HashLen16 implement and add cityhash64 unit test (#46928) 6ec1a86201d is described below commit 6ec1a86201decb7a749d1de1144144c52a36e0ee Author: Pxl <x...@selectdb.com> AuthorDate: Tue Jan 14 11:11:32 2025 +0800 [Bug](hash) fix wrong HashLen16 implement and add cityhash64 unit test (#46928) ### What problem does this PR solve? fix wrtong HashLen16 implement and add cityhash64 unit test HashLen16 was incorrectly modified in #35841, which would cause ngram indexes built in 2.0/2.1 to get incorrect results in 3.0 --- be/src/gutil/hash/city.cc | 10 +- be/src/gutil/hash/city.h | 9 +- .../olap/rowset/segment_v2/ngram_bloom_filter.cpp | 2 - be/src/olap/rowset/segment_v2/ngram_bloom_filter.h | 2 + be/src/vec/common/string_ref.h | 10 - be/test/util/cityhash_test.cpp | 232 +++++++++++++++++++++ 6 files changed, 243 insertions(+), 22 deletions(-) diff --git a/be/src/gutil/hash/city.cc b/be/src/gutil/hash/city.cc index d4d53ab2f6b..4e34959ea08 100644 --- a/be/src/gutil/hash/city.cc +++ b/be/src/gutil/hash/city.cc @@ -73,7 +73,7 @@ uint64 HashLen16(uint64 u, uint64 v) { const uint64 kMul = 0xc6a4a7935bd1e995ULL; uint64 a = (u ^ v) * kMul; a ^= (a >> 47); - uint64 b = (v ^ a) * kMul; + uint64 b = (u ^ a) * kMul; b ^= (b >> 47); b *= kMul; return b; @@ -199,11 +199,11 @@ uint64 CityHash64(const char* s, size_t len) { HashLen16(v.second, w.second) + x); } -uint64 CityHash64WithSeed(const char* s, size_t len, uint64 seed) { - return CityHash64WithSeeds(s, len, k2, seed); -} - uint64 CityHash64WithSeeds(const char* s, size_t len, uint64 seed0, uint64 seed1) { return HashLen16(CityHash64(s, len) - seed0, seed1); } + +uint64 CityHash64WithSeed(const char* s, size_t len, uint64 seed) { + return CityHash64WithSeeds(s, len, k2, seed); +} } // namespace util_hash diff --git a/be/src/gutil/hash/city.h b/be/src/gutil/hash/city.h index 8e6042b61a8..0617bc442a1 100644 --- a/be/src/gutil/hash/city.h +++ b/be/src/gutil/hash/city.h @@ -24,6 +24,10 @@ #include "gutil/integral_types.h" +// WARNING +// The implementation of cityhash in this file is somewhat different from Google's original version. +// For the same input, there will be different output results. +// Therefore, we should do not to use this special cityhash as possible as we can. namespace util_hash { uint64 HashLen16(uint64 u, uint64 v); @@ -35,9 +39,4 @@ uint64 CityHash64(const char* buf, size_t len); // Hash function for a byte array. For convenience, a 64-bit seed is also // hashed into the result. The mapping may change from time to time. uint64 CityHash64WithSeed(const char* buf, size_t len, uint64 seed); - -// Hash function for a byte array. For convenience, two seeds are also -// hashed into the result. The mapping may change from time to time. -uint64 CityHash64WithSeeds(const char* buf, size_t len, uint64 seed0, uint64 seed1); - } // namespace util_hash diff --git a/be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp index a3227f8e75f..f4d0953a8b9 100644 --- a/be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp +++ b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.cpp @@ -26,8 +26,6 @@ namespace doris { namespace segment_v2 { -static constexpr uint64_t SEED_GEN = 217728422; - NGramBloomFilter::NGramBloomFilter(size_t size) : _size(size), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), diff --git a/be/src/olap/rowset/segment_v2/ngram_bloom_filter.h b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.h index 527f8a6e99e..6418ab30c98 100644 --- a/be/src/olap/rowset/segment_v2/ngram_bloom_filter.h +++ b/be/src/olap/rowset/segment_v2/ngram_bloom_filter.h @@ -26,6 +26,8 @@ #include "olap/rowset/segment_v2/bloom_filter.h" namespace doris { +static constexpr uint64_t SEED_GEN = 217728422; + namespace segment_v2 { enum HashStrategyPB : int; diff --git a/be/src/vec/common/string_ref.h b/be/src/vec/common/string_ref.h index e2094ca8f39..bdb9b5f8b99 100644 --- a/be/src/vec/common/string_ref.h +++ b/be/src/vec/common/string_ref.h @@ -343,16 +343,6 @@ inline size_t hash_less_than8(const char* data, size_t size) { return k2; } -inline size_t hash_less_than16(const char* data, size_t size) { - if (size > 8) { - auto a = unaligned_load<doris::vectorized::UInt64>(data); - auto b = unaligned_load<doris::vectorized::UInt64>(data + size - 8); - return hash_len16(a, rotate_by_at_least1(b + size, size)) ^ b; - } - - return hash_less_than8(data, size); -} - inline size_t crc32_hash(const char* pos, size_t size) { if (size == 0) { return 0; diff --git a/be/test/util/cityhash_test.cpp b/be/test/util/cityhash_test.cpp new file mode 100644 index 00000000000..399c7954fa0 --- /dev/null +++ b/be/test/util/cityhash_test.cpp @@ -0,0 +1,232 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest-message.h> +#include <gtest/gtest-test-part.h> + +#include <cstdint> +#include <cstdio> +#include <iostream> + +#include "gtest/gtest_pred_impl.h" +#include "gutil/hash/city.h" +#include "olap/rowset/segment_v2/ngram_bloom_filter.h" +#include "testutil/any_type.h" + +namespace doris { + +void test(uint64_t result, uint64_t result_with_seed, const std::string& str) { + EXPECT_EQ(result, util_hash::CityHash64(str.data(), str.size())); + EXPECT_EQ(result_with_seed, util_hash::CityHash64WithSeed(str.data(), str.size(), SEED_GEN)); +} + +TEST(city_hash, simple) { + test(200641142423602673ULL, 14284967837249502203ULL, "Testing 1, 2, 3"); + test(15001808493323962774ULL, 11424039407453860401ULL, "Moscow"); + test(8034095133228533394ULL, 18132648334508667393ULL, "Moscow is the capital of Russia"); + test(13956251726120965457ULL, 1381326629274895150ULL, + "Moscow is the capital of Russia, and the largest city in the country"); + test(2261632006729419242ULL, 17084064757690202946ULL, + "Moscow is the capital of Russia, and the largest city in the country, with a population " + "of over 12 million people"); + test(6469445502724539237ULL, 11343158389890087495ULL, + "Moscow is the capital of Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in Europe"); + test(4443578057564282258ULL, 8388333080872437400ULL, "one"); +} + +TEST(city_hash, long_text) { + test(2603920979253371939ULL, 17286059257277065140ULL, + "Testing 1, 2, 3Testing 1, 2, 3Testing 1, 2, 3Testing 1, 2, 3Testing 1, 2, 3Testing 1, 2, " + "3Testing 1, 2, 3Testing 1, 2, 3Testing 1, 2, 3Testing 1, 2, 3Testing 1, 2, 3Testing 1, " + "2, 3"); + test(3019762420123308560ULL, 17640672461655093518ULL, + "MoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMosco" + "wMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMosc" + "owMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMos" + "cowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMo" + "scowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowM" + "oscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscow" + "MoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMoscowMosco" + "wMoscowMoscowMoscowMoscowMoscowMoscowMoscow"); + test(15852398506477355940ULL, 7555841253624036420ULL, + "Moscow is the capital of RussiaMoscow is the capital of RussiaMoscow is the capital of " + "RussiaMoscow is the capital of RussiaMoscow is the capital of RussiaMoscow is the " + "capital of RussiaMoscow is the capital of RussiaMoscow is the capital of RussiaMoscow is " + "the capital of RussiaMoscow is the capital of RussiaMoscow is the capital of " + "RussiaMoscow is the capital of RussiaMoscow is the capital of RussiaMoscow is the " + "capital of RussiaMoscow is the capital of RussiaMoscow is the capital of RussiaMoscow is " + "the capital of RussiaMoscow is the capital of RussiaMoscow is the capital of " + "RussiaMoscow is the capital of RussiaMoscow is the capital of RussiaMoscow is the " + "capital of RussiaMoscow is the capital of RussiaMoscow is the capital of RussiaMoscow is " + "the capital of RussiaMoscow is the capital of RussiaMoscow is the capital of " + "RussiaMoscow is the capital of RussiaMoscow is the capital of Russia"); + test(1035060799968064901ULL, 16530320359523728196ULL, + "Moscow is the capital of Russia, and the largest city in the countryMoscow is the " + "capital of Russia, and the largest city in the countryMoscow is the capital of Russia, " + "and the largest city in the countryMoscow is the capital of Russia, and the largest city " + "in the countryMoscow is the capital of Russia, and the largest city in the countryMoscow " + "is the capital of Russia, and the largest city in the countryMoscow is the capital of " + "Russia, and the largest city in the countryMoscow is the capital of Russia, and the " + "largest city in the countryMoscow is the capital of Russia, and the largest city in the " + "countryMoscow is the capital of Russia, and the largest city in the countryMoscow is the " + "capital of Russia, and the largest city in the countryMoscow is the capital of Russia, " + "and the largest city in the countryMoscow is the capital of Russia, and the largest city " + "in the countryMoscow is the capital of Russia, and the largest city in the countryMoscow " + "is the capital of Russia, and the largest city in the countryMoscow is the capital of " + "Russia, and the largest city in the countryMoscow is the capital of Russia, and the " + "largest city in the countryMoscow is the capital of Russia, and the largest city in the " + "countryMoscow is the capital of Russia, and the largest city in the countryMoscow is the " + "capital of Russia, and the largest city in the countryMoscow is the capital of Russia, " + "and the largest city in the countryMoscow is the capital of Russia, and the largest city " + "in the countryMoscow is the capital of Russia, and the largest city in the countryMoscow " + "is the capital of Russia, and the largest city in the countryMoscow is the capital of " + "Russia, and the largest city in the countryMoscow is the capital of Russia, and the " + "largest city in the countryMoscow is the capital of Russia, and the largest city in the " + "countryMoscow is the capital of Russia, and the largest city in the country"); + test(6905655253535205836ULL, 5560247534938243804ULL, + "Moscow is the capital of Russia, and the largest city in the country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million peopleMoscow is the capital of Russia, and the largest city in the " + "country, with a population " + "of over 12 million people"); + test(17180533981300533381ULL, 985389469184223326ULL, + "Moscow is the capital of Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in EuropeMoscow is the capital of " + "Russia, and the largest city in the country, with a population " + "of over 12 million people, and the largest city in Europe"); + test(15470213656373493785ULL, 18337102184905563192ULL, + "ooneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneo" + "neoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneone" + "oneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneoneon" + "eoneoneonene"); +} + +TEST(city_hash, short_text8) { + test(2548571168015014692ULL, 284905140919628274ULL, "a"); + test(2070361765919599426ULL, 9025766345180298939ULL, "bb"); + test(14141338468832955152ULL, 10552519853845467035ULL, "ccc"); + test(18138810160329292616ULL, 5334357042905201868ULL, "dddd"); + test(425873312578527799ULL, 17471752968193695125ULL, "aaaaa"); + test(16631617225600905903ULL, 8316778257275536170ULL, "aaaccc"); + test(12719358095561328225ULL, 17872985103380866073ULL, "abcdefh"); + test(17174388733642667936ULL, 11830283950305971712ULL, "abcdeffh"); +} + +TEST(city_hash, short_text16) { + test(5762505351139159907ULL, 5576230494162395002ULL, "a32146547"); + test(5384428512347131045ULL, 15753090810004664388ULL, "b32146547b"); + test(12829545899344666879ULL, 3848967114221885261ULL, "c32146547cc"); + test(16893894660113381599ULL, 5947978601334952997ULL, "dd32146547dd"); + test(17429098434560484596ULL, 6398107568024805321ULL, "aaaa32146547a"); + test(10726875353743957364ULL, 2240715260928806674ULL, "aaa32146547ccc"); + test(6679348617805124815ULL, 9286469086836368089ULL, "abc32146547defh"); + test(6691049816525186727ULL, 7961087717381234556ULL, "abcd32146547effh"); +} + +} // namespace doris --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org