github-actions[bot] commented on code in PR #42488: URL: https://github.com/apache/doris/pull/42488#discussion_r1816351779
########## be/src/vec/functions/url/find_symbols.h: ########## @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/base/base/find_symbols.h +// and modified by Doris + +#pragma once + +#include <array> +#include <cstdint> +#include <string> + +#if defined(__SSE2__) +#include <emmintrin.h> +#endif +#if defined(__SSE4_2__) +#include <nmmintrin.h> +#endif + +/** find_first_symbols<c1, c2, ...>(begin, end): + * + * Allow to search for next character from the set of 'symbols...' in a string. + * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'), + * but with the following differences: + * - works with any memory ranges, including containing zero bytes; + * - doesn't require terminating zero byte: end of memory range is passed explicitly; + * - if not found, returns pointer to end instead of nullptr; + * - maximum number of symbols to search is 16. + * + * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols, + * that have more than 2x performance advantage over trivial loop + * in the case of parsing tab-separated dump with (probably escaped) string fields. + * In the case of parsing tab separated dump with short strings, there is no performance degradation over trivial loop. + * + * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend on CPU model. + * + * find_last_symbols_or_null<c1, c2, ...>(begin, end): + * + * Allow to search for the last matching character in a string. + * If no such characters, returns nullptr. + */ + +struct SearchSymbols { + static constexpr auto BUFFER_SIZE = 16; + + SearchSymbols() = default; + + explicit SearchSymbols(std::string in) : str(std::move(in)) { +#if defined(__SSE4_2__) + if (str.size() > BUFFER_SIZE) { + throw std::runtime_error("SearchSymbols can contain at most " + + std::to_string(BUFFER_SIZE) + " symbols and " + + std::to_string(str.size()) + " was provided\n"); + } + + char tmp_safety_buffer[BUFFER_SIZE] = {0}; + + memcpy(tmp_safety_buffer, str.data(), str.size()); + + simd_vector = _mm_loadu_si128(reinterpret_cast<const __m128i*>(tmp_safety_buffer)); +#endif + } + +#if defined(__SSE4_2__) + __m128i simd_vector; +#endif + std::string str; +}; + +namespace detail { +template <char... chars> +constexpr bool is_in(char x) { + return ((x == chars) || ...); +} // NOLINT(misc-redundant-expression) + +static bool is_in(char c, const char* symbols, size_t num_chars) { + for (size_t i = 0U; i < num_chars; ++i) { + if (c == symbols[i]) { + return true; + } + } + + return false; +} + +#if defined(__SSE2__) +template <char s0> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + return eq0; +} + +template <char s0, char s1, char... tail> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + __m128i eq = mm_is_in<s1, tail...>(bytes); + return _mm_or_si128(eq0, eq); +} + +inline __m128i mm_is_in(__m128i bytes, const char* symbols, size_t num_chars) { + __m128i accumulator = _mm_setzero_si128(); + for (size_t i = 0; i < num_chars; ++i) { + __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i])); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} + +inline std::array<__m128i, 16u> mm_is_in_prepare(const char* symbols, size_t num_chars) { + std::array<__m128i, 16u> result {}; + + for (size_t i = 0; i < num_chars; ++i) { + result[i] = _mm_set1_epi8(symbols[i]); + } + + return result; +} + +inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u>& needles) { + __m128i accumulator = _mm_setzero_si128(); + + for (const auto& needle : needles) { + __m128i eq = _mm_cmpeq_epi8(bytes, needle); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} +#endif + +template <bool positive> +constexpr bool maybe_negate(bool x) { + return x == positive; +} + +template <bool positive> +constexpr uint16_t maybe_negate(uint16_t x) { + if constexpr (positive) + return x; + else + return ~x; +} + +enum class ReturnMode : uint8_t { + End, + Nullptr, +}; + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE2__) + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end, + const char* symbols, size_t num_chars) { + const char* pos = begin; + +#if defined(__SSE2__) + const auto needles = mm_is_in_prepare(symbols, num_chars); + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in_execute(bytes, needles); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols, num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_last_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = end; + +#if defined(__SSE2__) + for (; pos - 16 >= begin; + pos -= + 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos - 16)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) + return pos - 1 - + (__builtin_clz(bit_mask) - + 16); /// because __builtin_clz works with mask as uint32. + } +#endif + + --pos; + for (; pos >= begin; --pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, size_t num_chars, char c01, char c02 = 0, + char c03 = 0, char c04 = 0, char c05 = 0, char c06 = 0, char c07 = 0, char c08 = 0, + char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0, char c13 = 0, char c14 = 0, + char c15 = 0, char c16 = 0> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, + c14, c15, c16); + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if ((num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos))) || + (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos))) || + (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02, c03>(*pos))) || + (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03, c04>(*pos))) || + (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05>(*pos))) || + (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06>(*pos))) || + (num_chars == 7 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07>(*pos))) || + (num_chars == 8 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08>(*pos))) || + (num_chars == 9 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09>(*pos))) || + (num_chars == 10 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10>(*pos))) || + (num_chars == 11 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11>(*pos))) || + (num_chars == 12 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12>(*pos))) || + (num_chars == 13 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13>( + *pos))) || + (num_chars == 14 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14>( + *pos))) || + (num_chars == 15 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15>(*pos))) || + (num_chars == 16 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15, c16>(*pos)))) + return pos; + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end, + const SearchSymbols& symbols) { + const char* pos = begin; + + const auto num_chars = symbols.str.size(); + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + const __m128i set = symbols.simd_vector; + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols.str.data(), num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_dispatch(const char* begin, const char* end) + requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16) +{ +#if defined(__SSE4_2__) + if (sizeof...(symbols) >= 5) + return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>( + begin, end); + else Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion else { ``` be/src/vec/functions/url/find_symbols.h:346: ```diff - return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end); + return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end); + } ``` ########## be/src/vec/functions/url/find_symbols.h: ########## @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/base/base/find_symbols.h +// and modified by Doris + +#pragma once + +#include <array> +#include <cstdint> +#include <string> + +#if defined(__SSE2__) +#include <emmintrin.h> +#endif +#if defined(__SSE4_2__) +#include <nmmintrin.h> +#endif + +/** find_first_symbols<c1, c2, ...>(begin, end): + * + * Allow to search for next character from the set of 'symbols...' in a string. + * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'), + * but with the following differences: + * - works with any memory ranges, including containing zero bytes; + * - doesn't require terminating zero byte: end of memory range is passed explicitly; + * - if not found, returns pointer to end instead of nullptr; + * - maximum number of symbols to search is 16. + * + * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols, + * that have more than 2x performance advantage over trivial loop + * in the case of parsing tab-separated dump with (probably escaped) string fields. + * In the case of parsing tab separated dump with short strings, there is no performance degradation over trivial loop. + * + * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend on CPU model. + * + * find_last_symbols_or_null<c1, c2, ...>(begin, end): + * + * Allow to search for the last matching character in a string. + * If no such characters, returns nullptr. + */ + +struct SearchSymbols { + static constexpr auto BUFFER_SIZE = 16; + + SearchSymbols() = default; + + explicit SearchSymbols(std::string in) : str(std::move(in)) { +#if defined(__SSE4_2__) + if (str.size() > BUFFER_SIZE) { + throw std::runtime_error("SearchSymbols can contain at most " + + std::to_string(BUFFER_SIZE) + " symbols and " + + std::to_string(str.size()) + " was provided\n"); + } + + char tmp_safety_buffer[BUFFER_SIZE] = {0}; + + memcpy(tmp_safety_buffer, str.data(), str.size()); + + simd_vector = _mm_loadu_si128(reinterpret_cast<const __m128i*>(tmp_safety_buffer)); +#endif + } + +#if defined(__SSE4_2__) + __m128i simd_vector; +#endif + std::string str; +}; + +namespace detail { +template <char... chars> +constexpr bool is_in(char x) { + return ((x == chars) || ...); +} // NOLINT(misc-redundant-expression) + +static bool is_in(char c, const char* symbols, size_t num_chars) { + for (size_t i = 0U; i < num_chars; ++i) { + if (c == symbols[i]) { + return true; + } + } + + return false; +} + +#if defined(__SSE2__) +template <char s0> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + return eq0; +} + +template <char s0, char s1, char... tail> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + __m128i eq = mm_is_in<s1, tail...>(bytes); + return _mm_or_si128(eq0, eq); +} + +inline __m128i mm_is_in(__m128i bytes, const char* symbols, size_t num_chars) { + __m128i accumulator = _mm_setzero_si128(); + for (size_t i = 0; i < num_chars; ++i) { + __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i])); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} + +inline std::array<__m128i, 16u> mm_is_in_prepare(const char* symbols, size_t num_chars) { + std::array<__m128i, 16u> result {}; + + for (size_t i = 0; i < num_chars; ++i) { + result[i] = _mm_set1_epi8(symbols[i]); + } + + return result; +} + +inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u>& needles) { + __m128i accumulator = _mm_setzero_si128(); + + for (const auto& needle : needles) { + __m128i eq = _mm_cmpeq_epi8(bytes, needle); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} +#endif + +template <bool positive> +constexpr bool maybe_negate(bool x) { + return x == positive; +} + +template <bool positive> +constexpr uint16_t maybe_negate(uint16_t x) { + if constexpr (positive) + return x; + else + return ~x; +} + +enum class ReturnMode : uint8_t { + End, + Nullptr, +}; + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE2__) + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end, + const char* symbols, size_t num_chars) { + const char* pos = begin; + +#if defined(__SSE2__) + const auto needles = mm_is_in_prepare(symbols, num_chars); + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in_execute(bytes, needles); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols, num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_last_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = end; + +#if defined(__SSE2__) + for (; pos - 16 >= begin; + pos -= + 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos - 16)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) + return pos - 1 - + (__builtin_clz(bit_mask) - + 16); /// because __builtin_clz works with mask as uint32. + } +#endif + + --pos; + for (; pos >= begin; --pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, size_t num_chars, char c01, char c02 = 0, + char c03 = 0, char c04 = 0, char c05 = 0, char c06 = 0, char c07 = 0, char c08 = 0, + char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0, char c13 = 0, char c14 = 0, + char c15 = 0, char c16 = 0> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, + c14, c15, c16); + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if ((num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos))) || + (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos))) || + (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02, c03>(*pos))) || + (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03, c04>(*pos))) || + (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05>(*pos))) || + (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06>(*pos))) || + (num_chars == 7 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07>(*pos))) || + (num_chars == 8 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08>(*pos))) || + (num_chars == 9 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09>(*pos))) || + (num_chars == 10 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10>(*pos))) || + (num_chars == 11 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11>(*pos))) || + (num_chars == 12 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12>(*pos))) || + (num_chars == 13 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13>( + *pos))) || + (num_chars == 14 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14>( + *pos))) || + (num_chars == 15 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15>(*pos))) || + (num_chars == 16 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15, c16>(*pos)))) + return pos; + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end, + const SearchSymbols& symbols) { + const char* pos = begin; + + const auto num_chars = symbols.str.size(); + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + const __m128i set = symbols.simd_vector; + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols.str.data(), num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_dispatch(const char* begin, const char* end) + requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16) +{ +#if defined(__SSE4_2__) + if (sizeof...(symbols) >= 5) + return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>( + begin, end); + else +#endif + return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end); +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_dispatch(const std::string_view haystack, + const SearchSymbols& symbols) { +#if defined(__SSE4_2__) + if (symbols.str.size() >= 5) Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion if (symbols.str.size() >= 5) { ``` be/src/vec/functions/url/find_symbols.h:356: ```diff - else + } else ``` ########## be/src/vec/functions/url/tldLookup.generated.cpp: ########## @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.generated.cpp +// and modified by Doris + +// clang-format off +/* C++ code produced by gperf version 3.1 */ +/* Command-line: /usr/bin/gperf --output-file=tldLookup.generated.cpp tldLookup.gperf */ +/* Computed positions: -k'1-11,13-14,17,$' */ + +#if !( \ + (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gp...@gnu.org>." +#endif + +#line 7 "tldLookup.gperf" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" +#pragma GCC diagnostic ignored "-Wunused-macros" +#include <cstring> + +#define TOTAL_KEYWORDS 5045 +#define MIN_WORD_LENGTH 4 +#define MAX_WORD_LENGTH 34 Review Comment: warning: macro 'MAX_WORD_LENGTH' defines an integral constant; prefer an enum instead [modernize-macro-to-enum] ```cpp #define MAX_WORD_LENGTH 34 ^ ``` ########## be/src/vec/functions/url/find_symbols.h: ########## @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/base/base/find_symbols.h +// and modified by Doris + +#pragma once + +#include <array> +#include <cstdint> +#include <string> + +#if defined(__SSE2__) +#include <emmintrin.h> +#endif +#if defined(__SSE4_2__) +#include <nmmintrin.h> +#endif + +/** find_first_symbols<c1, c2, ...>(begin, end): + * + * Allow to search for next character from the set of 'symbols...' in a string. + * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'), + * but with the following differences: + * - works with any memory ranges, including containing zero bytes; + * - doesn't require terminating zero byte: end of memory range is passed explicitly; + * - if not found, returns pointer to end instead of nullptr; + * - maximum number of symbols to search is 16. + * + * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols, + * that have more than 2x performance advantage over trivial loop + * in the case of parsing tab-separated dump with (probably escaped) string fields. + * In the case of parsing tab separated dump with short strings, there is no performance degradation over trivial loop. + * + * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend on CPU model. + * + * find_last_symbols_or_null<c1, c2, ...>(begin, end): + * + * Allow to search for the last matching character in a string. + * If no such characters, returns nullptr. + */ + +struct SearchSymbols { + static constexpr auto BUFFER_SIZE = 16; + + SearchSymbols() = default; + + explicit SearchSymbols(std::string in) : str(std::move(in)) { +#if defined(__SSE4_2__) + if (str.size() > BUFFER_SIZE) { + throw std::runtime_error("SearchSymbols can contain at most " + + std::to_string(BUFFER_SIZE) + " symbols and " + + std::to_string(str.size()) + " was provided\n"); + } + + char tmp_safety_buffer[BUFFER_SIZE] = {0}; + + memcpy(tmp_safety_buffer, str.data(), str.size()); + + simd_vector = _mm_loadu_si128(reinterpret_cast<const __m128i*>(tmp_safety_buffer)); +#endif + } + +#if defined(__SSE4_2__) + __m128i simd_vector; +#endif + std::string str; +}; + +namespace detail { +template <char... chars> +constexpr bool is_in(char x) { + return ((x == chars) || ...); +} // NOLINT(misc-redundant-expression) + +static bool is_in(char c, const char* symbols, size_t num_chars) { + for (size_t i = 0U; i < num_chars; ++i) { + if (c == symbols[i]) { + return true; + } + } + + return false; +} + +#if defined(__SSE2__) +template <char s0> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + return eq0; +} + +template <char s0, char s1, char... tail> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + __m128i eq = mm_is_in<s1, tail...>(bytes); + return _mm_or_si128(eq0, eq); +} + +inline __m128i mm_is_in(__m128i bytes, const char* symbols, size_t num_chars) { + __m128i accumulator = _mm_setzero_si128(); + for (size_t i = 0; i < num_chars; ++i) { + __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i])); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} + +inline std::array<__m128i, 16u> mm_is_in_prepare(const char* symbols, size_t num_chars) { + std::array<__m128i, 16u> result {}; + + for (size_t i = 0; i < num_chars; ++i) { + result[i] = _mm_set1_epi8(symbols[i]); + } + + return result; +} + +inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u>& needles) { + __m128i accumulator = _mm_setzero_si128(); + + for (const auto& needle : needles) { + __m128i eq = _mm_cmpeq_epi8(bytes, needle); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} +#endif + +template <bool positive> +constexpr bool maybe_negate(bool x) { + return x == positive; +} + +template <bool positive> +constexpr uint16_t maybe_negate(uint16_t x) { + if constexpr (positive) + return x; + else + return ~x; +} + +enum class ReturnMode : uint8_t { + End, + Nullptr, +}; + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE2__) + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end, + const char* symbols, size_t num_chars) { + const char* pos = begin; + +#if defined(__SSE2__) + const auto needles = mm_is_in_prepare(symbols, num_chars); + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in_execute(bytes, needles); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols, num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_last_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = end; + +#if defined(__SSE2__) + for (; pos - 16 >= begin; + pos -= + 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos - 16)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) + return pos - 1 - + (__builtin_clz(bit_mask) - + 16); /// because __builtin_clz works with mask as uint32. + } +#endif + + --pos; + for (; pos >= begin; --pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, size_t num_chars, char c01, char c02 = 0, + char c03 = 0, char c04 = 0, char c05 = 0, char c06 = 0, char c07 = 0, char c08 = 0, + char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0, char c13 = 0, char c14 = 0, + char c15 = 0, char c16 = 0> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, + c14, c15, c16); + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if ((num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos))) || + (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos))) || + (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02, c03>(*pos))) || + (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03, c04>(*pos))) || + (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05>(*pos))) || + (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06>(*pos))) || + (num_chars == 7 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07>(*pos))) || + (num_chars == 8 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08>(*pos))) || + (num_chars == 9 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09>(*pos))) || + (num_chars == 10 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10>(*pos))) || + (num_chars == 11 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11>(*pos))) || + (num_chars == 12 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12>(*pos))) || + (num_chars == 13 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13>( + *pos))) || + (num_chars == 14 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14>( + *pos))) || + (num_chars == 15 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15>(*pos))) || + (num_chars == 16 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15, c16>(*pos)))) + return pos; + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end, + const SearchSymbols& symbols) { + const char* pos = begin; + + const auto num_chars = symbols.str.size(); + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + const __m128i set = symbols.simd_vector; + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols.str.data(), num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_dispatch(const char* begin, const char* end) + requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16) +{ +#if defined(__SSE4_2__) + if (sizeof...(symbols) >= 5) + return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>( + begin, end); + else +#endif + return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end); +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_dispatch(const std::string_view haystack, + const SearchSymbols& symbols) { +#if defined(__SSE4_2__) + if (symbols.str.size() >= 5) + return find_first_symbols_sse42<positive, return_mode>(haystack.begin(), haystack.end(), + symbols); + else +#endif + return find_first_symbols_sse2<positive, return_mode>( + haystack.begin(), haystack.end(), symbols.str.data(), symbols.str.size()); +} + +} // namespace detail + +template <char... symbols> +inline const char* find_first_symbols(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, + end); +} + +/// Returning non const result for non const arguments. +/// It is convenient when you are using this function to iterate through non-const buffer. +template <char... symbols> +inline char* find_first_symbols(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, + end)); +} + +inline const char* find_first_symbols(std::string_view haystack, const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End>(haystack, symbols); +} + +template <char... symbols> +inline const char* find_first_not_symbols(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_first_not_symbols(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, + end)); +} + +inline const char* find_first_not_symbols(std::string_view haystack, const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End>(haystack, symbols); +} + +template <char... symbols> +inline const char* find_first_symbols_or_null(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_first_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>( + begin, end)); +} + +inline const char* find_first_symbols_or_null(std::string_view haystack, + const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr>(haystack, + symbols); +} + +template <char... symbols> +inline const char* find_first_not_symbols_or_null(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>( + begin, end); +} + +template <char... symbols> +inline char* find_first_not_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>( + begin, end)); +} + +inline const char* find_first_not_symbols_or_null(std::string_view haystack, + const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr>(haystack, + symbols); +} + +template <char... symbols> +inline const char* find_last_symbols_or_null(const char* begin, const char* end) { + return detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_last_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end)); +} + +template <char... symbols> +inline const char* find_last_not_symbols_or_null(const char* begin, const char* end) { + return detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_last_not_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, + end)); +} + +/// Slightly resembles boost::split. The drawback of boost::split is that it fires a false positive in clang static analyzer. +/// See https://github.com/boostorg/algorithm/issues/63 +/// And https://bugs.llvm.org/show_bug.cgi?id=41141 +template <char... symbols, typename To> +inline To& splitInto(To& to, std::string_view what, bool token_compress = false) { + const char* pos = what.data(); + const char* end = pos + what.size(); + while (pos < end) { + const char* delimiter_or_end = find_first_symbols<symbols...>(pos, end); + + if (!token_compress || pos < delimiter_or_end) to.emplace_back(pos, delimiter_or_end - pos); Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion if (!token_compress || pos < delimiter_or_end) { to.emplace_back(pos, delimiter_or_end - pos); } ``` ########## be/src/vec/functions/url/find_symbols.h: ########## @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/base/base/find_symbols.h +// and modified by Doris + +#pragma once + +#include <array> +#include <cstdint> +#include <string> + +#if defined(__SSE2__) +#include <emmintrin.h> +#endif +#if defined(__SSE4_2__) +#include <nmmintrin.h> +#endif + +/** find_first_symbols<c1, c2, ...>(begin, end): + * + * Allow to search for next character from the set of 'symbols...' in a string. + * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'), + * but with the following differences: + * - works with any memory ranges, including containing zero bytes; + * - doesn't require terminating zero byte: end of memory range is passed explicitly; + * - if not found, returns pointer to end instead of nullptr; + * - maximum number of symbols to search is 16. + * + * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols, + * that have more than 2x performance advantage over trivial loop + * in the case of parsing tab-separated dump with (probably escaped) string fields. + * In the case of parsing tab separated dump with short strings, there is no performance degradation over trivial loop. + * + * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend on CPU model. + * + * find_last_symbols_or_null<c1, c2, ...>(begin, end): + * + * Allow to search for the last matching character in a string. + * If no such characters, returns nullptr. + */ + +struct SearchSymbols { + static constexpr auto BUFFER_SIZE = 16; + + SearchSymbols() = default; + + explicit SearchSymbols(std::string in) : str(std::move(in)) { +#if defined(__SSE4_2__) + if (str.size() > BUFFER_SIZE) { + throw std::runtime_error("SearchSymbols can contain at most " + + std::to_string(BUFFER_SIZE) + " symbols and " + + std::to_string(str.size()) + " was provided\n"); + } + + char tmp_safety_buffer[BUFFER_SIZE] = {0}; + + memcpy(tmp_safety_buffer, str.data(), str.size()); + + simd_vector = _mm_loadu_si128(reinterpret_cast<const __m128i*>(tmp_safety_buffer)); +#endif + } + +#if defined(__SSE4_2__) + __m128i simd_vector; +#endif + std::string str; +}; + +namespace detail { +template <char... chars> +constexpr bool is_in(char x) { + return ((x == chars) || ...); +} // NOLINT(misc-redundant-expression) + +static bool is_in(char c, const char* symbols, size_t num_chars) { + for (size_t i = 0U; i < num_chars; ++i) { + if (c == symbols[i]) { + return true; + } + } + + return false; +} + +#if defined(__SSE2__) +template <char s0> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + return eq0; +} + +template <char s0, char s1, char... tail> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + __m128i eq = mm_is_in<s1, tail...>(bytes); + return _mm_or_si128(eq0, eq); +} + +inline __m128i mm_is_in(__m128i bytes, const char* symbols, size_t num_chars) { + __m128i accumulator = _mm_setzero_si128(); + for (size_t i = 0; i < num_chars; ++i) { + __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i])); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} + +inline std::array<__m128i, 16u> mm_is_in_prepare(const char* symbols, size_t num_chars) { + std::array<__m128i, 16u> result {}; + + for (size_t i = 0; i < num_chars; ++i) { + result[i] = _mm_set1_epi8(symbols[i]); + } + + return result; +} + +inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u>& needles) { + __m128i accumulator = _mm_setzero_si128(); + + for (const auto& needle : needles) { + __m128i eq = _mm_cmpeq_epi8(bytes, needle); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} +#endif + +template <bool positive> +constexpr bool maybe_negate(bool x) { + return x == positive; +} + +template <bool positive> +constexpr uint16_t maybe_negate(uint16_t x) { + if constexpr (positive) + return x; + else + return ~x; +} + +enum class ReturnMode : uint8_t { + End, + Nullptr, +}; + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE2__) + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end, + const char* symbols, size_t num_chars) { + const char* pos = begin; + +#if defined(__SSE2__) + const auto needles = mm_is_in_prepare(symbols, num_chars); + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in_execute(bytes, needles); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols, num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_last_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = end; + +#if defined(__SSE2__) + for (; pos - 16 >= begin; + pos -= + 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos - 16)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) + return pos - 1 - + (__builtin_clz(bit_mask) - + 16); /// because __builtin_clz works with mask as uint32. + } +#endif + + --pos; + for (; pos >= begin; --pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, size_t num_chars, char c01, char c02 = 0, + char c03 = 0, char c04 = 0, char c05 = 0, char c06 = 0, char c07 = 0, char c08 = 0, + char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0, char c13 = 0, char c14 = 0, + char c15 = 0, char c16 = 0> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, + c14, c15, c16); + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if ((num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos))) || + (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos))) || + (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02, c03>(*pos))) || + (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03, c04>(*pos))) || + (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05>(*pos))) || + (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06>(*pos))) || + (num_chars == 7 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07>(*pos))) || + (num_chars == 8 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08>(*pos))) || + (num_chars == 9 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09>(*pos))) || + (num_chars == 10 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10>(*pos))) || + (num_chars == 11 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11>(*pos))) || + (num_chars == 12 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12>(*pos))) || + (num_chars == 13 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13>( + *pos))) || + (num_chars == 14 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14>( + *pos))) || + (num_chars == 15 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15>(*pos))) || + (num_chars == 16 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15, c16>(*pos)))) + return pos; + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end, + const SearchSymbols& symbols) { + const char* pos = begin; + + const auto num_chars = symbols.str.size(); + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + const __m128i set = symbols.simd_vector; + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols.str.data(), num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_dispatch(const char* begin, const char* end) + requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16) +{ +#if defined(__SSE4_2__) + if (sizeof...(symbols) >= 5) + return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>( + begin, end); + else +#endif + return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end); +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_dispatch(const std::string_view haystack, + const SearchSymbols& symbols) { +#if defined(__SSE4_2__) + if (symbols.str.size() >= 5) + return find_first_symbols_sse42<positive, return_mode>(haystack.begin(), haystack.end(), + symbols); + else +#endif + return find_first_symbols_sse2<positive, return_mode>( + haystack.begin(), haystack.end(), symbols.str.data(), symbols.str.size()); +} + +} // namespace detail + +template <char... symbols> +inline const char* find_first_symbols(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, + end); +} + +/// Returning non const result for non const arguments. +/// It is convenient when you are using this function to iterate through non-const buffer. +template <char... symbols> +inline char* find_first_symbols(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, + end)); +} + +inline const char* find_first_symbols(std::string_view haystack, const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End>(haystack, symbols); +} + +template <char... symbols> +inline const char* find_first_not_symbols(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_first_not_symbols(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, + end)); +} + +inline const char* find_first_not_symbols(std::string_view haystack, const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End>(haystack, symbols); +} + +template <char... symbols> +inline const char* find_first_symbols_or_null(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_first_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>( + begin, end)); +} + +inline const char* find_first_symbols_or_null(std::string_view haystack, + const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr>(haystack, + symbols); +} + +template <char... symbols> +inline const char* find_first_not_symbols_or_null(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>( + begin, end); +} + +template <char... symbols> +inline char* find_first_not_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>( + begin, end)); +} + +inline const char* find_first_not_symbols_or_null(std::string_view haystack, + const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr>(haystack, + symbols); +} + +template <char... symbols> +inline const char* find_last_symbols_or_null(const char* begin, const char* end) { + return detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_last_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end)); +} + +template <char... symbols> +inline const char* find_last_not_symbols_or_null(const char* begin, const char* end) { + return detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_last_not_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, + end)); +} + +/// Slightly resembles boost::split. The drawback of boost::split is that it fires a false positive in clang static analyzer. +/// See https://github.com/boostorg/algorithm/issues/63 +/// And https://bugs.llvm.org/show_bug.cgi?id=41141 +template <char... symbols, typename To> +inline To& splitInto(To& to, std::string_view what, bool token_compress = false) { + const char* pos = what.data(); + const char* end = pos + what.size(); + while (pos < end) { + const char* delimiter_or_end = find_first_symbols<symbols...>(pos, end); + + if (!token_compress || pos < delimiter_or_end) to.emplace_back(pos, delimiter_or_end - pos); + + if (delimiter_or_end < end) Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion if (delimiter_or_end < end) { ``` be/src/vec/functions/url/find_symbols.h:478: ```diff - else + } else ``` ########## be/src/vec/functions/url/find_symbols.h: ########## @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/base/base/find_symbols.h +// and modified by Doris + +#pragma once + +#include <array> +#include <cstdint> +#include <string> + +#if defined(__SSE2__) +#include <emmintrin.h> +#endif +#if defined(__SSE4_2__) +#include <nmmintrin.h> +#endif + +/** find_first_symbols<c1, c2, ...>(begin, end): + * + * Allow to search for next character from the set of 'symbols...' in a string. + * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'), + * but with the following differences: + * - works with any memory ranges, including containing zero bytes; + * - doesn't require terminating zero byte: end of memory range is passed explicitly; + * - if not found, returns pointer to end instead of nullptr; + * - maximum number of symbols to search is 16. + * + * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols, + * that have more than 2x performance advantage over trivial loop + * in the case of parsing tab-separated dump with (probably escaped) string fields. + * In the case of parsing tab separated dump with short strings, there is no performance degradation over trivial loop. + * + * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend on CPU model. + * + * find_last_symbols_or_null<c1, c2, ...>(begin, end): + * + * Allow to search for the last matching character in a string. + * If no such characters, returns nullptr. + */ + +struct SearchSymbols { + static constexpr auto BUFFER_SIZE = 16; + + SearchSymbols() = default; + + explicit SearchSymbols(std::string in) : str(std::move(in)) { +#if defined(__SSE4_2__) + if (str.size() > BUFFER_SIZE) { + throw std::runtime_error("SearchSymbols can contain at most " + + std::to_string(BUFFER_SIZE) + " symbols and " + + std::to_string(str.size()) + " was provided\n"); + } + + char tmp_safety_buffer[BUFFER_SIZE] = {0}; + + memcpy(tmp_safety_buffer, str.data(), str.size()); + + simd_vector = _mm_loadu_si128(reinterpret_cast<const __m128i*>(tmp_safety_buffer)); +#endif + } + +#if defined(__SSE4_2__) + __m128i simd_vector; +#endif + std::string str; +}; + +namespace detail { +template <char... chars> +constexpr bool is_in(char x) { + return ((x == chars) || ...); +} // NOLINT(misc-redundant-expression) + +static bool is_in(char c, const char* symbols, size_t num_chars) { + for (size_t i = 0U; i < num_chars; ++i) { + if (c == symbols[i]) { + return true; + } + } + + return false; +} + +#if defined(__SSE2__) +template <char s0> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + return eq0; +} + +template <char s0, char s1, char... tail> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + __m128i eq = mm_is_in<s1, tail...>(bytes); + return _mm_or_si128(eq0, eq); +} + +inline __m128i mm_is_in(__m128i bytes, const char* symbols, size_t num_chars) { + __m128i accumulator = _mm_setzero_si128(); + for (size_t i = 0; i < num_chars; ++i) { + __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i])); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} + +inline std::array<__m128i, 16u> mm_is_in_prepare(const char* symbols, size_t num_chars) { + std::array<__m128i, 16u> result {}; + + for (size_t i = 0; i < num_chars; ++i) { + result[i] = _mm_set1_epi8(symbols[i]); + } + + return result; +} + +inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u>& needles) { + __m128i accumulator = _mm_setzero_si128(); + + for (const auto& needle : needles) { + __m128i eq = _mm_cmpeq_epi8(bytes, needle); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} +#endif + +template <bool positive> +constexpr bool maybe_negate(bool x) { + return x == positive; +} + +template <bool positive> +constexpr uint16_t maybe_negate(uint16_t x) { + if constexpr (positive) + return x; + else + return ~x; +} + +enum class ReturnMode : uint8_t { + End, + Nullptr, +}; + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE2__) + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end, + const char* symbols, size_t num_chars) { + const char* pos = begin; + +#if defined(__SSE2__) + const auto needles = mm_is_in_prepare(symbols, num_chars); + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in_execute(bytes, needles); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols, num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_last_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = end; + +#if defined(__SSE2__) + for (; pos - 16 >= begin; + pos -= + 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos - 16)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) + return pos - 1 - + (__builtin_clz(bit_mask) - + 16); /// because __builtin_clz works with mask as uint32. + } +#endif + + --pos; + for (; pos >= begin; --pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, size_t num_chars, char c01, char c02 = 0, + char c03 = 0, char c04 = 0, char c05 = 0, char c06 = 0, char c07 = 0, char c08 = 0, + char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0, char c13 = 0, char c14 = 0, + char c15 = 0, char c16 = 0> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, + c14, c15, c16); + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if ((num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos))) || + (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos))) || + (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02, c03>(*pos))) || + (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03, c04>(*pos))) || + (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05>(*pos))) || + (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06>(*pos))) || + (num_chars == 7 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07>(*pos))) || + (num_chars == 8 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08>(*pos))) || + (num_chars == 9 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09>(*pos))) || + (num_chars == 10 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10>(*pos))) || + (num_chars == 11 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11>(*pos))) || + (num_chars == 12 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12>(*pos))) || + (num_chars == 13 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13>( + *pos))) || + (num_chars == 14 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14>( + *pos))) || + (num_chars == 15 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15>(*pos))) || + (num_chars == 16 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15, c16>(*pos)))) + return pos; + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end, + const SearchSymbols& symbols) { + const char* pos = begin; + + const auto num_chars = symbols.str.size(); + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + const __m128i set = symbols.simd_vector; + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols.str.data(), num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_dispatch(const char* begin, const char* end) + requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16) +{ +#if defined(__SSE4_2__) + if (sizeof...(symbols) >= 5) Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion if (sizeof...(symbols) >= 5) { ``` be/src/vec/functions/url/find_symbols.h:344: ```diff - else + } else ``` ########## be/src/vec/functions/url/find_symbols.h: ########## @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/base/base/find_symbols.h +// and modified by Doris + +#pragma once + +#include <array> +#include <cstdint> +#include <string> + +#if defined(__SSE2__) +#include <emmintrin.h> +#endif +#if defined(__SSE4_2__) +#include <nmmintrin.h> +#endif + +/** find_first_symbols<c1, c2, ...>(begin, end): + * + * Allow to search for next character from the set of 'symbols...' in a string. + * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'), + * but with the following differences: + * - works with any memory ranges, including containing zero bytes; + * - doesn't require terminating zero byte: end of memory range is passed explicitly; + * - if not found, returns pointer to end instead of nullptr; + * - maximum number of symbols to search is 16. + * + * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols, + * that have more than 2x performance advantage over trivial loop + * in the case of parsing tab-separated dump with (probably escaped) string fields. + * In the case of parsing tab separated dump with short strings, there is no performance degradation over trivial loop. + * + * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend on CPU model. + * + * find_last_symbols_or_null<c1, c2, ...>(begin, end): + * + * Allow to search for the last matching character in a string. + * If no such characters, returns nullptr. + */ + +struct SearchSymbols { + static constexpr auto BUFFER_SIZE = 16; + + SearchSymbols() = default; + + explicit SearchSymbols(std::string in) : str(std::move(in)) { +#if defined(__SSE4_2__) + if (str.size() > BUFFER_SIZE) { + throw std::runtime_error("SearchSymbols can contain at most " + + std::to_string(BUFFER_SIZE) + " symbols and " + + std::to_string(str.size()) + " was provided\n"); + } + + char tmp_safety_buffer[BUFFER_SIZE] = {0}; + + memcpy(tmp_safety_buffer, str.data(), str.size()); + + simd_vector = _mm_loadu_si128(reinterpret_cast<const __m128i*>(tmp_safety_buffer)); +#endif + } + +#if defined(__SSE4_2__) + __m128i simd_vector; +#endif + std::string str; +}; + +namespace detail { +template <char... chars> +constexpr bool is_in(char x) { + return ((x == chars) || ...); +} // NOLINT(misc-redundant-expression) + +static bool is_in(char c, const char* symbols, size_t num_chars) { + for (size_t i = 0U; i < num_chars; ++i) { + if (c == symbols[i]) { + return true; + } + } + + return false; +} + +#if defined(__SSE2__) +template <char s0> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + return eq0; +} + +template <char s0, char s1, char... tail> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + __m128i eq = mm_is_in<s1, tail...>(bytes); + return _mm_or_si128(eq0, eq); +} + +inline __m128i mm_is_in(__m128i bytes, const char* symbols, size_t num_chars) { + __m128i accumulator = _mm_setzero_si128(); + for (size_t i = 0; i < num_chars; ++i) { + __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i])); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} + +inline std::array<__m128i, 16u> mm_is_in_prepare(const char* symbols, size_t num_chars) { + std::array<__m128i, 16u> result {}; + + for (size_t i = 0; i < num_chars; ++i) { + result[i] = _mm_set1_epi8(symbols[i]); + } + + return result; +} + +inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u>& needles) { + __m128i accumulator = _mm_setzero_si128(); + + for (const auto& needle : needles) { + __m128i eq = _mm_cmpeq_epi8(bytes, needle); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} +#endif + +template <bool positive> +constexpr bool maybe_negate(bool x) { + return x == positive; +} + +template <bool positive> +constexpr uint16_t maybe_negate(uint16_t x) { + if constexpr (positive) + return x; + else + return ~x; +} + +enum class ReturnMode : uint8_t { + End, + Nullptr, +}; + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE2__) + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end, + const char* symbols, size_t num_chars) { + const char* pos = begin; + +#if defined(__SSE2__) + const auto needles = mm_is_in_prepare(symbols, num_chars); + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in_execute(bytes, needles); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols, num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_last_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = end; + +#if defined(__SSE2__) + for (; pos - 16 >= begin; + pos -= + 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos - 16)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) + return pos - 1 - + (__builtin_clz(bit_mask) - + 16); /// because __builtin_clz works with mask as uint32. + } +#endif + + --pos; + for (; pos >= begin; --pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, size_t num_chars, char c01, char c02 = 0, + char c03 = 0, char c04 = 0, char c05 = 0, char c06 = 0, char c07 = 0, char c08 = 0, + char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0, char c13 = 0, char c14 = 0, + char c15 = 0, char c16 = 0> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, + c14, c15, c16); + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if ((num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos))) || + (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos))) || + (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02, c03>(*pos))) || + (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03, c04>(*pos))) || + (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05>(*pos))) || + (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06>(*pos))) || + (num_chars == 7 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07>(*pos))) || + (num_chars == 8 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08>(*pos))) || + (num_chars == 9 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09>(*pos))) || + (num_chars == 10 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10>(*pos))) || + (num_chars == 11 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11>(*pos))) || + (num_chars == 12 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12>(*pos))) || + (num_chars == 13 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13>( + *pos))) || + (num_chars == 14 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14>( + *pos))) || + (num_chars == 15 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15>(*pos))) || + (num_chars == 16 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15, c16>(*pos)))) + return pos; + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end, + const SearchSymbols& symbols) { + const char* pos = begin; + + const auto num_chars = symbols.str.size(); + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + const __m128i set = symbols.simd_vector; + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols.str.data(), num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_dispatch(const char* begin, const char* end) + requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16) +{ +#if defined(__SSE4_2__) + if (sizeof...(symbols) >= 5) + return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>( + begin, end); + else +#endif + return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end); +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_dispatch(const std::string_view haystack, + const SearchSymbols& symbols) { +#if defined(__SSE4_2__) + if (symbols.str.size() >= 5) + return find_first_symbols_sse42<positive, return_mode>(haystack.begin(), haystack.end(), + symbols); + else Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion else { ``` be/src/vec/functions/url/find_symbols.h:359: ```diff - haystack.begin(), haystack.end(), symbols.str.data(), symbols.str.size()); + haystack.begin(), haystack.end(), symbols.str.data(), symbols.str.size()); + } ``` ########## be/src/vec/functions/url/find_symbols.h: ########## @@ -0,0 +1,484 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/base/base/find_symbols.h +// and modified by Doris + +#pragma once + +#include <array> +#include <cstdint> +#include <string> + +#if defined(__SSE2__) +#include <emmintrin.h> +#endif +#if defined(__SSE4_2__) +#include <nmmintrin.h> +#endif + +/** find_first_symbols<c1, c2, ...>(begin, end): + * + * Allow to search for next character from the set of 'symbols...' in a string. + * It is similar to 'strpbrk', 'strcspn' (and 'strchr', 'memchr' in the case of one symbol and '\0'), + * but with the following differences: + * - works with any memory ranges, including containing zero bytes; + * - doesn't require terminating zero byte: end of memory range is passed explicitly; + * - if not found, returns pointer to end instead of nullptr; + * - maximum number of symbols to search is 16. + * + * Uses SSE 2 in case of small number of symbols for search and SSE 4.2 in the case of large number of symbols, + * that have more than 2x performance advantage over trivial loop + * in the case of parsing tab-separated dump with (probably escaped) string fields. + * In the case of parsing tab separated dump with short strings, there is no performance degradation over trivial loop. + * + * Note: the optimal threshold to choose between SSE 2 and SSE 4.2 may depend on CPU model. + * + * find_last_symbols_or_null<c1, c2, ...>(begin, end): + * + * Allow to search for the last matching character in a string. + * If no such characters, returns nullptr. + */ + +struct SearchSymbols { + static constexpr auto BUFFER_SIZE = 16; + + SearchSymbols() = default; + + explicit SearchSymbols(std::string in) : str(std::move(in)) { +#if defined(__SSE4_2__) + if (str.size() > BUFFER_SIZE) { + throw std::runtime_error("SearchSymbols can contain at most " + + std::to_string(BUFFER_SIZE) + " symbols and " + + std::to_string(str.size()) + " was provided\n"); + } + + char tmp_safety_buffer[BUFFER_SIZE] = {0}; + + memcpy(tmp_safety_buffer, str.data(), str.size()); + + simd_vector = _mm_loadu_si128(reinterpret_cast<const __m128i*>(tmp_safety_buffer)); +#endif + } + +#if defined(__SSE4_2__) + __m128i simd_vector; +#endif + std::string str; +}; + +namespace detail { +template <char... chars> +constexpr bool is_in(char x) { + return ((x == chars) || ...); +} // NOLINT(misc-redundant-expression) + +static bool is_in(char c, const char* symbols, size_t num_chars) { + for (size_t i = 0U; i < num_chars; ++i) { + if (c == symbols[i]) { + return true; + } + } + + return false; +} + +#if defined(__SSE2__) +template <char s0> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + return eq0; +} + +template <char s0, char s1, char... tail> +inline __m128i mm_is_in(__m128i bytes) { + __m128i eq0 = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(s0)); + __m128i eq = mm_is_in<s1, tail...>(bytes); + return _mm_or_si128(eq0, eq); +} + +inline __m128i mm_is_in(__m128i bytes, const char* symbols, size_t num_chars) { + __m128i accumulator = _mm_setzero_si128(); + for (size_t i = 0; i < num_chars; ++i) { + __m128i eq = _mm_cmpeq_epi8(bytes, _mm_set1_epi8(symbols[i])); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} + +inline std::array<__m128i, 16u> mm_is_in_prepare(const char* symbols, size_t num_chars) { + std::array<__m128i, 16u> result {}; + + for (size_t i = 0; i < num_chars; ++i) { + result[i] = _mm_set1_epi8(symbols[i]); + } + + return result; +} + +inline __m128i mm_is_in_execute(__m128i bytes, const std::array<__m128i, 16u>& needles) { + __m128i accumulator = _mm_setzero_si128(); + + for (const auto& needle : needles) { + __m128i eq = _mm_cmpeq_epi8(bytes, needle); + accumulator = _mm_or_si128(accumulator, eq); + } + + return accumulator; +} +#endif + +template <bool positive> +constexpr bool maybe_negate(bool x) { + return x == positive; +} + +template <bool positive> +constexpr uint16_t maybe_negate(uint16_t x) { + if constexpr (positive) + return x; + else + return ~x; +} + +enum class ReturnMode : uint8_t { + End, + Nullptr, +}; + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE2__) + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse2(const char* const begin, const char* const end, + const char* symbols, size_t num_chars) { + const char* pos = begin; + +#if defined(__SSE2__) + const auto needles = mm_is_in_prepare(symbols, num_chars); + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + __m128i eq = mm_is_in_execute(bytes, needles); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) return pos + __builtin_ctz(bit_mask); + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols, num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_last_symbols_sse2(const char* const begin, const char* const end) { + const char* pos = end; + +#if defined(__SSE2__) + for (; pos - 16 >= begin; + pos -= + 16) /// Assuming the pointer cannot overflow. Assuming we can compare these pointers. + { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos - 16)); + + __m128i eq = mm_is_in<symbols...>(bytes); + + uint16_t bit_mask = maybe_negate<positive>(uint16_t(_mm_movemask_epi8(eq))); + if (bit_mask) + return pos - 1 - + (__builtin_clz(bit_mask) - + 16); /// because __builtin_clz works with mask as uint32. + } +#endif + + --pos; + for (; pos >= begin; --pos) + if (maybe_negate<positive>(is_in<symbols...>(*pos))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode, size_t num_chars, char c01, char c02 = 0, + char c03 = 0, char c04 = 0, char c05 = 0, char c06 = 0, char c07 = 0, char c08 = 0, + char c09 = 0, char c10 = 0, char c11 = 0, char c12 = 0, char c13 = 0, char c14 = 0, + char c15 = 0, char c16 = 0> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end) { + const char* pos = begin; + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + __m128i set = _mm_setr_epi8(c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, + c14, c15, c16); + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if ((num_chars == 1 && maybe_negate<positive>(is_in<c01>(*pos))) || + (num_chars == 2 && maybe_negate<positive>(is_in<c01, c02>(*pos))) || + (num_chars == 3 && maybe_negate<positive>(is_in<c01, c02, c03>(*pos))) || + (num_chars == 4 && maybe_negate<positive>(is_in<c01, c02, c03, c04>(*pos))) || + (num_chars == 5 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05>(*pos))) || + (num_chars == 6 && maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06>(*pos))) || + (num_chars == 7 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07>(*pos))) || + (num_chars == 8 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08>(*pos))) || + (num_chars == 9 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09>(*pos))) || + (num_chars == 10 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10>(*pos))) || + (num_chars == 11 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11>(*pos))) || + (num_chars == 12 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12>(*pos))) || + (num_chars == 13 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13>( + *pos))) || + (num_chars == 14 && + maybe_negate<positive>( + is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, c12, c13, c14>( + *pos))) || + (num_chars == 15 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15>(*pos))) || + (num_chars == 16 && + maybe_negate<positive>(is_in<c01, c02, c03, c04, c05, c06, c07, c08, c09, c10, c11, + c12, c13, c14, c15, c16>(*pos)))) + return pos; + return return_mode == ReturnMode::End ? end : nullptr; +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_sse42(const char* const begin, const char* const end, + const SearchSymbols& symbols) { + const char* pos = begin; + + const auto num_chars = symbols.str.size(); + +#if defined(__SSE4_2__) + constexpr int mode = _SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_LEAST_SIGNIFICANT; + + const __m128i set = symbols.simd_vector; + + for (; pos + 15 < end; pos += 16) { + __m128i bytes = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pos)); + + if constexpr (positive) { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode)) + return pos + _mm_cmpestri(set, num_chars, bytes, 16, mode); + } else { + if (_mm_cmpestrc(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY)) + return pos + + _mm_cmpestri(set, num_chars, bytes, 16, mode | _SIDD_NEGATIVE_POLARITY); + } + } +#endif + + for (; pos < end; ++pos) + if (maybe_negate<positive>(is_in(*pos, symbols.str.data(), num_chars))) return pos; + + return return_mode == ReturnMode::End ? end : nullptr; +} + +/// NOTE No SSE 4.2 implementation for find_last_symbols_or_null. Not worth to do. + +template <bool positive, ReturnMode return_mode, char... symbols> +inline const char* find_first_symbols_dispatch(const char* begin, const char* end) + requires(0 <= sizeof...(symbols) && sizeof...(symbols) <= 16) +{ +#if defined(__SSE4_2__) + if (sizeof...(symbols) >= 5) + return find_first_symbols_sse42<positive, return_mode, sizeof...(symbols), symbols...>( + begin, end); + else +#endif + return find_first_symbols_sse2<positive, return_mode, symbols...>(begin, end); +} + +template <bool positive, ReturnMode return_mode> +inline const char* find_first_symbols_dispatch(const std::string_view haystack, + const SearchSymbols& symbols) { +#if defined(__SSE4_2__) + if (symbols.str.size() >= 5) + return find_first_symbols_sse42<positive, return_mode>(haystack.begin(), haystack.end(), + symbols); + else +#endif + return find_first_symbols_sse2<positive, return_mode>( + haystack.begin(), haystack.end(), symbols.str.data(), symbols.str.size()); +} + +} // namespace detail + +template <char... symbols> +inline const char* find_first_symbols(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, + end); +} + +/// Returning non const result for non const arguments. +/// It is convenient when you are using this function to iterate through non-const buffer. +template <char... symbols> +inline char* find_first_symbols(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<true, detail::ReturnMode::End, symbols...>(begin, + end)); +} + +inline const char* find_first_symbols(std::string_view haystack, const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::End>(haystack, symbols); +} + +template <char... symbols> +inline const char* find_first_not_symbols(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_first_not_symbols(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<false, detail::ReturnMode::End, symbols...>(begin, + end)); +} + +inline const char* find_first_not_symbols(std::string_view haystack, const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::End>(haystack, symbols); +} + +template <char... symbols> +inline const char* find_first_symbols_or_null(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_first_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr, symbols...>( + begin, end)); +} + +inline const char* find_first_symbols_or_null(std::string_view haystack, + const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<true, detail::ReturnMode::Nullptr>(haystack, + symbols); +} + +template <char... symbols> +inline const char* find_first_not_symbols_or_null(const char* begin, const char* end) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>( + begin, end); +} + +template <char... symbols> +inline char* find_first_not_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr, symbols...>( + begin, end)); +} + +inline const char* find_first_not_symbols_or_null(std::string_view haystack, + const SearchSymbols& symbols) { + return detail::find_first_symbols_dispatch<false, detail::ReturnMode::Nullptr>(haystack, + symbols); +} + +template <char... symbols> +inline const char* find_last_symbols_or_null(const char* begin, const char* end) { + return detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_last_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_last_symbols_sse2<true, detail::ReturnMode::Nullptr, symbols...>(begin, + end)); +} + +template <char... symbols> +inline const char* find_last_not_symbols_or_null(const char* begin, const char* end) { + return detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, + end); +} + +template <char... symbols> +inline char* find_last_not_symbols_or_null(char* begin, char* end) { + return const_cast<char*>( + detail::find_last_symbols_sse2<false, detail::ReturnMode::Nullptr, symbols...>(begin, + end)); +} + +/// Slightly resembles boost::split. The drawback of boost::split is that it fires a false positive in clang static analyzer. +/// See https://github.com/boostorg/algorithm/issues/63 +/// And https://bugs.llvm.org/show_bug.cgi?id=41141 +template <char... symbols, typename To> +inline To& splitInto(To& to, std::string_view what, bool token_compress = false) { + const char* pos = what.data(); + const char* end = pos + what.size(); + while (pos < end) { + const char* delimiter_or_end = find_first_symbols<symbols...>(pos, end); + + if (!token_compress || pos < delimiter_or_end) to.emplace_back(pos, delimiter_or_end - pos); + + if (delimiter_or_end < end) + pos = delimiter_or_end + 1; + else + pos = delimiter_or_end; Review Comment: warning: statement should be inside braces [readability-braces-around-statements] ```suggestion else { pos = delimiter_or_end; } ``` ########## be/src/vec/functions/url/tldLookup.generated.cpp: ########## @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.generated.cpp +// and modified by Doris + +// clang-format off +/* C++ code produced by gperf version 3.1 */ +/* Command-line: /usr/bin/gperf --output-file=tldLookup.generated.cpp tldLookup.gperf */ +/* Computed positions: -k'1-11,13-14,17,$' */ + +#if !( \ + (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gp...@gnu.org>." +#endif + +#line 7 "tldLookup.gperf" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" +#pragma GCC diagnostic ignored "-Wunused-macros" +#include <cstring> + +#define TOTAL_KEYWORDS 5045 +#define MIN_WORD_LENGTH 4 +#define MAX_WORD_LENGTH 34 +#define MIN_HASH_VALUE 75 +#define MAX_HASH_VALUE 110600 Review Comment: warning: macro 'MAX_HASH_VALUE' defines an integral constant; prefer an enum instead [modernize-macro-to-enum] ```cpp #define MAX_HASH_VALUE 110600 ^ ``` ########## be/src/vec/functions/url/tldLookup.generated.cpp: ########## @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.generated.cpp +// and modified by Doris + +// clang-format off +/* C++ code produced by gperf version 3.1 */ +/* Command-line: /usr/bin/gperf --output-file=tldLookup.generated.cpp tldLookup.gperf */ +/* Computed positions: -k'1-11,13-14,17,$' */ + +#if !( \ + (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gp...@gnu.org>." +#endif + +#line 7 "tldLookup.gperf" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" +#pragma GCC diagnostic ignored "-Wunused-macros" +#include <cstring> + +#define TOTAL_KEYWORDS 5045 Review Comment: warning: macro 'TOTAL_KEYWORDS' defines an integral constant; prefer an enum instead [modernize-macro-to-enum] ```cpp #define TOTAL_KEYWORDS 5045 ^ ``` ########## be/src/vec/functions/url/tldLookup.generated.cpp: ########## @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.generated.cpp +// and modified by Doris + +// clang-format off +/* C++ code produced by gperf version 3.1 */ +/* Command-line: /usr/bin/gperf --output-file=tldLookup.generated.cpp tldLookup.gperf */ +/* Computed positions: -k'1-11,13-14,17,$' */ + +#if !( \ + (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gp...@gnu.org>." +#endif + +#line 7 "tldLookup.gperf" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" +#pragma GCC diagnostic ignored "-Wunused-macros" +#include <cstring> + +#define TOTAL_KEYWORDS 5045 +#define MIN_WORD_LENGTH 4 +#define MAX_WORD_LENGTH 34 +#define MIN_HASH_VALUE 75 Review Comment: warning: macro 'MIN_HASH_VALUE' defines an integral constant; prefer an enum instead [modernize-macro-to-enum] ```cpp #define MIN_HASH_VALUE 75 ^ ``` ########## be/src/vec/functions/url/tldLookup.generated.cpp: ########## @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.generated.cpp +// and modified by Doris + +// clang-format off +/* C++ code produced by gperf version 3.1 */ +/* Command-line: /usr/bin/gperf --output-file=tldLookup.generated.cpp tldLookup.gperf */ +/* Computed positions: -k'1-11,13-14,17,$' */ + +#if !( \ + (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gp...@gnu.org>." +#endif + +#line 7 "tldLookup.gperf" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" +#pragma GCC diagnostic ignored "-Wunused-macros" +#include <cstring> + +#define TOTAL_KEYWORDS 5045 +#define MIN_WORD_LENGTH 4 Review Comment: warning: macro 'MIN_WORD_LENGTH' defines an integral constant; prefer an enum instead [modernize-macro-to-enum] ```cpp #define MIN_WORD_LENGTH 4 ^ ``` ########## be/src/vec/functions/url/tldLookup.generated.cpp: ########## @@ -0,0 +1,140 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +// This file is copied from +// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/tldLookup.generated.cpp +// and modified by Doris + +// clang-format off +/* C++ code produced by gperf version 3.1 */ +/* Command-line: /usr/bin/gperf --output-file=tldLookup.generated.cpp tldLookup.gperf */ +/* Computed positions: -k'1-11,13-14,17,$' */ + +#if !( \ + (' ' == 32) && ('!' == 33) && ('"' == 34) && ('#' == 35) && ('%' == 37) && ('&' == 38) && ('\'' == 39) && ('(' == 40) && (')' == 41) && ('*' == 42) && ('+' == 43) && (',' == 44) && ('-' == 45) && ('.' == 46) && ('/' == 47) && ('0' == 48) && ('1' == 49) && ('2' == 50) && ('3' == 51) && ('4' == 52) && ('5' == 53) && ('6' == 54) && ('7' == 55) && ('8' == 56) && ('9' == 57) && (':' == 58) && (';' == 59) && ('<' == 60) && ('=' == 61) && ('>' == 62) && ('?' == 63) && ('A' == 65) && ('B' == 66) && ('C' == 67) && ('D' == 68) && ('E' == 69) && ('F' == 70) && ('G' == 71) && ('H' == 72) && ('I' == 73) && ('J' == 74) && ('K' == 75) && ('L' == 76) && ('M' == 77) && ('N' == 78) && ('O' == 79) && ('P' == 80) && ('Q' == 81) && ('R' == 82) && ('S' == 83) && ('T' == 84) && ('U' == 85) && ('V' == 86) && ('W' == 87) && ('X' == 88) && ('Y' == 89) && ('Z' == 90) && ('[' == 91) && ('\\' == 92) && (']' == 93) && ('^' == 94) && ('_' == 95) && ('a' == 97) && ('b' == 98) && ('c' == 99) && ('d' == 100) && ('e' == 101) && ('f' == 102) && ('g' == 103) && ('h' == 104) && ('i' == 105) && ('j' == 106) && ('k' == 107) && ('l' == 108) && ('m' == 109) && ('n' == 110) && ('o' == 111) && ('p' == 112) && ('q' == 113) && ('r' == 114) && ('s' == 115) && ('t' == 116) && ('u' == 117) && ('v' == 118) && ('w' == 119) && ('x' == 120) && ('y' == 121) && ('z' == 122) && ('{' == 123) && ('|' == 124) && ('}' == 125) && ('~' == 126)) +/* The character set is not based on ISO-646. */ +#error "gperf generated tables don't work with this execution character set. Please report a bug to <bug-gp...@gnu.org>." +#endif + +#line 7 "tldLookup.gperf" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" +#pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" +#pragma GCC diagnostic ignored "-Wunused-macros" +#include <cstring> + +#define TOTAL_KEYWORDS 5045 +#define MIN_WORD_LENGTH 4 +#define MAX_WORD_LENGTH 34 +#define MIN_HASH_VALUE 75 +#define MAX_HASH_VALUE 110600 +/* maximum key range = 110526, duplicates = 0 */ Review Comment: warning: replace macro with enum [modernize-macro-to-enum] ```suggestion enum { TOTAL_KEYWORDS = 5045, MIN_WORD_LENGTH = 4, MAX_WORD_LENGTH = 34, MIN_HASH_VALUE = 75, MAX_HASH_VALUE = 110600 }; /* maximum key range = 110526, duplicates = 0 */ ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org