This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push: new 5029ef4 [fix] fix ltrim result may incorrect in some case (#7963) 5029ef4 is described below commit 5029ef46c92fdba10a35136fa62eb7ac2d425d54 Author: Zhengguo Yang <yangz...@gmail.com> AuthorDate: Wed Feb 9 13:06:37 2022 +0800 [fix] fix ltrim result may incorrect in some case (#7963) fix ltrim result may incorrect in some case according to https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html Built-in Function: int __builtin_cl/tz (unsigned int x) If x is 0, the result is undefined. So we handle the case of 0 separately this function return different between gcc and clang when x is 0 --- be/CMakeLists.txt | 3 +- be/src/exprs/v_string_functions.h | 219 ------------------------------- be/src/vec/functions/function_string.cpp | 17 ++- be/test/exprs/string_functions_test.cpp | 56 ++++---- 4 files changed, 39 insertions(+), 256 deletions(-) diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 6bf9813..2d50c67 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -674,7 +674,6 @@ else() ${DORIS_DEPENDENCIES} ${WL_START_GROUP} ${X86_DEPENDENCIES} - ${WL_END_GROUP} ) endif() @@ -690,6 +689,8 @@ if (WITH_MYSQL) ) endif() +set(DORIS_DEPENDENCIES ${DORIS_DEPENDENCIES} ${WL_END_GROUP}) + message(STATUS "DORIS_DEPENDENCIES is ${DORIS_DEPENDENCIES}") # Add all external dependencies. They should come after the palo libs. diff --git a/be/src/exprs/v_string_functions.h b/be/src/exprs/v_string_functions.h deleted file mode 100644 index 3fd9845..0000000 --- a/be/src/exprs/v_string_functions.h +++ /dev/null @@ -1,219 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#pragma once - -#ifndef BE_V_STRING_FUNCTIONS_H -#define BE_V_STRING_FUNCTIONS_H - -#include <stdint.h> -#include <unistd.h> -#include "runtime/string_value.hpp" - -#ifdef __SSE2__ -#include <emmintrin.h> -#endif - -namespace doris { -class VStringFunctions { -public: -#ifdef __SSE2__ - /// n equals to 16 chars length - static constexpr auto REGISTER_SIZE = sizeof(__m128i); -#endif -public: - static StringVal rtrim(const StringVal& str) { - if (str.is_null || str.len == 0) { - return str; - } - auto begin = 0; - auto end = str.len - 1; -#ifdef __SSE2__ - char blank = ' '; - const auto pattern = _mm_set1_epi8(blank); - while (end - begin + 1 >= REGISTER_SIZE) { - const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(str.ptr + end + 1 - REGISTER_SIZE)); - const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern); - const auto mask = _mm_movemask_epi8(v_against_pattern); - int offset = __builtin_clz(~(mask << REGISTER_SIZE)); - /// means not found - if (offset == 0) - { - return StringVal(str.ptr + begin, end - begin + 1); - } else { - end -= offset; - } - } -#endif - while (end >= begin && str.ptr[end] == ' ') { - --end; - } - if (end < 0) { - return StringVal(""); - } - return StringVal(str.ptr + begin, end - begin + 1); - } - - static StringVal ltrim(const StringVal& str) { - if (str.is_null || str.len == 0) { - return str; - } - auto begin = 0; - auto end = str.len - 1; -#ifdef __SSE2__ - char blank = ' '; - const auto pattern = _mm_set1_epi8(blank); - while (end - begin + 1 >= REGISTER_SIZE) { - const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const __m128i *>(str.ptr + begin)); - const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern); - const auto mask = _mm_movemask_epi8(v_against_pattern); - const auto offset = __builtin_ctz(mask ^ 0xffff); - /// means not found - if (offset == 0) - { - return StringVal(str.ptr + begin, end - begin + 1); - } else if (offset > REGISTER_SIZE) { - begin += REGISTER_SIZE; - } else { - begin += offset; - return StringVal(str.ptr + begin, end - begin + 1); - } - } -#endif - while (begin <= end && str.ptr[begin] == ' ') { - ++begin; - } - return StringVal(str.ptr + begin, end - begin + 1); - } - - static StringVal trim(const StringVal& str) { - if (str.is_null || str.len == 0) { - return str; - } - return rtrim(ltrim(str)); - } - - static bool is_ascii(StringVal str) { - #ifdef __SSE2__ - size_t i = 0; - __m128i binary_code = _mm_setzero_si128(); - if (str.len >= REGISTER_SIZE) { - for (; i <= str.len - REGISTER_SIZE; i += REGISTER_SIZE) { - __m128i chars = _mm_loadu_si128((const __m128i*)(str.ptr + i)); - binary_code = _mm_or_si128(binary_code, chars); - } - } - int mask = _mm_movemask_epi8(binary_code); - - char or_code = 0; - for (; i < str.len; i++) { - or_code |= str.ptr[i]; - } - mask |= (or_code & 0x80); - - return !mask; - #else - char or_code = 0; - for (size_t i = 0; i < str.len; i++) { - or_code |= str.ptr[i]; - } - return !(or_code & 0x80); - #endif - } - - static void reverse(const StringVal& str, StringVal dst) { - if (str.is_null) { - dst.ptr = NULL; - return; - } - const bool is_ascii = VStringFunctions::is_ascii(str); - if (is_ascii) { - int64_t begin = 0; - int64_t end = str.len; - int64_t result_end = dst.len; - #if defined(__SSE2__) - const auto shuffle_array = _mm_set_epi64((__m64)0x00'01'02'03'04'05'06'07ull, (__m64)0x08'09'0a'0b'0c'0d'0e'0full); - for (; (begin + REGISTER_SIZE) < end; begin += REGISTER_SIZE) { - result_end -= REGISTER_SIZE; - _mm_storeu_si128((__m128i*)(dst.ptr + result_end), - _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(str.ptr + begin)), shuffle_array)); - } - #endif - for (; begin < end; ++begin) { - --result_end; - dst.ptr[result_end] = str.ptr[begin]; - } - } else { - for (size_t i = 0, char_size = 0; i < str.len; i += char_size) { - char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]); - std::copy(str.ptr + i, str.ptr + i + char_size, dst.ptr + str.len - i - char_size); - } - } - } - - static size_t get_utf8_byte_length(unsigned char byte) { - size_t char_size = 0; - if (byte >= 0xFC) { - char_size = 6; - } else if (byte >= 0xF8) { - char_size = 5; - } else if (byte >= 0xF0) { - char_size = 4; - } else if (byte >= 0xE0) { - char_size = 3; - } else if (byte >= 0xC0) { - char_size = 2; - } else { - char_size = 1; - } - return char_size; - } - - static void hex_encode(const unsigned char* src_str, size_t length, char* dst_str) { - static constexpr auto hex_table = "0123456789ABCDEF"; - auto src_str_end = src_str + length; - -#if defined(__SSE2__) - constexpr auto step = sizeof(uint64); - if (src_str + step < src_str_end) { - const auto hex_map = _mm_loadu_si128(reinterpret_cast<const __m128i *>(hex_table)); - const auto mask_map = _mm_set1_epi8(0x0F); - - do { - auto data = _mm_loadu_si64(src_str); - auto hex_loc = _mm_and_si128(_mm_unpacklo_epi8(_mm_srli_epi64(data, 4), data), mask_map); - _mm_storeu_si128(reinterpret_cast<__m128i *>(dst_str), _mm_shuffle_epi8(hex_map, hex_loc)); - - src_str += step; - dst_str += step * 2; - } while (src_str + step < src_str_end); - } -#endif - char res[2]; - // hex(str) str length is n, result must be 2 * n length - for (; src_str < src_str_end; src_str += 1, dst_str += 2) { - // low 4 bits - *(res + 1) = hex_table[src_str[0] & 0x0F]; - // high 4 bits - *res = hex_table[(src_str[0] >> 4)]; - std::copy(res, res + 2, dst_str); - } - } -}; -} - -#endif //BE_V_STRING_FUNCTIONS_H \ No newline at end of file diff --git a/be/src/vec/functions/function_string.cpp b/be/src/vec/functions/function_string.cpp index bdeb7e5..a89edf5 100644 --- a/be/src/vec/functions/function_string.cpp +++ b/be/src/vec/functions/function_string.cpp @@ -23,9 +23,9 @@ #include <cstdlib> #include <string_view> -#include "exprs/v_string_functions.h" #include "runtime/string_search.hpp" #include "util/encryption_util.h" +#include "util/simd/vstring_function.h" #include "util/url_coding.h" #include "vec/common/pod_array_fwd.h" #include "vec/functions/function_string_to_string.h" @@ -258,8 +258,8 @@ struct ReverseImpl { auto src_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]); int64_t src_len = offsets[i] - offsets[i - 1] - 1; char dst[src_len]; - VStringFunctions::reverse(StringVal((uint8_t*)src_str, src_len), - StringVal((uint8_t*)dst, src_len)); + simd::VStringFunctions::reverse(StringVal((uint8_t*)src_str, src_len), + StringVal((uint8_t*)dst, src_len)); StringOP::push_value_string(std::string_view(dst, src_len), i, res_data, res_offsets); } return Status::OK(); @@ -271,9 +271,7 @@ struct HexStringName { }; struct HexStringImpl { - static DataTypes get_variadic_argument_types() { - return {std::make_shared<DataTypeString>()}; - } + static DataTypes get_variadic_argument_types() { return {std::make_shared<DataTypeString>()}; } static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, ColumnString::Chars& dst_data, ColumnString::Offsets& dst_offsets) { @@ -293,7 +291,8 @@ struct HexStringImpl { dst_data_ptr++; offset++; } else { - VStringFunctions::hex_encode(source, srclen, reinterpret_cast<char*>(dst_data_ptr)); + simd::VStringFunctions::hex_encode(source, srclen, + reinterpret_cast<char*>(dst_data_ptr)); dst_data_ptr[srclen * 2] = '\0'; dst_data_ptr += (srclen * 2 + 1); offset += (srclen * 2 + 1); @@ -355,10 +354,10 @@ struct TrimImpl { const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]); StringVal str(raw_str); if constexpr (is_ltrim) { - str = VStringFunctions::ltrim(str); + str = simd::VStringFunctions::ltrim(str); } if constexpr (is_rtrim) { - str = VStringFunctions::rtrim(str); + str = simd::VStringFunctions::rtrim(str); } StringOP::push_value_string(std::string_view((char*)str.ptr, str.len), i, res_data, res_offsets); diff --git a/be/test/exprs/string_functions_test.cpp b/be/test/exprs/string_functions_test.cpp index 035e6a1..acae0e8 100644 --- a/be/test/exprs/string_functions_test.cpp +++ b/be/test/exprs/string_functions_test.cpp @@ -16,17 +16,18 @@ // under the License. #include "exprs/string_functions.h" -#include "exprs/v_string_functions.h" +#include <fmt/os.h> #include <gtest/gtest.h> + #include <iostream> #include <string> -#include <fmt/os.h> #include "exprs/anyval_util.h" #include "test_util/test_util.h" #include "testutil/function_utils.h" #include "util/logging.h" +#include "util/simd/vstring_function.h" namespace doris { @@ -47,8 +48,7 @@ private: TEST_F(StringFunctionsTest, do_money_format_for_bigint_bench) { doris_udf::FunctionContext* context = new doris_udf::FunctionContext(); - StringVal expected = - AnyValUtil::from_string(ctx, std::string("9,223,372,036,854,775,807.00")); + StringVal expected = AnyValUtil::from_string(ctx, std::string("9,223,372,036,854,775,807.00")); BigIntVal bigIntVal(9223372036854775807); for (int i = 0; i < LOOP_LESS_OR_MORE(10, 10000000); i++) { StringVal result = StringFunctions::money_format(context, bigIntVal); @@ -98,7 +98,7 @@ TEST_F(StringFunctionsTest, money_format_large_int) { value = MIN_INT128; result = StringFunctions::money_format(context, doris_udf::LargeIntVal(value)); expected = AnyValUtil::from_string_temp( - context, std::string("-170,141,183,460,469,231,731,687,303,715,884,105,728.00")); + context, std::string("-170,141,183,460,469,231,731,687,303,715,884,105,728.00")); ASSERT_EQ(expected, result); delete context; } @@ -682,80 +682,82 @@ TEST_F(StringFunctionsTest, upper) { TEST_F(StringFunctionsTest, ltrim) { // no blank StringVal src("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - StringVal res = VStringFunctions::ltrim(src); + StringVal res = simd::VStringFunctions::ltrim(src); ASSERT_EQ(src, res); // empty string StringVal src1(""); - res = VStringFunctions::ltrim(src1); + res = simd::VStringFunctions::ltrim(src1); ASSERT_EQ(src1, res); // null string StringVal src2(StringVal::null()); - res = VStringFunctions::ltrim(src2); + res = simd::VStringFunctions::ltrim(src2); ASSERT_EQ(src2, res); // less than 16 blanks StringVal src3(" hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - res = VStringFunctions::ltrim(src3); + res = simd::VStringFunctions::ltrim(src3); ASSERT_EQ(src, res); // more than 16 blanks StringVal src4(" hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - res = VStringFunctions::ltrim(src4); + res = simd::VStringFunctions::ltrim(src4); ASSERT_EQ(src, res); // all are blanks, less than 16 blanks StringVal src5(" "); - res = VStringFunctions::ltrim(src5); + res = simd::VStringFunctions::ltrim(src5); ASSERT_EQ(StringVal(""), res); // all are blanks, more than 16 blanks StringVal src6(" "); - res = VStringFunctions::ltrim(src6); + res = simd::VStringFunctions::ltrim(src6); ASSERT_EQ(StringVal(""), res); // src less than 16 length StringVal src7(" 12345678910"); - res = VStringFunctions::ltrim(src7); + res = simd::VStringFunctions::ltrim(src7); ASSERT_EQ(StringVal("12345678910"), res); } TEST_F(StringFunctionsTest, rtrim) { // no blank StringVal src("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"); - StringVal res = VStringFunctions::rtrim(src); + StringVal res = simd::VStringFunctions::rtrim(src); ASSERT_EQ(src, res); // empty string StringVal src1(""); - res = VStringFunctions::rtrim(src1); + res = simd::VStringFunctions::rtrim(src1); ASSERT_EQ(src1, res); // null string StringVal src2(StringVal::null()); - res = VStringFunctions::rtrim(src2); + res = simd::VStringFunctions::rtrim(src2); ASSERT_EQ(src2, res); // less than 16 blanks StringVal src3("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa "); - res = VStringFunctions::rtrim(src3); + res = simd::VStringFunctions::rtrim(src3); ASSERT_EQ(src, res); // more than 16 blanks StringVal src4("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa "); - res = VStringFunctions::rtrim(src4); + res = simd::VStringFunctions::rtrim(src4); ASSERT_EQ(src, res); // all are blanks, less than 16 blanks StringVal src5(" "); - res = VStringFunctions::rtrim(src5); + res = simd::VStringFunctions::rtrim(src5); ASSERT_EQ(StringVal(""), res); // all are blanks, more than 16 blanks StringVal src6(" "); - res = VStringFunctions::rtrim(src6); + res = simd::VStringFunctions::rtrim(src6); ASSERT_EQ(StringVal(""), res); // src less than 16 length StringVal src7("12345678910 "); - res = VStringFunctions::rtrim(src7); + res = simd::VStringFunctions::rtrim(src7); ASSERT_EQ(StringVal("12345678910"), res); } TEST_F(StringFunctionsTest, is_ascii) { - ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal("hello123"))); - ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal("hello123fwrewerwerwerwrsfqrwerwefwfwrwfsfwe"))); - ASSERT_EQ(false, VStringFunctions::is_ascii(StringVal("运维组123"))); - ASSERT_EQ(false, VStringFunctions::is_ascii(StringVal("hello123运维组fwrewerwerwerwrsfqrwerwefwfwrwfsfwe"))); - ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal::null())); - ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal(""))); + ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal("hello123"))); + ASSERT_EQ(true, simd::VStringFunctions::is_ascii( + StringVal("hello123fwrewerwerwerwrsfqrwerwefwfwrwfsfwe"))); + ASSERT_EQ(false, simd::VStringFunctions::is_ascii(StringVal("运维组123"))); + ASSERT_EQ(false, simd::VStringFunctions::is_ascii( + StringVal("hello123运维组fwrewerwerwerwrsfqrwerwefwfwrwfsfwe"))); + ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal::null())); + ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal(""))); } } // namespace doris --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org