This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 5029ef4  [fix] fix ltrim result may incorrect in some case (#7963)
5029ef4 is described below

commit 5029ef46c92fdba10a35136fa62eb7ac2d425d54
Author: Zhengguo Yang <yangz...@gmail.com>
AuthorDate: Wed Feb 9 13:06:37 2022 +0800

    [fix] fix ltrim result may incorrect in some case (#7963)
    
    fix ltrim result may incorrect in some case
    according to https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
    Built-in Function: int __builtin_cl/tz (unsigned int x)
    If x is 0, the result is undefined.
    So we handle the case of 0 separately
    
    this function return different between gcc and clang when x is 0
---
 be/CMakeLists.txt                        |   3 +-
 be/src/exprs/v_string_functions.h        | 219 -------------------------------
 be/src/vec/functions/function_string.cpp |  17 ++-
 be/test/exprs/string_functions_test.cpp  |  56 ++++----
 4 files changed, 39 insertions(+), 256 deletions(-)

diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index 6bf9813..2d50c67 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -674,7 +674,6 @@ else()
         ${DORIS_DEPENDENCIES}
         ${WL_START_GROUP}
         ${X86_DEPENDENCIES}
-        ${WL_END_GROUP}
     )
 endif()
 
@@ -690,6 +689,8 @@ if (WITH_MYSQL)
         )
 endif()
 
+set(DORIS_DEPENDENCIES ${DORIS_DEPENDENCIES} ${WL_END_GROUP})
+
 message(STATUS "DORIS_DEPENDENCIES is ${DORIS_DEPENDENCIES}")
 
 # Add all external dependencies. They should come after the palo libs.
diff --git a/be/src/exprs/v_string_functions.h 
b/be/src/exprs/v_string_functions.h
deleted file mode 100644
index 3fd9845..0000000
--- a/be/src/exprs/v_string_functions.h
+++ /dev/null
@@ -1,219 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#ifndef BE_V_STRING_FUNCTIONS_H
-#define BE_V_STRING_FUNCTIONS_H
-
-#include <stdint.h>
-#include <unistd.h>
-#include "runtime/string_value.hpp"
-
-#ifdef __SSE2__
-#include <emmintrin.h>
-#endif
-
-namespace doris {
-class VStringFunctions {
-public:
-#ifdef __SSE2__
-    /// n equals to 16 chars length
-    static constexpr auto REGISTER_SIZE = sizeof(__m128i);
-#endif
-public:
-    static StringVal rtrim(const StringVal& str) {
-        if (str.is_null || str.len == 0) {
-            return str;
-        }
-        auto begin = 0;
-        auto end = str.len - 1;
-#ifdef __SSE2__
-        char blank = ' ';
-        const auto pattern =  _mm_set1_epi8(blank);
-        while (end - begin + 1 >= REGISTER_SIZE) {
-            const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const 
__m128i *>(str.ptr + end + 1 - REGISTER_SIZE));
-            const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern);
-            const auto mask = _mm_movemask_epi8(v_against_pattern);
-            int offset = __builtin_clz(~(mask << REGISTER_SIZE));
-            /// means not found
-            if (offset == 0)
-            {
-                return StringVal(str.ptr + begin, end - begin + 1);
-            } else {
-                end -= offset;
-            }
-        }
-#endif
-        while (end >= begin && str.ptr[end] == ' ') {
-            --end;
-        }
-        if (end < 0) {
-            return StringVal("");
-        }
-        return StringVal(str.ptr + begin, end - begin + 1);
-    }
-
-    static StringVal ltrim(const StringVal& str) {
-        if (str.is_null || str.len == 0) {
-            return str;
-        }
-        auto begin = 0;
-        auto end = str.len - 1;
-#ifdef __SSE2__
-        char blank = ' ';
-        const auto pattern =  _mm_set1_epi8(blank);
-        while (end - begin + 1 >= REGISTER_SIZE) {
-            const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const 
__m128i *>(str.ptr + begin));
-            const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern);
-            const auto mask = _mm_movemask_epi8(v_against_pattern);
-            const auto offset = __builtin_ctz(mask ^ 0xffff);
-            /// means not found
-            if (offset == 0)
-            {
-                return StringVal(str.ptr + begin, end - begin + 1);
-            } else if (offset > REGISTER_SIZE) {
-                begin += REGISTER_SIZE;
-            } else {
-                begin += offset;
-                return StringVal(str.ptr + begin, end - begin + 1);
-            }
-        }
-#endif
-        while (begin <= end && str.ptr[begin] == ' ') {
-            ++begin;
-        }
-        return StringVal(str.ptr + begin, end - begin + 1);
-    }
-
-    static StringVal trim(const StringVal& str) {
-        if (str.is_null || str.len == 0) {
-            return str;
-        }
-        return rtrim(ltrim(str));
-    }
-
-    static bool is_ascii(StringVal str) {
-    #ifdef __SSE2__
-        size_t i = 0;
-        __m128i binary_code = _mm_setzero_si128();
-        if (str.len >= REGISTER_SIZE) {
-            for (; i <= str.len - REGISTER_SIZE; i += REGISTER_SIZE) {
-                __m128i chars = _mm_loadu_si128((const __m128i*)(str.ptr + i));
-                binary_code = _mm_or_si128(binary_code, chars);
-            }
-        }
-        int mask = _mm_movemask_epi8(binary_code);
-
-        char or_code = 0;
-        for (; i < str.len; i++) {
-            or_code |= str.ptr[i];
-        }
-        mask |= (or_code & 0x80);
-
-        return !mask;
-    #else
-        char or_code = 0;
-        for (size_t i = 0; i < str.len; i++) {
-            or_code |= str.ptr[i];
-        }
-        return !(or_code & 0x80);
-    #endif
-    }
-
-    static void reverse(const StringVal& str, StringVal dst) {
-        if (str.is_null) {
-            dst.ptr = NULL;
-            return;
-        }
-        const bool is_ascii = VStringFunctions::is_ascii(str);
-        if (is_ascii) {
-            int64_t begin = 0;
-            int64_t end = str.len;
-            int64_t result_end = dst.len;
-    #if defined(__SSE2__)
-            const auto shuffle_array = 
_mm_set_epi64((__m64)0x00'01'02'03'04'05'06'07ull, 
(__m64)0x08'09'0a'0b'0c'0d'0e'0full);
-            for (; (begin + REGISTER_SIZE) < end; begin += REGISTER_SIZE) {
-                result_end -= REGISTER_SIZE;
-                _mm_storeu_si128((__m128i*)(dst.ptr + result_end),
-                                 
_mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(str.ptr + begin)), shuffle_array));
-            }
-    #endif
-            for (; begin < end; ++begin) {
-                --result_end;
-                dst.ptr[result_end] = str.ptr[begin];
-            }
-        } else {
-            for (size_t i = 0, char_size = 0; i < str.len; i += char_size) {
-                char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]);
-                std::copy(str.ptr + i, str.ptr + i + char_size, dst.ptr + 
str.len - i - char_size);
-            }
-        }
-    }
-
-    static size_t get_utf8_byte_length(unsigned char byte) {
-        size_t char_size = 0;
-        if (byte >= 0xFC) {
-            char_size = 6;
-        } else if (byte >= 0xF8) {
-            char_size = 5;
-        } else if (byte >= 0xF0) {
-            char_size = 4;
-        } else if (byte >= 0xE0) {
-            char_size = 3;
-        } else if (byte >= 0xC0) {
-            char_size = 2;
-        } else {
-            char_size = 1;
-        }
-        return char_size;
-    }
-
-    static void hex_encode(const unsigned char* src_str, size_t length, char* 
dst_str) {
-        static constexpr auto hex_table = "0123456789ABCDEF";
-        auto src_str_end = src_str + length;
-
-#if defined(__SSE2__)
-        constexpr auto step = sizeof(uint64);
-        if (src_str + step < src_str_end) {
-            const auto hex_map = _mm_loadu_si128(reinterpret_cast<const 
__m128i *>(hex_table));
-            const auto mask_map = _mm_set1_epi8(0x0F);
-
-            do {
-                auto data = _mm_loadu_si64(src_str);
-                auto hex_loc = 
_mm_and_si128(_mm_unpacklo_epi8(_mm_srli_epi64(data, 4), data), mask_map);
-                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst_str), 
_mm_shuffle_epi8(hex_map, hex_loc));
-
-                src_str += step;
-                dst_str += step * 2;
-            } while (src_str + step < src_str_end);
-        }
-#endif
-        char res[2];
-        // hex(str) str length is n, result must be 2 * n length
-        for (; src_str < src_str_end; src_str += 1, dst_str += 2) {
-            // low 4 bits
-            *(res + 1) = hex_table[src_str[0] & 0x0F];
-            // high 4 bits
-            *res = hex_table[(src_str[0] >> 4)];
-            std::copy(res, res + 2, dst_str);
-        }
-    }
-};
-}
-
-#endif //BE_V_STRING_FUNCTIONS_H
\ No newline at end of file
diff --git a/be/src/vec/functions/function_string.cpp 
b/be/src/vec/functions/function_string.cpp
index bdeb7e5..a89edf5 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -23,9 +23,9 @@
 #include <cstdlib>
 #include <string_view>
 
-#include "exprs/v_string_functions.h"
 #include "runtime/string_search.hpp"
 #include "util/encryption_util.h"
+#include "util/simd/vstring_function.h"
 #include "util/url_coding.h"
 #include "vec/common/pod_array_fwd.h"
 #include "vec/functions/function_string_to_string.h"
@@ -258,8 +258,8 @@ struct ReverseImpl {
             auto src_str = reinterpret_cast<const char*>(&data[offsets[i - 
1]]);
             int64_t src_len = offsets[i] - offsets[i - 1] - 1;
             char dst[src_len];
-            VStringFunctions::reverse(StringVal((uint8_t*)src_str, src_len),
-                                      StringVal((uint8_t*)dst, src_len));
+            simd::VStringFunctions::reverse(StringVal((uint8_t*)src_str, 
src_len),
+                                            StringVal((uint8_t*)dst, src_len));
             StringOP::push_value_string(std::string_view(dst, src_len), i, 
res_data, res_offsets);
         }
         return Status::OK();
@@ -271,9 +271,7 @@ struct HexStringName {
 };
 
 struct HexStringImpl {
-    static DataTypes get_variadic_argument_types() {
-        return {std::make_shared<DataTypeString>()};
-    }
+    static DataTypes get_variadic_argument_types() { return 
{std::make_shared<DataTypeString>()}; }
 
     static Status vector(const ColumnString::Chars& data, const 
ColumnString::Offsets& offsets,
                          ColumnString::Chars& dst_data, ColumnString::Offsets& 
dst_offsets) {
@@ -293,7 +291,8 @@ struct HexStringImpl {
                 dst_data_ptr++;
                 offset++;
             } else {
-                VStringFunctions::hex_encode(source, srclen, 
reinterpret_cast<char*>(dst_data_ptr));
+                simd::VStringFunctions::hex_encode(source, srclen,
+                                                   
reinterpret_cast<char*>(dst_data_ptr));
                 dst_data_ptr[srclen * 2] = '\0';
                 dst_data_ptr += (srclen * 2 + 1);
                 offset += (srclen * 2 + 1);
@@ -355,10 +354,10 @@ struct TrimImpl {
             const char* raw_str = reinterpret_cast<const 
char*>(&data[offsets[i - 1]]);
             StringVal str(raw_str);
             if constexpr (is_ltrim) {
-                str = VStringFunctions::ltrim(str);
+                str = simd::VStringFunctions::ltrim(str);
             }
             if constexpr (is_rtrim) {
-                str = VStringFunctions::rtrim(str);
+                str = simd::VStringFunctions::rtrim(str);
             }
             StringOP::push_value_string(std::string_view((char*)str.ptr, 
str.len), i, res_data,
                                         res_offsets);
diff --git a/be/test/exprs/string_functions_test.cpp 
b/be/test/exprs/string_functions_test.cpp
index 035e6a1..acae0e8 100644
--- a/be/test/exprs/string_functions_test.cpp
+++ b/be/test/exprs/string_functions_test.cpp
@@ -16,17 +16,18 @@
 // under the License.
 
 #include "exprs/string_functions.h"
-#include "exprs/v_string_functions.h"
 
+#include <fmt/os.h>
 #include <gtest/gtest.h>
+
 #include <iostream>
 #include <string>
-#include <fmt/os.h>
 
 #include "exprs/anyval_util.h"
 #include "test_util/test_util.h"
 #include "testutil/function_utils.h"
 #include "util/logging.h"
+#include "util/simd/vstring_function.h"
 
 namespace doris {
 
@@ -47,8 +48,7 @@ private:
 
 TEST_F(StringFunctionsTest, do_money_format_for_bigint_bench) {
     doris_udf::FunctionContext* context = new doris_udf::FunctionContext();
-    StringVal expected =
-            AnyValUtil::from_string(ctx, 
std::string("9,223,372,036,854,775,807.00"));
+    StringVal expected = AnyValUtil::from_string(ctx, 
std::string("9,223,372,036,854,775,807.00"));
     BigIntVal bigIntVal(9223372036854775807);
     for (int i = 0; i < LOOP_LESS_OR_MORE(10, 10000000); i++) {
         StringVal result = StringFunctions::money_format(context, bigIntVal);
@@ -98,7 +98,7 @@ TEST_F(StringFunctionsTest, money_format_large_int) {
     value = MIN_INT128;
     result = StringFunctions::money_format(context, 
doris_udf::LargeIntVal(value));
     expected = AnyValUtil::from_string_temp(
-                context, 
std::string("-170,141,183,460,469,231,731,687,303,715,884,105,728.00"));
+            context, 
std::string("-170,141,183,460,469,231,731,687,303,715,884,105,728.00"));
     ASSERT_EQ(expected, result);
     delete context;
 }
@@ -682,80 +682,82 @@ TEST_F(StringFunctionsTest, upper) {
 TEST_F(StringFunctionsTest, ltrim) {
     // no blank
     StringVal src("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
-    StringVal res = VStringFunctions::ltrim(src);
+    StringVal res = simd::VStringFunctions::ltrim(src);
     ASSERT_EQ(src, res);
     // empty string
     StringVal src1("");
-    res = VStringFunctions::ltrim(src1);
+    res = simd::VStringFunctions::ltrim(src1);
     ASSERT_EQ(src1, res);
     // null string
     StringVal src2(StringVal::null());
-    res = VStringFunctions::ltrim(src2);
+    res = simd::VStringFunctions::ltrim(src2);
     ASSERT_EQ(src2, res);
     // less than 16 blanks
     StringVal src3("       hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
-    res = VStringFunctions::ltrim(src3);
+    res = simd::VStringFunctions::ltrim(src3);
     ASSERT_EQ(src, res);
     // more than 16 blanks
     StringVal src4("                   hello 
worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
-    res = VStringFunctions::ltrim(src4);
+    res = simd::VStringFunctions::ltrim(src4);
     ASSERT_EQ(src, res);
     // all are blanks, less than 16 blanks
     StringVal src5("       ");
-    res = VStringFunctions::ltrim(src5);
+    res = simd::VStringFunctions::ltrim(src5);
     ASSERT_EQ(StringVal(""), res);
     // all are blanks, more than 16 blanks
     StringVal src6("                  ");
-    res = VStringFunctions::ltrim(src6);
+    res = simd::VStringFunctions::ltrim(src6);
     ASSERT_EQ(StringVal(""), res);
     // src less than 16 length
     StringVal src7(" 12345678910");
-    res = VStringFunctions::ltrim(src7);
+    res = simd::VStringFunctions::ltrim(src7);
     ASSERT_EQ(StringVal("12345678910"), res);
 }
 
 TEST_F(StringFunctionsTest, rtrim) {
     // no blank
     StringVal src("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
-    StringVal res = VStringFunctions::rtrim(src);
+    StringVal res = simd::VStringFunctions::rtrim(src);
     ASSERT_EQ(src, res);
     // empty string
     StringVal src1("");
-    res = VStringFunctions::rtrim(src1);
+    res = simd::VStringFunctions::rtrim(src1);
     ASSERT_EQ(src1, res);
     // null string
     StringVal src2(StringVal::null());
-    res = VStringFunctions::rtrim(src2);
+    res = simd::VStringFunctions::rtrim(src2);
     ASSERT_EQ(src2, res);
     // less than 16 blanks
     StringVal src3("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa       ");
-    res = VStringFunctions::rtrim(src3);
+    res = simd::VStringFunctions::rtrim(src3);
     ASSERT_EQ(src, res);
     // more than 16 blanks
     StringVal src4("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa                 
     ");
-    res = VStringFunctions::rtrim(src4);
+    res = simd::VStringFunctions::rtrim(src4);
     ASSERT_EQ(src, res);
     // all are blanks, less than 16 blanks
     StringVal src5("       ");
-    res = VStringFunctions::rtrim(src5);
+    res = simd::VStringFunctions::rtrim(src5);
     ASSERT_EQ(StringVal(""), res);
     // all are blanks, more than 16 blanks
     StringVal src6("                  ");
-    res = VStringFunctions::rtrim(src6);
+    res = simd::VStringFunctions::rtrim(src6);
     ASSERT_EQ(StringVal(""), res);
     // src less than 16 length
     StringVal src7("12345678910 ");
-    res = VStringFunctions::rtrim(src7);
+    res = simd::VStringFunctions::rtrim(src7);
     ASSERT_EQ(StringVal("12345678910"), res);
 }
 
 TEST_F(StringFunctionsTest, is_ascii) {
-    ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal("hello123")));
-    ASSERT_EQ(true, 
VStringFunctions::is_ascii(StringVal("hello123fwrewerwerwerwrsfqrwerwefwfwrwfsfwe")));
-    ASSERT_EQ(false, VStringFunctions::is_ascii(StringVal("运维组123")));
-    ASSERT_EQ(false, 
VStringFunctions::is_ascii(StringVal("hello123运维组fwrewerwerwerwrsfqrwerwefwfwrwfsfwe")));
-    ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal::null()));
-    ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal("")));
+    ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal("hello123")));
+    ASSERT_EQ(true, simd::VStringFunctions::is_ascii(
+                            
StringVal("hello123fwrewerwerwerwrsfqrwerwefwfwrwfsfwe")));
+    ASSERT_EQ(false, simd::VStringFunctions::is_ascii(StringVal("运维组123")));
+    ASSERT_EQ(false, simd::VStringFunctions::is_ascii(
+                             
StringVal("hello123运维组fwrewerwerwerwrsfqrwerwefwfwrwfsfwe")));
+    ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal::null()));
+    ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal("")));
 }
 } // namespace doris
 

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to