This is an automated email from the ASF dual-hosted git repository.

panxiaolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new db4810b966a [fix](function) fix error result when input utf8 in 
url_encode, strright, append_trailing_char_if_absent (#49127)
db4810b966a is described below

commit db4810b966a6da2845972b9edabac7d3c709d965
Author: Mryange <[email protected]>
AuthorDate: Wed May 7 15:09:09 2025 +0800

    [fix](function) fix error result when input utf8 in url_encode, strright, 
append_trailing_char_if_absent (#49127)
    
    ### What problem does this PR solve?
    
    The url_encode function previously performed a modulus operation on a
    signed number. Converting it to an unsigned number will fix the issue.
    ```
    before
    mysql> select url_encode('编码');
    +----------------------+
    | url_encode('编码')   |
    +----------------------+
    | %5.%23%0-%5.%10%/(   |
    +----------------------+
    now
    mysql> select url_encode('编码');
    +----------------------+
    | url_encode('编码')   |
    +----------------------+
    | %E7%BC%96%E7%A0%81   |
    +----------------------+
    ```
    
    The strright function did not calculate the length according to the
    number of UTF-8 characters.
    ```
    before
    mysql> select strright("你好世界",5);
    +----------------------------+
    | strright("你好世界",5)     |
    +----------------------------+
    |                            |
    +----------------------------+
    now
    
    mysql> select strright("你好世界",5);
    +----------------------------+
    | strright("你好世界",5)     |
    +----------------------------+
    | 你好世界                   |
    +----------------------------+
    ```
    
    he case of inputting a UTF-8 character was not considered.
    ```
    mysql> select append_trailing_char_if_absent('中文', '文');
    +-------------------------------------------------+
    | append_trailing_char_if_absent('中文', '文')    |
    +-------------------------------------------------+
    | NULL                                            |
    +-------------------------------------------------+
    now
    mysql> select append_trailing_char_if_absent('中文', '文');
    +-------------------------------------------------+
    | append_trailing_char_if_absent('中文', '文')    |
    +-------------------------------------------------+
    | 中文                                            |
    +-------------------------------------------------+
    ```
    
    Problem Summary:
    
    ### Release note
    
    None
    
    ### Check List (For Author)
    
    - Test <!-- At least one of them must be included. -->
        - [ ] Regression test
        - [ ] Unit Test
        - [ ] Manual test (add detailed scripts or steps below)
        - [ ] No need to test or manual test. Explain why:
    - [ ] This is a refactor/code format and no logic has been changed.
            - [ ] Previous test can cover this change.
            - [ ] No code files have been changed.
            - [ ] Other reason <!-- Add your reason?  -->
    
    - Behavior changed:
        - [ ] No.
        - [ ] Yes. <!-- Explain the behavior change -->
    
    - Does this need documentation?
        - [ ] No.
    - [ ] Yes. <!-- Add document PR link here. eg:
    https://github.com/apache/doris-website/pull/1214 -->
    
    ### Check List (For Reviewer who merge this PR)
    
    - [ ] Confirm the release note
    - [ ] Confirm test cases
    - [ ] Confirm document
    - [ ] Add branch pick label <!-- Add branch pick label that this PR
    should merge into -->
---
 be/src/util/url_coding.cpp                         |  4 +-
 be/src/vec/functions/function_string.cpp           | 73 +++++++++++++---------
 be/src/vec/functions/function_string.h             |  6 +-
 be/test/vec/function/function_string_test.cpp      | 24 +++++++
 .../fold_constant_string_arithmatic.groovy         |  9 ++-
 5 files changed, 80 insertions(+), 36 deletions(-)

diff --git a/be/src/util/url_coding.cpp b/be/src/util/url_coding.cpp
index 5871b4b9d32..c442dbcb846 100644
--- a/be/src/util/url_coding.cpp
+++ b/be/src/util/url_coding.cpp
@@ -39,7 +39,9 @@ void url_encode(const std::string_view& in, std::string* out) 
{
         } else if (c == ' ') {
             os << '+';
         } else {
-            os << '%' << to_hex(c >> 4) << to_hex(c % 16);
+            ///TODO: In the past, there was an error here involving the 
modulus operation on a char (signed number).
+            // When the char data exceeds 128 (UTF-8 byte), it leads to 
incorrect results. It is actually better to use some third-party libraries here.
+            os << '%' << to_hex((unsigned char)c >> 4) << to_hex((unsigned 
char)c % 16);
         }
     }
 
diff --git a/be/src/vec/functions/function_string.cpp 
b/be/src/vec/functions/function_string.cpp
index a3d932a7b5b..f6311a76ced 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -1151,6 +1151,15 @@ struct StringAppendTrailingCharIfAbsent {
     using Offsets = ColumnString::Offsets;
     using ReturnType = DataTypeString;
     using ColumnType = ColumnString;
+
+    static bool str_end_with(const StringRef& str, const StringRef& end) {
+        if (str.size < end.size) {
+            return false;
+        }
+        // The end_with method of StringRef needs to ensure that the size of 
end is less than or equal to the size of str.
+        return str.end_with(end);
+    }
+
     static void vector_vector(FunctionContext* context, const Chars& ldata, 
const Offsets& loffsets,
                               const Chars& rdata, const Offsets& roffsets, 
Chars& res_data,
                               Offsets& res_offsets, NullMap& null_map_data) {
@@ -1162,36 +1171,39 @@ struct StringAppendTrailingCharIfAbsent {
         for (size_t i = 0; i < input_rows_count; ++i) {
             buffer.clear();
 
-            int l_size = loffsets[i] - loffsets[i - 1];
-            const auto l_raw = reinterpret_cast<const char*>(&ldata[loffsets[i 
- 1]]);
+            StringRef lstr = StringRef(reinterpret_cast<const 
char*>(&ldata[loffsets[i - 1]]),
+                                       loffsets[i] - loffsets[i - 1]);
+            StringRef rstr = StringRef(reinterpret_cast<const 
char*>(&rdata[roffsets[i - 1]]),
+                                       roffsets[i] - roffsets[i - 1]);
+            // The iterate_utf8_with_limit_length function iterates over a 
maximum of two UTF-8 characters.
+            auto [byte_len, char_len] = 
simd::VStringFunctions::iterate_utf8_with_limit_length(
+                    rstr.begin(), rstr.end(), 2);
 
-            int r_size = roffsets[i] - roffsets[i - 1];
-            const auto r_raw = reinterpret_cast<const char*>(&rdata[roffsets[i 
- 1]]);
-
-            if (r_size != 1) {
+            if (char_len != 1) {
                 StringOP::push_null_string(i, res_data, res_offsets, 
null_map_data);
                 continue;
             }
-            if (l_raw[l_size - 1] == r_raw[0]) {
-                StringOP::push_value_string(std::string_view(l_raw, l_size), 
i, res_data,
-                                            res_offsets);
+            if (str_end_with(lstr, rstr)) {
+                StringOP::push_value_string(lstr, i, res_data, res_offsets);
                 continue;
             }
 
-            buffer.append(l_raw, l_raw + l_size);
-            buffer.append(r_raw, r_raw + 1);
+            buffer.append(lstr.begin(), lstr.end());
+            buffer.append(rstr.begin(), rstr.end());
             StringOP::push_value_string(std::string_view(buffer.data(), 
buffer.size()), i, res_data,
                                         res_offsets);
         }
     }
     static void vector_scalar(FunctionContext* context, const Chars& ldata, 
const Offsets& loffsets,
-                              const StringRef& rdata, Chars& res_data, 
Offsets& res_offsets,
+                              const StringRef& rstr, Chars& res_data, Offsets& 
res_offsets,
                               NullMap& null_map_data) {
         size_t input_rows_count = loffsets.size();
         res_offsets.resize(input_rows_count);
         fmt::memory_buffer buffer;
-
-        if (rdata.size != 1) {
+        // The iterate_utf8_with_limit_length function iterates over a maximum 
of two UTF-8 characters.
+        auto [byte_len, char_len] =
+                
simd::VStringFunctions::iterate_utf8_with_limit_length(rstr.begin(), 
rstr.end(), 2);
+        if (char_len != 1) {
             for (size_t i = 0; i < input_rows_count; ++i) {
                 StringOP::push_null_string(i, res_data, res_offsets, 
null_map_data);
             }
@@ -1200,23 +1212,21 @@ struct StringAppendTrailingCharIfAbsent {
 
         for (size_t i = 0; i < input_rows_count; ++i) {
             buffer.clear();
+            StringRef lstr = StringRef(reinterpret_cast<const 
char*>(&ldata[loffsets[i - 1]]),
+                                       loffsets[i] - loffsets[i - 1]);
 
-            int l_size = loffsets[i] - loffsets[i - 1];
-            const auto l_raw = reinterpret_cast<const char*>(&ldata[loffsets[i 
- 1]]);
-
-            if (l_raw[l_size - 1] == rdata.data[0]) {
-                StringOP::push_value_string(std::string_view(l_raw, l_size), 
i, res_data,
-                                            res_offsets);
+            if (str_end_with(lstr, rstr)) {
+                StringOP::push_value_string(lstr, i, res_data, res_offsets);
                 continue;
             }
 
-            buffer.append(l_raw, l_raw + l_size);
-            buffer.append(rdata.begin(), rdata.end());
+            buffer.append(lstr.begin(), lstr.end());
+            buffer.append(rstr.begin(), rstr.end());
             StringOP::push_value_string(std::string_view(buffer.data(), 
buffer.size()), i, res_data,
                                         res_offsets);
         }
     }
-    static void scalar_vector(FunctionContext* context, const StringRef& 
ldata, const Chars& rdata,
+    static void scalar_vector(FunctionContext* context, const StringRef& lstr, 
const Chars& rdata,
                               const Offsets& roffsets, Chars& res_data, 
Offsets& res_offsets,
                               NullMap& null_map_data) {
         size_t input_rows_count = roffsets.size();
@@ -1226,20 +1236,23 @@ struct StringAppendTrailingCharIfAbsent {
         for (size_t i = 0; i < input_rows_count; ++i) {
             buffer.clear();
 
-            int r_size = roffsets[i] - roffsets[i - 1];
-            const auto r_raw = reinterpret_cast<const char*>(&rdata[roffsets[i 
- 1]]);
+            StringRef rstr = StringRef(reinterpret_cast<const 
char*>(&rdata[roffsets[i - 1]]),
+                                       roffsets[i] - roffsets[i - 1]);
+            // The iterate_utf8_with_limit_length function iterates over a 
maximum of two UTF-8 characters.
+            auto [byte_len, char_len] = 
simd::VStringFunctions::iterate_utf8_with_limit_length(
+                    rstr.begin(), rstr.end(), 2);
 
-            if (r_size != 1) {
+            if (char_len != 1) {
                 StringOP::push_null_string(i, res_data, res_offsets, 
null_map_data);
                 continue;
             }
-            if (ldata.size == 0 || ldata.back() == r_raw[0]) {
-                StringOP::push_value_string(ldata.to_string_view(), i, 
res_data, res_offsets);
+            if (str_end_with(lstr, rstr)) {
+                StringOP::push_value_string(lstr, i, res_data, res_offsets);
                 continue;
             }
 
-            buffer.append(ldata.begin(), ldata.end());
-            buffer.append(r_raw, r_raw + 1);
+            buffer.append(lstr.begin(), lstr.end());
+            buffer.append(rstr.begin(), rstr.end());
             StringOP::push_value_string(std::string_view(buffer.data(), 
buffer.size()), i, res_data,
                                         res_offsets);
         }
diff --git a/be/src/vec/functions/function_string.h 
b/be/src/vec/functions/function_string.h
index b7ca76df4ef..b9f701462fb 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -709,14 +709,14 @@ public:
 
         auto str_col =
                 
block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
-        const auto& str_offset = assert_cast<const 
ColumnString*>(str_col.get())->get_offsets();
-
+        const auto* str_column = assert_cast<const 
ColumnString*>(str_col.get());
         auto pos_col =
                 
block.get_by_position(arguments[1]).column->convert_to_full_column_if_const();
         const auto& pos_data = assert_cast<const 
ColumnInt32*>(pos_col.get())->get_data();
 
         for (int i = 0; i < input_rows_count; ++i) {
-            strlen_data[i] = str_offset[i] - str_offset[i - 1];
+            auto str = str_column->get_data_at(i);
+            strlen_data[i] = simd::VStringFunctions::get_char_len(str.data, 
str.size);
         }
 
         for (int i = 0; i < input_rows_count; ++i) {
diff --git a/be/test/vec/function/function_string_test.cpp 
b/be/test/vec/function/function_string_test.cpp
index 96820c7c0c0..33e7f8f32fd 100644
--- a/be/test/vec/function/function_string_test.cpp
+++ b/be/test/vec/function/function_string_test.cpp
@@ -405,6 +405,8 @@ TEST(function_string_test, function_string_strright_test) {
                 {{std::string("hah hah"), -1}, std::string("hah hah")},
                 {{std::string("🤣"), -1}, std::string("🤣")},
                 {{std::string("🤣😃😄"), -2}, std::string("😃😄")},
+                {{std::string("🐼abc🐼"), 100}, std::string("🐼abc🐼")},
+                {{std::string("你好世界"), 5}, std::string("你好世界")},
                 {{std::string("12345"), 6}, std::string("12345")},
                 {{std::string("12345"), 12345}, std::string("12345")},
                 {{std::string("-12345"), -1}, std::string("-12345")},
@@ -890,11 +892,33 @@ TEST(function_string_test, 
function_append_trailing_char_if_absent_test) {
                         {{std::string("ABC"), Null()}, Null()},
                         {{Null(), std::string("ABC")}, Null()},
                         {{std::string(""), Null()}, Null()},
+                        {{std::string("中文"), std::string("文")}, 
std::string("中文")},
+                        {{std::string("中"), std::string("文")}, 
std::string("中文")},
+                        {{std::string(""), std::string("文")}, 
std::string("文")},
                         {{Null(), std::string("")}, Null()}};
 
     check_function_all_arg_comb<DataTypeString, true>(func_name, input_types, 
data_set);
 }
 
+TEST(function_string_test, function_url_encode_test) {
+    std::string func_name = "url_encode";
+
+    InputTypeSet input_types = {TypeIndex::String};
+
+    DataSet data_set = {
+            {{std::string("编码")}, std::string("%E7%BC%96%E7%A0%81")},
+            {{std::string("http://www.baidu.com/?a=中文日文韩文俄文希伯来文Emoji";)},
+             std::string(
+                     
"http%3A%2F%2Fwww.baidu.com%2F%3Fa%3D%E4%B8%AD%E6%96%87%E6%97%A5%E6%96%87%E9%"
+                     
"9F%A9%E6%96%87%E4%BF%84%E6%96%87%E5%B8%8C%E4%BC%AF%E6%9D%A5%E6%96%87Emoji")},
+            
{{std::string("http://www.baidu.com?a=http%3A%2F%2Fexample.com%2F😊";)},
+             
std::string("http%3A%2F%2Fwww.baidu.com%3Fa%3Dhttp%253A%252F%252Fexample.com%252F%F0%"
+                         "9F%98%8A")},
+    };
+
+    check_function_all_arg_comb<DataTypeString, true>(func_name, input_types, 
data_set);
+}
+
 TEST(function_string_test, function_starts_with_test) {
     std::string func_name = "starts_with";
 
diff --git 
a/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy
 
b/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy
index bcac849c433..ac8a97c705e 100644
--- 
a/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy
+++ 
b/regression-test/suites/nereids_p0/expression/fold_constant/fold_constant_string_arithmatic.groovy
@@ -50,7 +50,10 @@ suite("fold_constant_string_arithmatic") {
     testFoldConst("select append_trailing_char_if_absent('こんにちは', '!')")
     testFoldConst("select append_trailing_char_if_absent('\n\t', '\n')")
     testFoldConst("select append_trailing_char_if_absent('こんにちは', 'ちは')")
-
+    testFoldConst("select append_trailing_char_if_absent('中文', '文')")
+    testFoldConst("select append_trailing_char_if_absent('中', '文')")
+    testFoldConst("select append_trailing_char_if_absent('', '文')")
+    
     // ascii
     testFoldConst("select ascii('!')")
     testFoldConst("select ascii('1')")
@@ -768,7 +771,8 @@ suite("fold_constant_string_arithmatic") {
     testFoldConst("select right('Hello World', 5)")
     testFoldConst("select right('Hello World', 0)")
     testFoldConst("select right(NULL, 1)")
-
+    testFoldConst("select right('🐼abc🐼', 100)")
+    testFoldConst("select right('你好世界',5)")
     // rpad
     testFoldConst("select rpad(cast('hi' as string), 1, cast('xy' as string))")
     testFoldConst("select rpad(cast('hi' as string), 5, cast('xy' as string))")
@@ -1315,6 +1319,7 @@ suite("fold_constant_string_arithmatic") {
     testFoldConst("select 
url_decode('http%3A%2F%2Fwww.apache.org%2Flicenses%2FLICENSE-22.0')")
     testFoldConst("select 
url_encode('http://www.apache.org/licenses/LICENSE-2.0')")
     testFoldConst("select url_encode(' 
http://www.apache.org/licenses/LICENSE-2.0 ')")
+    testFoldConst("select url_encode(' 
http://www.baidu.com/?a=中文日文韩文俄文希伯来文Emoji')")
 
     // extract_url_parameter
     testFoldConst("select 
extract_url_parameter('http://user:[email protected]?a=b', 'a')")


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to