[doris] branch master updated: [Fix](inverted index) fix tokenize function coredump (#24896)

kxiao Tue, 26 Sep 2023 02:32:51 -0700

This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 94082ae59c [Fix](inverted index) fix tokenize function coredump 
(#24896)
94082ae59c is described below

commit 94082ae59c37b44ea92cc451fad02658f20f084c
Author: airborne12 <[email protected]>
AuthorDate: Tue Sep 26 17:31:10 2023 +0800

    [Fix](inverted index) fix tokenize function coredump (#24896)
---
 be/src/vec/functions/function_tokenize.cpp         | 43 ++++++----------------
 .../data/inverted_index_p0/test_tokenize.out       |  3 ++
 .../suites/inverted_index_p0/test_tokenize.groovy  |  2 +
 3 files changed, 17 insertions(+), 31 deletions(-)

diff --git a/be/src/vec/functions/function_tokenize.cpp 
b/be/src/vec/functions/function_tokenize.cpp
index 62e0a53bcc..11760a30f5 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -20,6 +20,7 @@
 #include <glog/logging.h>
 
 #include <algorithm>
+#include <regex>
 #include <utility>
 
 #include "CLucene/StdHeader.h"
@@ -29,46 +30,26 @@
 #include "vec/common/string_ref.h"
 #include "vec/core/block.h"
 #include "vec/core/column_with_type_and_name.h"
-#include "vec/core/field.h"
 #include "vec/data_types/data_type_nullable.h"
 #include "vec/data_types/data_type_number.h"
-#include "vec/functions/simple_function_factory.h"
 
 namespace doris::vectorized {
 
 Status parse(const std::string& str, std::map<std::string, std::string>& 
result) {
-    std::string::size_type start = 0;
+    std::regex pattern(
+            
R"delimiter((?:'([^']*)'|"([^"]*)"|([^,]*))\s*=\s*(?:'([^']*)'|"([^"]*)"|([^,]*)))delimiter");
+    std::smatch matches;
 
-    while (start < str.size()) {
-        std::string::size_type end = str.find(',', start);
-        std::string pair =
-                (end == std::string::npos) ? str.substr(start) : 
str.substr(start, end - start);
-
-        std::string::size_type eq_pos = pair.find('=');
-        if (eq_pos == std::string::npos) {
-            return Status::InvalidArgument(
-                    fmt::format("invalid params {} for function tokenize", 
str));
-        }
-        std::string key = pair.substr(0, eq_pos);
-        key = key.substr(key.find_first_not_of(" '\""
-                                               "\t\n\r"),
-                         key.find_last_not_of(" '\""
-                                              "\t\n\r") -
-                                 key.find_first_not_of(" '\""
-                                                       "\t\n\r") +
-                                 1);
-        std::string value = pair.substr(eq_pos + 1);
-        value = value.substr(value.find_first_not_of(" '\""
-                                                     "\t\n\r"),
-                             value.find_last_not_of(" '\""
-                                                    "\t\n\r") -
-                                     value.find_first_not_of(" '\""
-                                                             "\t\n\r") +
-                                     1);
+    std::string::const_iterator searchStart(str.cbegin());
+    while (std::regex_search(searchStart, str.cend(), matches, pattern)) {
+        std::string key =
+                matches[1].length() ? matches[1] : (matches[2].length() ? 
matches[2] : matches[3]);
+        std::string value =
+                matches[4].length() ? matches[4] : (matches[5].length() ? 
matches[5] : matches[6]);
 
         result[key] = value;
 
-        start = (end == std::string::npos) ? str.size() : end + 1;
+        searchStart = matches.suffix().first;
     }
 
     return Status::OK();
@@ -169,6 +150,6 @@ Status FunctionTokenize::execute_impl(FunctionContext* 
/*context*/, Block& block
             return Status::OK();
         }
     }
-    return Status::RuntimeError("unimplements function {}", get_name());
+    return Status::RuntimeError("unimplemented function {}", get_name());
 }
 } // namespace doris::vectorized
diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out 
b/regression-test/data/inverted_index_p0/test_tokenize.out
index 731ae4249e..c4fa3915b4 100644
--- a/regression-test/data/inverted_index_p0/test_tokenize.out
+++ b/regression-test/data/inverted_index_p0/test_tokenize.out
@@ -19,3 +19,6 @@
 ["人民", "得到", "更多", "实惠"]
 ["陕西", "陕西省", "西安", "西安市", "高新", "高新区", "新区", "创业", "业大", "大厦", "A", "座", 
"手机", "手机号", "手机号码", "机号码", "号码", "12345678901", "邮箱", "12345678", "qq", "com", 
"ip", "information", "created", "automatically"]
 
+-- !tokenize_sql --
+["get", "images", "hm", "bg", "jpg", "http", "1", "0", "test", "abc", "bcd"]
+
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy 
b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index 7780329da0..5b5c4f02a4 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -91,4 +91,6 @@ suite("test_tokenize"){
 
     sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3, 
'人民可以得到更多实惠'), (4, 
'陕西省西安市高新区创业大厦A座，我的手机号码是12345678901,邮箱是[email protected]，,ip是1.1.1.1，this 
information is created automatically.');"
     qt_sql "SELECT TOKENIZE(c, 
\"'parser'='chinese','parser_mode'='fine_grained'\") FROM $indexTblName3";
+
+    qt_tokenize_sql """SELECT TOKENIZE('GET /images/hm_bg.jpg HTTP/1.0 
test:abc=bcd','"parser"="unicode","char_filter_type" = 
"char_replace","char_filter_pattern" = "._=:,","char_filter_replacement" = " 
"');"""
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[doris] branch master updated: [Fix](inverted index) fix tokenize function coredump (#24896)

Reply via email to