This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 94082ae59c [Fix](inverted index) fix tokenize function coredump
(#24896)
94082ae59c is described below
commit 94082ae59c37b44ea92cc451fad02658f20f084c
Author: airborne12 <[email protected]>
AuthorDate: Tue Sep 26 17:31:10 2023 +0800
[Fix](inverted index) fix tokenize function coredump (#24896)
---
be/src/vec/functions/function_tokenize.cpp | 43 ++++++----------------
.../data/inverted_index_p0/test_tokenize.out | 3 ++
.../suites/inverted_index_p0/test_tokenize.groovy | 2 +
3 files changed, 17 insertions(+), 31 deletions(-)
diff --git a/be/src/vec/functions/function_tokenize.cpp
b/be/src/vec/functions/function_tokenize.cpp
index 62e0a53bcc..11760a30f5 100644
--- a/be/src/vec/functions/function_tokenize.cpp
+++ b/be/src/vec/functions/function_tokenize.cpp
@@ -20,6 +20,7 @@
#include <glog/logging.h>
#include <algorithm>
+#include <regex>
#include <utility>
#include "CLucene/StdHeader.h"
@@ -29,46 +30,26 @@
#include "vec/common/string_ref.h"
#include "vec/core/block.h"
#include "vec/core/column_with_type_and_name.h"
-#include "vec/core/field.h"
#include "vec/data_types/data_type_nullable.h"
#include "vec/data_types/data_type_number.h"
-#include "vec/functions/simple_function_factory.h"
namespace doris::vectorized {
Status parse(const std::string& str, std::map<std::string, std::string>&
result) {
- std::string::size_type start = 0;
+ std::regex pattern(
+
R"delimiter((?:'([^']*)'|"([^"]*)"|([^,]*))\s*=\s*(?:'([^']*)'|"([^"]*)"|([^,]*)))delimiter");
+ std::smatch matches;
- while (start < str.size()) {
- std::string::size_type end = str.find(',', start);
- std::string pair =
- (end == std::string::npos) ? str.substr(start) :
str.substr(start, end - start);
-
- std::string::size_type eq_pos = pair.find('=');
- if (eq_pos == std::string::npos) {
- return Status::InvalidArgument(
- fmt::format("invalid params {} for function tokenize",
str));
- }
- std::string key = pair.substr(0, eq_pos);
- key = key.substr(key.find_first_not_of(" '\""
- "\t\n\r"),
- key.find_last_not_of(" '\""
- "\t\n\r") -
- key.find_first_not_of(" '\""
- "\t\n\r") +
- 1);
- std::string value = pair.substr(eq_pos + 1);
- value = value.substr(value.find_first_not_of(" '\""
- "\t\n\r"),
- value.find_last_not_of(" '\""
- "\t\n\r") -
- value.find_first_not_of(" '\""
- "\t\n\r") +
- 1);
+ std::string::const_iterator searchStart(str.cbegin());
+ while (std::regex_search(searchStart, str.cend(), matches, pattern)) {
+ std::string key =
+ matches[1].length() ? matches[1] : (matches[2].length() ?
matches[2] : matches[3]);
+ std::string value =
+ matches[4].length() ? matches[4] : (matches[5].length() ?
matches[5] : matches[6]);
result[key] = value;
- start = (end == std::string::npos) ? str.size() : end + 1;
+ searchStart = matches.suffix().first;
}
return Status::OK();
@@ -169,6 +150,6 @@ Status FunctionTokenize::execute_impl(FunctionContext*
/*context*/, Block& block
return Status::OK();
}
}
- return Status::RuntimeError("unimplements function {}", get_name());
+ return Status::RuntimeError("unimplemented function {}", get_name());
}
} // namespace doris::vectorized
diff --git a/regression-test/data/inverted_index_p0/test_tokenize.out
b/regression-test/data/inverted_index_p0/test_tokenize.out
index 731ae4249e..c4fa3915b4 100644
--- a/regression-test/data/inverted_index_p0/test_tokenize.out
+++ b/regression-test/data/inverted_index_p0/test_tokenize.out
@@ -19,3 +19,6 @@
["人民", "得到", "更多", "实惠"]
["陕西", "陕西省", "西安", "西安市", "高新", "高新区", "新区", "创业", "业大", "大厦", "A", "座",
"手机", "手机号", "手机号码", "机号码", "号码", "12345678901", "邮箱", "12345678", "qq", "com",
"ip", "information", "created", "automatically"]
+-- !tokenize_sql --
+["get", "images", "hm", "bg", "jpg", "http", "1", "0", "test", "abc", "bcd"]
+
diff --git a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
index 7780329da0..5b5c4f02a4 100644
--- a/regression-test/suites/inverted_index_p0/test_tokenize.groovy
+++ b/regression-test/suites/inverted_index_p0/test_tokenize.groovy
@@ -91,4 +91,6 @@ suite("test_tokenize"){
sql "INSERT INTO $indexTblName3 VALUES (1, '我来到北京清华大学'), (2, '我爱你中国'), (3,
'人民可以得到更多实惠'), (4,
'陕西省西安市高新区创业大厦A座,我的手机号码是12345678901,邮箱是[email protected],,ip是1.1.1.1,this
information is created automatically.');"
qt_sql "SELECT TOKENIZE(c,
\"'parser'='chinese','parser_mode'='fine_grained'\") FROM $indexTblName3";
+
+ qt_tokenize_sql """SELECT TOKENIZE('GET /images/hm_bg.jpg HTTP/1.0
test:abc=bcd','"parser"="unicode","char_filter_type" =
"char_replace","char_filter_pattern" = "._=:,","char_filter_replacement" = "
"');"""
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]