This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new e6a5d3375e [Feature-WIP](inverted index) add chinese analyzer for
inverted index reader (#15998)
e6a5d3375e is described below
commit e6a5d3375e3587c69fafd53ef3f62416959e3f36
Author: YueW <[email protected]>
AuthorDate: Tue Jan 17 20:20:40 2023 +0800
[Feature-WIP](inverted index) add chinese analyzer for inverted index
reader (#15998)
add chinese analyzer for inverted index reader
dependency pr: #14211 #15807 #15823
---
.../rowset/segment_v2/inverted_index_reader.cpp | 44 +++++++++++++---------
.../olap/rowset/segment_v2/inverted_index_reader.h | 10 ++---
2 files changed, 32 insertions(+), 22 deletions(-)
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index a11c076df2..5671f268c9 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -17,6 +17,7 @@
#include "olap/rowset/segment_v2/inverted_index_reader.h"
+#include <CLucene/analysis/LanguageBasedAnalyzer.h>
#include <CLucene/search/BooleanQuery.h>
#include <CLucene/search/PhraseQuery.h>
@@ -47,29 +48,41 @@ bool InvertedIndexReader::indexExists(io::Path&
index_file_path) {
return exists;
}
-std::vector<std::string> FullTextIndexReader::get_analyse_result(
- const std::wstring& field_name, const std::wstring& value,
- InvertedIndexQueryType query_type, InvertedIndexParserType
analyser_type) {
- std::vector<std::string> analyse_result;
+std::vector<std::wstring> FullTextIndexReader::get_analyse_result(
+ const std::wstring& field_name, const std::string& value,
InvertedIndexQueryType query_type,
+ InvertedIndexParserType analyser_type) {
+ std::vector<std::wstring> analyse_result;
std::shared_ptr<lucene::analysis::Analyzer> analyzer;
+ std::unique_ptr<lucene::util::Reader> reader;
if (analyser_type == InvertedIndexParserType::PARSER_STANDARD) {
analyzer =
std::make_shared<lucene::analysis::standard::StandardAnalyzer>();
+ reader.reset(
+ (new lucene::util::StringReader(std::wstring(value.begin(),
value.end()).c_str())));
+ } else if (analyser_type == InvertedIndexParserType::PARSER_CHINESE) {
+ auto chinese_analyzer =
+
std::make_shared<lucene::analysis::LanguageBasedAnalyzer>(L"chinese", false);
+ chinese_analyzer->initDict(config::inverted_index_dict_path);
+ analyzer = chinese_analyzer;
+ reader.reset(new lucene::util::SimpleInputStreamReader(
+ new lucene::util::AStringReader(value.c_str()),
+ lucene::util::SimpleInputStreamReader::UTF8));
} else {
// default
analyzer = std::make_shared<lucene::analysis::SimpleAnalyzer<TCHAR>>();
+ reader.reset(
+ (new lucene::util::StringReader(std::wstring(value.begin(),
value.end()).c_str())));
}
- std::unique_ptr<lucene::util::StringReader> reader(
- new lucene::util::StringReader(value.c_str()));
std::unique_ptr<lucene::analysis::TokenStream> token_stream(
analyzer->tokenStream(field_name.c_str(), reader.get()));
lucene::analysis::Token token;
while (token_stream->next(&token)) {
- std::string tk =
- lucene::util::Misc::toString(token.termBuffer<TCHAR>(),
token.termLength<TCHAR>());
- analyse_result.emplace_back(tk);
+ if (token.termLength<TCHAR>() != 0) {
+ analyse_result.emplace_back(
+ std::wstring(token.termBuffer<TCHAR>(),
token.termLength<TCHAR>()));
+ }
}
if (token_stream != nullptr) {
@@ -78,7 +91,7 @@ std::vector<std::string>
FullTextIndexReader::get_analyse_result(
if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) {
- std::set<std::string> unrepeated_result(analyse_result.begin(),
analyse_result.end());
+ std::set<std::wstring> unrepeated_result(analyse_result.begin(),
analyse_result.end());
analyse_result.assign(unrepeated_result.begin(),
unrepeated_result.end());
}
@@ -100,10 +113,9 @@ Status FullTextIndexReader::query(const std::string&
column_name, const void* qu
<< " begin to load the fulltext index from clucene, query_str="
<< search_str;
std::unique_ptr<lucene::search::Query> query;
std::wstring field_ws = std::wstring(column_name.begin(),
column_name.end());
- std::wstring search_str_ws = std::wstring(search_str.begin(),
search_str.end());
try {
- std::vector<std::string> analyse_result =
- get_analyse_result(field_ws, search_str_ws, query_type,
analyser_type);
+ std::vector<std::wstring> analyse_result =
+ get_analyse_result(field_ws, search_str, query_type,
analyser_type);
if (analyse_result.empty()) {
LOG(WARNING) << "invalid input query_str: " << search_str
@@ -114,8 +126,7 @@ Status FullTextIndexReader::query(const std::string&
column_name, const void* qu
switch (query_type) {
case InvertedIndexQueryType::MATCH_ANY_QUERY: {
query.reset(_CLNEW lucene::search::BooleanQuery());
- for (auto token : analyse_result) {
- std::wstring token_ws = std::wstring(token.begin(),
token.end());
+ for (auto token_ws : analyse_result) {
lucene::index::Term* term =
_CLNEW lucene::index::Term(field_ws.c_str(),
token_ws.c_str());
static_cast<lucene::search::BooleanQuery*>(query.get())
@@ -127,8 +138,7 @@ Status FullTextIndexReader::query(const std::string&
column_name, const void* qu
}
case InvertedIndexQueryType::MATCH_ALL_QUERY: {
query.reset(_CLNEW lucene::search::BooleanQuery());
- for (auto token : analyse_result) {
- std::wstring token_ws = std::wstring(token.begin(),
token.end());
+ for (auto token_ws : analyse_result) {
lucene::index::Term* term =
_CLNEW lucene::index::Term(field_ws.c_str(),
token_ws.c_str());
static_cast<lucene::search::BooleanQuery*>(query.get())
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 70a21f3e77..dca374a9a2 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -75,7 +75,7 @@ public:
virtual InvertedIndexReaderType type() = 0;
bool indexExists(io::Path& index_file_path);
- uint32_t get_index_id() { return _index_id; }
+ uint32_t get_index_id() const { return _index_id; }
protected:
bool _is_match_query(InvertedIndexQueryType query_type);
@@ -103,10 +103,10 @@ public:
}
InvertedIndexReaderType type() override;
- std::vector<std::string> get_analyse_result(const std::wstring& field_name,
- const std::wstring& value,
- InvertedIndexQueryType
query_type,
- InvertedIndexParserType
analyser_type);
+ std::vector<std::wstring> get_analyse_result(const std::wstring&
field_name,
+ const std::string& value,
+ InvertedIndexQueryType
query_type,
+ InvertedIndexParserType
analyser_type);
};
class StringTypeInvertedIndexReader : public InvertedIndexReader {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]