This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 7a78b9d6719 [feature](inverted index) Add a basic tokenizer (#48716) 7a78b9d6719 is described below commit 7a78b9d67198624f7ddcbe3b7f1a8ee6d88e1e70 Author: zzzxl <yangs...@selectdb.com> AuthorDate: Tue Mar 11 14:18:14 2025 +0800 [feature](inverted index) Add a basic tokenizer (#48716) Problem Summary: 1. Implement a basic tokenizer capable of efficiently performing basic segmentation on both Chinese and English text. --- be/src/olap/inverted_index_parser.cpp | 4 + be/src/olap/inverted_index_parser.h | 4 +- .../inverted_index/analyzer/analyzer.cpp | 5 +- .../{icu/ICUAnalyzer.h => basic/basic_analyzer.h} | 25 ++-- .../analyzer/basic/basic_tokenizer.cpp | 108 +++++++++++++++ .../ICUTokenizer.h => basic/basic_tokenizer.h} | 24 ++-- ...ratorWrapper.cpp => break_iterator_wrapper.cpp} | 6 +- ...kIteratorWrapper.h => break_iterator_wrapper.h} | 2 +- ...akIterator.cpp => composite_break_iterator.cpp} | 2 +- ...eBreakIterator.h => composite_break_iterator.h} | 8 +- ...Config.cpp => default_icu_tokenizer_config.cpp} | 2 +- ...izerConfig.h => default_icu_tokenizer_config.h} | 2 +- .../analyzer/icu/{ICUAnalyzer.h => icu_analyzer.h} | 2 +- .../analyzer/icu/{ICUCommon.h => icu_common.h} | 0 .../icu/{ICUTokenizer.cpp => icu_tokenizer.cpp} | 2 +- .../icu/{ICUTokenizer.h => icu_tokenizer.h} | 6 +- ...ICUTokenizerConfig.h => icu_tokenizer_config.h} | 2 +- .../{ScriptIterator.cpp => script_iterator.cpp} | 2 +- .../icu/{ScriptIterator.h => script_iterator.h} | 2 +- ...icu_anzlyzer_test.cpp => icu_analyzer_test.cpp} | 6 +- .../analyzer/simple_analyzer_test.cpp | 147 +++++++++++++++++++++ .../apache/doris/analysis/InvertedIndexUtil.java | 8 +- .../test_basic_analyzer.out} | Bin 371 -> 245 bytes .../{ => analyzer}/test_icu_analyzer.out | Bin .../test_basic_analyzer.groovy} | 6 +- .../{ => analyzer}/test_icu_analyzer.groovy | 0 26 files changed, 317 insertions(+), 58 deletions(-) diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index 44b170617f1..023576da09b 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -36,6 +36,8 @@ std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_ return INVERTED_INDEX_PARSER_CHINESE; case InvertedIndexParserType::PARSER_ICU: return INVERTED_INDEX_PARSER_ICU; + case InvertedIndexParserType::PARSER_BASIC: + return INVERTED_INDEX_PARSER_BASIC; default: return INVERTED_INDEX_PARSER_UNKNOWN; } @@ -55,6 +57,8 @@ InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::st return InvertedIndexParserType::PARSER_CHINESE; } else if (parser_str_lower == INVERTED_INDEX_PARSER_ICU) { return InvertedIndexParserType::PARSER_ICU; + } else if (parser_str_lower == INVERTED_INDEX_PARSER_BASIC) { + return InvertedIndexParserType::PARSER_BASIC; } return InvertedIndexParserType::PARSER_UNKNOWN; diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index d70cfa395f4..abc1af5908a 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -38,7 +38,8 @@ enum class InvertedIndexParserType { PARSER_ENGLISH = 3, PARSER_CHINESE = 4, PARSER_UNICODE = 5, - PARSER_ICU = 6 + PARSER_ICU = 6, + PARSER_BASIC = 7 }; using CharFilterMap = std::map<std::string, std::string>; @@ -69,6 +70,7 @@ const std::string INVERTED_INDEX_PARSER_UNICODE = "unicode"; const std::string INVERTED_INDEX_PARSER_ENGLISH = "english"; const std::string INVERTED_INDEX_PARSER_CHINESE = "chinese"; const std::string INVERTED_INDEX_PARSER_ICU = "icu"; +const std::string INVERTED_INDEX_PARSER_BASIC = "basic"; const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY = "support_phrase"; const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES = "true"; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp index 28f68932fe1..6ee2c6bb43f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp @@ -28,7 +28,8 @@ #ifdef __clang__ #pragma clang diagnostic pop #endif -#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h" +#include "olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h" +#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h" #include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" namespace doris::segment_v2::inverted_index { @@ -69,6 +70,8 @@ std::unique_ptr<lucene::analysis::Analyzer> InvertedIndexAnalyzer::create_analyz } else if (analyser_type == InvertedIndexParserType::PARSER_ICU) { analyzer = std::make_unique<ICUAnalyzer>(); analyzer->initDict(config::inverted_index_dict_path + "/icu"); + } else if (analyser_type == InvertedIndexParserType::PARSER_BASIC) { + analyzer = std::make_unique<BasicAnalyzer>(); } else { // default analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>(); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h similarity index 66% copy from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h copy to be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h index f3a7554f13f..b9f4f963666 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h @@ -19,43 +19,38 @@ #include <memory> -#include "ICUTokenizer.h" +#include "basic_tokenizer.h" namespace doris::segment_v2 { -class ICUAnalyzer : public Analyzer { +class BasicAnalyzer : public Analyzer { public: - ICUAnalyzer() { + BasicAnalyzer() { _lowercase = true; _ownReader = false; } - ~ICUAnalyzer() override = default; + ~BasicAnalyzer() override = default; bool isSDocOpt() override { return true; } - void initDict(const std::string& dictPath) override { dictPath_ = dictPath; } - TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { - auto* tokenizer = _CLNEW ICUTokenizer(_lowercase, _ownReader); - tokenizer->initialize(dictPath_); + auto* tokenizer = _CLNEW BasicTokenizer(_lowercase, _ownReader); tokenizer->reset(reader); return (TokenStream*)tokenizer; } TokenStream* reusableTokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override { - if (tokenizer_ == nullptr) { - tokenizer_ = std::make_unique<ICUTokenizer>(_lowercase, _ownReader); - tokenizer_->initialize(dictPath_); + if (_tokenizer == nullptr) { + _tokenizer = std::make_unique<BasicTokenizer>(_lowercase, _ownReader); } - tokenizer_->reset(reader); - return (TokenStream*)tokenizer_.get(); + _tokenizer->reset(reader); + return (TokenStream*)_tokenizer.get(); }; private: - std::string dictPath_; - std::unique_ptr<ICUTokenizer> tokenizer_; + std::unique_ptr<BasicTokenizer> _tokenizer; }; } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp new file mode 100644 index 00000000000..ece0c559be0 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "basic_tokenizer.h" + +#include <unicode/unistr.h> + +namespace doris::segment_v2 { + +#define IS_IN_RANGE(c, start, end) ((uint32_t)((c) - (start)) <= ((end) - (start))) + +#define IS_CHINESE_CHAR(c) \ + (IS_IN_RANGE(c, 0x4E00, 0x9FFF) || IS_IN_RANGE(c, 0x3400, 0x4DBF) || \ + IS_IN_RANGE(c, 0x20000, 0x2A6DF) || IS_IN_RANGE(c, 0x2A700, 0x2EBEF) || \ + IS_IN_RANGE(c, 0x30000, 0x3134A)) + +BasicTokenizer::BasicTokenizer() { + this->lowercase = false; + this->ownReader = false; +} + +BasicTokenizer::BasicTokenizer(bool lower_case, bool own_reader) : BasicTokenizer() { + this->lowercase = lower_case; + this->ownReader = own_reader; +} + +Token* BasicTokenizer::next(Token* token) { + if (_buffer_index >= _data_len) { + return nullptr; + } + + std::string_view& token_text = _tokens_text[_buffer_index++]; + size_t size = std::min(token_text.size(), static_cast<size_t>(LUCENE_MAX_WORD_LEN)); + token->setNoCopy(token_text.data(), 0, size); + return token; +} + +void BasicTokenizer::reset(lucene::util::Reader* reader) { + _buffer_index = 0; + _data_len = 0; + _tokens_text.clear(); + + _buffer.resize(reader->size()); + int32_t numRead = reader->readCopy(_buffer.data(), 0, _buffer.size()); + (void)numRead; + assert(_buffer.size() == numRead); + + cut(); + + _data_len = _tokens_text.size(); +} + +void BasicTokenizer::cut() { + auto* s = (uint8_t*)_buffer.data(); + int32_t length = _buffer.size(); + + for (int32_t i = 0; i < length;) { + uint8_t firstByte = s[i]; + + if (is_alnum(firstByte)) { + int32_t start = i; + while (i < length) { + uint8_t nextByte = s[i]; + if (!is_alnum(nextByte)) { + break; + } + if (this->lowercase) { + s[i] = to_lower(nextByte); + } else { + s[i] = nextByte; + } + i++; + } + std::string_view token((const char*)(s + start), i - start); + _tokens_text.emplace_back(token); + } else { + UChar32 c = U_UNASSIGNED; + const int32_t prev_i = i; + + U8_NEXT(s, i, length, c); + + if (c == U_SENTINEL) { + continue; + } + + if (IS_CHINESE_CHAR(c)) { + const int32_t len = i - prev_i; + _tokens_text.emplace_back(reinterpret_cast<const char*>(s + prev_i), len); + } + } + } +} + +} // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h similarity index 70% copy from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h copy to be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h index f703f677806..e07a5e37d78 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h @@ -21,30 +21,28 @@ #include "CLucene.h" #include "CLucene/analysis/AnalysisHeader.h" -#include "CompositeBreakIterator.h" -#include "DefaultICUTokenizerConfig.h" -#include "ICUCommon.h" +#include "CLucene/analysis/icu/ICUCommon.h" using namespace lucene::analysis; namespace doris::segment_v2 { -class ICUTokenizer : public Tokenizer { +class BasicTokenizer : public Tokenizer { public: - ICUTokenizer(); - ICUTokenizer(bool lowercase, bool ownReader); - ~ICUTokenizer() override = default; + BasicTokenizer(); + BasicTokenizer(bool lowercase, bool ownReader); + ~BasicTokenizer() override = default; - void initialize(const std::string& dictPath); Token* next(Token* token) override; void reset(lucene::util::Reader* reader) override; -private: - std::string utf8Str_; - icu::UnicodeString buffer_; + void cut(); - ICUTokenizerConfigPtr config_; - CompositeBreakIteratorPtr breaker_; +private: + int32_t _buffer_index = 0; + int32_t _data_len = 0; + std::string _buffer; + std::vector<std::string_view> _tokens_text; }; } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.cpp similarity index 96% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.cpp rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.cpp index 094aa93c4e2..50094e54f7b 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.cpp @@ -15,15 +15,15 @@ // specific language governing permissions and limitations // under the License. -#include "BreakIteratorWrapper.h" +#include "break_iterator_wrapper.h" #include <unicode/unistr.h> #include <mutex> #include <string> -#include "ICUCommon.h" -#include "ICUTokenizerConfig.h" +#include "icu_common.h" +#include "icu_tokenizer_config.h" namespace doris::segment_v2 { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.h similarity index 96% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.h rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.h index 0bee1be9efa..dea60d1d1f7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.h @@ -23,7 +23,7 @@ #include <memory> #include <unordered_set> -#include "ICUCommon.h" +#include "icu_common.h" namespace doris::segment_v2 { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.cpp similarity index 97% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.cpp rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.cpp index 35f7f499cc5..e178ad35c13 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.cpp @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "CompositeBreakIterator.h" +#include "composite_break_iterator.h" #include <unicode/unistr.h> diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.h similarity index 91% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.h rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.h index 251c37b91b2..8599be88dc2 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.h @@ -24,10 +24,10 @@ #include <memory> #include <vector> -#include "BreakIteratorWrapper.h" -#include "ICUCommon.h" -#include "ICUTokenizerConfig.h" -#include "ScriptIterator.h" +#include "break_iterator_wrapper.h" +#include "icu_common.h" +#include "icu_tokenizer_config.h" +#include "script_iterator.h" namespace doris::segment_v2 { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.cpp similarity index 98% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.cpp rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.cpp index 7da5d4df377..dfbcf2dcdf6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.cpp @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "DefaultICUTokenizerConfig.h" +#include "default_icu_tokenizer_config.h" #include <atomic> #include <fstream> diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.h similarity index 95% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.h rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.h index e3673cd543e..6500cf230eb 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.h @@ -17,7 +17,7 @@ #pragma once -#include "ICUTokenizerConfig.h" +#include "icu_tokenizer_config.h" namespace doris::segment_v2 { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h similarity index 97% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h index f3a7554f13f..072cf85bc7d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h @@ -19,7 +19,7 @@ #include <memory> -#include "ICUTokenizer.h" +#include "icu_tokenizer.h" namespace doris::segment_v2 { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUCommon.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_common.h similarity index 100% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUCommon.h rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_common.h diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.cpp similarity index 97% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.cpp rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.cpp index 1d1a25dea5a..e10b197d517 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.cpp @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "ICUTokenizer.h" +#include "icu_tokenizer.h" #include <unicode/unistr.h> diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.h similarity index 91% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.h index f703f677806..d11d0c67ed6 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.h @@ -21,9 +21,9 @@ #include "CLucene.h" #include "CLucene/analysis/AnalysisHeader.h" -#include "CompositeBreakIterator.h" -#include "DefaultICUTokenizerConfig.h" -#include "ICUCommon.h" +#include "composite_break_iterator.h" +#include "default_icu_tokenizer_config.h" +#include "icu_common.h" using namespace lucene::analysis; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizerConfig.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer_config.h similarity index 95% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizerConfig.h rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer_config.h index 33accf72c51..dd7b743e74b 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizerConfig.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer_config.h @@ -17,7 +17,7 @@ #pragma once -#include "ICUCommon.h" +#include "icu_common.h" namespace doris::segment_v2 { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.cpp b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.cpp similarity index 98% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.cpp rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.cpp index 5ca81d2a954..7fee3055d3b 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.cpp @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "ScriptIterator.h" +#include "script_iterator.h" #include <unicode/unistr.h> diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.h b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.h similarity index 97% rename from be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.h rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.h index 1cc67c4350c..bc93eea8670 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.h @@ -23,7 +23,7 @@ #include <memory> #include <vector> -#include "ICUCommon.h" +#include "icu_common.h" namespace doris::segment_v2 { diff --git a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp similarity index 99% rename from be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp rename to be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp index 98fa722be2c..4cd6d180a2e 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp @@ -15,14 +15,14 @@ // specific language governing permissions and limitations // under the License. +#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h" + #include <gtest/gtest.h> #include <memory> #include <string> #include <vector> -#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h" - using namespace lucene::analysis; namespace doris::segment_v2 { @@ -48,7 +48,7 @@ protected: } } catch (CLuceneError& e) { std::cout << e.what() << std::endl; - throw; + throw e; } } }; diff --git a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp new file mode 100644 index 00000000000..6dba8233a2e --- /dev/null +++ b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp @@ -0,0 +1,147 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include <vector> + +#include "olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h" + +using namespace lucene::analysis; + +namespace doris::segment_v2 { + +std::vector<std::string> tokenize(const std::string& s, bool lowercase = false) { + std::vector<std::string> datas; + try { + BasicAnalyzer analyzer; + analyzer.set_lowercase(lowercase); + + lucene::util::SStringReader<char> reader; + reader.init(s.data(), s.size(), false); + + std::unique_ptr<BasicTokenizer> tokenizer; + tokenizer.reset((BasicTokenizer*)analyzer.tokenStream(L"", &reader)); + + Token t; + while (tokenizer->next(&t)) { + std::string term(t.termBuffer<char>(), t.termLength<char>()); + datas.emplace_back(term); + } + } catch (CLuceneError& e) { + std::cout << e.what() << std::endl; + throw e; + } + return datas; +} + +class BasicTokenizerTest : public ::testing::Test {}; + +TEST(BasicTokenizerTest, EnglishBasic1) { + std::string text = "Hello World! This is a test."; + auto tokens = tokenize(text, false); + + std::vector<std::string> expected = {"Hello", "World", "This", "is", "a", "test"}; + ASSERT_EQ(tokens, expected); +} + +TEST(BasicTokenizerTest, EnglishBasic2) { + std::string text = "Hello World! This is a test."; + auto tokens = tokenize(text, true); + + std::vector<std::string> expected = {"hello", "world", "this", "is", "a", "test"}; + ASSERT_EQ(tokens, expected); +} + +TEST(BasicTokenizerTest, EnglishLowercase) { + std::string text = "Hello World"; + auto tokens = tokenize(text, true); + + std::vector<std::string> expected = {"hello", "world"}; + ASSERT_EQ(tokens, expected); +} + +TEST(BasicTokenizerTest, ChineseBasic) { + std::string text = "你好世界"; + auto tokens = tokenize(text); + + std::vector<std::string> expected = {"你", "好", "世", "界"}; + ASSERT_EQ(tokens, expected); +} + +TEST(BasicTokenizerTest, MixedLanguage) { + std::string text = "Hello你好World世界"; + auto tokens = tokenize(text, true); + + std::vector<std::string> expected = {"hello", "你", "好", "world", "世", "界"}; + ASSERT_EQ(tokens, expected); +} + +TEST(BasicTokenizerTest, LongWordTruncation) { + const int32_t MAX_LEN = 255; + std::string longWord(MAX_LEN + 100, 'A'); + + auto tokens = tokenize(longWord); + ASSERT_EQ(tokens.size(), 1); + ASSERT_EQ(tokens[0].size(), MAX_LEN); +} + +TEST(BasicTokenizerTest, LargeDataset) { + const std::string english = "The quick brown fox jumps over the lazy dog. "; + const std::string chinese = "这是一个用于测试的分词样例。"; + const int32_t REPEAT = 5000; + + std::string largeText; + for (int32_t i = 0; i < REPEAT; ++i) { + largeText += english; + largeText += chinese; + } + + auto tokens = tokenize(largeText); + + const size_t englishPerIteration = 9; + const size_t chinesePerIteration = 13; + const size_t expectedTotal = REPEAT * (englishPerIteration + chinesePerIteration); + + ASSERT_EQ(tokens.size(), expectedTotal); + + ASSERT_EQ(tokens[0], "The"); + ASSERT_EQ(tokens[englishPerIteration], "这"); + ASSERT_EQ(tokens[englishPerIteration + 1], "是"); +} + +TEST(BasicTokenizerTest, InvalidUTF8) { + std::string invalidText = "\x80\x81\xff"; + auto tokens = tokenize(invalidText); + ASSERT_EQ(tokens.size(), 0); +} + +TEST(BasicTokenizerTest, ConsecutiveNumbers) { + const std::string input(300, '1'); + auto tokens = tokenize(input); + EXPECT_EQ(tokens.size(), 1); + + EXPECT_EQ(tokens[0].size(), 255); +} + +TEST(BasicTokenizerTest, EmojiHandling) { + const std::string input = "😊😋"; + auto tokens = tokenize(input); + EXPECT_EQ(tokens.size(), 0); +} + +} // namespace doris::segment_v2 diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java index 88ecc83337a..202ddd9be6a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java @@ -37,6 +37,7 @@ public class InvertedIndexUtil { public static String INVERTED_INDEX_PARSER_ENGLISH = "english"; public static String INVERTED_INDEX_PARSER_CHINESE = "chinese"; public static String INVERTED_INDEX_PARSER_ICU = "icu"; + public static String INVERTED_INDEX_PARSER_BASIC = "basic"; public static String INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode"; public static String INVERTED_INDEX_PARSER_FINE_GRANULARITY = "fine_grained"; @@ -143,7 +144,8 @@ public class InvertedIndexUtil { || parser.equals(INVERTED_INDEX_PARSER_UNICODE) || parser.equals(INVERTED_INDEX_PARSER_ENGLISH) || parser.equals(INVERTED_INDEX_PARSER_CHINESE) - || parser.equals(INVERTED_INDEX_PARSER_ICU))) { + || parser.equals(INVERTED_INDEX_PARSER_ICU) + || parser.equals(INVERTED_INDEX_PARSER_BASIC))) { throw new AnalysisException("INVERTED index parser: " + parser + " is invalid for column: " + indexColName + " of type " + colType); } @@ -184,9 +186,9 @@ public class InvertedIndexUtil { String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY); String dictCompression = properties.get(INVERTED_INDEX_DICT_COMPRESSION_KEY); - if (parser != null && !parser.matches("none|english|unicode|chinese|standard|icu")) { + if (parser != null && !parser.matches("none|english|unicode|chinese|standard|icu|basic")) { throw new AnalysisException("Invalid inverted index 'parser' value: " + parser - + ", parser must be none, english, unicode, chinese or icu"); + + ", parser must be none, english, unicode, chinese, icu or basic"); } if (!"chinese".equals(parser) && parserMode != null) { diff --git a/regression-test/data/inverted_index_p0/test_icu_analyzer.out b/regression-test/data/inverted_index_p0/analyzer/test_basic_analyzer.out similarity index 66% copy from regression-test/data/inverted_index_p0/test_icu_analyzer.out copy to regression-test/data/inverted_index_p0/analyzer/test_basic_analyzer.out index 2c5978b17e6..31dd2231521 100644 Binary files a/regression-test/data/inverted_index_p0/test_icu_analyzer.out and b/regression-test/data/inverted_index_p0/analyzer/test_basic_analyzer.out differ diff --git a/regression-test/data/inverted_index_p0/test_icu_analyzer.out b/regression-test/data/inverted_index_p0/analyzer/test_icu_analyzer.out similarity index 100% rename from regression-test/data/inverted_index_p0/test_icu_analyzer.out rename to regression-test/data/inverted_index_p0/analyzer/test_icu_analyzer.out diff --git a/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy b/regression-test/suites/inverted_index_p0/analyzer/test_basic_analyzer.groovy similarity index 92% copy from regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy copy to regression-test/suites/inverted_index_p0/analyzer/test_basic_analyzer.groovy index 2fa943b9ca9..863c3ddba63 100644 --- a/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy +++ b/regression-test/suites/inverted_index_p0/analyzer/test_basic_analyzer.groovy @@ -16,8 +16,8 @@ // under the License. -suite("test_icu_analyzer", "p0"){ - def indexTbName1 = "test_icu_analyzer" +suite("test_basic_analyzer", "p0"){ + def indexTbName1 = "test_basic_analyzer" sql "DROP TABLE IF EXISTS ${indexTbName1}" @@ -25,7 +25,7 @@ suite("test_icu_analyzer", "p0"){ CREATE TABLE ${indexTbName1} ( `a` int(11) NULL COMMENT "", `b` text NULL COMMENT "", - INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "icu") COMMENT '', + INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "basic") COMMENT '', ) ENGINE=OLAP DUPLICATE KEY(`a`) COMMENT "OLAP" diff --git a/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy b/regression-test/suites/inverted_index_p0/analyzer/test_icu_analyzer.groovy similarity index 100% rename from regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy rename to regression-test/suites/inverted_index_p0/analyzer/test_icu_analyzer.groovy --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org