(doris) branch master updated: [feature](inverted index) Add a basic tokenizer (#48716)

jianliangqi Mon, 10 Mar 2025 23:53:08 -0700

This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 7a78b9d6719 [feature](inverted index) Add a basic tokenizer (#48716)
7a78b9d6719 is described below

commit 7a78b9d67198624f7ddcbe3b7f1a8ee6d88e1e70
Author: zzzxl <yangs...@selectdb.com>
AuthorDate: Tue Mar 11 14:18:14 2025 +0800

    [feature](inverted index) Add a basic tokenizer (#48716)
    
    Problem Summary:
    
    1. Implement a basic tokenizer capable of efficiently performing basic
    segmentation on both Chinese and English text.
---
 be/src/olap/inverted_index_parser.cpp              |   4 +
 be/src/olap/inverted_index_parser.h                |   4 +-
 .../inverted_index/analyzer/analyzer.cpp           |   5 +-
 .../{icu/ICUAnalyzer.h => basic/basic_analyzer.h}  |  25 ++--
 .../analyzer/basic/basic_tokenizer.cpp             | 108 +++++++++++++++
 .../ICUTokenizer.h => basic/basic_tokenizer.h}     |  24 ++--
 ...ratorWrapper.cpp => break_iterator_wrapper.cpp} |   6 +-
 ...kIteratorWrapper.h => break_iterator_wrapper.h} |   2 +-
 ...akIterator.cpp => composite_break_iterator.cpp} |   2 +-
 ...eBreakIterator.h => composite_break_iterator.h} |   8 +-
 ...Config.cpp => default_icu_tokenizer_config.cpp} |   2 +-
 ...izerConfig.h => default_icu_tokenizer_config.h} |   2 +-
 .../analyzer/icu/{ICUAnalyzer.h => icu_analyzer.h} |   2 +-
 .../analyzer/icu/{ICUCommon.h => icu_common.h}     |   0
 .../icu/{ICUTokenizer.cpp => icu_tokenizer.cpp}    |   2 +-
 .../icu/{ICUTokenizer.h => icu_tokenizer.h}        |   6 +-
 ...ICUTokenizerConfig.h => icu_tokenizer_config.h} |   2 +-
 .../{ScriptIterator.cpp => script_iterator.cpp}    |   2 +-
 .../icu/{ScriptIterator.h => script_iterator.h}    |   2 +-
 ...icu_anzlyzer_test.cpp => icu_analyzer_test.cpp} |   6 +-
 .../analyzer/simple_analyzer_test.cpp              | 147 +++++++++++++++++++++
 .../apache/doris/analysis/InvertedIndexUtil.java   |   8 +-
 .../test_basic_analyzer.out}                       | Bin 371 -> 245 bytes
 .../{ => analyzer}/test_icu_analyzer.out           | Bin
 .../test_basic_analyzer.groovy}                    |   6 +-
 .../{ => analyzer}/test_icu_analyzer.groovy        |   0
 26 files changed, 317 insertions(+), 58 deletions(-)

diff --git a/be/src/olap/inverted_index_parser.cpp 
b/be/src/olap/inverted_index_parser.cpp
index 44b170617f1..023576da09b 100644
--- a/be/src/olap/inverted_index_parser.cpp
+++ b/be/src/olap/inverted_index_parser.cpp
@@ -36,6 +36,8 @@ std::string 
inverted_index_parser_type_to_string(InvertedIndexParserType parser_
         return INVERTED_INDEX_PARSER_CHINESE;
     case InvertedIndexParserType::PARSER_ICU:
         return INVERTED_INDEX_PARSER_ICU;
+    case InvertedIndexParserType::PARSER_BASIC:
+        return INVERTED_INDEX_PARSER_BASIC;
     default:
         return INVERTED_INDEX_PARSER_UNKNOWN;
     }
@@ -55,6 +57,8 @@ InvertedIndexParserType 
get_inverted_index_parser_type_from_string(const std::st
         return InvertedIndexParserType::PARSER_CHINESE;
     } else if (parser_str_lower == INVERTED_INDEX_PARSER_ICU) {
         return InvertedIndexParserType::PARSER_ICU;
+    } else if (parser_str_lower == INVERTED_INDEX_PARSER_BASIC) {
+        return InvertedIndexParserType::PARSER_BASIC;
     }
 
     return InvertedIndexParserType::PARSER_UNKNOWN;
diff --git a/be/src/olap/inverted_index_parser.h 
b/be/src/olap/inverted_index_parser.h
index d70cfa395f4..abc1af5908a 100644
--- a/be/src/olap/inverted_index_parser.h
+++ b/be/src/olap/inverted_index_parser.h
@@ -38,7 +38,8 @@ enum class InvertedIndexParserType {
     PARSER_ENGLISH = 3,
     PARSER_CHINESE = 4,
     PARSER_UNICODE = 5,
-    PARSER_ICU = 6
+    PARSER_ICU = 6,
+    PARSER_BASIC = 7
 };
 
 using CharFilterMap = std::map<std::string, std::string>;
@@ -69,6 +70,7 @@ const std::string INVERTED_INDEX_PARSER_UNICODE = "unicode";
 const std::string INVERTED_INDEX_PARSER_ENGLISH = "english";
 const std::string INVERTED_INDEX_PARSER_CHINESE = "chinese";
 const std::string INVERTED_INDEX_PARSER_ICU = "icu";
+const std::string INVERTED_INDEX_PARSER_BASIC = "basic";
 
 const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_KEY = "support_phrase";
 const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES = "true";
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
index 28f68932fe1..6ee2c6bb43f 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp
@@ -28,7 +28,8 @@
 #ifdef __clang__
 #pragma clang diagnostic pop
 #endif
-#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h"
+#include 
"olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h"
+#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h"
 #include 
"olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
 
 namespace doris::segment_v2::inverted_index {
@@ -69,6 +70,8 @@ std::unique_ptr<lucene::analysis::Analyzer> 
InvertedIndexAnalyzer::create_analyz
     } else if (analyser_type == InvertedIndexParserType::PARSER_ICU) {
         analyzer = std::make_unique<ICUAnalyzer>();
         analyzer->initDict(config::inverted_index_dict_path + "/icu");
+    } else if (analyser_type == InvertedIndexParserType::PARSER_BASIC) {
+        analyzer = std::make_unique<BasicAnalyzer>();
     } else {
         // default
         analyzer = std::make_unique<lucene::analysis::SimpleAnalyzer<char>>();
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h
similarity index 66%
copy from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h
copy to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h
index f3a7554f13f..b9f4f963666 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h
@@ -19,43 +19,38 @@
 
 #include <memory>
 
-#include "ICUTokenizer.h"
+#include "basic_tokenizer.h"
 
 namespace doris::segment_v2 {
 
-class ICUAnalyzer : public Analyzer {
+class BasicAnalyzer : public Analyzer {
 public:
-    ICUAnalyzer() {
+    BasicAnalyzer() {
         _lowercase = true;
         _ownReader = false;
     }
 
-    ~ICUAnalyzer() override = default;
+    ~BasicAnalyzer() override = default;
 
     bool isSDocOpt() override { return true; }
 
-    void initDict(const std::string& dictPath) override { dictPath_ = 
dictPath; }
-
     TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* 
reader) override {
-        auto* tokenizer = _CLNEW ICUTokenizer(_lowercase, _ownReader);
-        tokenizer->initialize(dictPath_);
+        auto* tokenizer = _CLNEW BasicTokenizer(_lowercase, _ownReader);
         tokenizer->reset(reader);
         return (TokenStream*)tokenizer;
     }
 
     TokenStream* reusableTokenStream(const TCHAR* fieldName,
                                      lucene::util::Reader* reader) override {
-        if (tokenizer_ == nullptr) {
-            tokenizer_ = std::make_unique<ICUTokenizer>(_lowercase, 
_ownReader);
-            tokenizer_->initialize(dictPath_);
+        if (_tokenizer == nullptr) {
+            _tokenizer = std::make_unique<BasicTokenizer>(_lowercase, 
_ownReader);
         }
-        tokenizer_->reset(reader);
-        return (TokenStream*)tokenizer_.get();
+        _tokenizer->reset(reader);
+        return (TokenStream*)_tokenizer.get();
     };
 
 private:
-    std::string dictPath_;
-    std::unique_ptr<ICUTokenizer> tokenizer_;
+    std::unique_ptr<BasicTokenizer> _tokenizer;
 };
 
 } // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp
new file mode 100644
index 00000000000..ece0c559be0
--- /dev/null
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.cpp
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "basic_tokenizer.h"
+
+#include <unicode/unistr.h>
+
+namespace doris::segment_v2 {
+
+#define IS_IN_RANGE(c, start, end) ((uint32_t)((c) - (start)) <= ((end) - 
(start)))
+
+#define IS_CHINESE_CHAR(c)                                                   \
+    (IS_IN_RANGE(c, 0x4E00, 0x9FFF) || IS_IN_RANGE(c, 0x3400, 0x4DBF) ||     \
+     IS_IN_RANGE(c, 0x20000, 0x2A6DF) || IS_IN_RANGE(c, 0x2A700, 0x2EBEF) || \
+     IS_IN_RANGE(c, 0x30000, 0x3134A))
+
+BasicTokenizer::BasicTokenizer() {
+    this->lowercase = false;
+    this->ownReader = false;
+}
+
+BasicTokenizer::BasicTokenizer(bool lower_case, bool own_reader) : 
BasicTokenizer() {
+    this->lowercase = lower_case;
+    this->ownReader = own_reader;
+}
+
+Token* BasicTokenizer::next(Token* token) {
+    if (_buffer_index >= _data_len) {
+        return nullptr;
+    }
+
+    std::string_view& token_text = _tokens_text[_buffer_index++];
+    size_t size = std::min(token_text.size(), 
static_cast<size_t>(LUCENE_MAX_WORD_LEN));
+    token->setNoCopy(token_text.data(), 0, size);
+    return token;
+}
+
+void BasicTokenizer::reset(lucene::util::Reader* reader) {
+    _buffer_index = 0;
+    _data_len = 0;
+    _tokens_text.clear();
+
+    _buffer.resize(reader->size());
+    int32_t numRead = reader->readCopy(_buffer.data(), 0, _buffer.size());
+    (void)numRead;
+    assert(_buffer.size() == numRead);
+
+    cut();
+
+    _data_len = _tokens_text.size();
+}
+
+void BasicTokenizer::cut() {
+    auto* s = (uint8_t*)_buffer.data();
+    int32_t length = _buffer.size();
+
+    for (int32_t i = 0; i < length;) {
+        uint8_t firstByte = s[i];
+
+        if (is_alnum(firstByte)) {
+            int32_t start = i;
+            while (i < length) {
+                uint8_t nextByte = s[i];
+                if (!is_alnum(nextByte)) {
+                    break;
+                }
+                if (this->lowercase) {
+                    s[i] = to_lower(nextByte);
+                } else {
+                    s[i] = nextByte;
+                }
+                i++;
+            }
+            std::string_view token((const char*)(s + start), i - start);
+            _tokens_text.emplace_back(token);
+        } else {
+            UChar32 c = U_UNASSIGNED;
+            const int32_t prev_i = i;
+
+            U8_NEXT(s, i, length, c);
+
+            if (c == U_SENTINEL) {
+                continue;
+            }
+
+            if (IS_CHINESE_CHAR(c)) {
+                const int32_t len = i - prev_i;
+                _tokens_text.emplace_back(reinterpret_cast<const char*>(s + 
prev_i), len);
+            }
+        }
+    }
+}
+
+} // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h
similarity index 70%
copy from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h
copy to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h
index f703f677806..e07a5e37d78 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_tokenizer.h
@@ -21,30 +21,28 @@
 
 #include "CLucene.h"
 #include "CLucene/analysis/AnalysisHeader.h"
-#include "CompositeBreakIterator.h"
-#include "DefaultICUTokenizerConfig.h"
-#include "ICUCommon.h"
+#include "CLucene/analysis/icu/ICUCommon.h"
 
 using namespace lucene::analysis;
 
 namespace doris::segment_v2 {
 
-class ICUTokenizer : public Tokenizer {
+class BasicTokenizer : public Tokenizer {
 public:
-    ICUTokenizer();
-    ICUTokenizer(bool lowercase, bool ownReader);
-    ~ICUTokenizer() override = default;
+    BasicTokenizer();
+    BasicTokenizer(bool lowercase, bool ownReader);
+    ~BasicTokenizer() override = default;
 
-    void initialize(const std::string& dictPath);
     Token* next(Token* token) override;
     void reset(lucene::util::Reader* reader) override;
 
-private:
-    std::string utf8Str_;
-    icu::UnicodeString buffer_;
+    void cut();
 
-    ICUTokenizerConfigPtr config_;
-    CompositeBreakIteratorPtr breaker_;
+private:
+    int32_t _buffer_index = 0;
+    int32_t _data_len = 0;
+    std::string _buffer;
+    std::vector<std::string_view> _tokens_text;
 };
 
 } // namespace doris::segment_v2
\ No newline at end of file
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.cpp
similarity index 96%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.cpp
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.cpp
index 094aa93c4e2..50094e54f7b 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.cpp
@@ -15,15 +15,15 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "BreakIteratorWrapper.h"
+#include "break_iterator_wrapper.h"
 
 #include <unicode/unistr.h>
 
 #include <mutex>
 #include <string>
 
-#include "ICUCommon.h"
-#include "ICUTokenizerConfig.h"
+#include "icu_common.h"
+#include "icu_tokenizer_config.h"
 
 namespace doris::segment_v2 {
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.h
similarity index 96%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.h
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.h
index 0bee1be9efa..dea60d1d1f7 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/BreakIteratorWrapper.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/break_iterator_wrapper.h
@@ -23,7 +23,7 @@
 #include <memory>
 #include <unordered_set>
 
-#include "ICUCommon.h"
+#include "icu_common.h"
 
 namespace doris::segment_v2 {
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.cpp
similarity index 97%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.cpp
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.cpp
index 35f7f499cc5..e178ad35c13 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.cpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "CompositeBreakIterator.h"
+#include "composite_break_iterator.h"
 
 #include <unicode/unistr.h>
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.h
similarity index 91%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.h
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.h
index 251c37b91b2..8599be88dc2 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/CompositeBreakIterator.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/composite_break_iterator.h
@@ -24,10 +24,10 @@
 #include <memory>
 #include <vector>
 
-#include "BreakIteratorWrapper.h"
-#include "ICUCommon.h"
-#include "ICUTokenizerConfig.h"
-#include "ScriptIterator.h"
+#include "break_iterator_wrapper.h"
+#include "icu_common.h"
+#include "icu_tokenizer_config.h"
+#include "script_iterator.h"
 
 namespace doris::segment_v2 {
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.cpp
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.cpp
similarity index 98%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.cpp
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.cpp
index 7da5d4df377..dfbcf2dcdf6 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.cpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "DefaultICUTokenizerConfig.h"
+#include "default_icu_tokenizer_config.h"
 
 #include <atomic>
 #include <fstream>
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.h
similarity index 95%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.h
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.h
index e3673cd543e..6500cf230eb 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/DefaultICUTokenizerConfig.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/default_icu_tokenizer_config.h
@@ -17,7 +17,7 @@
 
 #pragma once
 
-#include "ICUTokenizerConfig.h"
+#include "icu_tokenizer_config.h"
 
 namespace doris::segment_v2 {
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h
similarity index 97%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h
index f3a7554f13f..072cf85bc7d 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h
@@ -19,7 +19,7 @@
 
 #include <memory>
 
-#include "ICUTokenizer.h"
+#include "icu_tokenizer.h"
 
 namespace doris::segment_v2 {
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUCommon.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_common.h
similarity index 100%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUCommon.h
rename to be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_common.h
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.cpp
similarity index 97%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.cpp
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.cpp
index 1d1a25dea5a..e10b197d517 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.cpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "ICUTokenizer.h"
+#include "icu_tokenizer.h"
 
 #include <unicode/unistr.h>
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.h
similarity index 91%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.h
index f703f677806..d11d0c67ed6 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizer.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer.h
@@ -21,9 +21,9 @@
 
 #include "CLucene.h"
 #include "CLucene/analysis/AnalysisHeader.h"
-#include "CompositeBreakIterator.h"
-#include "DefaultICUTokenizerConfig.h"
-#include "ICUCommon.h"
+#include "composite_break_iterator.h"
+#include "default_icu_tokenizer_config.h"
+#include "icu_common.h"
 
 using namespace lucene::analysis;
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizerConfig.h
 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer_config.h
similarity index 95%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizerConfig.h
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer_config.h
index 33accf72c51..dd7b743e74b 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUTokenizerConfig.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_tokenizer_config.h
@@ -17,7 +17,7 @@
 
 #pragma once
 
-#include "ICUCommon.h"
+#include "icu_common.h"
 
 namespace doris::segment_v2 {
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.cpp
similarity index 98%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.cpp
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.cpp
index 5ca81d2a954..7fee3055d3b 100644
--- 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.cpp
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.cpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "ScriptIterator.h"
+#include "script_iterator.h"
 
 #include <unicode/unistr.h>
 
diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.h 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.h
similarity index 97%
rename from 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.h
rename to 
be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.h
index 1cc67c4350c..bc93eea8670 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/ScriptIterator.h
+++ 
b/be/src/olap/rowset/segment_v2/inverted_index/analyzer/icu/script_iterator.h
@@ -23,7 +23,7 @@
 #include <memory>
 #include <vector>
 
-#include "ICUCommon.h"
+#include "icu_common.h"
 
 namespace doris::segment_v2 {
 
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp 
b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp
similarity index 99%
rename from 
be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp
rename to 
be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp
index 98fa722be2c..4cd6d180a2e 100644
--- 
a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_anzlyzer_test.cpp
+++ 
b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/icu_analyzer_test.cpp
@@ -15,14 +15,14 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h"
+
 #include <gtest/gtest.h>
 
 #include <memory>
 #include <string>
 #include <vector>
 
-#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/ICUAnalyzer.h"
-
 using namespace lucene::analysis;
 
 namespace doris::segment_v2 {
@@ -48,7 +48,7 @@ protected:
             }
         } catch (CLuceneError& e) {
             std::cout << e.what() << std::endl;
-            throw;
+            throw e;
         }
     }
 };
diff --git 
a/be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp
 
b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp
new file mode 100644
index 00000000000..6dba8233a2e
--- /dev/null
+++ 
b/be/test/olap/rowset/segment_v2/inverted_index/analyzer/simple_analyzer_test.cpp
@@ -0,0 +1,147 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include 
"olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h"
+
+using namespace lucene::analysis;
+
+namespace doris::segment_v2 {
+
+std::vector<std::string> tokenize(const std::string& s, bool lowercase = 
false) {
+    std::vector<std::string> datas;
+    try {
+        BasicAnalyzer analyzer;
+        analyzer.set_lowercase(lowercase);
+
+        lucene::util::SStringReader<char> reader;
+        reader.init(s.data(), s.size(), false);
+
+        std::unique_ptr<BasicTokenizer> tokenizer;
+        tokenizer.reset((BasicTokenizer*)analyzer.tokenStream(L"", &reader));
+
+        Token t;
+        while (tokenizer->next(&t)) {
+            std::string term(t.termBuffer<char>(), t.termLength<char>());
+            datas.emplace_back(term);
+        }
+    } catch (CLuceneError& e) {
+        std::cout << e.what() << std::endl;
+        throw e;
+    }
+    return datas;
+}
+
+class BasicTokenizerTest : public ::testing::Test {};
+
+TEST(BasicTokenizerTest, EnglishBasic1) {
+    std::string text = "Hello World! This is a test.";
+    auto tokens = tokenize(text, false);
+
+    std::vector<std::string> expected = {"Hello", "World", "This", "is", "a", 
"test"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST(BasicTokenizerTest, EnglishBasic2) {
+    std::string text = "Hello World! This is a test.";
+    auto tokens = tokenize(text, true);
+
+    std::vector<std::string> expected = {"hello", "world", "this", "is", "a", 
"test"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST(BasicTokenizerTest, EnglishLowercase) {
+    std::string text = "Hello World";
+    auto tokens = tokenize(text, true);
+
+    std::vector<std::string> expected = {"hello", "world"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST(BasicTokenizerTest, ChineseBasic) {
+    std::string text = "你好世界";
+    auto tokens = tokenize(text);
+
+    std::vector<std::string> expected = {"你", "好", "世", "界"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST(BasicTokenizerTest, MixedLanguage) {
+    std::string text = "Hello你好World世界";
+    auto tokens = tokenize(text, true);
+
+    std::vector<std::string> expected = {"hello", "你", "好", "world", "世", "界"};
+    ASSERT_EQ(tokens, expected);
+}
+
+TEST(BasicTokenizerTest, LongWordTruncation) {
+    const int32_t MAX_LEN = 255;
+    std::string longWord(MAX_LEN + 100, 'A');
+
+    auto tokens = tokenize(longWord);
+    ASSERT_EQ(tokens.size(), 1);
+    ASSERT_EQ(tokens[0].size(), MAX_LEN);
+}
+
+TEST(BasicTokenizerTest, LargeDataset) {
+    const std::string english = "The quick brown fox jumps over the lazy dog. 
";
+    const std::string chinese = "这是一个用于测试的分词样例。";
+    const int32_t REPEAT = 5000;
+
+    std::string largeText;
+    for (int32_t i = 0; i < REPEAT; ++i) {
+        largeText += english;
+        largeText += chinese;
+    }
+
+    auto tokens = tokenize(largeText);
+
+    const size_t englishPerIteration = 9;
+    const size_t chinesePerIteration = 13;
+    const size_t expectedTotal = REPEAT * (englishPerIteration + 
chinesePerIteration);
+
+    ASSERT_EQ(tokens.size(), expectedTotal);
+
+    ASSERT_EQ(tokens[0], "The");
+    ASSERT_EQ(tokens[englishPerIteration], "这");
+    ASSERT_EQ(tokens[englishPerIteration + 1], "是");
+}
+
+TEST(BasicTokenizerTest, InvalidUTF8) {
+    std::string invalidText = "\x80\x81\xff";
+    auto tokens = tokenize(invalidText);
+    ASSERT_EQ(tokens.size(), 0);
+}
+
+TEST(BasicTokenizerTest, ConsecutiveNumbers) {
+    const std::string input(300, '1');
+    auto tokens = tokenize(input);
+    EXPECT_EQ(tokens.size(), 1);
+
+    EXPECT_EQ(tokens[0].size(), 255);
+}
+
+TEST(BasicTokenizerTest, EmojiHandling) {
+    const std::string input = "😊😋";
+    auto tokens = tokenize(input);
+    EXPECT_EQ(tokens.size(), 0);
+}
+
+} // namespace doris::segment_v2
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
index 88ecc83337a..202ddd9be6a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/InvertedIndexUtil.java
@@ -37,6 +37,7 @@ public class InvertedIndexUtil {
     public static String INVERTED_INDEX_PARSER_ENGLISH = "english";
     public static String INVERTED_INDEX_PARSER_CHINESE = "chinese";
     public static String INVERTED_INDEX_PARSER_ICU = "icu";
+    public static String INVERTED_INDEX_PARSER_BASIC = "basic";
 
     public static String INVERTED_INDEX_PARSER_MODE_KEY = "parser_mode";
     public static String INVERTED_INDEX_PARSER_FINE_GRANULARITY = 
"fine_grained";
@@ -143,7 +144,8 @@ public class InvertedIndexUtil {
                         || parser.equals(INVERTED_INDEX_PARSER_UNICODE)
                             || parser.equals(INVERTED_INDEX_PARSER_ENGLISH)
                                 || parser.equals(INVERTED_INDEX_PARSER_CHINESE)
-                                    || 
parser.equals(INVERTED_INDEX_PARSER_ICU))) {
+                                    || parser.equals(INVERTED_INDEX_PARSER_ICU)
+                                        || 
parser.equals(INVERTED_INDEX_PARSER_BASIC))) {
                 throw new AnalysisException("INVERTED index parser: " + parser
                     + " is invalid for column: " + indexColName + " of type " 
+ colType);
             }
@@ -184,9 +186,9 @@ public class InvertedIndexUtil {
         String stopWords = properties.get(INVERTED_INDEX_PARSER_STOPWORDS_KEY);
         String dictCompression = 
properties.get(INVERTED_INDEX_DICT_COMPRESSION_KEY);
 
-        if (parser != null && 
!parser.matches("none|english|unicode|chinese|standard|icu")) {
+        if (parser != null && 
!parser.matches("none|english|unicode|chinese|standard|icu|basic")) {
             throw new AnalysisException("Invalid inverted index 'parser' 
value: " + parser
-                    + ", parser must be none, english, unicode, chinese or 
icu");
+                    + ", parser must be none, english, unicode, chinese, icu 
or basic");
         }
 
         if (!"chinese".equals(parser) && parserMode != null) {
diff --git a/regression-test/data/inverted_index_p0/test_icu_analyzer.out 
b/regression-test/data/inverted_index_p0/analyzer/test_basic_analyzer.out
similarity index 66%
copy from regression-test/data/inverted_index_p0/test_icu_analyzer.out
copy to regression-test/data/inverted_index_p0/analyzer/test_basic_analyzer.out
index 2c5978b17e6..31dd2231521 100644
Binary files a/regression-test/data/inverted_index_p0/test_icu_analyzer.out and 
b/regression-test/data/inverted_index_p0/analyzer/test_basic_analyzer.out differ
diff --git a/regression-test/data/inverted_index_p0/test_icu_analyzer.out 
b/regression-test/data/inverted_index_p0/analyzer/test_icu_analyzer.out
similarity index 100%
rename from regression-test/data/inverted_index_p0/test_icu_analyzer.out
rename to regression-test/data/inverted_index_p0/analyzer/test_icu_analyzer.out
diff --git a/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy 
b/regression-test/suites/inverted_index_p0/analyzer/test_basic_analyzer.groovy
similarity index 92%
copy from regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy
copy to 
regression-test/suites/inverted_index_p0/analyzer/test_basic_analyzer.groovy
index 2fa943b9ca9..863c3ddba63 100644
--- a/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy
+++ 
b/regression-test/suites/inverted_index_p0/analyzer/test_basic_analyzer.groovy
@@ -16,8 +16,8 @@
 // under the License.
 
 
-suite("test_icu_analyzer", "p0"){
-    def indexTbName1 = "test_icu_analyzer"
+suite("test_basic_analyzer", "p0"){
+    def indexTbName1 = "test_basic_analyzer"
 
     sql "DROP TABLE IF EXISTS ${indexTbName1}"
 
@@ -25,7 +25,7 @@ suite("test_icu_analyzer", "p0"){
       CREATE TABLE ${indexTbName1} (
       `a` int(11) NULL COMMENT "",
       `b` text NULL COMMENT "",
-      INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "icu") COMMENT '',
+      INDEX b_idx (`b`) USING INVERTED PROPERTIES("parser" = "basic") COMMENT 
'',
       ) ENGINE=OLAP
       DUPLICATE KEY(`a`)
       COMMENT "OLAP"
diff --git a/regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy 
b/regression-test/suites/inverted_index_p0/analyzer/test_icu_analyzer.groovy
similarity index 100%
rename from regression-test/suites/inverted_index_p0/test_icu_analyzer.groovy
rename to 
regression-test/suites/inverted_index_p0/analyzer/test_icu_analyzer.groovy


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

(doris) branch master updated: [feature](inverted index) Add a basic tokenizer (#48716)

Reply via email to