This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/clucene by this push:
     new 3236e18d93b [opt](inverted index) Refactor ICU tokenizer code location 
for better organization and maintainability. (#283)
3236e18d93b is described below

commit 3236e18d93bf96481493d88c34b6c2515f3b0b75
Author: zzzxl <yangs...@selectdb.com>
AuthorDate: Thu Feb 20 10:28:45 2025 +0800

    [opt](inverted index) Refactor ICU tokenizer code location for better 
organization and maintainability. (#283)
---
 CMakeLists.txt                                     |  3 +-
 .../analysis/icu/DefaultICUTokenizerConfig.cpp     | 42 +++++++++++++++-------
 src/core/CMakeLists.txt                            |  7 +---
 src/test/CMakeLists.txt                            |  8 -----
 src/test/tests.cpp                                 |  1 -
 5 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3e39dc56344..317629af74d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -197,8 +197,7 @@ find_package(Roaring REQUIRED)
 
 #zstd
 find_package(Zstd REQUIRED)
-#icu
-find_package(ICU REQUIRED)
+
 #sse2neon
 INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/3rdparty/sse2neon)
 
diff --git a/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp 
b/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp
index b43536f033a..57e8374d804 100644
--- a/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp
+++ b/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp
@@ -5,6 +5,7 @@
 #include <mutex>
 #include <sstream>
 #include <string>
+#include <atomic>
 
 namespace lucene::analysis {
 
@@ -18,20 +19,35 @@ DefaultICUTokenizerConfig::DefaultICUTokenizerConfig(bool 
cjkAsWords, bool myanm
 }
 
 void DefaultICUTokenizerConfig::initialize(const std::string& dictPath) {
-    static std::once_flag once_flag;
-    std::call_once(once_flag, [&dictPath]() {
-        UErrorCode status = U_ZERO_ERROR;
-        cjkBreakIterator_.reset(
-                icu::BreakIterator::createWordInstance(icu::Locale::getRoot(), 
status));
-        if (U_FAILURE(status)) {
-            std::string error_msg = "Failed to create CJK BreakIterator: ";
-            error_msg += u_errorName(status);
-            _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str());
-        }
+    static std::atomic<bool> initialized_(false);
+    if (!initialized_) {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+
+        if (!initialized_) {
+            try {
+                UErrorCode status = U_ZERO_ERROR;
+                cjkBreakIterator_.reset(
+                        
icu::BreakIterator::createWordInstance(icu::Locale::getRoot(), status));
+                if (U_FAILURE(status)) {
+                    std::string error_msg = "Failed to create CJK 
BreakIterator: ";
+                    error_msg += u_errorName(status);
+                    _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str());
+                }
+
+                readBreakIterator(defaultBreakIterator_, dictPath + 
"/uax29/Default.txt");
+                readBreakIterator(myanmarSyllableIterator_,
+                                  dictPath + "/uax29/MyanmarSyllable.txt");
 
-        readBreakIterator(defaultBreakIterator_, dictPath + 
"/uax29/Default.txt");
-        readBreakIterator(myanmarSyllableIterator_, dictPath + 
"/uax29/MyanmarSyllable.txt");
-    });
+                initialized_ = true;
+            } catch (...) {
+                cjkBreakIterator_.reset();
+                defaultBreakIterator_.reset();
+                myanmarSyllableIterator_.reset();
+                throw; // Clean up resources and rethrow the original 
exception to the caller
+            }
+        }
+    }
 }
 
 icu::BreakIterator* DefaultICUTokenizerConfig::getBreakIterator(int32_t 
script) {
diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 7fd9cabc60b..0a19a2d278f 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -71,11 +71,6 @@ SET(clucene_core_Files
     ./CLucene/analysis/standard/StandardFilter.cpp
     ./CLucene/analysis/standard/StandardTokenizer.cpp
     ./CLucene/analysis/standard95/StandardTokenizerImpl.cpp
-    ./CLucene/analysis/icu/BreakIteratorWrapper.cpp
-    ./CLucene/analysis/icu/CompositeBreakIterator.cpp
-    ./CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp
-    ./CLucene/analysis/icu/ICUTokenizer.cpp
-    ./CLucene/analysis/icu/ScriptIterator.cpp
     ./CLucene/analysis/Analyzers.cpp
     ./CLucene/analysis/AnalysisHeader.cpp
     ./CLucene/store/MMapInput.cpp
@@ -256,7 +251,7 @@ IF (BUILD_STATIC_LIBRARIES)
     TARGET_LINK_LIBRARIES(clucene-core-static ssl crypto ${BRPC_LIB} 
${GLOG_LIB} ${GFLAG_LIB} ${PROTOBUF_LIB})
   ENDIF (USE_BTHREAD)
   TARGET_INCLUDE_DIRECTORIES(clucene-core-static PUBLIC ${Roaring_INCLUDE_DIR})
-  TARGET_LINK_LIBRARIES(clucene-core-static PRIVATE zstd icu)
+  TARGET_LINK_LIBRARIES(clucene-core-static PRIVATE zstd)
 
   SET_TARGET_PROPERTIES(clucene-core-static PROPERTIES
       VERSION ${CLUCENE_VERSION}
diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt
index f069adc0b2d..c284c5b7e0b 100644
--- a/src/test/CMakeLists.txt
+++ b/src/test/CMakeLists.txt
@@ -38,7 +38,6 @@ SET(test_files ./tests.cpp
         ./analysis/TestAnalysis.cpp
         ./analysis/TestAnalyzers.cpp
         ./analysis/TestStandard95.cpp
-        ./analysis/TestICU.cpp
         ./debug/TestError.cpp
         ./document/TestDateTools.cpp
         ./document/TestDocument.cpp
@@ -294,13 +293,6 @@ IF (BUILD_STATIC_LIBRARIES)
             COMMAND ${CMAKE_COMMAND} -E copy_directory ${DICT_SOURCE_DIR} 
${DICT_TARGET_DIR}
             COMMENT "Copying ${DATA_SOURCE_DIR} to ${DATA_TARGET_DIR}/dict")
 
-    SET(ICU_DICT_SOURCE_DIR 
${CMAKE_SOURCE_DIR}/src/core/CLucene/analysis/icu/data)
-    SET(ICU_DICT_TARGET_DIR "${EXECUTABLE_OUTPUT_PATH}/icu-dict")
-
-     ADD_CUSTOM_COMMAND(TARGET cl_test POST_BUILD
-             COMMAND ${CMAKE_COMMAND} -E copy_directory ${ICU_DICT_SOURCE_DIR} 
${ICU_DICT_TARGET_DIR}
-             COMMENT "Copying ${DATA_SOURCE_DIR} to ${ICU_DICT_TARGET_DIR}")
-
 ENDIF (BUILD_STATIC_LIBRARIES)
 
 ############################
diff --git a/src/test/tests.cpp b/src/test/tests.cpp
index 1c9b91444ff..f4a3609e57f 100644
--- a/src/test/tests.cpp
+++ b/src/test/tests.cpp
@@ -23,7 +23,6 @@ unittest tests[] = {
         {"IndexCompressV3", testIndexCompressV3},
         {"ByteArrayDataInput", testByteArrayDataInputSuite},
         {"GrowableByteArrayDataOutput", testGrowableByteArrayDataOutputSuite},
-        {"testICU", testICU},
 #ifdef TEST_CONTRIB_LIBS
         {"chinese", testchinese},
 #endif


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to