This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new 3236e18d93b [opt](inverted index) Refactor ICU tokenizer code location for better organization and maintainability. (#283) 3236e18d93b is described below commit 3236e18d93bf96481493d88c34b6c2515f3b0b75 Author: zzzxl <yangs...@selectdb.com> AuthorDate: Thu Feb 20 10:28:45 2025 +0800 [opt](inverted index) Refactor ICU tokenizer code location for better organization and maintainability. (#283) --- CMakeLists.txt | 3 +- .../analysis/icu/DefaultICUTokenizerConfig.cpp | 42 +++++++++++++++------- src/core/CMakeLists.txt | 7 +--- src/test/CMakeLists.txt | 8 ----- src/test/tests.cpp | 1 - 5 files changed, 31 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3e39dc56344..317629af74d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -197,8 +197,7 @@ find_package(Roaring REQUIRED) #zstd find_package(Zstd REQUIRED) -#icu -find_package(ICU REQUIRED) + #sse2neon INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/3rdparty/sse2neon) diff --git a/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp b/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp index b43536f033a..57e8374d804 100644 --- a/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp +++ b/src/core/CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp @@ -5,6 +5,7 @@ #include <mutex> #include <sstream> #include <string> +#include <atomic> namespace lucene::analysis { @@ -18,20 +19,35 @@ DefaultICUTokenizerConfig::DefaultICUTokenizerConfig(bool cjkAsWords, bool myanm } void DefaultICUTokenizerConfig::initialize(const std::string& dictPath) { - static std::once_flag once_flag; - std::call_once(once_flag, [&dictPath]() { - UErrorCode status = U_ZERO_ERROR; - cjkBreakIterator_.reset( - icu::BreakIterator::createWordInstance(icu::Locale::getRoot(), status)); - if (U_FAILURE(status)) { - std::string error_msg = "Failed to create CJK BreakIterator: "; - error_msg += u_errorName(status); - _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str()); - } + static std::atomic<bool> initialized_(false); + if (!initialized_) { + static std::mutex mutex; + std::lock_guard<std::mutex> lock(mutex); + + if (!initialized_) { + try { + UErrorCode status = U_ZERO_ERROR; + cjkBreakIterator_.reset( + icu::BreakIterator::createWordInstance(icu::Locale::getRoot(), status)); + if (U_FAILURE(status)) { + std::string error_msg = "Failed to create CJK BreakIterator: "; + error_msg += u_errorName(status); + _CLTHROWT(CL_ERR_IllegalArgument, error_msg.c_str()); + } + + readBreakIterator(defaultBreakIterator_, dictPath + "/uax29/Default.txt"); + readBreakIterator(myanmarSyllableIterator_, + dictPath + "/uax29/MyanmarSyllable.txt"); - readBreakIterator(defaultBreakIterator_, dictPath + "/uax29/Default.txt"); - readBreakIterator(myanmarSyllableIterator_, dictPath + "/uax29/MyanmarSyllable.txt"); - }); + initialized_ = true; + } catch (...) { + cjkBreakIterator_.reset(); + defaultBreakIterator_.reset(); + myanmarSyllableIterator_.reset(); + throw; // Clean up resources and rethrow the original exception to the caller + } + } + } } icu::BreakIterator* DefaultICUTokenizerConfig::getBreakIterator(int32_t script) { diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 7fd9cabc60b..0a19a2d278f 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -71,11 +71,6 @@ SET(clucene_core_Files ./CLucene/analysis/standard/StandardFilter.cpp ./CLucene/analysis/standard/StandardTokenizer.cpp ./CLucene/analysis/standard95/StandardTokenizerImpl.cpp - ./CLucene/analysis/icu/BreakIteratorWrapper.cpp - ./CLucene/analysis/icu/CompositeBreakIterator.cpp - ./CLucene/analysis/icu/DefaultICUTokenizerConfig.cpp - ./CLucene/analysis/icu/ICUTokenizer.cpp - ./CLucene/analysis/icu/ScriptIterator.cpp ./CLucene/analysis/Analyzers.cpp ./CLucene/analysis/AnalysisHeader.cpp ./CLucene/store/MMapInput.cpp @@ -256,7 +251,7 @@ IF (BUILD_STATIC_LIBRARIES) TARGET_LINK_LIBRARIES(clucene-core-static ssl crypto ${BRPC_LIB} ${GLOG_LIB} ${GFLAG_LIB} ${PROTOBUF_LIB}) ENDIF (USE_BTHREAD) TARGET_INCLUDE_DIRECTORIES(clucene-core-static PUBLIC ${Roaring_INCLUDE_DIR}) - TARGET_LINK_LIBRARIES(clucene-core-static PRIVATE zstd icu) + TARGET_LINK_LIBRARIES(clucene-core-static PRIVATE zstd) SET_TARGET_PROPERTIES(clucene-core-static PROPERTIES VERSION ${CLUCENE_VERSION} diff --git a/src/test/CMakeLists.txt b/src/test/CMakeLists.txt index f069adc0b2d..c284c5b7e0b 100644 --- a/src/test/CMakeLists.txt +++ b/src/test/CMakeLists.txt @@ -38,7 +38,6 @@ SET(test_files ./tests.cpp ./analysis/TestAnalysis.cpp ./analysis/TestAnalyzers.cpp ./analysis/TestStandard95.cpp - ./analysis/TestICU.cpp ./debug/TestError.cpp ./document/TestDateTools.cpp ./document/TestDocument.cpp @@ -294,13 +293,6 @@ IF (BUILD_STATIC_LIBRARIES) COMMAND ${CMAKE_COMMAND} -E copy_directory ${DICT_SOURCE_DIR} ${DICT_TARGET_DIR} COMMENT "Copying ${DATA_SOURCE_DIR} to ${DATA_TARGET_DIR}/dict") - SET(ICU_DICT_SOURCE_DIR ${CMAKE_SOURCE_DIR}/src/core/CLucene/analysis/icu/data) - SET(ICU_DICT_TARGET_DIR "${EXECUTABLE_OUTPUT_PATH}/icu-dict") - - ADD_CUSTOM_COMMAND(TARGET cl_test POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_directory ${ICU_DICT_SOURCE_DIR} ${ICU_DICT_TARGET_DIR} - COMMENT "Copying ${DATA_SOURCE_DIR} to ${ICU_DICT_TARGET_DIR}") - ENDIF (BUILD_STATIC_LIBRARIES) ############################ diff --git a/src/test/tests.cpp b/src/test/tests.cpp index 1c9b91444ff..f4a3609e57f 100644 --- a/src/test/tests.cpp +++ b/src/test/tests.cpp @@ -23,7 +23,6 @@ unittest tests[] = { {"IndexCompressV3", testIndexCompressV3}, {"ByteArrayDataInput", testByteArrayDataInputSuite}, {"GrowableByteArrayDataOutput", testGrowableByteArrayDataOutputSuite}, - {"testICU", testICU}, #ifdef TEST_CONTRIB_LIBS {"chinese", testchinese}, #endif --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org