This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch clucene in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/clucene by this push: new cf210eaaad [opt](chinese) chinese tokenizer lowercase interface (#203) cf210eaaad is described below commit cf210eaaadc3ad5d7b27ff2e7b9635ad45cf227b Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Mon Mar 18 17:43:54 2024 +0800 [opt](chinese) chinese tokenizer lowercase interface (#203) --- src/core/CLucene/index/IndexWriter.cpp | 18 +++++++++++++++--- src/core/CLucene/index/IndexWriter.h | 4 ++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/src/core/CLucene/index/IndexWriter.cpp b/src/core/CLucene/index/IndexWriter.cpp index 0d770182ba..6b52e047f5 100644 --- a/src/core/CLucene/index/IndexWriter.cpp +++ b/src/core/CLucene/index/IndexWriter.cpp @@ -1255,7 +1255,7 @@ void IndexWriter::resetMergeExceptions() { void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_dirs, std::vector<lucene::store::Directory *> dest_dirs, std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec, - std::vector<uint32_t> dest_index_docs) { + std::vector<uint32_t> dest_index_docs, bool maybe_skip) { CND_CONDITION(src_dirs.size() > 0, "Source directory not found."); CND_CONDITION(dest_dirs.size() > 0, "Destination directory not found."); this->_trans_vec = std::move(trans_vec); @@ -1387,7 +1387,7 @@ void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d } /// merge terms - mergeTerms(hasProx); + mergeTerms(hasProx, maybe_skip); /// merge null_bitmap mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList); @@ -1613,7 +1613,7 @@ protected: }; -void IndexWriter::mergeTerms(bool hasProx) { +void IndexWriter::mergeTerms(bool hasProx, bool maybe_skip) { auto queue = _CLNEW SegmentMergeQueue(readers.size()); auto numSrcIndexes = readers.size(); //std::vector<TermPositions *> postingsList(numSrcIndexes); @@ -1664,6 +1664,18 @@ void IndexWriter::mergeTerms(bool hasProx) { top = queue->top(); } + if (maybe_skip && smallestTerm) { + auto containsUpperCase = [](const std::wstring_view& ws_term) { + return std::any_of(ws_term.begin(), ws_term.end(), + [](wchar_t ch) { return std::iswupper(ch) != 0; }); + }; + + std::wstring_view ws_term(smallestTerm->text(), smallestTerm->textLength()); + if (containsUpperCase(ws_term)) { + _CLTHROWA(CL_ERR_InvalidState, "need rewrite, skip index compaction"); + } + } + std::vector<std::vector<uint32_t>> docDeltaBuffers(numDestIndexes); std::vector<std::vector<uint32_t>> freqBuffers(numDestIndexes); auto destPostingQueues = _CLNEW postingQueue(matchSize); diff --git a/src/core/CLucene/index/IndexWriter.h b/src/core/CLucene/index/IndexWriter.h index 7cfb67d2ca..0e8d40d8cc 100644 --- a/src/core/CLucene/index/IndexWriter.h +++ b/src/core/CLucene/index/IndexWriter.h @@ -317,14 +317,14 @@ public: void indexCompaction(std::vector<lucene::store::Directory*>& src_dirs, std::vector<lucene::store::Directory*> dest_dirs, std::vector<std::vector<std::pair<uint32_t, uint32_t>>> trans_vec, - std::vector<uint32_t> dest_index_docs); + std::vector<uint32_t> dest_index_docs, bool maybe_skip = false); // create new fields info void mergeFields(bool hasProx); // write fields info file void writeFields(lucene::store::Directory* d, std::string segment); // merge terms and write files - void mergeTerms(bool hasProx); + void mergeTerms(bool hasProx, bool maybe_skip = false); // merge null_bitmap void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues, std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org