This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch clucene
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git


The following commit(s) were added to refs/heads/clucene by this push:
     new cf210eaaad [opt](chinese) chinese tokenizer lowercase interface (#203)
cf210eaaad is described below

commit cf210eaaadc3ad5d7b27ff2e7b9635ad45cf227b
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Mon Mar 18 17:43:54 2024 +0800

    [opt](chinese) chinese tokenizer lowercase interface (#203)
---
 src/core/CLucene/index/IndexWriter.cpp | 18 +++++++++++++++---
 src/core/CLucene/index/IndexWriter.h   |  4 ++--
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/src/core/CLucene/index/IndexWriter.cpp 
b/src/core/CLucene/index/IndexWriter.cpp
index 0d770182ba..6b52e047f5 100644
--- a/src/core/CLucene/index/IndexWriter.cpp
+++ b/src/core/CLucene/index/IndexWriter.cpp
@@ -1255,7 +1255,7 @@ void IndexWriter::resetMergeExceptions() {
 void IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> 
&src_dirs,
                                   std::vector<lucene::store::Directory *> 
dest_dirs,
                                   std::vector<std::vector<std::pair<uint32_t, 
uint32_t>>> trans_vec,
-                                  std::vector<uint32_t> dest_index_docs) {
+                                  std::vector<uint32_t> dest_index_docs, bool 
maybe_skip) {
     CND_CONDITION(src_dirs.size() > 0, "Source directory not found.");
     CND_CONDITION(dest_dirs.size() > 0, "Destination directory not found.");
     this->_trans_vec = std::move(trans_vec);
@@ -1387,7 +1387,7 @@ void 
IndexWriter::indexCompaction(std::vector<lucene::store::Directory *> &src_d
         }
 
         /// merge terms
-        mergeTerms(hasProx);
+        mergeTerms(hasProx, maybe_skip);
 
         /// merge null_bitmap
         mergeNullBitmap(srcNullBitmapValues, nullBitmapIndexOutputList);
@@ -1613,7 +1613,7 @@ protected:
 
 };
 
-void IndexWriter::mergeTerms(bool hasProx) {
+void IndexWriter::mergeTerms(bool hasProx, bool maybe_skip) {
     auto queue = _CLNEW SegmentMergeQueue(readers.size());
     auto numSrcIndexes = readers.size();
     //std::vector<TermPositions *> postingsList(numSrcIndexes);
@@ -1664,6 +1664,18 @@ void IndexWriter::mergeTerms(bool hasProx) {
             top = queue->top();
         }
 
+        if (maybe_skip && smallestTerm) {
+            auto containsUpperCase = [](const std::wstring_view& ws_term) {
+                return std::any_of(ws_term.begin(), ws_term.end(),
+                                   [](wchar_t ch) { return std::iswupper(ch) 
!= 0; });
+            };
+
+            std::wstring_view ws_term(smallestTerm->text(), 
smallestTerm->textLength());
+            if (containsUpperCase(ws_term)) {
+                _CLTHROWA(CL_ERR_InvalidState, "need rewrite, skip index 
compaction");
+            }
+        }
+
         std::vector<std::vector<uint32_t>> docDeltaBuffers(numDestIndexes);
         std::vector<std::vector<uint32_t>> freqBuffers(numDestIndexes);
         auto destPostingQueues = _CLNEW postingQueue(matchSize);
diff --git a/src/core/CLucene/index/IndexWriter.h 
b/src/core/CLucene/index/IndexWriter.h
index 7cfb67d2ca..0e8d40d8cc 100644
--- a/src/core/CLucene/index/IndexWriter.h
+++ b/src/core/CLucene/index/IndexWriter.h
@@ -317,14 +317,14 @@ public:
     void indexCompaction(std::vector<lucene::store::Directory*>& src_dirs,
                             std::vector<lucene::store::Directory*> dest_dirs,
                             std::vector<std::vector<std::pair<uint32_t, 
uint32_t>>> trans_vec,
-                            std::vector<uint32_t> dest_index_docs);
+                            std::vector<uint32_t> dest_index_docs, bool 
maybe_skip = false);
 
     // create new fields info
     void mergeFields(bool hasProx);
     // write fields info file
     void writeFields(lucene::store::Directory* d, std::string segment);
     // merge terms and write files
-    void mergeTerms(bool hasProx);
+    void mergeTerms(bool hasProx, bool maybe_skip = false);
     // merge null_bitmap
     void mergeNullBitmap(std::vector<std::vector<uint32_t>> srcBitmapValues, 
std::vector<lucene::store::IndexOutput *> nullBitmapIndexOutputList);
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to