xiaokang commented on code in PR #23871:
URL: https://github.com/apache/doris/pull/23871#discussion_r1314852179


##########
be/src/olap/rowset/segment_v2/inverted_index_reader.cpp:
##########
@@ -264,74 +250,26 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, const std::string
             }
         }
 
-        std::unique_ptr<lucene::search::Query> query;
-        std::wstring field_ws = std::wstring(column_name.begin(), 
column_name.end());
-
-        auto index_search = [&](bool& null_bitmap_already_read,
-                                std::shared_ptr<roaring::Roaring>& 
term_match_bitmap,
-                                InvertedIndexQueryCache* cache,
-                                InvertedIndexQueryCache::CacheKey& cache_key,
-                                InvertedIndexQueryCacheHandle& cache_handle) {
-            // check index file existence
-            if (!indexExists(index_file_path)) {
-                return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
-                        "inverted index path: {} not exist.", 
index_file_path.string());
-            }
-
-            InvertedIndexCacheHandle inverted_index_cache_handle;
-            InvertedIndexSearcherCache::instance()->get_index_searcher(
-                    _fs, index_dir.c_str(), index_file_name, 
&inverted_index_cache_handle, stats);
-            auto index_searcher = 
inverted_index_cache_handle.get_index_searcher();
-
-            // try to reuse index_searcher's directory to read null_bitmap to 
cache
-            // to avoid open directory additionally for null_bitmap
-            if (!null_bitmap_already_read) {
-                InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
-                read_null_bitmap(&null_bitmap_cache_handle,
-                                 index_searcher->getReader()->directory());
-                null_bitmap_already_read = true;
-            }
+        // check index file existence
+        if (!indexExists(index_file_path)) {
+            return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
+                    "inverted index path: {} not exist.", 
index_file_path.string());
+        }
 
-            try {
-                if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
-                    query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
-                    query_type == InvertedIndexQueryType::EQUAL_QUERY) {
-                    
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
-                    index_searcher->_search(query.get(), 
[&term_match_bitmap](DocRange* docRange) {
-                        if (docRange->type_ == DocRangeType::kMany) {
-                            
term_match_bitmap->addMany(docRange->doc_many_size_,
-                                                       
docRange->doc_many.data());
-                        } else {
-                            
term_match_bitmap->addRange(docRange->doc_range.first,
-                                                        
docRange->doc_range.second);
-                        }
-                    });
-                } else {
-                    
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
-                    index_searcher->_search(
-                            query.get(),
-                            [&term_match_bitmap](const int32_t docid, const 
float_t /*score*/) {
-                                // docid equal to rowid in segment
-                                term_match_bitmap->add(docid);
-                            });
-                }
-            } catch (const CLuceneError& e) {
-                return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
-                        "CLuceneError occured: {}", e.what());
-            }
+        InvertedIndexCacheHandle inverted_index_cache_handle;
+        InvertedIndexSearcherCache::instance()->get_index_searcher(
+                _fs, index_dir.c_str(), index_file_name, 
&inverted_index_cache_handle, stats);
+        auto index_searcher = inverted_index_cache_handle.get_index_searcher();
 
-            {
-                // add to cache
-                term_match_bitmap->runOptimize();
-                cache->insert(cache_key, term_match_bitmap, &cache_handle);
-            }
-            return Status::OK();
-        };
+        std::unique_ptr<lucene::search::Query> query;
+        std::wstring field_ws = std::wstring(column_name.begin(), 
column_name.end());
 
         roaring::Roaring query_match_bitmap;
         bool null_bitmap_already_read = false;
-        if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+        if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
+            query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) {
             std::wstring str_tokens;
+            str_tokens += std::to_wstring(static_cast<int32_t>(query_type));

Review Comment:
   move it into InvertedIndexQueryCache::CacheKey::encode()



##########
be/src/olap/rowset/segment_v2/inverted_index_reader.cpp:
##########
@@ -264,74 +250,26 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, const std::string
             }
         }
 
-        std::unique_ptr<lucene::search::Query> query;
-        std::wstring field_ws = std::wstring(column_name.begin(), 
column_name.end());
-
-        auto index_search = [&](bool& null_bitmap_already_read,
-                                std::shared_ptr<roaring::Roaring>& 
term_match_bitmap,
-                                InvertedIndexQueryCache* cache,
-                                InvertedIndexQueryCache::CacheKey& cache_key,
-                                InvertedIndexQueryCacheHandle& cache_handle) {
-            // check index file existence
-            if (!indexExists(index_file_path)) {
-                return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
-                        "inverted index path: {} not exist.", 
index_file_path.string());
-            }
-
-            InvertedIndexCacheHandle inverted_index_cache_handle;
-            InvertedIndexSearcherCache::instance()->get_index_searcher(
-                    _fs, index_dir.c_str(), index_file_name, 
&inverted_index_cache_handle, stats);
-            auto index_searcher = 
inverted_index_cache_handle.get_index_searcher();
-
-            // try to reuse index_searcher's directory to read null_bitmap to 
cache
-            // to avoid open directory additionally for null_bitmap
-            if (!null_bitmap_already_read) {
-                InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
-                read_null_bitmap(&null_bitmap_cache_handle,
-                                 index_searcher->getReader()->directory());
-                null_bitmap_already_read = true;
-            }
+        // check index file existence
+        if (!indexExists(index_file_path)) {
+            return Status::Error<ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND>(
+                    "inverted index path: {} not exist.", 
index_file_path.string());
+        }
 
-            try {
-                if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
-                    query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
-                    query_type == InvertedIndexQueryType::EQUAL_QUERY) {
-                    
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
-                    index_searcher->_search(query.get(), 
[&term_match_bitmap](DocRange* docRange) {
-                        if (docRange->type_ == DocRangeType::kMany) {
-                            
term_match_bitmap->addMany(docRange->doc_many_size_,
-                                                       
docRange->doc_many.data());
-                        } else {
-                            
term_match_bitmap->addRange(docRange->doc_range.first,
-                                                        
docRange->doc_range.second);
-                        }
-                    });
-                } else {
-                    
SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
-                    index_searcher->_search(
-                            query.get(),
-                            [&term_match_bitmap](const int32_t docid, const 
float_t /*score*/) {
-                                // docid equal to rowid in segment
-                                term_match_bitmap->add(docid);
-                            });
-                }
-            } catch (const CLuceneError& e) {
-                return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>(
-                        "CLuceneError occured: {}", e.what());
-            }
+        InvertedIndexCacheHandle inverted_index_cache_handle;
+        InvertedIndexSearcherCache::instance()->get_index_searcher(
+                _fs, index_dir.c_str(), index_file_name, 
&inverted_index_cache_handle, stats);
+        auto index_searcher = inverted_index_cache_handle.get_index_searcher();
 
-            {
-                // add to cache
-                term_match_bitmap->runOptimize();
-                cache->insert(cache_key, term_match_bitmap, &cache_handle);
-            }
-            return Status::OK();
-        };
+        std::unique_ptr<lucene::search::Query> query;
+        std::wstring field_ws = std::wstring(column_name.begin(), 
column_name.end());
 
         roaring::Roaring query_match_bitmap;
         bool null_bitmap_already_read = false;
-        if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+        if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
+            query_type == InvertedIndexQueryType::MATCH_ALL_QUERY) {
             std::wstring str_tokens;
+            str_tokens += std::to_wstring(static_cast<int32_t>(query_type));
             for (auto& token : analyse_result) {
                 str_tokens += token;

Review Comment:
   add whitespace seperator



##########
be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp:
##########
@@ -32,7 +32,8 @@ Status compact_column(int32_t index_id, int src_segment_num, 
int dest_segment_nu
                       std::vector<uint32_t> dest_segment_num_rows) {
     lucene::store::Directory* dir =
             DorisCompoundDirectory::getDirectory(fs, 
index_writer_path.c_str(), false);
-    auto index_writer = _CLNEW lucene::index::IndexWriter(dir, nullptr, true 
/* create */,
+    lucene::analysis::SimpleAnalyzer<char> analyzer;

Review Comment:
   use simple analyzer for all case?



##########
be/src/olap/rowset/segment_v2/inverted_index_reader.cpp:
##########
@@ -436,6 +386,68 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, const std::string
     }
 }
 
+Status FullTextIndexReader::index_search(
+        OlapReaderStatistics* stats, InvertedIndexQueryType query_type,
+        const IndexSearcherPtr& index_searcher, bool& null_bitmap_already_read,
+        const std::unique_ptr<lucene::search::Query>& query,
+        const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
+    check_null_bitmap(index_searcher, null_bitmap_already_read);
+
+    try {
+        SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer);
+        if (query_type == InvertedIndexQueryType::MATCH_ANY_QUERY ||
+            query_type == InvertedIndexQueryType::EQUAL_QUERY) {
+            index_searcher->_search(query.get(), 
[&term_match_bitmap](DocRange* docRange) {

Review Comment:
   use doc_range consistently



##########
be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp:
##########
@@ -0,0 +1,145 @@
+#include "conjunction_query.h"
+
+namespace doris {
+
+ConjunctionQuery::ConjunctionQuery(IndexReader* reader)
+        : _reader(reader), _indexVersion(reader->getIndexVersion()) {}
+
+ConjunctionQuery::~ConjunctionQuery() {
+    for (auto& term : _terms) {
+        if (term) {
+            _CLDELETE(term);
+        }
+    }
+    for (auto& termDoc : _termDocs) {
+        if (termDoc) {
+            _CLDELETE(termDoc);
+        }
+    }
+}
+
+void ConjunctionQuery::add(const std::wstring& fieldName, const 
std::vector<std::wstring>& wterms) {
+    if (wterms.size() < 1) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "ConjunctionQuery::add: terms.size() 
< 1");
+    }
+
+    std::vector<TermIterator> iterators;
+    for (auto& wterm : wterms) {
+        Term* t = _CLNEW Term(fieldName.c_str(), wterm.c_str());
+        _terms.push_back(t);
+        TermDocs* termDoc = _reader->termDocs(t);
+        _termDocs.push_back(termDoc);
+        iterators.emplace_back(termDoc);
+    }
+
+    std::sort(iterators.begin(), iterators.end(), [](const TermIterator& a, 
const TermIterator& b) {
+        return a.docFreq() < b.docFreq();
+    });
+
+    if (iterators.size() == 1) {
+        _lead1 = iterators[0];
+    } else {
+        _lead1 = iterators[0];
+        _lead2 = iterators[1];
+        for (int32_t i = 2; i < _terms.size(); i++) {
+            _others.push_back(iterators[i]);
+        }
+    }
+
+    if (iterators.size() >= 2) {
+        int32_t little = iterators[0].docFreq();
+        int32_t big = iterators[iterators.size() - 1].docFreq();
+        if (little == 0) {
+            _useSkip = true;
+        } else if ((big / little) > 1000) {

Review Comment:
   make it a session variable



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to