This is an automated email from the ASF dual-hosted git repository.

jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 51373428cde [feature](inverted index) add slop functionality to 
match_phrase (#33225)
51373428cde is described below

commit 51373428cde6f9e2404209aa2f59a7ac7f10f575
Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com>
AuthorDate: Mon Apr 15 15:35:31 2024 +0800

    [feature](inverted index) add slop functionality to match_phrase (#33225)
    
    https://github.com/apache/doris-website/pull/553 doc
---
 .../inverted_index/query/phrase_query.cpp          |  67 +++++++++--
 .../segment_v2/inverted_index/query/phrase_query.h |  10 ++
 .../rowset/segment_v2/inverted_index/query/query.h |  11 ++
 .../rowset/segment_v2/inverted_index_reader.cpp    |  36 +++---
 .../olap/rowset/segment_v2/inverted_index_reader.h |   5 +-
 .../test_index_match_phrase_slop.out               |  75 +++++++++++++
 .../test_index_match_phrase_slop.groovy            | 122 +++++++++++++++++++++
 7 files changed, 300 insertions(+), 26 deletions(-)

diff --git 
a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
index a4b7f7502d1..9d242bce68a 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp
@@ -17,11 +17,13 @@
 
 #include "phrase_query.h"
 
+#include <charconv>
+
 namespace doris::segment_v2 {
 
 PhraseQuery::PhraseQuery(const std::shared_ptr<lucene::search::IndexSearcher>& 
searcher,
                          const TQueryOptions& query_options)
-        : _searcher(searcher) {}
+        : _searcher(searcher), 
_query(std::make_unique<CL_NS(search)::PhraseQuery>()) {}
 
 PhraseQuery::~PhraseQuery() {
     for (auto& term_doc : _term_docs) {
@@ -36,6 +38,25 @@ PhraseQuery::~PhraseQuery() {
     }
 }
 
+void PhraseQuery::add(const InvertedIndexQueryInfo& query_info) {
+    if (query_info.terms.empty()) {
+        _CLTHROWA(CL_ERR_IllegalArgument, "PhraseQuery::add: terms empty");
+    }
+
+    _slop = query_info.slop;
+    if (_slop <= 0) {
+        add(query_info.field_name, query_info.terms);
+    } else {
+        for (const auto& term : query_info.terms) {
+            std::wstring ws_term = StringUtil::string_to_wstring(term);
+            auto* t = _CLNEW 
lucene::index::Term(query_info.field_name.c_str(), ws_term.c_str());
+            _query->add(t);
+            _CLDECDELETE(t);
+        }
+        _query->setSlop(_slop);
+    }
+}
+
 void PhraseQuery::add(const std::wstring& field_name, const 
std::vector<std::string>& terms) {
     if (terms.empty()) {
         _CLTHROWA(CL_ERR_IllegalArgument, "PhraseQuery::add: terms empty");
@@ -74,14 +95,20 @@ void PhraseQuery::add(const std::wstring& field_name, const 
std::vector<std::str
 }
 
 void PhraseQuery::search(roaring::Roaring& roaring) {
-    if (_lead1.isEmpty()) {
-        return;
-    }
-    if (_lead2.isEmpty()) {
-        search_by_bitmap(roaring);
-        return;
+    if (_slop <= 0) {
+        if (_lead1.isEmpty()) {
+            return;
+        }
+        if (_lead2.isEmpty()) {
+            search_by_bitmap(roaring);
+            return;
+        }
+        search_by_skiplist(roaring);
+    } else {
+        _searcher->_search(_query.get(), [&roaring](const int32_t docid, const 
float_t /*score*/) {
+            roaring.add(docid);
+        });
     }
-    search_by_skiplist(roaring);
 }
 
 void PhraseQuery::search_by_bitmap(roaring::Roaring& roaring) {
@@ -202,4 +229,28 @@ void PhraseQuery::reset() {
     }
 }
 
+Status PhraseQuery::parser_slop(std::string& query, InvertedIndexQueryInfo& 
query_info) {
+    auto is_digits = [](const std::string_view& str) {
+        return std::all_of(str.begin(), str.end(), [](unsigned char c) { 
return std::isdigit(c); });
+    };
+
+    size_t last_space_pos = query.find_last_of(' ');
+    if (last_space_pos != std::string::npos) {
+        size_t tilde_pos = last_space_pos + 1;
+        if (tilde_pos < query.size() - 1 && query[tilde_pos] == '~') {
+            size_t slop_pos = tilde_pos + 1;
+            std::string_view slop_str(query.data() + slop_pos, query.size() - 
slop_pos);
+            if (is_digits(slop_str)) {
+                auto result = std::from_chars(slop_str.begin(), 
slop_str.end(), query_info.slop);
+                if (result.ec != std::errc()) {
+                    return 
Status::Error<doris::ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
+                            "PhraseQuery parser failed: {}", query);
+                }
+                query = query.substr(0, last_space_pos);
+            }
+        }
+    }
+    return Status::OK();
+}
+
 } // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
index 8f62989d86b..41b5f2d2e97 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h
@@ -17,7 +17,10 @@
 
 #pragma once
 
+// clang-format off
 #include "olap/rowset/segment_v2/inverted_index/query/query.h"
+#include "CLucene/search/PhraseQuery.h"
+// clang-format on
 
 CL_NS_USE(index)
 CL_NS_USE(search)
@@ -30,6 +33,7 @@ public:
                 const TQueryOptions& query_options);
     ~PhraseQuery() override;
 
+    void add(const InvertedIndexQueryInfo& query_info) override;
     void add(const std::wstring& field_name, const std::vector<std::string>& 
terms) override;
     void search(roaring::Roaring& roaring) override;
 
@@ -54,6 +58,9 @@ private:
     bool advance_position(PostingsAndPosition& posting, int32_t target);
     void reset();
 
+public:
+    static Status parser_slop(std::string& query, InvertedIndexQueryInfo& 
query_info);
+
 private:
     std::shared_ptr<lucene::search::IndexSearcher> _searcher;
 
@@ -65,6 +72,9 @@ private:
 
     std::vector<Term*> _terms;
     std::vector<TermDocs*> _term_docs;
+
+    std::unique_ptr<CL_NS(search)::PhraseQuery> _query;
+    int32_t _slop = 0;
 };
 
 } // namespace doris::segment_v2
\ No newline at end of file
diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h 
b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h
index 091ba7d3958..011229aa667 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h
@@ -26,6 +26,7 @@
 
 #include <memory>
 
+#include "common/status.h"
 #include "roaring/roaring.hh"
 
 CL_NS_USE(index)
@@ -34,10 +35,20 @@ CL_NS_USE(util)
 
 namespace doris::segment_v2 {
 
+struct InvertedIndexQueryInfo {
+    std::wstring field_name;
+    std::vector<std::string> terms;
+    int32_t slop = 0;
+};
+
 class Query {
 public:
     virtual ~Query() = default;
 
+    virtual void add(const InvertedIndexQueryInfo& query_info) {
+        add(query_info.field_name, query_info.terms);
+    }
+
     // a unified data preparation interface that provides the field names to 
be queried and the terms for the query.
     // @param field_name The name of the field within the data source to 
search against.
     // @param terms a vector of tokenized strings that represent the search 
terms.
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
index 11c53bbabc0..068d36bc8d0 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp
@@ -38,8 +38,11 @@
 #include <ostream>
 #include <roaring/roaring.hh>
 #include <set>
+#include <string>
 
+#include "gutil/integral_types.h"
 #include "inverted_index_query_type.h"
+#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h"
 
 #ifdef __clang__
 #pragma clang diagnostic push
@@ -261,14 +264,18 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
               << search_str << "]";
 
     try {
-        std::vector<std::string> analyse_result;
+        InvertedIndexQueryInfo query_info;
         InvertedIndexQueryCache::CacheKey cache_key;
         auto index_file_key = 
_inverted_index_file_reader->get_index_file_key(&_index_meta);
 
         if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) {
             cache_key = {index_file_key, column_name, query_type, search_str};
-            analyse_result.emplace_back(search_str);
+            query_info.terms.emplace_back(search_str);
         } else {
+            if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+                RETURN_IF_ERROR(PhraseQuery::parser_slop(search_str, 
query_info));
+            }
+
             InvertedIndexCtxSPtr inverted_index_ctx = 
std::make_shared<InvertedIndexCtx>(
                     get_inverted_index_parser_type_from_string(
                             
get_parser_string_from_properties(_index_meta.properties())),
@@ -283,10 +290,10 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
             }
             inverted_index_ctx->analyzer = analyzer.get();
             auto reader = create_reader(inverted_index_ctx.get(), search_str);
-            get_analyse_result(analyse_result, reader.get(), analyzer.get(), 
column_name,
+            get_analyse_result(query_info.terms, reader.get(), analyzer.get(), 
column_name,
                                query_type);
         }
-        if (analyse_result.empty()) {
+        if (query_info.terms.empty()) {
             auto msg = fmt::format(
                     "token parser result is empty for query, "
                     "please check your query: '{}' and index parser: '{}'",
@@ -300,7 +307,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
         }
 
         std::unique_ptr<lucene::search::Query> query;
-        std::wstring field_ws = std::wstring(column_name.begin(), 
column_name.end());
+        query_info.field_name = std::wstring(column_name.begin(), 
column_name.end());
 
         if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY ||
             query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY ||
@@ -308,8 +315,10 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
             query_type == InvertedIndexQueryType::MATCH_ALL_QUERY ||
             query_type == InvertedIndexQueryType::EQUAL_QUERY ||
             query_type == InvertedIndexQueryType::MATCH_ANY_QUERY) {
-            std::string str_tokens = join(analyse_result, " ");
-
+            std::string str_tokens = join(query_info.terms, " ");
+            if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) {
+                str_tokens += " " + std::to_string(query_info.slop);
+            }
             cache_key = {index_file_key, column_name, query_type, str_tokens};
         }
         auto* cache = InvertedIndexQueryCache::instance();
@@ -329,12 +338,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
         searcher_ptr = 
std::get_if<FulltextIndexSearcherPtr>(&searcher_variant);
         if (searcher_ptr != nullptr) {
             term_match_bitmap = std::make_shared<roaring::Roaring>();
-
-            Status res = match_index_search(stats, runtime_state, query_type, 
field_ws,
-                                            analyse_result, *searcher_ptr, 
term_match_bitmap);
-            if (!res.ok()) {
-                return res;
-            }
+            RETURN_IF_ERROR(match_index_search(stats, runtime_state, 
query_type, query_info,
+                                               *searcher_ptr, 
term_match_bitmap));
             term_match_bitmap->runOptimize();
             cache->insert(cache_key, term_match_bitmap, &cache_handler);
             bit_map = term_match_bitmap;
@@ -348,8 +353,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* 
stats, RuntimeState* run
 
 Status FullTextIndexReader::match_index_search(
         OlapReaderStatistics* stats, RuntimeState* runtime_state, 
InvertedIndexQueryType query_type,
-        const std::wstring& field_ws, const std::vector<std::string>& 
analyse_result,
-        const FulltextIndexSearcherPtr& index_searcher,
+        const InvertedIndexQueryInfo& query_info, const 
FulltextIndexSearcherPtr& index_searcher,
         const std::shared_ptr<roaring::Roaring>& term_match_bitmap) {
     TQueryOptions queryOptions = runtime_state->query_options();
     try {
@@ -359,7 +363,7 @@ Status FullTextIndexReader::match_index_search(
             return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
                     "query type " + query_type_to_string(query_type) + ", 
query is nullptr");
         }
-        query->add(field_ws, analyse_result);
+        query->add(query_info);
         query->search(*term_match_bitmap);
     } catch (const CLuceneError& e) {
         return 
Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: 
{}",
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h 
b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
index 63002da5c92..ffc8c7c75fd 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h
@@ -71,6 +71,7 @@ namespace segment_v2 {
 class InvertedIndexIterator;
 class InvertedIndexQueryCacheHandle;
 class InvertedIndexFileReader;
+struct InvertedIndexQueryInfo;
 
 class InvertedIndexReader : public 
std::enable_shared_from_this<InvertedIndexReader> {
 public:
@@ -173,8 +174,8 @@ public:
 
 private:
     Status match_index_search(OlapReaderStatistics* stats, RuntimeState* 
runtime_state,
-                              InvertedIndexQueryType query_type, const 
std::wstring& field_ws,
-                              const std::vector<std::string>& analyse_result,
+                              InvertedIndexQueryType query_type,
+                              const InvertedIndexQueryInfo& query_info,
                               const FulltextIndexSearcherPtr& index_searcher,
                               const std::shared_ptr<roaring::Roaring>& 
term_match_bitmap);
 };
diff --git 
a/regression-test/data/inverted_index_p0/test_index_match_phrase_slop.out 
b/regression-test/data/inverted_index_p0/test_index_match_phrase_slop.out
new file mode 100644
index 00000000000..404921dd401
--- /dev/null
+++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_slop.out
@@ -0,0 +1,75 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !sql --
+0
+
+-- !sql --
+21
+
+-- !sql --
+21
+
+-- !sql --
+1
+
+-- !sql --
+125
+
+-- !sql --
+125
+
+-- !sql --
+0
+
+-- !sql --
+137
+
+-- !sql --
+137
+
+-- !sql --
+0
+
+-- !sql --
+80
+
+-- !sql --
+80
+
+-- !sql --
+12
+
+-- !sql --
+823
+
+-- !sql --
+823
+
+-- !sql --
+1      127.0.0.1       I'm glad I kept my fingers crossed ~4   1       1
+
+-- !sql --
+1      127.0.0.1       I'm glad I kept my fingers crossed ~4   1       1
+
+-- !sql --
+1      127.0.0.1       I'm glad I kept my fingers crossed ~4   1       1
+
+-- !sql --
+1      127.0.0.1       I'm glad I kept my fingers crossed ~4   1       1
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+
+-- !sql --
+1      127.0.0.1       I'm glad I kept my fingers crossed ~4   1       1
+
diff --git 
a/regression-test/suites/inverted_index_p0/test_index_match_phrase_slop.groovy 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_slop.groovy
new file mode 100644
index 00000000000..a8454878e61
--- /dev/null
+++ 
b/regression-test/suites/inverted_index_p0/test_index_match_phrase_slop.groovy
@@ -0,0 +1,122 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+
+suite("test_index_match_phrase_slop", "p0"){
+    def indexTbName1 = "test_index_match_phrase_slop"
+
+    sql "DROP TABLE IF EXISTS ${indexTbName1}"
+
+    sql """
+      CREATE TABLE ${indexTbName1} (
+        `@timestamp` int(11) NULL COMMENT "",
+        `clientip` varchar(20) NULL COMMENT "",
+        `request` text NULL COMMENT "",
+        `status` int(11) NULL COMMENT "",
+        `size` int(11) NULL COMMENT "",
+        INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = 
"english", "support_phrase" = "true") COMMENT ''
+      ) ENGINE=OLAP
+      DUPLICATE KEY(`@timestamp`)
+      COMMENT "OLAP"
+      DISTRIBUTED BY RANDOM BUCKETS 1
+      PROPERTIES (
+        "replication_allocation" = "tag.location.default: 1",
+        "disable_auto_compaction" = "true"
+      );
+    """
+
+    def load_httplogs_data = {table_name, label, read_flag, format_flag, 
file_name, ignore_failure=false,
+                        expected_succ_rows = -1, load_to_single_tablet = 
'true' ->
+        
+        // load the json data
+        streamLoad {
+            table "${table_name}"
+            
+            // set http request header params
+            set 'label', label + "_" + UUID.randomUUID().toString()
+            set 'read_json_by_line', read_flag
+            set 'format', format_flag
+            file file_name // import json file
+            time 10000 // limit inflight 10s
+            if (expected_succ_rows >= 0) {
+                set 'max_filter_ratio', '1'
+            }
+
+            // if declared a check callback, the default check condition will 
ignore.
+            // So you must check all condition
+            check { result, exception, startTime, endTime ->
+                       if (ignore_failure && expected_succ_rows < 0) { return }
+                    if (exception != null) {
+                        throw exception
+                    }
+                    log.info("Stream load result: ${result}".toString())
+                    def json = parseJson(result)
+                    assertEquals("success", json.Status.toLowerCase())
+                    if (expected_succ_rows >= 0) {
+                        assertEquals(json.NumberLoadedRows, expected_succ_rows)
+                    } else {
+                        assertEquals(json.NumberTotalRows, 
json.NumberLoadedRows + json.NumberUnselectedRows)
+                        assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes 
> 0)
+                }
+            }
+        }
+    }
+
+    try {
+        load_httplogs_data.call(indexTbName1, 'test_index_match_phrase_slop', 
'true', 'json', 'documents-1000.json')
+
+        sql """ INSERT INTO ${indexTbName1} VALUES (1, "127.0.0.1", "I'm glad 
I kept my fingers crossed ~4", 1, 1); """
+
+        sql "sync"
+
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'get jpg'; """
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'get jpg  ~2'; """
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'get jpg ~2'; """
+
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'images bg'; """
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'images bg  ~1'; """
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'images bg ~1'; """
+
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'images jpg'; """
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'images jpg  ~2'; """
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'images jpg ~2'; """
+
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'french gif'; """
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'french gif  ~4'; """
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'french gif ~4'; """
+
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'get http'; """
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'get http  ~6'; """
+        qt_sql """ select count() from ${indexTbName1} where request 
match_phrase 'get http ~6'; """
+
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'crossed~4'; """
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'crossed  ~4'; """
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'glad crossed ~4'; """
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'glad crossed  ~4'; """
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'glad crossed ~4 '; """
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'glad crossed ~4.'; """
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'glad crossed~4'; """
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'glad~4crossed~4'; """
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'glad ~4 crossed~4'; """
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'glad ~4 crossed ~4'; """
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'glad crossed \\\\~4'; """
+        qt_sql """ select * from ${indexTbName1} where request match_phrase 
'glad crossed \\~4'; """
+
+    } finally {
+        //try_sql("DROP TABLE IF EXISTS ${testTable}")
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to