This is an automated email from the ASF dual-hosted git repository. jianliangqi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 51373428cde [feature](inverted index) add slop functionality to match_phrase (#33225) 51373428cde is described below commit 51373428cde6f9e2404209aa2f59a7ac7f10f575 Author: zzzxl <33418555+zzzxl1...@users.noreply.github.com> AuthorDate: Mon Apr 15 15:35:31 2024 +0800 [feature](inverted index) add slop functionality to match_phrase (#33225) https://github.com/apache/doris-website/pull/553 doc --- .../inverted_index/query/phrase_query.cpp | 67 +++++++++-- .../segment_v2/inverted_index/query/phrase_query.h | 10 ++ .../rowset/segment_v2/inverted_index/query/query.h | 11 ++ .../rowset/segment_v2/inverted_index_reader.cpp | 36 +++--- .../olap/rowset/segment_v2/inverted_index_reader.h | 5 +- .../test_index_match_phrase_slop.out | 75 +++++++++++++ .../test_index_match_phrase_slop.groovy | 122 +++++++++++++++++++++ 7 files changed, 300 insertions(+), 26 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp index a4b7f7502d1..9d242bce68a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp @@ -17,11 +17,13 @@ #include "phrase_query.h" +#include <charconv> + namespace doris::segment_v2 { PhraseQuery::PhraseQuery(const std::shared_ptr<lucene::search::IndexSearcher>& searcher, const TQueryOptions& query_options) - : _searcher(searcher) {} + : _searcher(searcher), _query(std::make_unique<CL_NS(search)::PhraseQuery>()) {} PhraseQuery::~PhraseQuery() { for (auto& term_doc : _term_docs) { @@ -36,6 +38,25 @@ PhraseQuery::~PhraseQuery() { } } +void PhraseQuery::add(const InvertedIndexQueryInfo& query_info) { + if (query_info.terms.empty()) { + _CLTHROWA(CL_ERR_IllegalArgument, "PhraseQuery::add: terms empty"); + } + + _slop = query_info.slop; + if (_slop <= 0) { + add(query_info.field_name, query_info.terms); + } else { + for (const auto& term : query_info.terms) { + std::wstring ws_term = StringUtil::string_to_wstring(term); + auto* t = _CLNEW lucene::index::Term(query_info.field_name.c_str(), ws_term.c_str()); + _query->add(t); + _CLDECDELETE(t); + } + _query->setSlop(_slop); + } +} + void PhraseQuery::add(const std::wstring& field_name, const std::vector<std::string>& terms) { if (terms.empty()) { _CLTHROWA(CL_ERR_IllegalArgument, "PhraseQuery::add: terms empty"); @@ -74,14 +95,20 @@ void PhraseQuery::add(const std::wstring& field_name, const std::vector<std::str } void PhraseQuery::search(roaring::Roaring& roaring) { - if (_lead1.isEmpty()) { - return; - } - if (_lead2.isEmpty()) { - search_by_bitmap(roaring); - return; + if (_slop <= 0) { + if (_lead1.isEmpty()) { + return; + } + if (_lead2.isEmpty()) { + search_by_bitmap(roaring); + return; + } + search_by_skiplist(roaring); + } else { + _searcher->_search(_query.get(), [&roaring](const int32_t docid, const float_t /*score*/) { + roaring.add(docid); + }); } - search_by_skiplist(roaring); } void PhraseQuery::search_by_bitmap(roaring::Roaring& roaring) { @@ -202,4 +229,28 @@ void PhraseQuery::reset() { } } +Status PhraseQuery::parser_slop(std::string& query, InvertedIndexQueryInfo& query_info) { + auto is_digits = [](const std::string_view& str) { + return std::all_of(str.begin(), str.end(), [](unsigned char c) { return std::isdigit(c); }); + }; + + size_t last_space_pos = query.find_last_of(' '); + if (last_space_pos != std::string::npos) { + size_t tilde_pos = last_space_pos + 1; + if (tilde_pos < query.size() - 1 && query[tilde_pos] == '~') { + size_t slop_pos = tilde_pos + 1; + std::string_view slop_str(query.data() + slop_pos, query.size() - slop_pos); + if (is_digits(slop_str)) { + auto result = std::from_chars(slop_str.begin(), slop_str.end(), query_info.slop); + if (result.ec != std::errc()) { + return Status::Error<doris::ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>( + "PhraseQuery parser failed: {}", query); + } + query = query.substr(0, last_space_pos); + } + } + } + return Status::OK(); +} + } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h index 8f62989d86b..41b5f2d2e97 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h @@ -17,7 +17,10 @@ #pragma once +// clang-format off #include "olap/rowset/segment_v2/inverted_index/query/query.h" +#include "CLucene/search/PhraseQuery.h" +// clang-format on CL_NS_USE(index) CL_NS_USE(search) @@ -30,6 +33,7 @@ public: const TQueryOptions& query_options); ~PhraseQuery() override; + void add(const InvertedIndexQueryInfo& query_info) override; void add(const std::wstring& field_name, const std::vector<std::string>& terms) override; void search(roaring::Roaring& roaring) override; @@ -54,6 +58,9 @@ private: bool advance_position(PostingsAndPosition& posting, int32_t target); void reset(); +public: + static Status parser_slop(std::string& query, InvertedIndexQueryInfo& query_info); + private: std::shared_ptr<lucene::search::IndexSearcher> _searcher; @@ -65,6 +72,9 @@ private: std::vector<Term*> _terms; std::vector<TermDocs*> _term_docs; + + std::unique_ptr<CL_NS(search)::PhraseQuery> _query; + int32_t _slop = 0; }; } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h index 091ba7d3958..011229aa667 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h @@ -26,6 +26,7 @@ #include <memory> +#include "common/status.h" #include "roaring/roaring.hh" CL_NS_USE(index) @@ -34,10 +35,20 @@ CL_NS_USE(util) namespace doris::segment_v2 { +struct InvertedIndexQueryInfo { + std::wstring field_name; + std::vector<std::string> terms; + int32_t slop = 0; +}; + class Query { public: virtual ~Query() = default; + virtual void add(const InvertedIndexQueryInfo& query_info) { + add(query_info.field_name, query_info.terms); + } + // a unified data preparation interface that provides the field names to be queried and the terms for the query. // @param field_name The name of the field within the data source to search against. // @param terms a vector of tokenized strings that represent the search terms. diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 11c53bbabc0..068d36bc8d0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -38,8 +38,11 @@ #include <ostream> #include <roaring/roaring.hh> #include <set> +#include <string> +#include "gutil/integral_types.h" #include "inverted_index_query_type.h" +#include "olap/rowset/segment_v2/inverted_index/query/phrase_query.h" #ifdef __clang__ #pragma clang diagnostic push @@ -261,14 +264,18 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run << search_str << "]"; try { - std::vector<std::string> analyse_result; + InvertedIndexQueryInfo query_info; InvertedIndexQueryCache::CacheKey cache_key; auto index_file_key = _inverted_index_file_reader->get_index_file_key(&_index_meta); if (query_type == InvertedIndexQueryType::MATCH_REGEXP_QUERY) { cache_key = {index_file_key, column_name, query_type, search_str}; - analyse_result.emplace_back(search_str); + query_info.terms.emplace_back(search_str); } else { + if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { + RETURN_IF_ERROR(PhraseQuery::parser_slop(search_str, query_info)); + } + InvertedIndexCtxSPtr inverted_index_ctx = std::make_shared<InvertedIndexCtx>( get_inverted_index_parser_type_from_string( get_parser_string_from_properties(_index_meta.properties())), @@ -283,10 +290,10 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run } inverted_index_ctx->analyzer = analyzer.get(); auto reader = create_reader(inverted_index_ctx.get(), search_str); - get_analyse_result(analyse_result, reader.get(), analyzer.get(), column_name, + get_analyse_result(query_info.terms, reader.get(), analyzer.get(), column_name, query_type); } - if (analyse_result.empty()) { + if (query_info.terms.empty()) { auto msg = fmt::format( "token parser result is empty for query, " "please check your query: '{}' and index parser: '{}'", @@ -300,7 +307,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run } std::unique_ptr<lucene::search::Query> query; - std::wstring field_ws = std::wstring(column_name.begin(), column_name.end()); + query_info.field_name = std::wstring(column_name.begin(), column_name.end()); if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY || query_type == InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY || @@ -308,8 +315,10 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run query_type == InvertedIndexQueryType::MATCH_ALL_QUERY || query_type == InvertedIndexQueryType::EQUAL_QUERY || query_type == InvertedIndexQueryType::MATCH_ANY_QUERY) { - std::string str_tokens = join(analyse_result, " "); - + std::string str_tokens = join(query_info.terms, " "); + if (query_type == InvertedIndexQueryType::MATCH_PHRASE_QUERY) { + str_tokens += " " + std::to_string(query_info.slop); + } cache_key = {index_file_key, column_name, query_type, str_tokens}; } auto* cache = InvertedIndexQueryCache::instance(); @@ -329,12 +338,8 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run searcher_ptr = std::get_if<FulltextIndexSearcherPtr>(&searcher_variant); if (searcher_ptr != nullptr) { term_match_bitmap = std::make_shared<roaring::Roaring>(); - - Status res = match_index_search(stats, runtime_state, query_type, field_ws, - analyse_result, *searcher_ptr, term_match_bitmap); - if (!res.ok()) { - return res; - } + RETURN_IF_ERROR(match_index_search(stats, runtime_state, query_type, query_info, + *searcher_ptr, term_match_bitmap)); term_match_bitmap->runOptimize(); cache->insert(cache_key, term_match_bitmap, &cache_handler); bit_map = term_match_bitmap; @@ -348,8 +353,7 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run Status FullTextIndexReader::match_index_search( OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, - const std::wstring& field_ws, const std::vector<std::string>& analyse_result, - const FulltextIndexSearcherPtr& index_searcher, + const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, const std::shared_ptr<roaring::Roaring>& term_match_bitmap) { TQueryOptions queryOptions = runtime_state->query_options(); try { @@ -359,7 +363,7 @@ Status FullTextIndexReader::match_index_search( return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>( "query type " + query_type_to_string(query_type) + ", query is nullptr"); } - query->add(field_ws, analyse_result); + query->add(query_info); query->search(*term_match_bitmap); } catch (const CLuceneError& e) { return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>("CLuceneError occured: {}", diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index 63002da5c92..ffc8c7c75fd 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -71,6 +71,7 @@ namespace segment_v2 { class InvertedIndexIterator; class InvertedIndexQueryCacheHandle; class InvertedIndexFileReader; +struct InvertedIndexQueryInfo; class InvertedIndexReader : public std::enable_shared_from_this<InvertedIndexReader> { public: @@ -173,8 +174,8 @@ public: private: Status match_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, - InvertedIndexQueryType query_type, const std::wstring& field_ws, - const std::vector<std::string>& analyse_result, + InvertedIndexQueryType query_type, + const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, const std::shared_ptr<roaring::Roaring>& term_match_bitmap); }; diff --git a/regression-test/data/inverted_index_p0/test_index_match_phrase_slop.out b/regression-test/data/inverted_index_p0/test_index_match_phrase_slop.out new file mode 100644 index 00000000000..404921dd401 --- /dev/null +++ b/regression-test/data/inverted_index_p0/test_index_match_phrase_slop.out @@ -0,0 +1,75 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +0 + +-- !sql -- +21 + +-- !sql -- +21 + +-- !sql -- +1 + +-- !sql -- +125 + +-- !sql -- +125 + +-- !sql -- +0 + +-- !sql -- +137 + +-- !sql -- +137 + +-- !sql -- +0 + +-- !sql -- +80 + +-- !sql -- +80 + +-- !sql -- +12 + +-- !sql -- +823 + +-- !sql -- +823 + +-- !sql -- +1 127.0.0.1 I'm glad I kept my fingers crossed ~4 1 1 + +-- !sql -- +1 127.0.0.1 I'm glad I kept my fingers crossed ~4 1 1 + +-- !sql -- +1 127.0.0.1 I'm glad I kept my fingers crossed ~4 1 1 + +-- !sql -- +1 127.0.0.1 I'm glad I kept my fingers crossed ~4 1 1 + +-- !sql -- + +-- !sql -- + +-- !sql -- + +-- !sql -- + +-- !sql -- + +-- !sql -- + +-- !sql -- + +-- !sql -- +1 127.0.0.1 I'm glad I kept my fingers crossed ~4 1 1 + diff --git a/regression-test/suites/inverted_index_p0/test_index_match_phrase_slop.groovy b/regression-test/suites/inverted_index_p0/test_index_match_phrase_slop.groovy new file mode 100644 index 00000000000..a8454878e61 --- /dev/null +++ b/regression-test/suites/inverted_index_p0/test_index_match_phrase_slop.groovy @@ -0,0 +1,122 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +suite("test_index_match_phrase_slop", "p0"){ + def indexTbName1 = "test_index_match_phrase_slop" + + sql "DROP TABLE IF EXISTS ${indexTbName1}" + + sql """ + CREATE TABLE ${indexTbName1} ( + `@timestamp` int(11) NULL COMMENT "", + `clientip` varchar(20) NULL COMMENT "", + `request` text NULL COMMENT "", + `status` int(11) NULL COMMENT "", + `size` int(11) NULL COMMENT "", + INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '' + ) ENGINE=OLAP + DUPLICATE KEY(`@timestamp`) + COMMENT "OLAP" + DISTRIBUTED BY RANDOM BUCKETS 1 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1", + "disable_auto_compaction" = "true" + ); + """ + + def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false, + expected_succ_rows = -1, load_to_single_tablet = 'true' -> + + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'label', label + "_" + UUID.randomUUID().toString() + set 'read_json_by_line', read_flag + set 'format', format_flag + file file_name // import json file + time 10000 // limit inflight 10s + if (expected_succ_rows >= 0) { + set 'max_filter_ratio', '1' + } + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + check { result, exception, startTime, endTime -> + if (ignore_failure && expected_succ_rows < 0) { return } + if (exception != null) { + throw exception + } + log.info("Stream load result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + if (expected_succ_rows >= 0) { + assertEquals(json.NumberLoadedRows, expected_succ_rows) + } else { + assertEquals(json.NumberTotalRows, json.NumberLoadedRows + json.NumberUnselectedRows) + assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0) + } + } + } + } + + try { + load_httplogs_data.call(indexTbName1, 'test_index_match_phrase_slop', 'true', 'json', 'documents-1000.json') + + sql """ INSERT INTO ${indexTbName1} VALUES (1, "127.0.0.1", "I'm glad I kept my fingers crossed ~4", 1, 1); """ + + sql "sync" + + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'get jpg'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'get jpg ~2'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'get jpg ~2'; """ + + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'images bg'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'images bg ~1'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'images bg ~1'; """ + + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'images jpg'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'images jpg ~2'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'images jpg ~2'; """ + + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'french gif'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'french gif ~4'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'french gif ~4'; """ + + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'get http'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'get http ~6'; """ + qt_sql """ select count() from ${indexTbName1} where request match_phrase 'get http ~6'; """ + + qt_sql """ select * from ${indexTbName1} where request match_phrase 'crossed~4'; """ + qt_sql """ select * from ${indexTbName1} where request match_phrase 'crossed ~4'; """ + qt_sql """ select * from ${indexTbName1} where request match_phrase 'glad crossed ~4'; """ + qt_sql """ select * from ${indexTbName1} where request match_phrase 'glad crossed ~4'; """ + qt_sql """ select * from ${indexTbName1} where request match_phrase 'glad crossed ~4 '; """ + qt_sql """ select * from ${indexTbName1} where request match_phrase 'glad crossed ~4.'; """ + qt_sql """ select * from ${indexTbName1} where request match_phrase 'glad crossed~4'; """ + qt_sql """ select * from ${indexTbName1} where request match_phrase 'glad~4crossed~4'; """ + qt_sql """ select * from ${indexTbName1} where request match_phrase 'glad ~4 crossed~4'; """ + qt_sql """ select * from ${indexTbName1} where request match_phrase 'glad ~4 crossed ~4'; """ + qt_sql """ select * from ${indexTbName1} where request match_phrase 'glad crossed \\\\~4'; """ + qt_sql """ select * from ${indexTbName1} where request match_phrase 'glad crossed \\~4'; """ + + } finally { + //try_sql("DROP TABLE IF EXISTS ${testTable}") + } +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org