This is an automated email from the ASF dual-hosted git repository.
jianliangqi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new b62c5a70c7 [fix](match query) fix array column match query failed
without inverted index (#20344)
b62c5a70c7 is described below
commit b62c5a70c7400dc3e0e4e5776e9ab8854b7c333b
Author: YueW <[email protected]>
AuthorDate: Fri Jun 2 21:10:12 2023 +0800
[fix](match query) fix array column match query failed without inverted
index (#20344)
---
be/src/vec/functions/match.cpp | 111 ++++++++++++++++-----
be/src/vec/functions/match.h | 42 ++++++--
.../data/inverted_index_p0/test_array_index.out | 12 +++
.../inverted_index_p0/test_array_index.groovy | 6 ++
4 files changed, 138 insertions(+), 33 deletions(-)
diff --git a/be/src/vec/functions/match.cpp b/be/src/vec/functions/match.cpp
index fe55bf9a5a..77de502107 100644
--- a/be/src/vec/functions/match.cpp
+++ b/be/src/vec/functions/match.cpp
@@ -17,7 +17,6 @@
#include "vec/functions/match.h"
-#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "runtime/query_context.h"
#include "runtime/runtime_state.h"
@@ -36,19 +35,45 @@ Status FunctionMatchBase::execute_impl(FunctionContext*
context, Block& block,
InvertedIndexCtx* inverted_index_ctx =
reinterpret_cast<InvertedIndexCtx*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
- const auto values_col =
+ const ColumnPtr source_col =
block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
- const auto* values =
check_and_get_column<ColumnString>(values_col.get());
+ const auto* values =
check_and_get_column<ColumnString>(source_col.get());
+ const ColumnArray* array_col = nullptr;
+ if (source_col->is_column_array()) {
+ array_col = check_and_get_column<ColumnArray>(source_col.get());
+ if (array_col && !array_col->get_data().is_column_string()) {
+ return Status::NotSupported(
+ fmt::format("unsupported nested array of type {} for
function {}",
+ is_column_nullable(array_col->get_data())
+ ? array_col->get_data().get_name()
+ :
array_col->get_data().get_family_name(),
+ get_name()));
+ }
+
+ if (is_column_nullable(array_col->get_data())) {
+ const auto& array_nested_null_column =
+ reinterpret_cast<const
ColumnNullable&>(array_col->get_data());
+ values = check_and_get_column<ColumnString>(
+ *(array_nested_null_column.get_nested_column_ptr()));
+ } else {
+ values =
check_and_get_column<ColumnString>(*(array_col->get_data_ptr()));
+ }
+ } else if (auto* nullable =
check_and_get_column<ColumnNullable>(source_col.get())) {
+ values =
check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
+ }
+
if (!values) {
- return Status::InternalError("Not supported input arguments
types");
+ LOG(WARNING) << "Illegal column " << source_col->get_name();
+ return Status::InternalError("Not supported input column types");
}
// result column
auto res = ColumnUInt8::create();
ColumnUInt8::Container& vec_res = res->get_data();
// set default value to 0, and match functions only need to set 1/true
vec_res.resize_fill(input_rows_count);
- RETURN_IF_ERROR(execute_match(column_name, match_query_str,
input_rows_count, values,
- inverted_index_ctx, vec_res));
+ RETURN_IF_ERROR(execute_match(
+ column_name, match_query_str, input_rows_count, values,
inverted_index_ctx,
+ (array_col ? &(array_col->get_offsets()) : nullptr), vec_res));
block.replace_by_position(result, std::move(res));
} else {
auto match_pred_column =
@@ -59,10 +84,46 @@ Status FunctionMatchBase::execute_impl(FunctionContext*
context, Block& block,
return Status::OK();
}
+inline doris::segment_v2::InvertedIndexQueryType
FunctionMatchBase::get_query_type_from_fn_name() {
+ std::string fn_name = get_name();
+ if (fn_name == MATCH_ANY_FUNCTION) {
+ return doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY;
+ } else if (fn_name == MATCH_ALL_FUNCTION) {
+ return doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY;
+ } else if (fn_name == MATCH_PHRASE_FUNCTION) {
+ return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY;
+ }
+ return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
+}
+
+inline std::vector<std::wstring> FunctionMatchBase::analyse_data_token(
+ const std::string& column_name, InvertedIndexCtx* inverted_index_ctx,
+ const ColumnString* string_col, int32_t current_block_row_idx,
+ const ColumnArray::Offsets64* array_offsets, int32_t&
current_src_array_offset) {
+ std::vector<std::wstring> data_tokens;
+ auto query_type = get_query_type_from_fn_name();
+ if (array_offsets) {
+ for (auto next_src_array_offset =
(*array_offsets)[current_block_row_idx];
+ current_src_array_offset < next_src_array_offset;
++current_src_array_offset) {
+ const auto& str_ref =
string_col->get_data_at(current_src_array_offset);
+ std::vector<std::wstring> element_tokens =
+ doris::segment_v2::InvertedIndexReader::get_analyse_result(
+ column_name, str_ref.to_string(), query_type,
inverted_index_ctx);
+ data_tokens.insert(data_tokens.end(), element_tokens.begin(),
element_tokens.end());
+ }
+ } else {
+ const auto& str_ref = string_col->get_data_at(current_block_row_idx);
+ data_tokens =
doris::segment_v2::InvertedIndexReader::get_analyse_result(
+ column_name, str_ref.to_string(), query_type,
inverted_index_ctx);
+ }
+ return data_tokens;
+}
+
Status FunctionMatchAny::execute_match(const std::string& column_name,
const std::string& match_query_str,
size_t input_rows_count,
- const ColumnString* datas,
+ const ColumnString* string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64*
array_offsets,
ColumnUInt8::Container& result) {
doris::InvertedIndexParserType parser_type =
doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
@@ -79,13 +140,13 @@ Status FunctionMatchAny::execute_match(const std::string&
column_name,
<< ", please check your query sql";
return Status::Error<ErrorCode::INVERTED_INDEX_NO_TERMS>();
}
+
+ auto current_src_array_offset = 0;
for (int i = 0; i < input_rows_count; i++) {
- const auto& str_ref = datas->get_data_at(i);
std::vector<std::wstring> data_tokens =
- doris::segment_v2::InvertedIndexReader::get_analyse_result(
- column_name, str_ref.to_string(),
-
doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY,
- inverted_index_ctx);
+ analyse_data_token(column_name, inverted_index_ctx,
string_col, i, array_offsets,
+ current_src_array_offset);
+
// TODO: more efficient impl
for (auto& token : query_tokens) {
auto it = std::find(data_tokens.begin(), data_tokens.end(), token);
@@ -101,8 +162,9 @@ Status FunctionMatchAny::execute_match(const std::string&
column_name,
Status FunctionMatchAll::execute_match(const std::string& column_name,
const std::string& match_query_str,
size_t input_rows_count,
- const ColumnString* datas,
+ const ColumnString* string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64*
array_offsets,
ColumnUInt8::Container& result) {
doris::InvertedIndexParserType parser_type =
doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
@@ -119,13 +181,13 @@ Status FunctionMatchAll::execute_match(const std::string&
column_name,
<< ", please check your query sql";
return Status::Error<ErrorCode::INVERTED_INDEX_NO_TERMS>();
}
+
+ auto current_src_array_offset = 0;
for (int i = 0; i < input_rows_count; i++) {
- const auto& str_ref = datas->get_data_at(i);
std::vector<std::wstring> data_tokens =
- doris::segment_v2::InvertedIndexReader::get_analyse_result(
- column_name, str_ref.to_string(),
-
doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY,
- inverted_index_ctx);
+ analyse_data_token(column_name, inverted_index_ctx,
string_col, i, array_offsets,
+ current_src_array_offset);
+
// TODO: more efficient impl
auto find_count = 0;
for (auto& token : query_tokens) {
@@ -147,8 +209,9 @@ Status FunctionMatchAll::execute_match(const std::string&
column_name,
Status FunctionMatchPhrase::execute_match(const std::string& column_name,
const std::string& match_query_str,
- size_t input_rows_count, const
ColumnString* datas,
+ size_t input_rows_count, const
ColumnString* string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64*
array_offsets,
ColumnUInt8::Container& result) {
doris::InvertedIndexParserType parser_type =
doris::InvertedIndexParserType::PARSER_UNKNOWN;
if (inverted_index_ctx) {
@@ -166,13 +229,13 @@ Status FunctionMatchPhrase::execute_match(const
std::string& column_name,
<< ", please check your query sql";
return Status::Error<ErrorCode::INVERTED_INDEX_NO_TERMS>();
}
+
+ auto current_src_array_offset = 0;
for (int i = 0; i < input_rows_count; i++) {
- const auto& str_ref = datas->get_data_at(i);
std::vector<std::wstring> data_tokens =
- doris::segment_v2::InvertedIndexReader::get_analyse_result(
- column_name, str_ref.to_string(),
-
doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY,
- inverted_index_ctx);
+ analyse_data_token(column_name, inverted_index_ctx,
string_col, i, array_offsets,
+ current_src_array_offset);
+
// TODO: more efficient impl
bool matched = false;
auto it = data_tokens.begin();
diff --git a/be/src/vec/functions/match.h b/be/src/vec/functions/match.h
index 3fcce4ebf1..dda00bb56c 100644
--- a/be/src/vec/functions/match.h
+++ b/be/src/vec/functions/match.h
@@ -31,8 +31,10 @@
#include "common/logging.h"
#include "common/status.h"
#include "olap/inverted_index_parser.h"
+#include "olap/rowset/segment_v2/inverted_index_reader.h"
#include "vec/aggregate_functions/aggregate_function.h"
#include "vec/columns/column.h"
+#include "vec/columns/column_array.h"
#include "vec/core/block.h"
#include "vec/core/column_numbers.h"
#include "vec/core/column_with_type_and_name.h"
@@ -48,6 +50,10 @@ class FunctionContext;
namespace doris::vectorized {
+const std::string MATCH_ANY_FUNCTION = "match_any";
+const std::string MATCH_ALL_FUNCTION = "match_all";
+const std::string MATCH_PHRASE_FUNCTION = "match_phrase";
+
class FunctionMatchBase : public IFunction {
public:
size_t get_number_of_arguments() const override { return 2; }
@@ -63,9 +69,19 @@ public:
size_t result, size_t input_rows_count) override;
virtual Status execute_match(const std::string& column_name, const
std::string& match_query_str,
- size_t input_rows_count, const ColumnString*
datas,
+ size_t input_rows_count, const ColumnString*
string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) = 0;
+
+ doris::segment_v2::InvertedIndexQueryType get_query_type_from_fn_name();
+
+ std::vector<std::wstring> analyse_data_token(const std::string&
column_name,
+ InvertedIndexCtx*
inverted_index_ctx,
+ const ColumnString*
string_col,
+ int32_t current_block_row_idx,
+ const ColumnArray::Offsets64*
array_offsets,
+ int32_t&
current_src_array_offset);
};
class FunctionMatchAny : public FunctionMatchBase {
@@ -76,8 +92,9 @@ public:
String get_name() const override { return name; }
virtual Status execute_match(const std::string& column_name, const
std::string& match_query_str,
- size_t input_rows_count, const ColumnString*
datas,
+ size_t input_rows_count, const ColumnString*
string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) override;
};
@@ -89,8 +106,9 @@ public:
String get_name() const override { return name; }
virtual Status execute_match(const std::string& column_name, const
std::string& match_query_str,
- size_t input_rows_count, const ColumnString*
datas,
+ size_t input_rows_count, const ColumnString*
string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) override;
};
@@ -102,8 +120,9 @@ public:
String get_name() const override { return name; }
virtual Status execute_match(const std::string& column_name, const
std::string& match_query_str,
- size_t input_rows_count, const ColumnString*
datas,
+ size_t input_rows_count, const ColumnString*
string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) override;
};
@@ -115,8 +134,9 @@ public:
String get_name() const override { return name; }
virtual Status execute_match(const std::string& column_name, const
std::string& match_query_str,
- size_t input_rows_count, const ColumnString*
datas,
+ size_t input_rows_count, const ColumnString*
string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) override {
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
}
@@ -130,8 +150,9 @@ public:
String get_name() const override { return name; }
virtual Status execute_match(const std::string& column_name, const
std::string& match_query_str,
- size_t input_rows_count, const ColumnString*
datas,
+ size_t input_rows_count, const ColumnString*
string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) override {
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
}
@@ -145,8 +166,9 @@ public:
String get_name() const override { return name; }
virtual Status execute_match(const std::string& column_name, const
std::string& match_query_str,
- size_t input_rows_count, const ColumnString*
datas,
+ size_t input_rows_count, const ColumnString*
string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) override {
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
}
@@ -160,8 +182,9 @@ public:
String get_name() const override { return name; }
virtual Status execute_match(const std::string& column_name, const
std::string& match_query_str,
- size_t input_rows_count, const ColumnString*
datas,
+ size_t input_rows_count, const ColumnString*
string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) override {
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
}
@@ -175,8 +198,9 @@ public:
String get_name() const override { return name; }
virtual Status execute_match(const std::string& column_name, const
std::string& match_query_str,
- size_t input_rows_count, const ColumnString*
datas,
+ size_t input_rows_count, const ColumnString*
string_col,
InvertedIndexCtx* inverted_index_ctx,
+ const ColumnArray::Offsets64* array_offsets,
ColumnUInt8::Container& result) override {
return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>();
}
diff --git a/regression-test/data/inverted_index_p0/test_array_index.out
b/regression-test/data/inverted_index_p0/test_array_index.out
index 4fd98fbd3d..0d7529c8b6 100644
--- a/regression-test/data/inverted_index_p0/test_array_index.out
+++ b/regression-test/data/inverted_index_p0/test_array_index.out
@@ -56,3 +56,15 @@
-- !sql --
4 [40, 50, 60] \N
+-- !sql --
+1 [10, 20, 30] ["i", "love", "china"]
+
+-- !sql --
+1 [10, 20, 30] ["i", "love", "china"]
+2 [20, 30, 40] ["i", "love", "north korea"]
+
+-- !sql --
+2 [20, 30, 40] ["i", "love", "north korea"]
+
+-- !sql --
+2 [20, 30, 40] ["i", "love", "north korea"]
diff --git a/regression-test/suites/inverted_index_p0/test_array_index.groovy
b/regression-test/suites/inverted_index_p0/test_array_index.groovy
index 1811a44b22..7fd95f5ad1 100644
--- a/regression-test/suites/inverted_index_p0/test_array_index.groovy
+++ b/regression-test/suites/inverted_index_p0/test_array_index.groovy
@@ -64,4 +64,10 @@ suite("test_array_index"){
qt_sql "SELECT * FROM $indexTblName WHERE int_array element_eq 40 ORDER BY
id;"
qt_sql "SELECT * FROM $indexTblName WHERE int_array element_eq 50 ORDER BY
id;"
qt_sql "SELECT * FROM $indexTblName WHERE int_array element_eq 60 ORDER BY
id;"
+
+ sql " ALTER TABLE $indexTblName drop index c_array_idx; "
+ qt_sql "SELECT * FROM $indexTblName WHERE c_array MATCH 'china' ORDER BY
id;"
+ qt_sql "SELECT * FROM $indexTblName WHERE c_array MATCH 'love' ORDER BY
id;"
+ qt_sql "SELECT * FROM $indexTblName WHERE c_array MATCH 'north' ORDER BY
id;"
+ qt_sql "SELECT * FROM $indexTblName WHERE c_array MATCH 'korea' ORDER BY
id;"
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]