Tanya-W commented on code in PR #19936: URL: https://github.com/apache/doris/pull/19936#discussion_r1209074639
########## be/src/vec/functions/match.cpp: ########## @@ -15,134 +15,167 @@ // specific language governing permissions and limitations // under the License. -#include <stddef.h> - -#include <algorithm> -#include <boost/iterator/iterator_facade.hpp> -#include <memory> -#include <ostream> -#include <string> -#include <utility> - -#include "common/config.h" -#include "common/consts.h" -#include "common/logging.h" -#include "common/status.h" -#include "vec/aggregate_functions/aggregate_function.h" -#include "vec/columns/column.h" -#include "vec/core/block.h" -#include "vec/core/column_numbers.h" -#include "vec/core/column_with_type_and_name.h" -#include "vec/core/types.h" -#include "vec/data_types/data_type_number.h" -#include "vec/functions/function.h" -#include "vec/functions/simple_function_factory.h" - -namespace doris { -class FunctionContext; -} // namespace doris +#include "vec/functions/match.h" -namespace doris::vectorized { - -class FunctionMatchBase : public IFunction { -public: - size_t get_number_of_arguments() const override { return 2; } - - String get_name() const override { return "match"; } +#include "olap/rowset/segment_v2/inverted_index_reader.h" +#include "runtime/query_context.h" +#include "runtime/runtime_state.h" - /// Get result types by argument types. If the function does not apply to these arguments, throw an exception. - DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { - return std::make_shared<DataTypeUInt8>(); - } +namespace doris::vectorized { - Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - size_t result, size_t input_rows_count) override { - auto match_query_str = block.get_by_position(arguments[1]).to_string(0); - std::string column_name = block.get_by_position(arguments[0]).name; - auto match_pred_column_name = - BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; - if (!block.has(match_pred_column_name)) { - if (!config::enable_index_apply_preds_except_leafnode_of_andnode) { - return Status::Cancelled( - "please check whether turn on the configuration " - "'enable_index_apply_preds_except_leafnode_of_andnode'"); - } - LOG(WARNING) << "execute match query meet error, block no column: " - << match_pred_column_name; - return Status::InternalError( - "match query meet error, no match predicate evaluate result column in block."); +Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + size_t result, size_t input_rows_count) { + auto match_query_str = block.get_by_position(arguments[1]).to_string(0); + std::string column_name = block.get_by_position(arguments[0]).name; + auto match_pred_column_name = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str; + if (!block.has(match_pred_column_name)) { + LOG(INFO) << "begin to execute match directly, column_name=" << column_name + << ", match_query_str=" << match_query_str; + InvertedIndexCtx* inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>( + context->get_function_state(FunctionContext::THREAD_LOCAL)); + + const auto values_col = + block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); + const auto* values = check_and_get_column<ColumnString>(values_col.get()); + if (!values) { + return Status::InternalError("Not supported input arguments types"); } + // result column + auto res = ColumnUInt8::create(); + ColumnUInt8::Container& vec_res = res->get_data(); + // set default value to 0, and match functions only need to set 1/true + vec_res.resize_fill(input_rows_count); + RETURN_IF_ERROR(execute_match(column_name, match_query_str, + input_rows_count, values, inverted_index_ctx, vec_res)); + block.replace_by_position(result, std::move(res)); + } else { auto match_pred_column = block.get_by_name(match_pred_column_name).column->convert_to_full_column_if_const(); - block.replace_by_position(result, std::move(match_pred_column)); - return Status::OK(); } -}; - -class FunctionMatchAny : public FunctionMatchBase { -public: - static constexpr auto name = "match_any"; - static FunctionPtr create() { return std::make_shared<FunctionMatchAny>(); } - - String get_name() const override { return name; } -}; -class FunctionMatchAll : public FunctionMatchBase { -public: - static constexpr auto name = "match_all"; - static FunctionPtr create() { return std::make_shared<FunctionMatchAll>(); } - - String get_name() const override { return name; } -}; - -class FunctionMatchPhrase : public FunctionMatchBase { -public: - static constexpr auto name = "match_phrase"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } - - String get_name() const override { return name; } -}; - -class FunctionMatchElementEQ : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_eq"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } + return Status::OK(); +} - String get_name() const override { return name; } -}; +Status FunctionMatchAny::execute_match(const std::string& column_name, + const std::string& match_query_str, + size_t input_rows_count, + const ColumnString* query_values, + InvertedIndexCtx* inverted_index_ctx, + ColumnUInt8::Container& result) { + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = get_inverted_index_parser_type_from_string(inverted_index_ctx->_parser_type); + } + LOG(INFO) << "begin to run FunctionMatchAny::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + std::vector<std::string> tokens = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY, parser_type); + for (int i = 0; i < input_rows_count; i++) { + const auto& str_ref = query_values->get_data_at(i); + std::vector<std::string> values = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, str_ref.to_string(), doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY, parser_type); + // TODO: more efficient impl + for (auto& token : tokens) { + auto it = std::find(values.begin(), values.end(), token); + if (it != values.end()) { + result[i] = true; + break; + } + } + } -class FunctionMatchElementLT : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_lt"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } + return Status::OK(); +} - String get_name() const override { return name; } -}; +Status FunctionMatchAll::execute_match(const std::string& column_name, + const std::string& match_query_str, + size_t input_rows_count, + const ColumnString* query_values, + InvertedIndexCtx* inverted_index_ctx, + ColumnUInt8::Container& result) { + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = get_inverted_index_parser_type_from_string(inverted_index_ctx->_parser_type); + } + LOG(INFO) << "begin to run FunctionMatchAll::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + std::vector<std::string> tokens = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY, parser_type); + + for (int i = 0; i < input_rows_count; i++) { + const auto& str_ref = query_values->get_data_at(i); + std::vector<std::string> values = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, str_ref.to_string(), doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY, parser_type); + // TODO: more efficient impl + auto find_count = 0; + for (auto& token : tokens) { + auto it = std::find(values.begin(), values.end(), token); + if (it != values.end()) { + ++find_count; + } + } -class FunctionMatchElementGT : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_gt"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } + if (find_count == tokens.size()) { + result[i] = true; + } + } - String get_name() const override { return name; } -}; + return Status::OK(); +} -class FunctionMatchElementLE : public FunctionMatchBase { -public: - static constexpr auto name = "match_element_le"; - static FunctionPtr create() { return std::make_shared<FunctionMatchPhrase>(); } +Status FunctionMatchPhrase::execute_match(const std::string& column_name, + const std::string& match_query_str, + size_t input_rows_count, + const ColumnString* query_values, + InvertedIndexCtx* inverted_index_ctx, + ColumnUInt8::Container& result) { + doris::InvertedIndexParserType parser_type = doris::InvertedIndexParserType::PARSER_UNKNOWN; + if (inverted_index_ctx) { + parser_type = get_inverted_index_parser_type_from_string(inverted_index_ctx->_parser_type); + } + LOG(INFO) << "begin to run FunctionMatchPhrase::execute_match, parser_type: " + << inverted_index_parser_type_to_string(parser_type); + std::vector<std::string> tokens = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, match_query_str, doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY, parser_type); + + for (int i = 0; i < input_rows_count; i++) { + const auto& str_ref = query_values->get_data_at(i); + std::vector<std::string> values = + doris::segment_v2::InvertedIndexReader::get_analyse_result( + column_name, str_ref.to_string(), doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY, parser_type); + // TODO: more efficient impl + std::vector<int> token_positions; + for (auto& token : tokens) { Review Comment: updated -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org