HappenLee commented on a change in pull request #7972: URL: https://github.com/apache/incubator-doris/pull/7972#discussion_r801326895
########## File path: be/src/vec/exec/join/vhash_join_node.cpp ########## @@ -505,8 +604,8 @@ struct ProcessHashTableProbe { private: HashJoinNode* _join_node; - const DataTypes& _left_table_data_types; - const DataTypes& _right_table_data_types; + const int right_col_idx; Review comment: _right_col_idx ########## File path: be/src/vec/exec/join/vhash_join_node.cpp ########## @@ -26,9 +26,18 @@ #include "vec/exprs/vexpr_context.h" #include "vec/functions/simple_function_factory.h" #include "vec/utils/util.hpp" - +#include <variant> Review comment: move 20 line and add space ########## File path: be/src/vec/exec/join/vhash_join_node.cpp ########## @@ -178,36 +188,133 @@ struct ProcessHashTableProbe { std::vector<uint32_t> items_counts(_probe_rows); auto& mcol = mutable_block.mutable_columns(); - int right_col_idx = _join_node->_is_right_semi_anti ? 0 : _left_table_data_types.size(); - int right_col_len = _right_table_data_types.size(); int current_offset = 0; - for (; _probe_index < _probe_rows;) { - // ignore null rows - if constexpr (ignore_null) { - if ((*null_map)[_probe_index]) { - items_counts[_probe_index++] = 0; - continue; + if constexpr (is_inner_join) { + for (; _probe_index < _probe_rows;) { + // ignore null rows + if constexpr (ignore_null) { + if ((*null_map)[_probe_index]) { + items_counts[_probe_index++] = 0; + continue; + } + } + int repeat_count = 0; + if (!(*null_map)[_probe_index]) { + auto find_result = key_getter.find_key(hash_table_ctx.hash_table, _probe_index, _arena); + + if (find_result.is_found()) { + auto& mapped = find_result.get_mapped(); + // TODO: Iterators are currently considered to be a heavy operation and have a certain impact on performance. + // We should rethink whether to use this iterator mode in the future. Now just opt the one row case + if (mapped.get_row_count() == 1) { + mapped.visited = true; + // right semi/anti join should dispose the data in hash table + // after probe data eof + ++repeat_count; + for (size_t j = 0; j < right_col_len; ++j) { + auto& column = *mapped.block->get_by_position(j).column; + mcol[j + right_col_idx]->insert_from(column, mapped.row_num); + } + } else { + if (_probe_index + 2 < _probe_rows) + key_getter.prefetch(hash_table_ctx.hash_table, _probe_index + 2, _arena); + for (auto it = mapped.begin(); it.ok(); ++it) { + // right semi/anti join should dispose the data in hash table + // after probe data eof + ++repeat_count; + for (size_t j = 0; j < right_col_len; ++j) { + auto& column = *it->block->get_by_position(j).column; + // TODO: interface insert from cause serious performance problems + // when column is nullable. Try to make more effective way + mcol[j + right_col_idx]->insert_from(column, it->row_num); + } + it->visited = true; + } + } + } + } + + items_counts[_probe_index++] = repeat_count; + current_offset += repeat_count; + + if (current_offset >= _batch_size) { + break; } } + } + else if constexpr (is_left_anti_join) { + for (; _probe_index < _probe_rows;) { + // check ignore null + if constexpr (ignore_null) { Review comment: You need enough abstraction to use macros or lambda expressions ########## File path: be/src/vec/exec/join/vhash_join_node.cpp ########## @@ -178,36 +188,133 @@ struct ProcessHashTableProbe { std::vector<uint32_t> items_counts(_probe_rows); auto& mcol = mutable_block.mutable_columns(); - int right_col_idx = _join_node->_is_right_semi_anti ? 0 : _left_table_data_types.size(); - int right_col_len = _right_table_data_types.size(); int current_offset = 0; - for (; _probe_index < _probe_rows;) { - // ignore null rows - if constexpr (ignore_null) { - if ((*null_map)[_probe_index]) { - items_counts[_probe_index++] = 0; - continue; + if constexpr (is_inner_join) { + for (; _probe_index < _probe_rows;) { + // ignore null rows + if constexpr (ignore_null) { + if ((*null_map)[_probe_index]) { + items_counts[_probe_index++] = 0; + continue; + } + } + int repeat_count = 0; + if (!(*null_map)[_probe_index]) { + auto find_result = key_getter.find_key(hash_table_ctx.hash_table, _probe_index, _arena); + + if (find_result.is_found()) { + auto& mapped = find_result.get_mapped(); + // TODO: Iterators are currently considered to be a heavy operation and have a certain impact on performance. + // We should rethink whether to use this iterator mode in the future. Now just opt the one row case + if (mapped.get_row_count() == 1) { + mapped.visited = true; + // right semi/anti join should dispose the data in hash table + // after probe data eof + ++repeat_count; + for (size_t j = 0; j < right_col_len; ++j) { + auto& column = *mapped.block->get_by_position(j).column; + mcol[j + right_col_idx]->insert_from(column, mapped.row_num); + } + } else { + if (_probe_index + 2 < _probe_rows) + key_getter.prefetch(hash_table_ctx.hash_table, _probe_index + 2, _arena); + for (auto it = mapped.begin(); it.ok(); ++it) { + // right semi/anti join should dispose the data in hash table + // after probe data eof + ++repeat_count; + for (size_t j = 0; j < right_col_len; ++j) { + auto& column = *it->block->get_by_position(j).column; + // TODO: interface insert from cause serious performance problems + // when column is nullable. Try to make more effective way + mcol[j + right_col_idx]->insert_from(column, it->row_num); + } + it->visited = true; + } + } + } + } + + items_counts[_probe_index++] = repeat_count; + current_offset += repeat_count; + + if (current_offset >= _batch_size) { + break; } } + } Review comment: } else if constexpr -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org