This is an automated email from the ASF dual-hosted git repository. panxiaolei pushed a commit to branch new_join2 in repository https://gitbox.apache.org/repos/asf/doris.git
commit a7092a7755d20058c2e6bb8ffac588c3fa1d3eb7 Author: HappenLee <happen...@hotmail.com> AuthorDate: Tue Nov 21 12:04:02 2023 +0800 fix outer join and other join conjuncts (#27319) * update some fix on join * save code --------- Co-authored-by: BiteTheDDDDt <pxl...@qq.com> --- be/src/vec/common/hash_table/hash_map.h | 26 +++++++++++++--------- be/src/vec/exec/join/process_hash_table_probe.h | 1 + .../vec/exec/join/process_hash_table_probe_impl.h | 2 +- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/be/src/vec/common/hash_table/hash_map.h b/be/src/vec/common/hash_table/hash_map.h index 2f81fc27978..80ff3481544 100644 --- a/be/src/vec/common/hash_table/hash_map.h +++ b/be/src/vec/common/hash_table/hash_map.h @@ -260,7 +260,7 @@ public: template <int JoinOpType, bool with_other_conjuncts, bool is_mark_join, bool need_judge_null> auto find_batch(const Key* __restrict keys, const uint32_t* __restrict bucket_nums, int probe_idx, uint32_t build_idx, int probe_rows, - uint32_t* __restrict probe_idxs, uint32_t* __restrict build_idxs, + uint32_t* __restrict probe_idxs, bool& probe_visited, uint32_t* __restrict build_idxs, doris::vectorized::ColumnFilterHelper* mark_column) { if constexpr (is_mark_join) { return _find_batch_mark<JoinOpType>(keys, bucket_nums, probe_idx, probe_rows, @@ -277,7 +277,7 @@ public: JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN || JoinOpType == doris::TJoinOp::RIGHT_OUTER_JOIN) { return _find_batch_inner_outer_join<JoinOpType>(keys, bucket_nums, probe_idx, build_idx, - probe_rows, probe_idxs, build_idxs); + probe_rows, probe_idxs, probe_visited, build_idxs); } if constexpr (JoinOpType == doris::TJoinOp::LEFT_ANTI_JOIN || JoinOpType == doris::TJoinOp::LEFT_SEMI_JOIN || @@ -431,7 +431,7 @@ private: build_idxs[matched_cnt++] = build_idx; } } else { - build_idxs[matched_cnt++] = build_idx; + build_idxs[matched_cnt] = build_idx; matched_cnt += keys[probe_idx] == build_keys[build_idx]; } build_idx = next[build_idx]; @@ -443,6 +443,7 @@ private: if constexpr (JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN || JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN) { + // may over batch_size when emplace 0 into build_idxs if (!build_idx) { probe_idxs[matched_cnt] = probe_idx; build_idxs[matched_cnt] = 0; @@ -462,9 +463,7 @@ private: do_the_probe(); } - probe_idx -= - (matched_cnt >= batch_size && - build_idx); // FULL_OUTER_JOIN may over batch_size when emplace 0 into build_idxs + probe_idx -= (build_idx != 0); return std::tuple {probe_idx, build_idx, matched_cnt}; } @@ -473,6 +472,7 @@ private: const uint32_t* __restrict bucket_nums, int probe_idx, uint32_t build_idx, int probe_rows, uint32_t* __restrict probe_idxs, + bool& probe_visited, uint32_t* __restrict build_idxs) { auto matched_cnt = 0; const auto batch_size = max_batch_size; @@ -496,10 +496,14 @@ private: if constexpr (JoinOpType == doris::TJoinOp::LEFT_OUTER_JOIN || JoinOpType == doris::TJoinOp::FULL_OUTER_JOIN) { // `(!matched_cnt || probe_idxs[matched_cnt - 1] != probe_idx)` means not match one build side - if (!matched_cnt || probe_idxs[matched_cnt - 1] != probe_idx) { - probe_idxs[matched_cnt] = probe_idx; - build_idxs[matched_cnt] = 0; - matched_cnt++; + probe_visited |= (matched_cnt && probe_idxs[matched_cnt - 1] == probe_idx); + if (!build_idx) { + if (!probe_visited) { + probe_idxs[matched_cnt] = probe_idx; + build_idxs[matched_cnt] = 0; + matched_cnt++; + } + probe_visited = false; } } probe_idx++; @@ -514,7 +518,7 @@ private: do_the_probe(); } - probe_idx -= (matched_cnt == batch_size && build_idx); + probe_idx -= (build_idx != 0); return std::tuple {probe_idx, build_idx, matched_cnt}; } diff --git a/be/src/vec/exec/join/process_hash_table_probe.h b/be/src/vec/exec/join/process_hash_table_probe.h index ed7d0c6443b..bfc628914b1 100644 --- a/be/src/vec/exec/join/process_hash_table_probe.h +++ b/be/src/vec/exec/join/process_hash_table_probe.h @@ -88,6 +88,7 @@ struct ProcessHashTableProbe { std::vector<StringRef> _probe_keys; std::vector<uint32_t> _probe_indexs; + bool _probe_visited = false; std::vector<uint32_t> _build_indexs; std::vector<int> _build_blocks_locs; // only need set the tuple is null in RIGHT_OUTER_JOIN and FULL_OUTER_JOIN diff --git a/be/src/vec/exec/join/process_hash_table_probe_impl.h b/be/src/vec/exec/join/process_hash_table_probe_impl.h index 8cb5bd8cb8f..4cd79510d4b 100644 --- a/be/src/vec/exec/join/process_hash_table_probe_impl.h +++ b/be/src/vec/exec/join/process_hash_table_probe_impl.h @@ -179,7 +179,7 @@ Status ProcessHashTableProbe<JoinOpType, Parent>::do_process(HashTableType& hash with_other_conjuncts, is_mark_join, need_null_map_for_probe && ignore_null > (hash_table_ctx.keys, hash_table_ctx.bucket_nums.data(), - probe_index, build_index, probe_rows, _probe_indexs.data(), + probe_index, build_index, probe_rows, _probe_indexs.data(), _probe_visited, _build_indexs.data(), mark_column.get()); probe_index = new_probe_idx; build_index = new_build_idx; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org