[ https://issues.apache.org/jira/browse/HIVE-23716?focusedWorklogId=461840&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-461840 ]
ASF GitHub Bot logged work on HIVE-23716: ----------------------------------------- Author: ASF GitHub Bot Created on: 22/Jul/20 03:19 Start Date: 22/Jul/20 03:19 Worklog Time Spent: 10m Work Description: jcamachor commented on a change in pull request #1147: URL: https://github.com/apache/hive/pull/1147#discussion_r457831207 ########## File path: parser/src/java/org/apache/hadoop/hive/ql/parse/FromClauseParser.g ########## @@ -145,6 +145,7 @@ joinToken | KW_RIGHT (KW_OUTER)? KW_JOIN -> TOK_RIGHTOUTERJOIN | KW_FULL (KW_OUTER)? KW_JOIN -> TOK_FULLOUTERJOIN | KW_LEFT KW_SEMI KW_JOIN -> TOK_LEFTSEMIJOIN + | KW_ANTI KW_JOIN -> TOK_ANTIJOIN Review comment: Since we are exposing this and to prevent any ambiguity, should we use: `KW_LEFT KW_ANTI KW_JOIN -> TOK_LEFTANTISEMIJOIN` ########## File path: ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java ########## @@ -509,11 +513,17 @@ protected void addToAliasFilterTags(byte alias, List<Object> object, boolean isN } } + private void createForwardJoinObjectForAntiJoin(boolean[] skip) throws HiveException { + boolean forward = fillFwdCache(skip); Review comment: nit. Fwd -> Forward ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelOptUtil.java ########## @@ -747,7 +747,7 @@ public static RewritablePKFKJoinInfo isRewritablePKFKJoin(Join join, final RelNode nonFkInput = leftInputPotentialFK ? join.getRight() : join.getLeft(); final RewritablePKFKJoinInfo nonRewritable = RewritablePKFKJoinInfo.of(false, null); - if (joinType != JoinRelType.INNER && !join.isSemiJoin()) { + if (joinType != JoinRelType.INNER && !join.isSemiJoin() && joinType != JoinRelType.ANTI) { Review comment: This is interesting. An antijoin of a PK-FK join returns no rows? Can we create a JIRA for such optimization based on integrity constraints? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinConstraintsRule.java ########## @@ -100,7 +100,8 @@ public void onMatch(RelOptRuleCall call) { // These boolean values represent corresponding left, right input which is potential FK boolean leftInputPotentialFK = topRefs.intersects(leftBits); boolean rightInputPotentialFK = topRefs.intersects(rightBits); - if (leftInputPotentialFK && rightInputPotentialFK && (joinType == JoinRelType.INNER || joinType == JoinRelType.SEMI)) { + if (leftInputPotentialFK && rightInputPotentialFK && + (joinType == JoinRelType.INNER || joinType == JoinRelType.SEMI || joinType == JoinRelType.ANTI)) { Review comment: This is not correct and needs further thinking. If we have a PK-FK join that is only appending columns to the FK side, it basically means it is not filtering anything (everything is matching). If that is the case, then ANTIJOIN result would be empty? We could detect this at planning time and trigger the rewriting. Could we bail out from the rule if it is an ANTIJOIN and create a follow-up JIRA to tackle this and introduce further tests? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinWithFilterToAntiJoinRule.java ########## @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.plan.RelOptUtil; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Filter; +import org.apache.calcite.rel.core.Join; +import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.core.Project; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.util.ImmutableBitSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +/** + * Planner rule that converts a join plus filter to anti join. + */ +public class HiveJoinWithFilterToAntiJoinRule extends RelOptRule { + protected static final Logger LOG = LoggerFactory.getLogger(HiveJoinWithFilterToAntiJoinRule.class); + public static final HiveJoinWithFilterToAntiJoinRule INSTANCE = new HiveJoinWithFilterToAntiJoinRule(); + + // HiveProject(fld=[$0]) + // HiveFilter(condition=[IS NULL($1)]) + // HiveJoin(condition=[=($0, $1)], joinType=[left], algorithm=[none], cost=[not available]) + // + // TO + // + // HiveProject(fld_tbl=[$0]) + // HiveAntiJoin(condition=[=($0, $1)], joinType=[anti]) + // + public HiveJoinWithFilterToAntiJoinRule() { + super(operand(Project.class, operand(Filter.class, operand(Join.class, RelOptRule.any()))), + "HiveJoinWithFilterToAntiJoinRule:filter"); + } + + // is null filter over a left join. + public void onMatch(final RelOptRuleCall call) { + final Project project = call.rel(0); + final Filter filter = call.rel(1); + final Join join = call.rel(2); + perform(call, project, filter, join); + } + + protected void perform(RelOptRuleCall call, Project project, Filter filter, Join join) { + LOG.debug("Matched HiveAntiJoinRule"); + + if (join.getCondition().isAlwaysTrue()) { + return; + } + + //We support conversion from left outer join only. + if (join.getJoinType() != JoinRelType.LEFT) { + return; + } + + assert (filter != null); + + List<RexNode> aboveFilters = RelOptUtil.conjunctions(filter.getCondition()); + boolean hasIsNull = false; + + // Get all filter condition and check if any of them is a "is null" kind. + for (RexNode filterNode : aboveFilters) { + if (filterNode.getKind() == SqlKind.IS_NULL && + isFilterFromRightSide(join, filterNode, join.getJoinType())) { + hasIsNull = true; + break; + } + } + + // Is null should be on a key from right side of the join. + if (!hasIsNull) { + return; + } + + // Build anti join with same left, right child and condition as original left outer join. + Join anti = join.copy(join.getTraitSet(), join.getCondition(), + join.getLeft(), join.getRight(), JoinRelType.ANTI, false); + + //TODO : Do we really need it + call.getPlanner().onCopy(join, anti); + + RelNode newProject = getNewProjectNode(project, anti); + if (newProject != null) { + call.getPlanner().onCopy(project, newProject); + call.transformTo(newProject); + } + } + + protected RelNode getNewProjectNode(Project oldProject, Join newJoin) { + List<RelDataTypeField> newJoinFiledList = newJoin.getRowType().getFieldList(); + List<RexNode> newProjectExpr = new ArrayList<>(); + for (RexNode field : oldProject.getProjects()) { + if (!(field instanceof RexInputRef)) { + return null; + } + int idx = ((RexInputRef)field).getIndex(); + if (idx > newJoinFiledList.size()) { + LOG.debug(" Project filed " + ((RexInputRef) field).getName() + + " is from right side of join. Can not convert to anti join."); + return null; + } + + final RexInputRef ref = newJoin.getCluster().getRexBuilder() + .makeInputRef(field.getType(), idx); + newProjectExpr.add(ref); + } + return oldProject.copy(oldProject.getTraitSet(), newJoin, newProjectExpr, oldProject.getRowType()); + } + + private boolean isFilterFromRightSide(RelNode joinRel, RexNode filter, JoinRelType joinType) { + List<RelDataTypeField> joinFields = joinRel.getRowType().getFieldList(); + int nTotalFields = joinFields.size(); + + List<RelDataTypeField> leftFields = (joinRel.getInputs().get(0)).getRowType().getFieldList(); + int nFieldsLeft = leftFields.size(); + List<RelDataTypeField> rightFields = (joinRel.getInputs().get(1)).getRowType().getFieldList(); + int nFieldsRight = rightFields.size(); + assert nTotalFields == (!joinType.projectsRight() ? nFieldsLeft : nFieldsLeft + nFieldsRight); Review comment: `projectsRight()` is always true for LEFT join, this condition can be simplified. ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinWithFilterToAntiJoinRule.java ########## @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.plan.RelOptUtil; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Filter; +import org.apache.calcite.rel.core.Join; +import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.core.Project; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.util.ImmutableBitSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +/** + * Planner rule that converts a join plus filter to anti join. + */ +public class HiveJoinWithFilterToAntiJoinRule extends RelOptRule { + protected static final Logger LOG = LoggerFactory.getLogger(HiveJoinWithFilterToAntiJoinRule.class); + public static final HiveJoinWithFilterToAntiJoinRule INSTANCE = new HiveJoinWithFilterToAntiJoinRule(); + + // HiveProject(fld=[$0]) + // HiveFilter(condition=[IS NULL($1)]) + // HiveJoin(condition=[=($0, $1)], joinType=[left], algorithm=[none], cost=[not available]) + // + // TO + // + // HiveProject(fld_tbl=[$0]) + // HiveAntiJoin(condition=[=($0, $1)], joinType=[anti]) + // + public HiveJoinWithFilterToAntiJoinRule() { + super(operand(Project.class, operand(Filter.class, operand(Join.class, RelOptRule.any()))), + "HiveJoinWithFilterToAntiJoinRule:filter"); + } + + // is null filter over a left join. + public void onMatch(final RelOptRuleCall call) { + final Project project = call.rel(0); + final Filter filter = call.rel(1); + final Join join = call.rel(2); + perform(call, project, filter, join); + } + + protected void perform(RelOptRuleCall call, Project project, Filter filter, Join join) { + LOG.debug("Matched HiveAntiJoinRule"); + + if (join.getCondition().isAlwaysTrue()) { + return; + } + + //We support conversion from left outer join only. + if (join.getJoinType() != JoinRelType.LEFT) { + return; + } + + assert (filter != null); + + List<RexNode> aboveFilters = RelOptUtil.conjunctions(filter.getCondition()); + boolean hasIsNull = false; + + // Get all filter condition and check if any of them is a "is null" kind. + for (RexNode filterNode : aboveFilters) { + if (filterNode.getKind() == SqlKind.IS_NULL && + isFilterFromRightSide(join, filterNode, join.getJoinType())) { + hasIsNull = true; + break; + } + } + + // Is null should be on a key from right side of the join. + if (!hasIsNull) { + return; + } + + // Build anti join with same left, right child and condition as original left outer join. + Join anti = join.copy(join.getTraitSet(), join.getCondition(), + join.getLeft(), join.getRight(), JoinRelType.ANTI, false); + + //TODO : Do we really need it + call.getPlanner().onCopy(join, anti); + + RelNode newProject = getNewProjectNode(project, anti); + if (newProject != null) { + call.getPlanner().onCopy(project, newProject); Review comment: This call is not necessary either. ########## File path: ql/src/test/results/clientpositive/perf/tez/constraints/cbo_query94_anti_join.q.out ########## @@ -0,0 +1,94 @@ +PREHOOK: query: explain cbo +select + count(distinct ws_order_number) as `order count` + ,sum(ws_ext_ship_cost) as `total shipping cost` + ,sum(ws_net_profit) as `total net profit` +from + web_sales ws1 + ,date_dim + ,customer_address + ,web_site +where + d_date between '1999-5-01' and + (cast('1999-5-01' as date) + 60 days) +and ws1.ws_ship_date_sk = d_date_sk +and ws1.ws_ship_addr_sk = ca_address_sk +and ca_state = 'TX' +and ws1.ws_web_site_sk = web_site_sk +and web_company_name = 'pri' +and exists (select * + from web_sales ws2 + where ws1.ws_order_number = ws2.ws_order_number + and ws1.ws_warehouse_sk <> ws2.ws_warehouse_sk) +and not exists(select * + from web_returns wr1 + where ws1.ws_order_number = wr1.wr_order_number) +order by count(distinct ws_order_number) +limit 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@customer_address +PREHOOK: Input: default@date_dim +PREHOOK: Input: default@web_returns +PREHOOK: Input: default@web_sales +PREHOOK: Input: default@web_site +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: explain cbo +select + count(distinct ws_order_number) as `order count` + ,sum(ws_ext_ship_cost) as `total shipping cost` + ,sum(ws_net_profit) as `total net profit` +from + web_sales ws1 + ,date_dim + ,customer_address + ,web_site +where + d_date between '1999-5-01' and + (cast('1999-5-01' as date) + 60 days) +and ws1.ws_ship_date_sk = d_date_sk +and ws1.ws_ship_addr_sk = ca_address_sk +and ca_state = 'TX' +and ws1.ws_web_site_sk = web_site_sk +and web_company_name = 'pri' +and exists (select * + from web_sales ws2 + where ws1.ws_order_number = ws2.ws_order_number + and ws1.ws_warehouse_sk <> ws2.ws_warehouse_sk) +and not exists(select * + from web_returns wr1 + where ws1.ws_order_number = wr1.wr_order_number) +order by count(distinct ws_order_number) +limit 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@customer_address +POSTHOOK: Input: default@date_dim +POSTHOOK: Input: default@web_returns +POSTHOOK: Input: default@web_sales +POSTHOOK: Input: default@web_site +POSTHOOK: Output: hdfs://### HDFS PATH ### +CBO PLAN: +HiveAggregate(group=[{}], agg#0=[count(DISTINCT $4)], agg#1=[sum($5)], agg#2=[sum($6)]) + HiveJoin(condition=[=($4, $14)], joinType=[anti], algorithm=[none], cost=[not available]) Review comment: Same as q16. ########## File path: ql/src/java/org/apache/hadoop/hive/ql/exec/CommonJoinOperator.java ########## @@ -153,6 +153,8 @@ transient boolean hasLeftSemiJoin = false; + transient boolean hasAntiJoin = false; Review comment: `hasAntiJoin` -> `hasLeftAntiSemiJoin` Accordingly in other places. This fits well with other logic where you use `isSemiJoin` boolean... Now it is less ambiguous that it applies to both. ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveAntiJoin.java ########## @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.reloperators; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import org.apache.calcite.plan.RelOptCluster; +import org.apache.calcite.plan.RelTraitSet; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Join; +import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexNode; +import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelOptUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRulesRegistry; + +import java.util.ArrayList; +import java.util.List; + +public class HiveAntiJoin extends Join implements HiveRelNode { Review comment: I wonder whether we really need all these operator variants at this stage. In Calcite, it all seems to be based on a single Join class in newer releases. Can we create a follow-up JIRA to explore whether we could merge HiveJoin, HiveSemiJoin, and HiveAntiSemiJoin? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/exec/vector/mapjoin/VectorMapJoinAntiJoinGenerateResultOperator.java ########## @@ -0,0 +1,218 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec.vector.mapjoin; + +import org.apache.hadoop.hive.ql.CompilationOpContext; +import org.apache.hadoop.hive.ql.exec.JoinUtil; +import org.apache.hadoop.hive.ql.exec.vector.VectorizationContext; +import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch; +import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSet; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashSetResult; +import org.apache.hadoop.hive.ql.exec.vector.mapjoin.hashtable.VectorMapJoinHashTableResult; +import org.apache.hadoop.hive.ql.metadata.HiveException; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.VectorDesc; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; + +// TODO : This class is duplicate of semi join. Need to do a refactoring to merge it with semi join. Review comment: Can we create a JIRA for this and link it to HIVE-23716? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/plan/VectorMapJoinDesc.java ########## @@ -89,7 +89,8 @@ public PrimitiveTypeInfo getPrimitiveTypeInfo() { INNER_BIG_ONLY, LEFT_SEMI, OUTER, - FULL_OUTER + FULL_OUTER, + ANTI Review comment: LEFT_ANTI_SEMI ? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelFactories.java ########## @@ -188,6 +193,20 @@ public RelNode createSemiJoin(RelNode left, RelNode right, } } + /** + * Implementation of {@link AntiJoinFactory} that returns + * {@link org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAntiJoin} + * . + */ + private static class HiveAntiJoinFactoryImpl implements SemiJoinFactory { Review comment: Why is `HiveAntiJoinFactoryImpl` extending `SemiJoinFactory`? I think it is not used... Can we remove it? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinAddNotNullRule.java ########## @@ -74,7 +78,14 @@ public HiveJoinAddNotNullRule(Class<? extends Join> clazz, @Override public void onMatch(RelOptRuleCall call) { Join join = call.rel(0); - if (join.getJoinType() == JoinRelType.FULL || join.getCondition().isAlwaysTrue()) { + + // For anti join case add the not null on right side if the condition is + // always true. This is done because during execution, anti join expect the right side to + // be empty and if we dont put null check on right, for null only right side table and condition + // always true, execution will produce 0 records. + // eg select * from left_tbl where (select 1 from all_null_right limit 1) is null + if (join.getJoinType() == JoinRelType.FULL || + (join.getJoinType() != JoinRelType.ANTI && join.getCondition().isAlwaysTrue())) { Review comment: This condition does not seem correct (I think the example you provided in the comment is different case). - When `join.getCondition().isAlwaysTrue()`, we always bail out because we cannot introduce is not null condition on any key column (how can you know on which keys you would be filtering?). - For full outer join, we bail out because even non-matching rows from any of the inputs should still produce output rows. - For left anti join, as you did below, we introduce is not null filter on the right side. This only happens if condition is not always true. ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelOptMaterializationValidator.java ########## @@ -253,6 +256,14 @@ private RelNode visit(HiveSemiJoin semiJoin) { return visitChildren(semiJoin); } + // Note: Not currently part of the HiveRelNode interface + private RelNode visit(HiveAntiJoin antiJoin) { Review comment: `Not currently part of the HiveRelNode interface`? What does that mean? HiveAntiJoin is implementing HiveRelNode. ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveAntiJoin.java ########## @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.reloperators; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; +import org.apache.calcite.plan.RelOptCluster; +import org.apache.calcite.plan.RelTraitSet; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Join; +import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexNode; +import org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelOptUtil; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRulesRegistry; + +import java.util.ArrayList; +import java.util.List; + +public class HiveAntiJoin extends Join implements HiveRelNode { + + private final RexNode joinFilter; Review comment: Can we add a comment explaining what `joinFilter` holds? Is this an aux data structure? Will condition in Join hold the full condition? I am asking because it is important that digest contains the full condition so Calcite does not think that two operators are equivalent when they are not. ########## File path: ql/src/test/results/clientpositive/perf/tez/cbo_query16_anti_join.q.out ########## @@ -0,0 +1,99 @@ +PREHOOK: query: explain cbo +select + count(distinct cs_order_number) as `order count` + ,sum(cs_ext_ship_cost) as `total shipping cost` + ,sum(cs_net_profit) as `total net profit` +from + catalog_sales cs1 + ,date_dim + ,customer_address + ,call_center +where + d_date between '2001-4-01' and + (cast('2001-4-01' as date) + 60 days) +and cs1.cs_ship_date_sk = d_date_sk +and cs1.cs_ship_addr_sk = ca_address_sk +and ca_state = 'NY' +and cs1.cs_call_center_sk = cc_call_center_sk +and cc_county in ('Ziebach County','Levy County','Huron County','Franklin Parish', + 'Daviess County' +) +and exists (select * + from catalog_sales cs2 + where cs1.cs_order_number = cs2.cs_order_number + and cs1.cs_warehouse_sk <> cs2.cs_warehouse_sk) +and not exists(select * + from catalog_returns cr1 + where cs1.cs_order_number = cr1.cr_order_number) +order by count(distinct cs_order_number) +limit 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@call_center +PREHOOK: Input: default@catalog_returns +PREHOOK: Input: default@catalog_sales +PREHOOK: Input: default@customer_address +PREHOOK: Input: default@date_dim +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: explain cbo +select + count(distinct cs_order_number) as `order count` + ,sum(cs_ext_ship_cost) as `total shipping cost` + ,sum(cs_net_profit) as `total net profit` +from + catalog_sales cs1 + ,date_dim + ,customer_address + ,call_center +where + d_date between '2001-4-01' and + (cast('2001-4-01' as date) + 60 days) +and cs1.cs_ship_date_sk = d_date_sk +and cs1.cs_ship_addr_sk = ca_address_sk +and ca_state = 'NY' +and cs1.cs_call_center_sk = cc_call_center_sk +and cc_county in ('Ziebach County','Levy County','Huron County','Franklin Parish', + 'Daviess County' +) +and exists (select * + from catalog_sales cs2 + where cs1.cs_order_number = cs2.cs_order_number + and cs1.cs_warehouse_sk <> cs2.cs_warehouse_sk) +and not exists(select * + from catalog_returns cr1 + where cs1.cs_order_number = cr1.cr_order_number) +order by count(distinct cs_order_number) +limit 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@call_center +POSTHOOK: Input: default@catalog_returns +POSTHOOK: Input: default@catalog_sales +POSTHOOK: Input: default@customer_address +POSTHOOK: Input: default@date_dim +POSTHOOK: Output: hdfs://### HDFS PATH ### +CBO PLAN: +HiveAggregate(group=[{}], agg#0=[count(DISTINCT $4)], agg#1=[sum($5)], agg#2=[sum($6)]) + HiveJoin(condition=[=($4, $14)], joinType=[anti], algorithm=[none], cost=[not available]) + HiveSemiJoin(condition=[AND(<>($3, $13), =($4, $14))], joinType=[semi]) Review comment: Confused by something here, though it may be OK. It seems we have a `HiveSemiJoin` operator that is used when `joinType` is `SEMI`. You also created a `HiveAntiJoin` operator but it is not used for `joinType` is `ANTI` (at least in this plan). What is the reason for that? Are we creating the operator correctly? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveSubQRemoveRelBuilder.java ########## @@ -1112,7 +1112,7 @@ public RexNode field(RexNode e, String name) { } public HiveSubQRemoveRelBuilder join(JoinRelType joinType, RexNode condition, - Set<CorrelationId> variablesSet, boolean createSemiJoin) { + Set<CorrelationId> variablesSet, JoinRelType semiJoinType) { Review comment: Should we add precondition for `semiJoinType` that is either SEMI or ANTI? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinAddNotNullRule.java ########## @@ -56,6 +57,9 @@ public static final HiveJoinAddNotNullRule INSTANCE_SEMIJOIN = new HiveJoinAddNotNullRule(HiveSemiJoin.class, HiveRelFactories.HIVE_FILTER_FACTORY); + public static final HiveJoinAddNotNullRule INSTANCE_ANTIJOIN = + new HiveJoinAddNotNullRule(HiveAntiJoin.class, HiveRelFactories.HIVE_FILTER_FACTORY); Review comment: nit. spacing ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinProjectTransposeRule.java ########## @@ -133,6 +135,10 @@ private HiveJoinProjectTransposeRuleBase( public void onMatch(RelOptRuleCall call) { //TODO: this can be removed once CALCITE-3824 is released + Join joinRel = call.rel(0); + if (joinRel.getJoinType() == JoinRelType.ANTI) { Review comment: Why is this rule skipped if it is an ANTI join? It seems this rule could be straightforward. Should we create a follow-up JIRA? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java ########## @@ -1901,6 +1905,11 @@ public RelNode apply(RelOptCluster cluster, RelOptSchema relOptSchema, SchemaPlu calcitePreCboPlan = applyPreJoinOrderingTransforms(calciteGenPlan, mdProvider.getMetadataProvider(), executorProvider); + if (conf.getBoolVar(ConfVars.HIVE_CONVERT_ANTI_JOIN)) { Review comment: Can we make this block part of the `applyPostJoinOrderingTransform`, in particular before/after steps 3 and 4 in that method (those steps relate to semijoin conversion)? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java ########## @@ -414,6 +416,13 @@ private RexNode rewriteInExists(RexSubQuery e, Set<CorrelationId> variablesSet, // null keys we do not need to generate count(*), count(c) if (e.getKind() == SqlKind.EXISTS) { logic = RelOptUtil.Logic.TRUE_FALSE; + if (conf.getBoolVar(HiveConf.ConfVars.HIVE_CONVERT_ANTI_JOIN)) { + //TODO : As of now anti join is first converted to left outer join Review comment: This does not seem to be the case anymore? Should we remove? Otherwise, please create a follow-up JIRA. ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinWithFilterToAntiJoinRule.java ########## @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.plan.RelOptUtil; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Filter; +import org.apache.calcite.rel.core.Join; +import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.core.Project; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.util.ImmutableBitSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +/** + * Planner rule that converts a join plus filter to anti join. + */ +public class HiveJoinWithFilterToAntiJoinRule extends RelOptRule { + protected static final Logger LOG = LoggerFactory.getLogger(HiveJoinWithFilterToAntiJoinRule.class); + public static final HiveJoinWithFilterToAntiJoinRule INSTANCE = new HiveJoinWithFilterToAntiJoinRule(); + + // HiveProject(fld=[$0]) + // HiveFilter(condition=[IS NULL($1)]) + // HiveJoin(condition=[=($0, $1)], joinType=[left], algorithm=[none], cost=[not available]) + // + // TO + // + // HiveProject(fld_tbl=[$0]) + // HiveAntiJoin(condition=[=($0, $1)], joinType=[anti]) + // + public HiveJoinWithFilterToAntiJoinRule() { + super(operand(Project.class, operand(Filter.class, operand(Join.class, RelOptRule.any()))), + "HiveJoinWithFilterToAntiJoinRule:filter"); + } + + // is null filter over a left join. + public void onMatch(final RelOptRuleCall call) { + final Project project = call.rel(0); + final Filter filter = call.rel(1); + final Join join = call.rel(2); + perform(call, project, filter, join); + } + + protected void perform(RelOptRuleCall call, Project project, Filter filter, Join join) { + LOG.debug("Matched HiveAntiJoinRule"); + + if (join.getCondition().isAlwaysTrue()) { + return; + } + + //We support conversion from left outer join only. + if (join.getJoinType() != JoinRelType.LEFT) { + return; + } + + assert (filter != null); + + List<RexNode> aboveFilters = RelOptUtil.conjunctions(filter.getCondition()); + boolean hasIsNull = false; + + // Get all filter condition and check if any of them is a "is null" kind. + for (RexNode filterNode : aboveFilters) { + if (filterNode.getKind() == SqlKind.IS_NULL && + isFilterFromRightSide(join, filterNode, join.getJoinType())) { + hasIsNull = true; + break; + } + } + + // Is null should be on a key from right side of the join. + if (!hasIsNull) { + return; + } + + // Build anti join with same left, right child and condition as original left outer join. + Join anti = join.copy(join.getTraitSet(), join.getCondition(), + join.getLeft(), join.getRight(), JoinRelType.ANTI, false); + + //TODO : Do we really need it + call.getPlanner().onCopy(join, anti); Review comment: No need for this line indeed. ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java ########## @@ -2606,6 +2607,17 @@ private long computeFinalRowCount(List<Long> rowCountParents, long interimRowCou // max # of rows = rows from left side result = Math.min(rowCountParents.get(joinCond.getLeft()), result); break; + case JoinDesc.ANTI_JOIN: + long leftRowCount = rowCountParents.get(joinCond.getLeft()); + if (leftRowCount < result) { + // Ideally the inner join count should be less than the left row count. but if its not calculated + // properly then we can assume whole of left table will be selected. + result = leftRowCount; Review comment: Shouldn't we use `result=0`? If we estimated that the result of the inner join is going to be larger than the left input, it seems we should assume that nothing is going to come out of the anti join? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java ########## @@ -118,6 +119,15 @@ public Double getRowCount(HiveJoin join, RelMetadataQuery mq) { } public Double getRowCount(HiveSemiJoin rel, RelMetadataQuery mq) { + return getRowCountInt(rel, mq); + } + + public Double getRowCount(HiveAntiJoin rel, RelMetadataQuery mq) { + return getRowCountInt(rel, mq); + } + + private Double getRowCountInt(Join rel, RelMetadataQuery mq) { Review comment: Logic for antijoin would be slightly different? ``` return pkfk.fkInfo.rowCount * (1-selectivity); ``` You had similar change below. Does that make sense? In addition, does `super.getRowCount(rel, mq)` handle antijoin correctly? ########## File path: ql/src/test/results/clientpositive/perf/tez/cbo_query16_anti_join.q.out ########## @@ -0,0 +1,99 @@ +PREHOOK: query: explain cbo Review comment: No need to add this perf tests if enabled by default. ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinWithFilterToAntiJoinRule.java ########## @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.plan.RelOptUtil; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Filter; +import org.apache.calcite.rel.core.Join; +import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.core.Project; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.util.ImmutableBitSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +/** + * Planner rule that converts a join plus filter to anti join. + */ +public class HiveJoinWithFilterToAntiJoinRule extends RelOptRule { + protected static final Logger LOG = LoggerFactory.getLogger(HiveJoinWithFilterToAntiJoinRule.class); + public static final HiveJoinWithFilterToAntiJoinRule INSTANCE = new HiveJoinWithFilterToAntiJoinRule(); + + // HiveProject(fld=[$0]) + // HiveFilter(condition=[IS NULL($1)]) + // HiveJoin(condition=[=($0, $1)], joinType=[left], algorithm=[none], cost=[not available]) + // + // TO + // + // HiveProject(fld_tbl=[$0]) + // HiveAntiJoin(condition=[=($0, $1)], joinType=[anti]) + // + public HiveJoinWithFilterToAntiJoinRule() { + super(operand(Project.class, operand(Filter.class, operand(Join.class, RelOptRule.any()))), + "HiveJoinWithFilterToAntiJoinRule:filter"); + } + + // is null filter over a left join. + public void onMatch(final RelOptRuleCall call) { + final Project project = call.rel(0); + final Filter filter = call.rel(1); + final Join join = call.rel(2); + perform(call, project, filter, join); + } + + protected void perform(RelOptRuleCall call, Project project, Filter filter, Join join) { + LOG.debug("Matched HiveAntiJoinRule"); + + if (join.getCondition().isAlwaysTrue()) { + return; + } + + //We support conversion from left outer join only. + if (join.getJoinType() != JoinRelType.LEFT) { + return; + } + + assert (filter != null); + + List<RexNode> aboveFilters = RelOptUtil.conjunctions(filter.getCondition()); + boolean hasIsNull = false; + + // Get all filter condition and check if any of them is a "is null" kind. + for (RexNode filterNode : aboveFilters) { + if (filterNode.getKind() == SqlKind.IS_NULL && + isFilterFromRightSide(join, filterNode, join.getJoinType())) { + hasIsNull = true; + break; + } + } + + // Is null should be on a key from right side of the join. + if (!hasIsNull) { + return; + } + + // Build anti join with same left, right child and condition as original left outer join. + Join anti = join.copy(join.getTraitSet(), join.getCondition(), + join.getLeft(), join.getRight(), JoinRelType.ANTI, false); + + //TODO : Do we really need it + call.getPlanner().onCopy(join, anti); + + RelNode newProject = getNewProjectNode(project, anti); + if (newProject != null) { + call.getPlanner().onCopy(project, newProject); + call.transformTo(newProject); + } + } + + protected RelNode getNewProjectNode(Project oldProject, Join newJoin) { Review comment: We should simply use `project.copy` and remove this method (it seems we are also limiting it to `RexInputRef`... why is that done?). You can verify that all fields are coming from left input using `isFilterFromRightSide`; in fact, you can rename it to `isExprFromRightSide` and pass a list of `RexNode` as second parameter. If they do not, just bail out similarly to what you do with the filter. Please, check whether such method already exists (e.g., in `HiveCalciteUtil` or `RexUtil`) so we do not re-implement it if it does. ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinWithFilterToAntiJoinRule.java ########## @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.plan.RelOptUtil; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Filter; +import org.apache.calcite.rel.core.Join; +import org.apache.calcite.rel.core.JoinRelType; +import org.apache.calcite.rel.core.Project; +import org.apache.calcite.rel.type.RelDataTypeField; +import org.apache.calcite.rex.RexInputRef; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.util.ImmutableBitSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.List; + +/** + * Planner rule that converts a join plus filter to anti join. + */ +public class HiveJoinWithFilterToAntiJoinRule extends RelOptRule { Review comment: Rename `HiveJoinWithFilterToAntiJoinRule` to `HiveAntiSemiJoinRule` to follow naming convention for other rules, e.g., `HiveSemiJoinRule`. ########## File path: ql/src/test/results/clientpositive/perf/tez/cbo_query16_anti_join.q.out ########## @@ -0,0 +1,99 @@ +PREHOOK: query: explain cbo +select + count(distinct cs_order_number) as `order count` + ,sum(cs_ext_ship_cost) as `total shipping cost` + ,sum(cs_net_profit) as `total net profit` +from + catalog_sales cs1 + ,date_dim + ,customer_address + ,call_center +where + d_date between '2001-4-01' and + (cast('2001-4-01' as date) + 60 days) +and cs1.cs_ship_date_sk = d_date_sk +and cs1.cs_ship_addr_sk = ca_address_sk +and ca_state = 'NY' +and cs1.cs_call_center_sk = cc_call_center_sk +and cc_county in ('Ziebach County','Levy County','Huron County','Franklin Parish', + 'Daviess County' +) +and exists (select * + from catalog_sales cs2 + where cs1.cs_order_number = cs2.cs_order_number + and cs1.cs_warehouse_sk <> cs2.cs_warehouse_sk) +and not exists(select * + from catalog_returns cr1 + where cs1.cs_order_number = cr1.cr_order_number) +order by count(distinct cs_order_number) +limit 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@call_center +PREHOOK: Input: default@catalog_returns +PREHOOK: Input: default@catalog_sales +PREHOOK: Input: default@customer_address +PREHOOK: Input: default@date_dim +PREHOOK: Output: hdfs://### HDFS PATH ### +POSTHOOK: query: explain cbo +select + count(distinct cs_order_number) as `order count` + ,sum(cs_ext_ship_cost) as `total shipping cost` + ,sum(cs_net_profit) as `total net profit` +from + catalog_sales cs1 + ,date_dim + ,customer_address + ,call_center +where + d_date between '2001-4-01' and + (cast('2001-4-01' as date) + 60 days) +and cs1.cs_ship_date_sk = d_date_sk +and cs1.cs_ship_addr_sk = ca_address_sk +and ca_state = 'NY' +and cs1.cs_call_center_sk = cc_call_center_sk +and cc_county in ('Ziebach County','Levy County','Huron County','Franklin Parish', + 'Daviess County' +) +and exists (select * + from catalog_sales cs2 + where cs1.cs_order_number = cs2.cs_order_number + and cs1.cs_warehouse_sk <> cs2.cs_warehouse_sk) +and not exists(select * + from catalog_returns cr1 + where cs1.cs_order_number = cr1.cr_order_number) +order by count(distinct cs_order_number) +limit 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@call_center +POSTHOOK: Input: default@catalog_returns +POSTHOOK: Input: default@catalog_sales +POSTHOOK: Input: default@customer_address +POSTHOOK: Input: default@date_dim +POSTHOOK: Output: hdfs://### HDFS PATH ### +CBO PLAN: +HiveAggregate(group=[{}], agg#0=[count(DISTINCT $4)], agg#1=[sum($5)], agg#2=[sum($6)]) + HiveJoin(condition=[=($4, $14)], joinType=[anti], algorithm=[none], cost=[not available]) Review comment: Is this correct? Note that the condition in the SemiJoin below and the Antijoin have a common conjunct `=($4, $14)`. The semijoin implies that `=($4, $14)` is `true` for all rows that passed through it. That means that the condition in the antijoin always evaluates to `true`, which means that the query would not produce any output rows? If it is not correct, we need to fix it. Nevertheless, we should create a follow-up JIRA to implement a rule that rewrites this cases so we do not execute the subplan rooted at the antijoin. ########## File path: ql/src/java/org/apache/hadoop/hive/ql/ppd/SyntheticJoinPredicate.java ########## @@ -339,6 +339,12 @@ String getFuncText(String funcText, final int srcPos) { vector.add(right, left); break; case JoinDesc.LEFT_OUTER_JOIN: + case JoinDesc.ANTI_JOIN: + //TODO : In case of anti join, bloom filter can be created on left side also ("IN (keylist right table)"). + // But the filter should be "not-in" ("NOT IN (keylist right table)") as we want to select the records from + // left side which are not present in the right side. But it may cause wrong result as + // bloom filter may have false positive and thus simply adding not is not correct, + // special handling is required for "NOT IN". Review comment: Could we create a follow-up JIRA to explore this? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRemoveGBYSemiJoinRule.java ########## @@ -41,17 +41,19 @@ public HiveRemoveGBYSemiJoinRule() { super( - operand(HiveSemiJoin.class, + operand(Join.class, some( operand(RelNode.class, any()), operand(Aggregate.class, any()))), HiveRelFactories.HIVE_BUILDER, "HiveRemoveGBYSemiJoinRule"); } @Override public void onMatch(RelOptRuleCall call) { - final HiveSemiJoin semijoin= call.rel(0); + final Join join= call.rel(0); Review comment: nit. space before = ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdSelectivity.java ########## @@ -142,7 +146,7 @@ private Double computeInnerJoinSelectivity(Join j, RelMetadataQuery mq, RexNode ndvEstimate = exponentialBackoff(peLst, colStatMap); } - if (j.isSemiJoin()) { + if (j.isSemiJoin() || (j instanceof HiveJoin && j.getJoinType().equals(JoinRelType.ANTI))) { Review comment: We can probably remove `j instanceof HiveJoin`? ########## File path: ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdDistinctRowCount.java ########## @@ -79,6 +80,11 @@ public Double getDistinctRowCount(HiveSemiJoin rel, RelMetadataQuery mq, Immutab return super.getDistinctRowCount(rel, mq, groupKey, predicate); } + public Double getDistinctRowCount(HiveAntiJoin rel, RelMetadataQuery mq, ImmutableBitSet groupKey, + RexNode predicate) { + return super.getDistinctRowCount(rel, mq, groupKey, predicate); Review comment: Did you verify whether `super` method handles ANTI join? Not a blocker but we may need to create a follow-up JIRA if it does not. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 461840) Time Spent: 5h 20m (was: 5h 10m) > Support Anti Join in Hive > -------------------------- > > Key: HIVE-23716 > URL: https://issues.apache.org/jira/browse/HIVE-23716 > Project: Hive > Issue Type: Bug > Reporter: mahesh kumar behera > Assignee: mahesh kumar behera > Priority: Major > Labels: pull-request-available > Attachments: HIVE-23716.01.patch > > Time Spent: 5h 20m > Remaining Estimate: 0h > > Currently hive does not support Anti join. The query for anti join is > converted to left outer join and null filter on right side join key is added > to get the desired result. This is causing > # Extra computation — The left outer join projects the redundant columns > from right side. Along with that, filtering is done to remove the redundant > rows. This is can be avoided in case of anti join as anti join will project > only the required columns and rows from the left side table. > # Extra shuffle — In case of anti join the duplicate records moved to join > node can be avoided from the child node. This can reduce significant amount > of data movement if the number of distinct rows( join keys) is significant. > # Extra Memory Usage - In case of map based anti join , hash set is > sufficient as just the key is required to check if the records matches the > join condition. In case of left join, we need the key and the non key columns > also and thus a hash table will be required. > For a query like > {code:java} > select wr_order_number FROM web_returns LEFT JOIN web_sales ON > wr_order_number = ws_order_number WHERE ws_order_number IS NULL;{code} > The number of distinct ws_order_number in web_sales table in a typical 10TB > TPCDS set up is just 10% of total records. So when we convert this query to > anti join, instead of 7 billion rows, only 600 million rows are moved to join > node. > In the current patch, just one conversion is done. The pattern of > project->filter->left-join is converted to project->anti-join. This will take > care of sub queries with “not exists” clause. The queries with “not exists” > are converted first to filter + left-join and then its converted to anti > join. The queries with “not in” are not handled in the current patch. > From execution side, both merge join and map join with vectorized execution > is supported for anti join. -- This message was sent by Atlassian Jira (v8.3.4#803005)