[ https://issues.apache.org/jira/browse/HIVE-26737?focusedWorklogId=829968&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-829968 ]
ASF GitHub Bot logged work on HIVE-26737: ----------------------------------------- Author: ASF GitHub Bot Created on: 30/Nov/22 07:54 Start Date: 30/Nov/22 07:54 Worklog Time Spent: 10m Work Description: kasakrisz commented on code in PR #3761: URL: https://github.com/apache/hive/pull/3761#discussion_r1035633744 ########## ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveSubQueryRemoveRule.java: ########## @@ -104,44 +106,43 @@ private HiveSubQueryRemoveRule(RelOptRuleOperand operand, String description, Hi // if subquery is in FILTER if (relNode instanceof HiveFilter) { final HiveFilter filter = call.rel(0); - final RexSubQuery e = RexUtil.SubQueryFinder.find(filter.getCondition()); - assert e != null; - - final RelOptUtil.Logic logic = - LogicVisitor.find(RelOptUtil.Logic.TRUE, ImmutableList.of(filter.getCondition()), e); + // Since there is a RexSubQuery, there should be a HiveCorrelationInfo + // in the RelNode + Preconditions.checkState(filter.getCorrelationInfos().size() > 0); + HiveCorrelationInfo correlationInfo = filter.getCorrelationInfos().get(0); + final RelOptUtil.Logic logic = LogicVisitor.find(RelOptUtil.Logic.TRUE, + ImmutableList.of(filter.getCondition()), correlationInfo.rexSubQuery); builder.push(filter.getInput()); final int fieldCount = builder.peek().getRowType().getFieldCount(); - SubqueryConf subqueryConfig = filter.getCluster().getPlanner().getContext().unwrap(SubqueryConf.class); - boolean isCorrScalarQuery = subqueryConfig.getCorrScalarRexSQWithAgg().contains(e.rel); + boolean isCorrScalarQuery = correlationInfo.isCorrScalarQuery(); final RexNode target = - apply(call.getMetadataQuery(), e, HiveFilter.getVariablesSet(e), logic, builder, 1, - fieldCount, isCorrScalarQuery); - final RexShuttle shuttle = new ReplaceSubQueryShuttle(e, target); + apply(call.getMetadataQuery(), correlationInfo.rexSubQuery, + correlationInfo.correlationIds, logic, builder, 1, fieldCount, isCorrScalarQuery); + final RexShuttle shuttle = new ReplaceSubQueryShuttle(correlationInfo.rexSubQuery, target); builder.filter(shuttle.apply(filter.getCondition())); builder.project(fields(builder, filter.getRowType().getFieldCount())); RelNode newRel = builder.build(); call.transformTo(newRel); } else if (relNode instanceof HiveProject) { - // if subquery is in PROJECT final HiveProject project = call.rel(0); - final RexSubQuery e = RexUtil.SubQueryFinder.find(project.getProjects()); - assert e != null; + // Since there is a RexSubQuery, there should be a HiveCorrelationInfo + // in the RelNode + Preconditions.checkState(project.getCorrelationInfos().size() > 0); + HiveCorrelationInfo correlationInfo = project.getCorrelationInfos().get(0); Review Comment: I managed to have more than one `CorrelationInfo`: ``` explain cbo select year in (select year from t_test2 where t_test2.id = t_test1.id), month in (select month from t_test2 where t_test2.id = t_test1.id) from t_test1; ``` but here we always get the 1st element of the list and `project.getCorrelationInfos()` is nowhere else is used or I'm missing something. Could you please check why do we have to collect all `CorrelationInfo` objects Issue Time Tracking ------------------- Worklog Id: (was: 829968) Time Spent: 2h 20m (was: 2h 10m) > Subquery returning wrong results when database has materialized views > --------------------------------------------------------------------- > > Key: HIVE-26737 > URL: https://issues.apache.org/jira/browse/HIVE-26737 > Project: Hive > Issue Type: Bug > Components: HiveServer2 > Reporter: Steve Carlin > Assignee: Steve Carlin > Priority: Major > Labels: pull-request-available > Time Spent: 2h 20m > Remaining Estimate: 0h > > When HS2 has materialized views in its registry, subqueries with correlated > variables may return wrong results. > An example of this: > > {code:java} > CREATE TABLE t_test1( > id int, > int_col int, > year int, > month int > ); > CREATE TABLE t_test2( > id int, > int_col int, > year int, > month int > ); > CREATE TABLE dummy ( > id int > ) stored as orc TBLPROPERTIES ('transactional'='true'); > CREATE MATERIALIZED VIEW need_a_mat_view_in_registry AS > SELECT * FROM dummy where id > 5; > INSERT INTO t_test1 VALUES (1, 1, 2009, 1), (10,0, 2009, 1); > INSERT INTO t_test2 VALUES (1, 1, 2009, 1); > select id, int_col, year, month from t_test1 s where s.int_col = (select > count(*) from t_test2 t where s.id = t.id) order by id; > {code} > The select statement should produce 2 rows, but it is only producing one. > The CBO plan produced has an inner join instead of a left join. > {code:java} > HiveSortLimit(sort0=[$0], dir0=[ASC]) > HiveProject(id=[$0], int_col=[$1], year=[$2], month=[$3]) > HiveJoin(condition=[AND(=($0, $5), =($4, $6))], joinType=[inner], > algorithm=[none], cost=[not available]) > HiveProject(id=[$0], int_col=[$1], year=[$2], month=[$3], > CAST=[CAST($1):BIGINT]) > HiveFilter(condition=[AND(IS NOT NULL($0), IS NOT > NULL(CAST($1):BIGINT))]) > HiveTableScan(table=[[default, t_test1]], table:alias=[s]) > HiveProject(id=[$0], $f1=[$1]) > HiveFilter(condition=[IS NOT NULL($1)]) > HiveAggregate(group=[{0}], agg#0=[count()]) > HiveFilter(condition=[IS NOT NULL($0)]) > HiveTableScan(table=[[default, t_test2]], table:alias=[t]){code} > -- This message was sent by Atlassian Jira (v8.20.10#820010)