Jesus Camacho Rodriguez created HIVE-10686:
----------------------------------------------

             Summary: java.lang.IndexOutOfBoundsException for query with rank() 
over(partition ...) on CBO
                 Key: HIVE-10686
                 URL: https://issues.apache.org/jira/browse/HIVE-10686
             Project: Hive
          Issue Type: Bug
            Reporter: Jesus Camacho Rodriguez
            Assignee: Jesus Camacho Rodriguez


CBO throws Index out of bound exception for TPC-DS Q70.

Query 
{code}

explain
select
    sum(ss_net_profit) as total_sum
   ,s_state
   ,s_county
   ,grouping__id as lochierarchy
   , rank() over(partition by grouping__id, case when grouping__id == 2 then 
s_state end order by sum(ss_net_profit)) as rank_within_parent
from
    store_sales ss join date_dim d1 on d1.d_date_sk = ss.ss_sold_date_sk
    join store s on s.s_store_sk  = ss.ss_store_sk
 where
    d1.d_month_seq between 1193 and 1193+11
 and s.s_state in
             ( select s_state
               from  (select s_state as s_state, sum(ss_net_profit),
                             rank() over ( partition by s_state order by 
sum(ss_net_profit) desc) as ranking
                      from   store_sales, store, date_dim
                      where  d_month_seq between 1193 and 1193+11
                            and date_dim.d_date_sk = store_sales.ss_sold_date_sk
                            and store.s_store_sk  = store_sales.ss_store_sk
                      group by s_state
                     ) tmp1
               where ranking <= 5
             )
 group by s_state,s_county with rollup
order by
   lochierarchy desc
  ,case when lochierarchy = 0 then s_state end
  ,rank_within_parent
 limit 100
{code}

Original plan (correct)
{code}
 HiveSort(fetch=[100])
  HiveSort(sort0=[$3], sort1=[$5], sort2=[$4], dir0=[DESC], dir1=[ASC], 
dir2=[ASC])
    HiveProject(total_sum=[$4], s_state=[$0], s_county=[$1], lochierarchy=[$5], 
rank_within_parent=[rank() OVER (PARTITION BY $5, when(==($5, 2), $0) ORDER BY 
$4 ROWS BETWEEN 2147483647 FOLLOWING AND 2147483647 PRECEDING)], (tok_function 
when (= (tok_table_or_col lochierarchy) 0) (tok_table_or_col 
s_state))=[when(=($5, 0), $0)])
      HiveAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}, {}]], 
indicator=[true], agg#0=[sum($2)], GROUPING__ID=[GROUPING__ID()])
        HiveProject($f0=[$7], $f1=[$6], $f2=[$1])
          HiveJoin(condition=[=($5, $2)], joinType=[inner], algorithm=[none], 
cost=[{1177.2086187101072 rows, 0.0 cpu, 0.0 io}])
            HiveJoin(condition=[=($3, $0)], joinType=[inner], algorithm=[none], 
cost=[{2880430.428726483 rows, 0.0 cpu, 0.0 io}])
              HiveProject(ss_sold_date_sk=[$0], ss_net_profit=[$21], 
ss_store_sk=[$22])
                HiveTableScan(table=[[tpcds.store_sales]])
              HiveProject(d_date_sk=[$0], d_month_seq=[$3])
                HiveFilter(condition=[between(false, $3, 1193, +(1193, 11))])
                  HiveTableScan(table=[[tpcds.date_dim]])
            HiveProject(s_store_sk=[$0], s_county=[$1], s_state=[$2])
              SemiJoin(condition=[=($2, $3)], joinType=[inner])
                HiveProject(s_store_sk=[$0], s_county=[$23], s_state=[$24])
                  HiveTableScan(table=[[tpcds.store]])
                HiveProject(s_state=[$0])
                  HiveFilter(condition=[<=($1, 5)])
                    HiveProject((tok_table_or_col s_state)=[$0], 
rank_window_0=[rank() OVER (PARTITION BY $0 ORDER BY $1 DESC ROWS BETWEEN 
2147483647 FOLLOWING AND 2147483647 PRECEDING)])
                      HiveAggregate(group=[{0}], agg#0=[sum($1)])
                        HiveProject($f0=[$6], $f1=[$1])
                          HiveJoin(condition=[=($5, $2)], joinType=[inner], 
algorithm=[none], cost=[{1177.2086187101072 rows, 0.0 cpu, 0.0 io}])
                            HiveJoin(condition=[=($3, $0)], joinType=[inner], 
algorithm=[none], cost=[{2880430.428726483 rows, 0.0 cpu, 0.0 io}])
                              HiveProject(ss_sold_date_sk=[$0], 
ss_net_profit=[$21], ss_store_sk=[$22])
                                HiveTableScan(table=[[tpcds.store_sales]])
                              HiveProject(d_date_sk=[$0], d_month_seq=[$3])
                                HiveFilter(condition=[between(false, $3, 1193, 
+(1193, 11))])
                                  HiveTableScan(table=[[tpcds.date_dim]])
                            HiveProject(s_store_sk=[$0], s_state=[$24])
                              HiveTableScan(table=[[tpcds.store]])
{code}

Plan after fixTopOBSchema (incorrect)

{code}
 HiveSort(fetch=[100])
  HiveSort(sort0=[$3], sort1=[$5], sort2=[$4], dir0=[DESC], dir1=[ASC], 
dir2=[ASC])
    HiveProject(total_sum=[$4], s_state=[$0], s_county=[$1], lochierarchy=[$5], 
rank_within_parent=[rank() OVER (PARTITION BY $5, when(==($5, 2), $0) ORDER BY 
$4 ROWS BETWEEN 2147483647 FOLLOWING AND 2147483647 PRECEDING)])
      HiveAggregate(group=[{0, 1}], groups=[[{0, 1}, {0}, {}]], 
indicator=[true], agg#0=[sum($2)], GROUPING__ID=[GROUPING__ID()])
        HiveProject($f0=[$7], $f1=[$6], $f2=[$1])
          HiveJoin(condition=[=($5, $2)], joinType=[inner], algorithm=[none], 
cost=[{1177.2086187101072 rows, 0.0 cpu, 0.0 io}])
            HiveJoin(condition=[=($3, $0)], joinType=[inner], algorithm=[none], 
cost=[{2880430.428726483 rows, 0.0 cpu, 0.0 io}])
              HiveProject(ss_sold_date_sk=[$0], ss_net_profit=[$21], 
ss_store_sk=[$22])
                HiveTableScan(table=[[tpcds.store_sales]])
              HiveProject(d_date_sk=[$0], d_month_seq=[$3])
                HiveFilter(condition=[between(false, $3, 1193, +(1193, 11))])
                  HiveTableScan(table=[[tpcds.date_dim]])
            HiveProject(s_store_sk=[$0], s_county=[$1], s_state=[$2])
              SemiJoin(condition=[=($2, $3)], joinType=[inner])
                HiveProject(s_store_sk=[$0], s_county=[$23], s_state=[$24])
                  HiveTableScan(table=[[tpcds.store]])
                HiveProject(s_state=[$0])
                  HiveFilter(condition=[<=($1, 5)])
                    HiveProject((tok_table_or_col s_state)=[$0], 
rank_window_0=[rank() OVER (PARTITION BY $0 ORDER BY $1 DESC ROWS BETWEEN 
2147483647 FOLLOWING AND 2147483647 PRECEDING)])
                      HiveAggregate(group=[{0}], agg#0=[sum($1)])
                        HiveProject($f0=[$6], $f1=[$1])
                          HiveJoin(condition=[=($5, $2)], joinType=[inner], 
algorithm=[none], cost=[{1177.2086187101072 rows, 0.0 cpu, 0.0 io}])
                            HiveJoin(condition=[=($3, $0)], joinType=[inner], 
algorithm=[none], cost=[{2880430.428726483 rows, 0.0 cpu, 0.0 io}])
                              HiveProject(ss_sold_date_sk=[$0], 
ss_net_profit=[$21], ss_store_sk=[$22])
                                HiveTableScan(table=[[tpcds.store_sales]])
                              HiveProject(d_date_sk=[$0], d_month_seq=[$3])
                                HiveFilter(condition=[between(false, $3, 1193, 
+(1193, 11))])
                                  HiveTableScan(table=[[tpcds.date_dim]])
                            HiveProject(s_store_sk=[$0], s_state=[$24])
                              HiveTableScan(table=[[tpcds.store]])
{code}

Exception 
{code}
15/04/14 02:42:52 [main]: ERROR parse.CalcitePlanner: CBO failed, skipping CBO.
java.lang.IndexOutOfBoundsException: Index: 5, Size: 5
        at java.util.ArrayList.rangeCheck(ArrayList.java:635)
        at java.util.ArrayList.get(ArrayList.java:411)
        at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitInputRef(ASTConverter.java:395)
        at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitInputRef(ASTConverter.java:372)
        at org.apache.calcite.rex.RexInputRef.accept(RexInputRef.java:112)
        at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:543)
        at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:372)
        at org.apache.calcite.rex.RexCall.accept(RexCall.java:107)
        at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:543)
        at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter$RexVisitor.visitCall(ASTConverter.java:372)
        at org.apache.calcite.rex.RexCall.accept(RexCall.java:107)
        at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.convertOBToASTNode(ASTConverter.java:252)
        at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.convert(ASTConverter.java:208)
        at 
org.apache.hadoop.hive.ql.optimizer.calcite.translator.ASTConverter.convert(ASTConverter.java:98)
        at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner.getOptimizedAST(CalcitePlanner.java:607)
        at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner.genOPTree(CalcitePlanner.java:239)
        at 
org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.analyzeInternal(SemanticAnalyzer.java:10003)
        at 
org.apache.hadoop.hive.ql.parse.CalcitePlanner.analyzeInternal(CalcitePlanner.java:202)
        at 
org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:224)
        at 
org.apache.hadoop.hive.ql.parse.ExplainSemanticAnalyzer.analyzeInternal(ExplainSemanticAnalyzer.java:74)
        at 
org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.analyze(BaseSemanticAnalyzer.java:224)
        at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:424)
        at org.apache.hadoop.hive.ql.Driver.compile(Driver.java:308)
        at org.apache.hadoop.hive.ql.Driver.compileInternal(Driver.java:1122)
        at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:1170)
        at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1059)
        at org.apache.hadoop.hive.ql.Driver.run(Driver.java:1049)
        at 
org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:213)
        at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:165)
        at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:376)
        at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:311)
        at 
org.apache.hadoop.hive.cli.CliDriver.processReader(CliDriver.java:409)
        at org.apache.hadoop.hive.cli.CliDriver.processFile(CliDriver.java:425)
        at 
org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:714)
        at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:681)
        at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:621)
        at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
        at 
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
        at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
        at java.lang.reflect.Method.invoke(Method.java:606)
        at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
        at org.apache.hadoop.util.RunJar.main(RunJar.java:136)
{code}




--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to