Mostafa Mokhtar created HIVE-10153:
--------------------------------------

             Summary: CBO (Calcite Return Path): TPC-DS Q15 in-efficient join 
order 
                 Key: HIVE-10153
                 URL: https://issues.apache.org/jira/browse/HIVE-10153
             Project: Hive
          Issue Type: Bug
          Components: CBO
    Affects Versions: cbo-branch
            Reporter: Mostafa Mokhtar
            Assignee: Laljo John Pullokkaran
             Fix For: cbo-branch


TPC-DS Q15 joins catalog_sales with date_dim last where it should be the first 
join.

Query 
{code}
select  ca_zip
       ,sum(cs_sales_price)
 from catalog_sales
     ,customer
     ,customer_address
     ,date_dim
 where catalog_sales.cs_bill_customer_sk = customer.c_customer_sk
  and customer.c_current_addr_sk = customer_address.ca_address_sk 
  and ( substr(ca_zip,1,5) in ('85669', '86197','88274','83405','86475',
                                   '85392', '85460', '80348', '81792')
       or customer_address.ca_state in ('CA','WA','GA')
       or catalog_sales.cs_sales_price > 500)
  and catalog_sales.cs_sold_date_sk = date_dim.d_date_sk
  and date_dim.d_qoy = 2 and date_dim.d_year = 2000
 group by ca_zip
 order by ca_zip
 limit 100;
{code}

Logical plan 
{code}

HiveSort(fetch=[100]): rowcount = 7171.0, cumulative cost = 
{7.507729983730065E8 rows, 7.553113550983669E8 cpu, 9.08546638062188E10 io}, id 
= 2207
  HiveSort(sort0=[$0], dir0=[ASC]): rowcount = 7171.0, cumulative cost = 
{7.502636967200102E8 rows, 7.553041840983669E8 cpu, 9.08546638062188E10 io}, id 
= 2205
    HiveAggregate(group=[{0}], agg#0=[sum($1)]): rowcount = 7171.0, cumulative 
cost = {7.497543950670139E8 rows, 7.552970130983669E8 cpu, 9.08546638062188E10 
io}, id = 2203
      HiveProject($f0=[$7], $f1=[$1]): rowcount = 272862.9537571146, cumulative 
cost = {7.494815321132567E8 rows, 7.518816625578996E8 cpu, 8.75951724E10 io}, 
id = 2201
        HiveJoin(condition=[=($2, $8)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{1.36661031991844E8 rows, 1.3666116243648687E8 
cpu, 0.0 io}]): rowcount = 272862.9537571146, cumulative cost = 
{7.494815321132567E8 rows, 7.518816625578996E8 cpu, 8.75951724E10 io}, id = 2242
          HiveFilter(condition=[OR(in(substr($7, 1, 5), '85669', '86197', 
'88274', '83405', '86475', '85392', '85460', '80348', '81792'), in($6, 'CA', 
'WA', 'GA'), >($1, 5E2))]): rowcount = 1.3666090154720113E8, cumulative cost = 
{6.128205001214128E8 rows, 6.152205001214128E8 cpu, 8.75951724E10 io}, id = 2195
            HiveJoin(condition=[=($4, $5)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{3.246707731214128E8 rows, 3.254707731214128E8 
cpu, 4.91951724E10 io}]): rowcount = 3.6605287632468826E8, cumulative cost = 
{6.128205001214128E8 rows, 6.152205001214128E8 cpu, 8.75951724E10 io}, id = 2238
              HiveJoin(condition=[=($0, $3)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{2.88149727E8 rows, 2.89749727E8 cpu, 3.84E10 
io}]): rowcount = 3.238707731214128E8, cumulative cost = {2.88149727E8 rows, 
2.89749727E8 cpu, 3.84E10 io}, id  = 2222
                
HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.catalog_sales]]): 
rowcount = 2.86549727E8, cumulative cost = {0}, id = 2134
                
HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.customer]]): rowcount = 
1600000.0, cumulative cost = {0}, id = 2135
              
HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.customer_address]]): 
rowcount = 800000.0, cumulative cost = {0}, id = 2137
          HiveFilter(condition=[AND(=($2, 2), =($1, 2000))]): rowcount = 
130.44464285714287, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 2197
            HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.date_dim]]): 
rowcount = 73049.0, cumulative cost = {0}, id = 2140
{code}

— Re-write 
{code}
with cs as 
 ( select cs_sales_price,cs_bill_customer_sk
 from catalog_sales
     ,date_dim
where      
  cs_sold_date_sk = d_date_sk
  and date_dim.d_qoy = 2 and d_year = 2000)
  select  ca_zip
       ,sum(cs_sales_price)
 from cs
     ,customer
     ,customer_address
 where cs.cs_bill_customer_sk = customer.c_customer_sk
  and customer.c_current_addr_sk = customer_address.ca_address_sk 
  and ( substr(ca_zip,1,5) in ('85669', '86197','88274','83405','86475',
                                   '85392', '85460', '80348', '81792')
       or customer_address.ca_state in ('CA','WA','GA')
       or cs.cs_sales_price > 500)
 group by ca_zip
 order by ca_zip
 limit 100
 {code}

— plan for re-write 
{code}
HiveSort(fetch=[100]): rowcount = 7171.0, cumulative cost = 
{2.9146011517152977E8 rows, 2.949706092384584E8 cpu, 3.261369809075945E9 io}, 
id = 1990
  HiveSort(sort0=[$0], dir0=[ASC]): rowcount = 7171.0, cumulative cost = 
{2.909508135185335E8 rows, 2.949634382384584E8 cpu, 3.261369809075945E9 io}, id 
= 1988
    HiveAggregate(group=[{0}], agg#0=[sum($1)]): rowcount = 7171.0, cumulative 
cost = {2.904415118655373E8 rows, 2.949562672384584E8 cpu, 3.261369809075945E9 
io}, id = 1986
      HiveProject($f0=[$6], $f1=[$0]): rowcount = 272862.9537571146, cumulative 
cost = {2.901686489117802E8 rows, 2.915409166979911E8 cpu, 1878402.8571428573 
io}, id = 1984
        HiveFilter(condition=[OR(in(substr($6, 1, 5), '85669', '86197', 
'88274', '83405', '86475', '85392', '85460', '80348', '81792'), in($5, 'CA', 
'WA', 'GA'), >($0, 5E2))]): rowcount = 272862.9537571146, cumulative cost = 
{2.901686489117802E8 rows, 2.915409166979911E8 cpu, 1878402.8571428573 io}, id 
= 1982
          HiveProject(cs_sales_price=[$5], cs_bill_customer_sk=[$6], 
c_customer_sk=[$3], c_current_addr_sk=[$4], ca_address_sk=[$0], ca_state=[$1], 
ca_zip=[$2]): rowcount = 730876.7023664336, cumulative cost = 
{2.901686489117802E8 rows, 2.915409166979911E8 cpu, 1878402.8571428573 io}, id 
= 2030
            HiveJoin(condition=[=($4, $0)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{1446654.1255692376 rows, 2246654.1255692374 
cpu, 0.0 io}]): rowcount = 730876.7023664336, cumulative cost = 
{2.901686489117802E8 rows, 2.915409166979911E8 cpu, 1878402.8571428573 io}, id 
= 2028
              
HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.customer_address]]): 
rowcount = 800000.0, cumulative cost = {0}, id = 1917
              HiveJoin(condition=[=($3, $0)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{2172137.341568095 rows, 2744274.6831361903 
cpu, 0.0 io}]): rowcount = 646654.1255692376, cumulative cost = 
{2.8872199478621095E8 rows, 2.8929426257242185E8 cpu, 1878402.8571428573 io}, 
id = 2012
                
HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.customer]]): rowcount = 
1600000.0, cumulative cost = {0}, id = 1915
                HiveProject(cs_sales_price=[$1], cs_bill_customer_sk=[$0]): 
rowcount = 572137.341568095, cumulative cost = {2.8654985744464284E8 rows, 
2.865499878892857E8 cpu, 1878402.8571428573 io}, id = 1976
                  HiveJoin(condition=[=($2, $3)], joinType=[inner], 
joinAlgorithm=[map_join], cost=[{2.8654985744464284E8 rows, 2.865499878892857E8 
cpu, 1878402.8571428573 io}]): rowcount = 572137.341568095, cumulative cost = 
{2.8654985744464284E8 rows, 2.865499878892857E8 cpu, 1878402.8571428573 io}, id 
= 2005
                    
HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.catalog_sales]]): 
rowcount = 2.86549727E8, cumulative cost = {0}, id = 1910
                    HiveFilter(condition=[AND(=($2, 2), =($1, 2000))]): 
rowcount = 130.44464285714287, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, 
id = 1972
                      
HiveTableScan(table=[[tpcds_bin_partitioned_orc_200_1.date_dim]]): rowcount = 
73049.0, cumulative cost = {0}, id = 1911
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to