[ 
https://issues.apache.org/jira/browse/HIVE-20456?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Aditya Shah updated HIVE-20456:
-------------------------------
    Description: 
When skew join is enabled and auto convert join is disabled the query fails 
with file not found exception. The following query reproduces the error:

 
{code:java}
set hive.optimize.skewjoin = true;
set hive.auto.convert.join = false;
set hive.groupby.orderby.position.alias = true;
set hive.on.master=true;
set hive.execution.engine=mr;

drop database if exists test cascade;
create database if not exists test;
use test;

CREATE EXTERNAL TABLE test_table1
( `a` int , `b` int, `c` int)
PARTITIONED BY (
`d` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
;

CREATE EXTERNAL TABLE test_table2
( `a` int , `b` int, `c` int)
PARTITIONED BY (
`d` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';

CREATE EXTERNAL TABLE test_table3
( `a` int , `b` int, `c` int)
PARTITIONED BY (
`e` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='\u0001',
'serialization.format'='\u0001')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';

CREATE EXTERNAL TABLE test_table4 (`a` int , `b` int, `c` int)
PARTITIONED BY (
`e` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='\u0001',
'serialization.format'='\u0001')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';

with
temp1 as (
select
g.a,
n.b,
u.c
from
test_table2 g
inner join test_table4 u on g.a = u.a
inner join test_table3 n on u.b = n.b
),
temp2 as (
select * from test_table4 where a = 2
),
temp21 as (
select
g.b,
n.c,
u.a
from
temp2 g
inner join test_table3 u on g.b = u.b
inner join test_table2 n on u.c = n.c
group by g.b, n.c, u.a
),
stack as (
select * from temp1
union all
select * from temp21
)
select * from stack;


{code}
The query runs perfectly fine when tez is used or other combinations of skew 
join and auto convert join are set. On diagnosing the issue, the problem was 
when a conditional task resolves tasks it puts the resolved task directly in 
the runnable state without checking the parental dependencies as well as 
whether the task is already queued.

  was:
When skew join is enabled and auto convert join is disabled the query fails 
with file not found exception. The following query reproduces the error:



set hive.optimize.skewjoin = true;
set hive.auto.convert.join = false;
set hive.groupby.orderby.position.alias = true;
set hive.on.master=true;
set hive.execution.engine=mr;

drop database if exists test cascade;
create database if not exists test;
use test;

CREATE EXTERNAL TABLE test_table1
( `a` int , `b` int, `c` int)
PARTITIONED BY (
`d` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
;

CREATE EXTERNAL TABLE test_table2
( `a` int , `b` int, `c` int)
PARTITIONED BY (
`d` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';


CREATE EXTERNAL TABLE test_table3
( `a` int , `b` int, `c` int)
PARTITIONED BY (
`e` int)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='\u0001',
'serialization.format'='\u0001')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';


CREATE EXTERNAL TABLE test_table4 (`a` int , `b` int, `c` int)
PARTITIONED BY (
`e` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'='\u0001',
'serialization.format'='\u0001')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';


with
temp1 as (
select
g.a,
n.b,
u.c
from
test_table2 g
inner join test_table4 u on g.a = u.a
inner join test_table3 n on u.b = n.b
),
temp2 as (
select * from test_table4 where a = 2
),
temp21 as (
select
g.b,
n.c,
u.a
from
temp2 g
inner join test_table3 u on g.b = u.b
inner join test_table2 n on u.c = n.c
group by g.b, n.c, u.a
),
stack as (
select * from temp1
union all
select * from temp21
)
select * from stack;



The query runs perfectly fine when tez is used or other combinations of skew 
join and auto convert join are set. On diagnosing the issue, the problem was 
when a conditional task resolves tasks it puts the resolved task directly in 
the runnable state without checking the parental dependencies as well as 
whether the task is already queued.


> Query fails with FNFException using MR with skewjoin enabled and auto convert 
> join disabled
> -------------------------------------------------------------------------------------------
>
>                 Key: HIVE-20456
>                 URL: https://issues.apache.org/jira/browse/HIVE-20456
>             Project: Hive
>          Issue Type: Bug
>          Components: Hive
>    Affects Versions: 1.2.0, 2.1.1, 3.1.0
>            Reporter: Aditya Shah
>            Assignee: Aditya Shah
>            Priority: Major
>
> When skew join is enabled and auto convert join is disabled the query fails 
> with file not found exception. The following query reproduces the error:
>  
> {code:java}
> set hive.optimize.skewjoin = true;
> set hive.auto.convert.join = false;
> set hive.groupby.orderby.position.alias = true;
> set hive.on.master=true;
> set hive.execution.engine=mr;
> drop database if exists test cascade;
> create database if not exists test;
> use test;
> CREATE EXTERNAL TABLE test_table1
> ( `a` int , `b` int, `c` int)
> PARTITIONED BY (
> `d` int)
> ROW FORMAT SERDE
> 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
> STORED AS INPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
> OUTPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
> ;
> CREATE EXTERNAL TABLE test_table2
> ( `a` int , `b` int, `c` int)
> PARTITIONED BY (
> `d` int)
> ROW FORMAT SERDE
> 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
> STORED AS INPUTFORMAT
> 'org.apache.hadoop.mapred.TextInputFormat'
> OUTPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
> CREATE EXTERNAL TABLE test_table3
> ( `a` int , `b` int, `c` int)
> PARTITIONED BY (
> `e` int)
> ROW FORMAT SERDE
> 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
> WITH SERDEPROPERTIES (
> 'field.delim'='\u0001',
> 'serialization.format'='\u0001')
> STORED AS INPUTFORMAT
> 'org.apache.hadoop.mapred.TextInputFormat'
> OUTPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
> CREATE EXTERNAL TABLE test_table4 (`a` int , `b` int, `c` int)
> PARTITIONED BY (
> `e` string)
> ROW FORMAT SERDE
> 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
> WITH SERDEPROPERTIES (
> 'field.delim'='\u0001',
> 'serialization.format'='\u0001')
> STORED AS INPUTFORMAT
> 'org.apache.hadoop.mapred.TextInputFormat'
> OUTPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
> with
> temp1 as (
> select
> g.a,
> n.b,
> u.c
> from
> test_table2 g
> inner join test_table4 u on g.a = u.a
> inner join test_table3 n on u.b = n.b
> ),
> temp2 as (
> select * from test_table4 where a = 2
> ),
> temp21 as (
> select
> g.b,
> n.c,
> u.a
> from
> temp2 g
> inner join test_table3 u on g.b = u.b
> inner join test_table2 n on u.c = n.c
> group by g.b, n.c, u.a
> ),
> stack as (
> select * from temp1
> union all
> select * from temp21
> )
> select * from stack;
> {code}
> The query runs perfectly fine when tez is used or other combinations of skew 
> join and auto convert join are set. On diagnosing the issue, the problem was 
> when a conditional task resolves tasks it puts the resolved task directly in 
> the runnable state without checking the parental dependencies as well as 
> whether the task is already queued.



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to