[ https://issues.apache.org/jira/browse/HIVE-21574?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Panda Song updated HIVE-21574: ------------------------------ Description: when I use a table instead of the sub select,I get the right result,much more rows are joined together(metrics old_uv is bigger!!!) Is there some bugs here? Please help me ,thanks a lot!!!!!! {code:java} select a.event_date, count(distinct a.device_id) as uv, count(distinct case when b.device_id is not null then b.device_id end) as old_uv, count(distinct a.device_id) - count(distinct case when b.device_id is not null then b.device_id end) as new_uv from ( select event_date, device_id, qingting_id from datacenter.bl_page_chain_day where event_date = '2019-03-31' and (current_content like 'https://a.qingting.fm/membership5%' or current_content like 'https://m.qingting.fm/vips/members%' or current_content like 'https://sss.qingting.fm/vips/members/v2/%') )a left join (select b.device_id from lzq_test.first_buy_vip a inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id where a.first_buy < '2019-03-31' group by b.device_id )b on a.device_id = b.device_id group by a.event_date; {code} plan: {code:java} Plan optimized by CBO. | Vertex dependency in root stage Map 1 <- Map 3 (BROADCAST_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE) Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE) Reducer 6 <- Reducer 5 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:-1 Stage-1 Reducer 6 File Output Operator [FS_26] Select Operator [SEL_25] (rows=35527639 width=349) Output:["_col0","_col1","_col2","_col3"] Group By Operator [GBY_24] (rows=35527639 width=349) Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0 <-Reducer 5 [SIMPLE_EDGE] SHUFFLE [RS_23] PartitionCols:_col0 Group By Operator [GBY_22] (rows=71055278 width=349) Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT _col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2 Select Operator [SEL_20] (rows=71055278 width=349) Output:["_col1","_col2"] Map Join Operator [MAPJOIN_45] (rows=71055278 width=349) Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right Outer),Output:["_col0","_col1"] <-Reducer 2 [ONE_TO_ONE_EDGE] FORWARD [RS_17] PartitionCols:_col0 Group By Operator [GBY_12] (rows=21738609 width=235) Output:["_col0"],keys:KEY._col0 <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_11] PartitionCols:_col0 Group By Operator [GBY_10] (rows=43477219 width=235) Output:["_col0"],keys:_col0 Map Join Operator [MAPJOIN_44] (rows=43477219 width=235) Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"] <-Map 3 [BROADCAST_EDGE] BROADCAST [RS_7] PartitionCols:_col0 Select Operator [SEL_5] (rows=301013 width=228) Output:["_col0"] Filter Operator [FIL_32] (rows=301013 width=228) predicate:((first_buy < DATE'2019-03-31') and qingting_id is not null) TableScan [TS_3] (rows=1062401 width=228) lzq_test@first_buy_vip,a, transactional table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"] <-Select Operator [SEL_2] (rows=39524744 width=235) Output:["_col0","_col1"] Filter Operator [FIL_31] (rows=39524744 width=235) predicate:qingting_id is not null TableScan [TS_0] (rows=39524744 width=235) datacenter@device_qingting,b, ACID table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"] <-Map 4 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_18] PartitionCols:_col0 Select Operator [SEL_16] (rows=64595706 width=349) Output:["_col0"] Filter Operator [FIL_33] (rows=64595706 width=349) predicate:((current_content like 'https://a.qingting.fm/membership5%') or (current_content like 'https://m.qingting.fm/vips/members%') or (current_content like 'https://sss.qingting.fm/vips/members/v2/%')) TableScan [TS_14] (rows=64595706 width=349) datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"] {code} was: when I use a table instead of the sub select,I get the right result,much more rows are joined together(metrics old_uv is bigger!!!) Is there some bugs here? Please help me ,thanks a lot!!!!!! select {{a.event_date,}} {{count(distinct a.device_id) as uv,}} {{count(distinct case when b.device_id is not null then b.device_id end) as old_uv,}} {{count(distinct a.device_id) - count(distinct case when b.device_id is not null then b.device_id end) as new_uv}} {{from}} {{(}} {{select}} {{event_date,}} {{device_id,}} {{qingting_id}} {{from datacenter.bl_page_chain_day}} {{where event_date = '2019-03-31'}} {{and (current_content like 'https://a.qingting.fm/membership5%'}} {{or current_content like 'https://m.qingting.fm/vips/members%'}} {{or current_content like 'https://sss.qingting.fm/vips/members/v2/%')}} {{)a}} {{left join}} {{(select}} b.device_id {{from}} {{lzq_test.first_buy_vip a}} {{inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id}} {{where a.first_buy < '2019-03-31'}} {{group by b.device_id}} {{)b}} {{on a.device_id = b.device_id}} {{group by a.event_date;}} plan: {quote}Plan optimized by CBO. | Vertex dependency in root stage Map 1 <- Map 3 (BROADCAST_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE) Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE) Reducer 6 <- Reducer 5 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:-1 Stage-1 Reducer 6 File Output Operator [FS_26] Select Operator [SEL_25] (rows=35527639 width=349) Output:["_col0","_col1","_col2","_col3"] Group By Operator [GBY_24] (rows=35527639 width=349) Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0 <-Reducer 5 [SIMPLE_EDGE] SHUFFLE [RS_23] PartitionCols:_col0 Group By Operator [GBY_22] (rows=71055278 width=349) Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT _col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2 Select Operator [SEL_20] (rows=71055278 width=349) Output:["_col1","_col2"] Map Join Operator [MAPJOIN_45] (rows=71055278 width=349) Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right Outer),Output:["_col0","_col1"] <-Reducer 2 [ONE_TO_ONE_EDGE] FORWARD [RS_17] PartitionCols:_col0 Group By Operator [GBY_12] (rows=21738609 width=235) Output:["_col0"],keys:KEY._col0 <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_11] PartitionCols:_col0 Group By Operator [GBY_10] (rows=43477219 width=235) Output:["_col0"],keys:_col0 Map Join Operator [MAPJOIN_44] (rows=43477219 width=235) Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"] <-Map 3 [BROADCAST_EDGE] BROADCAST [RS_7] PartitionCols:_col0 Select Operator [SEL_5] (rows=301013 width=228) Output:["_col0"] Filter Operator [FIL_32] (rows=301013 width=228) predicate:((first_buy < DATE'2019-03-31') and qingting_id is not null) TableScan [TS_3] (rows=1062401 width=228) lzq_test@first_buy_vip,a, transactional table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"] <-Select Operator [SEL_2] (rows=39524744 width=235) Output:["_col0","_col1"] Filter Operator [FIL_31] (rows=39524744 width=235) predicate:qingting_id is not null TableScan [TS_0] (rows=39524744 width=235) datacenter@device_qingting,b, ACID table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"] <-Map 4 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_18] PartitionCols:_col0 Select Operator [SEL_16] (rows=64595706 width=349) Output:["_col0"] Filter Operator [FIL_33] (rows=64595706 width=349) predicate:((current_content like 'https://a.qingting.fm/membership5%') or (current_content like 'https://m.qingting.fm/vips/members%') or (current_content like 'https://sss.qingting.fm/vips/members/v2/%')) TableScan [TS_14] (rows=64595706 width=349) datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"] {quote} > return wrong result when excuting left join sql > ----------------------------------------------- > > Key: HIVE-21574 > URL: https://issues.apache.org/jira/browse/HIVE-21574 > Project: Hive > Issue Type: Bug > Affects Versions: 3.1.0 > Environment: hive 3.1.0 hdfs 3.1.1 > Reporter: Panda Song > Priority: Blocker > > when I use a table instead of the sub select,I get the right result,much more > rows are joined together(metrics old_uv is bigger!!!) > Is there some bugs here? > Please help me ,thanks a lot!!!!!! > {code:java} > select > a.event_date, > count(distinct a.device_id) as uv, > count(distinct case when b.device_id is not null then b.device_id end) as > old_uv, > count(distinct a.device_id) - count(distinct case when b.device_id is not > null then b.device_id end) as new_uv > from > ( > select > event_date, > device_id, > qingting_id > from datacenter.bl_page_chain_day > where event_date = '2019-03-31' > and (current_content like 'https://a.qingting.fm/membership5%' > or current_content like 'https://m.qingting.fm/vips/members%' > or current_content like 'https://sss.qingting.fm/vips/members/v2/%') > )a > left join > (select > b.device_id > from > lzq_test.first_buy_vip a > inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id > where a.first_buy < '2019-03-31' > group by b.device_id > )b > on a.device_id = b.device_id > group by a.event_date; > {code} > plan: > {code:java} > Plan optimized by CBO. | > > Vertex dependency in root stage > Map 1 <- Map 3 (BROADCAST_EDGE) > Reducer 2 <- Map 1 (SIMPLE_EDGE) > Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE) > Reducer 6 <- Reducer 5 (SIMPLE_EDGE) > > Stage-0 > Fetch Operator > limit:-1 > Stage-1 > Reducer 6 > File Output Operator [FS_26] > Select Operator [SEL_25] (rows=35527639 width=349) > Output:["_col0","_col1","_col2","_col3"] > Group By Operator [GBY_24] (rows=35527639 width=349) > Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT > KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0 > <-Reducer 5 [SIMPLE_EDGE] > SHUFFLE [RS_23] > PartitionCols:_col0 > Group By Operator [GBY_22] (rows=71055278 width=349) > > Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT > _col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2 > Select Operator [SEL_20] (rows=71055278 width=349) > Output:["_col1","_col2"] > Map Join Operator [MAPJOIN_45] (rows=71055278 width=349) > > Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right > Outer),Output:["_col0","_col1"] > <-Reducer 2 [ONE_TO_ONE_EDGE] > FORWARD [RS_17] > PartitionCols:_col0 > Group By Operator [GBY_12] (rows=21738609 width=235) > Output:["_col0"],keys:KEY._col0 > <-Map 1 [SIMPLE_EDGE] > SHUFFLE [RS_11] > PartitionCols:_col0 > Group By Operator [GBY_10] (rows=43477219 > width=235) > Output:["_col0"],keys:_col0 > Map Join Operator [MAPJOIN_44] (rows=43477219 > width=235) > > Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"] > <-Map 3 [BROADCAST_EDGE] > BROADCAST [RS_7] > PartitionCols:_col0 > Select Operator [SEL_5] (rows=301013 > width=228) > Output:["_col0"] > Filter Operator [FIL_32] (rows=301013 > width=228) > predicate:((first_buy < > DATE'2019-03-31') and qingting_id is not null) > TableScan [TS_3] (rows=1062401 > width=228) > lzq_test@first_buy_vip,a, > transactional table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"] > <-Select Operator [SEL_2] (rows=39524744 > width=235) > Output:["_col0","_col1"] > Filter Operator [FIL_31] (rows=39524744 > width=235) > predicate:qingting_id is not null > TableScan [TS_0] (rows=39524744 width=235) > datacenter@device_qingting,b, ACID > table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"] > <-Map 4 [CUSTOM_SIMPLE_EDGE] > PARTITION_ONLY_SHUFFLE [RS_18] > PartitionCols:_col0 > Select Operator [SEL_16] (rows=64595706 width=349) > Output:["_col0"] > Filter Operator [FIL_33] (rows=64595706 width=349) > predicate:((current_content like > 'https://a.qingting.fm/membership5%') or (current_content like > 'https://m.qingting.fm/vips/members%') or (current_content like > 'https://sss.qingting.fm/vips/members/v2/%')) > TableScan [TS_14] (rows=64595706 width=349) > > datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"] > > {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005)