[ https://issues.apache.org/jira/browse/HIVE-21574?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Panda Song updated HIVE-21574: ------------------------------ Description: when I use a table instead of the sub select,I get the right result,much more rows are joined together(metrics old_uv is bigger!!!) Is there some bugs here? Please help me ,thanks a lot!!!!!! {quote}select {{a.event_date,}} {{count(distinct a.device_id) as uv,}} {{count(distinct case when b.device_id is not null then b.device_id end) as old_uv,}} {{count(distinct a.device_id) - count(distinct case when b.device_id is not null then b.device_id end) as new_uv}} {{from}} {{(}} {{select}} {{event_date,}} {{device_id,}} {{qingting_id}} {{from datacenter.bl_page_chain_day}} {{where event_date = '2019-03-31'}} {{and (current_content like 'https://a.qingting.fm/membership5%'}} {{or current_content like 'https://m.qingting.fm/vips/members%'}} {{or current_content like 'https://sss.qingting.fm/vips/members/v2/%')}} {{)a}} {{left join}} {{(select}} b.device_id {{from}} {{lzq_test.first_buy_vip a}} {{inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id}} {{where a.first_buy < '2019-03-31'}} {{group by b.device_id}} {{)b}} {{on a.device_id = b.device_id}} {{group by a.event_date;}} {quote} plan: {quote}Plan optimized by CBO. | Vertex dependency in root stage Map 1 <- Map 3 (BROADCAST_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE) Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE) Reducer 6 <- Reducer 5 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:-1 Stage-1 Reducer 6 File Output Operator [FS_26] Select Operator [SEL_25] (rows=35527639 width=349) Output:["_col0","_col1","_col2","_col3"] Group By Operator [GBY_24] (rows=35527639 width=349) Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0 <-Reducer 5 [SIMPLE_EDGE] SHUFFLE [RS_23] PartitionCols:_col0 Group By Operator [GBY_22] (rows=71055278 width=349) Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT _col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2 Select Operator [SEL_20] (rows=71055278 width=349) Output:["_col1","_col2"] Map Join Operator [MAPJOIN_45] (rows=71055278 width=349) Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right Outer),Output:["_col0","_col1"] <-Reducer 2 [ONE_TO_ONE_EDGE] FORWARD [RS_17] PartitionCols:_col0 Group By Operator [GBY_12] (rows=21738609 width=235) Output:["_col0"],keys:KEY._col0 <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_11] PartitionCols:_col0 Group By Operator [GBY_10] (rows=43477219 width=235) Output:["_col0"],keys:_col0 Map Join Operator [MAPJOIN_44] (rows=43477219 width=235) Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"] <-Map 3 [BROADCAST_EDGE] BROADCAST [RS_7] PartitionCols:_col0 Select Operator [SEL_5] (rows=301013 width=228) Output:["_col0"] Filter Operator [FIL_32] (rows=301013 width=228) predicate:((first_buy < DATE'2019-03-31') and qingting_id is not null) TableScan [TS_3] (rows=1062401 width=228) lzq_test@first_buy_vip,a, transactional table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"] <-Select Operator [SEL_2] (rows=39524744 width=235) Output:["_col0","_col1"] Filter Operator [FIL_31] (rows=39524744 width=235) predicate:qingting_id is not null TableScan [TS_0] (rows=39524744 width=235) datacenter@device_qingting,b, ACID table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"] <-Map 4 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_18] PartitionCols:_col0 Select Operator [SEL_16] (rows=64595706 width=349) Output:["_col0"] Filter Operator [FIL_33] (rows=64595706 width=349) predicate:((current_content like 'https://a.qingting.fm/membership5%') or (current_content like 'https://m.qingting.fm/vips/members%') or (current_content like 'https://sss.qingting.fm/vips/members/v2/%')) TableScan [TS_14] (rows=64595706 width=349) datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"] {quote} was: when I use a table instead of the sub select,I get the right result,much more rows are joined together(metrics old_uv is bigger!!!) Is there some bugs here? Please help me ,thanks a lot!!!!!! {quote}{{select }} {{a.event_date,}} {{count(distinct a.device_id) as uv,}} {{count(distinct case when b.device_id is not null then b.device_id end) as old_uv,}} {{count(distinct a.device_id) - count(distinct case when b.device_id is not null then b.device_id end) as new_uv}} {{from}} {{(}} {{select}} {{event_date,}} {{device_id,}} {{qingting_id}} {{from datacenter.bl_page_chain_day}} {{where event_date = '2019-03-31'}} {{and (current_content like 'https://a.qingting.fm/membership5%'}} {{or current_content like 'https://m.qingting.fm/vips/members%'}} {{or current_content like 'https://sss.qingting.fm/vips/members/v2/%')}} {{)a}} {{left join}} {{(select}} {{ b.device_id}} {{from}} {{lzq_test.first_buy_vip a}} {{inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id}} {{where a.first_buy < '2019-03-31'}} {{group by b.device_id}} {{)b}} {{on a.device_id = b.device_id}} {{group by a.event_date;}}{quote} plan: {quote}Plan optimized by CBO. | Vertex dependency in root stage Map 1 <- Map 3 (BROADCAST_EDGE) Reducer 2 <- Map 1 (SIMPLE_EDGE) Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE) Reducer 6 <- Reducer 5 (SIMPLE_EDGE) Stage-0 Fetch Operator limit:-1 Stage-1 Reducer 6 File Output Operator [FS_26] Select Operator [SEL_25] (rows=35527639 width=349) Output:["_col0","_col1","_col2","_col3"] Group By Operator [GBY_24] (rows=35527639 width=349) Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0 <-Reducer 5 [SIMPLE_EDGE] SHUFFLE [RS_23] PartitionCols:_col0 Group By Operator [GBY_22] (rows=71055278 width=349) Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT _col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2 Select Operator [SEL_20] (rows=71055278 width=349) Output:["_col1","_col2"] Map Join Operator [MAPJOIN_45] (rows=71055278 width=349) Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right Outer),Output:["_col0","_col1"] <-Reducer 2 [ONE_TO_ONE_EDGE] FORWARD [RS_17] PartitionCols:_col0 Group By Operator [GBY_12] (rows=21738609 width=235) Output:["_col0"],keys:KEY._col0 <-Map 1 [SIMPLE_EDGE] SHUFFLE [RS_11] PartitionCols:_col0 Group By Operator [GBY_10] (rows=43477219 width=235) Output:["_col0"],keys:_col0 Map Join Operator [MAPJOIN_44] (rows=43477219 width=235) Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"] <-Map 3 [BROADCAST_EDGE] BROADCAST [RS_7] PartitionCols:_col0 Select Operator [SEL_5] (rows=301013 width=228) Output:["_col0"] Filter Operator [FIL_32] (rows=301013 width=228) predicate:((first_buy < DATE'2019-03-31') and qingting_id is not null) TableScan [TS_3] (rows=1062401 width=228) lzq_test@first_buy_vip,a, transactional table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"] <-Select Operator [SEL_2] (rows=39524744 width=235) Output:["_col0","_col1"] Filter Operator [FIL_31] (rows=39524744 width=235) predicate:qingting_id is not null TableScan [TS_0] (rows=39524744 width=235) datacenter@device_qingting,b, ACID table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"] <-Map 4 [CUSTOM_SIMPLE_EDGE] PARTITION_ONLY_SHUFFLE [RS_18] PartitionCols:_col0 Select Operator [SEL_16] (rows=64595706 width=349) Output:["_col0"] Filter Operator [FIL_33] (rows=64595706 width=349) predicate:((current_content like 'https://a.qingting.fm/membership5%') or (current_content like 'https://m.qingting.fm/vips/members%') or (current_content like 'https://sss.qingting.fm/vips/members/v2/%')) TableScan [TS_14] (rows=64595706 width=349) datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"] {quote} > return wrong result when excuting left join sql > ----------------------------------------------- > > Key: HIVE-21574 > URL: https://issues.apache.org/jira/browse/HIVE-21574 > Project: Hive > Issue Type: Bug > Affects Versions: 3.1.0 > Environment: hive 3.1.0 hdfs 3.1.1 > Reporter: Panda Song > Priority: Major > > when I use a table instead of the sub select,I get the right result,much more > rows are joined together(metrics old_uv is bigger!!!) > Is there some bugs here? > Please help me ,thanks a lot!!!!!! > {quote}select > {{a.event_date,}} > {{count(distinct a.device_id) as uv,}} > {{count(distinct case when b.device_id is not null then b.device_id end) as > old_uv,}} > {{count(distinct a.device_id) - count(distinct case when b.device_id is not > null then b.device_id end) as new_uv}} > {{from}} > {{(}} > {{select}} > {{event_date,}} > {{device_id,}} > {{qingting_id}} > {{from datacenter.bl_page_chain_day}} > {{where event_date = '2019-03-31'}} > {{and (current_content like 'https://a.qingting.fm/membership5%'}} > {{or current_content like 'https://m.qingting.fm/vips/members%'}} > {{or current_content like 'https://sss.qingting.fm/vips/members/v2/%')}} > {{)a}} > {{left join}} > {{(select}} > b.device_id > {{from}} > {{lzq_test.first_buy_vip a}} > {{inner join datacenter.device_qingting b on a.qingting_id = b.qingting_id}} > {{where a.first_buy < '2019-03-31'}} > {{group by b.device_id}} > {{)b}} > {{on a.device_id = b.device_id}} > {{group by a.event_date;}} > {quote} > plan: > {quote}Plan optimized by CBO. | > Vertex dependency in root stage > Map 1 <- Map 3 (BROADCAST_EDGE) > Reducer 2 <- Map 1 (SIMPLE_EDGE) > Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE), Reducer 2 (ONE_TO_ONE_EDGE) > Reducer 6 <- Reducer 5 (SIMPLE_EDGE) > Stage-0 > Fetch Operator > limit:-1 > Stage-1 > Reducer 6 > File Output Operator [FS_26] > Select Operator [SEL_25] (rows=35527639 width=349) > Output:["_col0","_col1","_col2","_col3"] > Group By Operator [GBY_24] (rows=35527639 width=349) > Output:["_col0","_col1","_col2"],aggregations:["count(DISTINCT > KEY._col1:0._col0)","count(DISTINCT KEY._col1:1._col0)"],keys:KEY._col0 > <-Reducer 5 [SIMPLE_EDGE] > SHUFFLE [RS_23] > PartitionCols:_col0 > Group By Operator [GBY_22] (rows=71055278 width=349) > > Output:["_col0","_col1","_col2","_col3","_col4"],aggregations:["count(DISTINCT > _col1)","count(DISTINCT _col2)"],keys:true, _col1, _col2 > Select Operator [SEL_20] (rows=71055278 width=349) > Output:["_col1","_col2"] > Map Join Operator [MAPJOIN_45] (rows=71055278 width=349) > Conds:RS_17.KEY.reducesinkkey0=RS_18.KEY.reducesinkkey0(Right > Outer),Output:["_col0","_col1"] > <-Reducer 2 [ONE_TO_ONE_EDGE] > FORWARD [RS_17] > PartitionCols:_col0 > Group By Operator [GBY_12] (rows=21738609 width=235) > Output:["_col0"],keys:KEY._col0 > <-Map 1 [SIMPLE_EDGE] > SHUFFLE [RS_11] > PartitionCols:_col0 > Group By Operator [GBY_10] (rows=43477219 width=235) > Output:["_col0"],keys:_col0 > Map Join Operator [MAPJOIN_44] (rows=43477219 width=235) > Conds:SEL_2._col1=RS_7._col0(Inner),Output:["_col0"] > <-Map 3 [BROADCAST_EDGE] > BROADCAST [RS_7] > PartitionCols:_col0 > Select Operator [SEL_5] (rows=301013 width=228) > Output:["_col0"] > Filter Operator [FIL_32] (rows=301013 width=228) > predicate:((first_buy < DATE'2019-03-31') and qingting_id is not null) > TableScan [TS_3] (rows=1062401 width=228) > lzq_test@first_buy_vip,a, transactional > table,Tbl:COMPLETE,Col:NONE,Output:["qingting_id","first_buy"] > <-Select Operator [SEL_2] (rows=39524744 width=235) > Output:["_col0","_col1"] > Filter Operator [FIL_31] (rows=39524744 width=235) > predicate:qingting_id is not null > TableScan [TS_0] (rows=39524744 width=235) > datacenter@device_qingting,b, ACID > table,Tbl:COMPLETE,Col:COMPLETE,Output:["device_id","qingting_id"] > <-Map 4 [CUSTOM_SIMPLE_EDGE] > PARTITION_ONLY_SHUFFLE [RS_18] > PartitionCols:_col0 > Select Operator [SEL_16] (rows=64595706 width=349) > Output:["_col0"] > Filter Operator [FIL_33] (rows=64595706 width=349) > predicate:((current_content like 'https://a.qingting.fm/membership5%') or > (current_content like 'https://m.qingting.fm/vips/members%') or > (current_content like 'https://sss.qingting.fm/vips/members/v2/%')) > TableScan [TS_14] (rows=64595706 width=349) > > datacenter@bl_page_chain_day,bl_page_chain_day,Tbl:COMPLETE,Col:NONE,Output:["device_id","current_content"] > {quote} -- This message was sent by Atlassian JIRA (v7.6.3#76005)