[ https://issues.apache.org/jira/browse/HIVE-27303?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Mahesh Raju Somalaraju updated HIVE-27303: ------------------------------------------ Description: select query result is different when enable/disable mapjoin with UNION ALL Below are the reproduce steps. As per query when map.join is disabled it should not give rows(duplicate). Same is working fine with map.join=true. Expected result: Empty rows. Problem: returning duplicate rows. Steps: ---------- SET hive.server2.tez.queue.access.check=true; SET tez.queue.name=default SET hive.query.results.cache.enabled=false; SET hive.fetch.task.conversion=none; SET hive.execution.engine=tez; SET hive.stats.autogather=true; SET hive.server2.enable.doAs=false; SET hive.auto.convert.join=false; drop table if exists hive1_tbl_data; drop table if exists hive2_tbl_data; drop table if exists hive3_tbl_data; drop table if exists hive4_tbl_data; CREATE EXTERNAL TABLE hive1_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' TBLPROPERTIES ( 'TRANSLATED_TO_EXTERNAL'='true', 'bucketing_version'='2', 'external.table.purge'='true', 'parquet.compression'='SNAPPY'); CREATE EXTERNAL TABLE hive2_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' TBLPROPERTIES ( 'TRANSLATED_TO_EXTERNAL'='true', 'bucketing_version'='2', 'external.table.purge'='true', 'parquet.compression'='SNAPPY'); CREATE EXTERNAL TABLE hive3_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' TBLPROPERTIES ( 'TRANSLATED_TO_EXTERNAL'='true', 'bucketing_version'='2', 'external.table.purge'='true', 'parquet.compression'='SNAPPY'); CREATE EXTERNAL TABLE hive4_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' TBLPROPERTIES ( 'TRANSLATED_TO_EXTERNAL'='true', 'bucketing_version'='2', 'external.table.purge'='true', 'parquet.compression'='SNAPPY'); insert into table hive1_tbl_data select '00001','john','doe','j...@hotmail.com','2014-01-01 12:01:02','4000-10000'; insert into table hive1_tbl_data select '00002','john','doe','j...@hotmail.com','2014-01-01 12:01:02','4000-10000';insert into table hive2_tbl_data select '00001','john','doe','j...@hotmail.com','2014-01-01 12:01:02','00001'; insert into table hive2_tbl_data select '00002','john','doe','j...@hotmail.com','2014-01-01 12:01:02','00001'; select t.COLUMID from ( select distinct t.COLUMID as COLUMID from (SELECT COLUMID FROM hive3_tbl_data UNION ALL SELECT COLUMID FROM hive1_tbl_data) t ) t left join ( select distinct t.COLUMID from (SELECT COLUMID FROM hive4_tbl_data UNION ALL SELECT COLUMID FROM hive2_tbl_data) t ) t1 on t.COLUMID = t1.COLUMID where t1.COLUMID is null; was: select query result is different when enable/disable mapjoin with UNION ALL Below are the reproduce steps. As per query when map.join is disabled it should not give rows(duplicate). Same is working fine with map.join=true. Expected result: Empty rows. Problem: returning duplicate rows. Steps: ---------- SET hive.server2.tez.queue.access.check=true; SET tez.queue.name=default SET hive.query.results.cache.enabled=false; SET hive.fetch.task.conversion=none; SET hive.execution.engine=tez; SET hive.stats.autogather=true; SET hive.server2.enable.doAs=false; SET hive.auto.convert.join=true; drop table if exists hive1_tbl_data; drop table if exists hive2_tbl_data; drop table if exists hive3_tbl_data; drop table if exists hive4_tbl_data; CREATE EXTERNAL TABLE hive1_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' TBLPROPERTIES ( 'TRANSLATED_TO_EXTERNAL'='true', 'bucketing_version'='2', 'external.table.purge'='true', 'parquet.compression'='SNAPPY'); CREATE EXTERNAL TABLE hive2_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' TBLPROPERTIES ( 'TRANSLATED_TO_EXTERNAL'='true', 'bucketing_version'='2', 'external.table.purge'='true', 'parquet.compression'='SNAPPY'); CREATE EXTERNAL TABLE hive3_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' TBLPROPERTIES ( 'TRANSLATED_TO_EXTERNAL'='true', 'bucketing_version'='2', 'external.table.purge'='true', 'parquet.compression'='SNAPPY'); CREATE EXTERNAL TABLE hive4_tbl_data (COLUMID string,COLUMN_FN string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM string) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' TBLPROPERTIES ( 'TRANSLATED_TO_EXTERNAL'='true', 'bucketing_version'='2', 'external.table.purge'='true', 'parquet.compression'='SNAPPY'); insert into table hive1_tbl_data select '00001','john','doe','j...@hotmail.com','2014-01-01 12:01:02','4000-10000'; insert into table hive1_tbl_data select '00002','john','doe','j...@hotmail.com','2014-01-01 12:01:02','4000-10000';insert into table hive2_tbl_data select '00001','john','doe','j...@hotmail.com','2014-01-01 12:01:02','00001'; insert into table hive2_tbl_data select '00002','john','doe','j...@hotmail.com','2014-01-01 12:01:02','00001'; select t.COLUMID from ( select distinct t.COLUMID as COLUMID from (SELECT COLUMID FROM hive3_tbl_data UNION ALL SELECT COLUMID FROM hive1_tbl_data) t ) t left join ( select distinct t.COLUMID from (SELECT COLUMID FROM hive4_tbl_data UNION ALL SELECT COLUMID FROM hive2_tbl_data) t ) t1 on t.COLUMID = t1.COLUMID where t1.COLUMID is null; > select query result is different when enable/disable mapjoin with UNION ALL > --------------------------------------------------------------------------- > > Key: HIVE-27303 > URL: https://issues.apache.org/jira/browse/HIVE-27303 > Project: Hive > Issue Type: Bug > Reporter: Mahesh Raju Somalaraju > Assignee: Mahesh Raju Somalaraju > Priority: Major > > select query result is different when enable/disable mapjoin with UNION ALL > Below are the reproduce steps. > As per query when map.join is disabled it should not give rows(duplicate). > Same is working fine with map.join=true. > Expected result: Empty rows. > Problem: returning duplicate rows. > Steps: > ---------- > SET hive.server2.tez.queue.access.check=true; > SET tez.queue.name=default > SET hive.query.results.cache.enabled=false; > SET hive.fetch.task.conversion=none; > SET hive.execution.engine=tez; > SET hive.stats.autogather=true; > SET hive.server2.enable.doAs=false; > SET hive.auto.convert.join=false; > drop table if exists hive1_tbl_data; > drop table if exists hive2_tbl_data; > drop table if exists hive3_tbl_data; > drop table if exists hive4_tbl_data; > CREATE EXTERNAL TABLE hive1_tbl_data (COLUMID string,COLUMN_FN > string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM > string) > ROW FORMAT SERDE > 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' > STORED AS INPUTFORMAT > 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' > OUTPUTFORMAT > 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' > TBLPROPERTIES ( > 'TRANSLATED_TO_EXTERNAL'='true', > 'bucketing_version'='2', > 'external.table.purge'='true', > 'parquet.compression'='SNAPPY'); > CREATE EXTERNAL TABLE hive2_tbl_data (COLUMID string,COLUMN_FN > string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM > string) > ROW FORMAT SERDE > 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' > STORED AS INPUTFORMAT > 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' > OUTPUTFORMAT > 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' > TBLPROPERTIES ( > 'TRANSLATED_TO_EXTERNAL'='true', > 'bucketing_version'='2', > 'external.table.purge'='true', > 'parquet.compression'='SNAPPY'); > CREATE EXTERNAL TABLE hive3_tbl_data (COLUMID string,COLUMN_FN > string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM > string) > ROW FORMAT SERDE > 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' > STORED AS INPUTFORMAT > 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' > OUTPUTFORMAT > 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' > TBLPROPERTIES ( > 'TRANSLATED_TO_EXTERNAL'='true', > 'bucketing_version'='2', > 'external.table.purge'='true', > 'parquet.compression'='SNAPPY'); > CREATE EXTERNAL TABLE hive4_tbl_data (COLUMID string,COLUMN_FN > string,COLUMN_LN string,EMAIL string,COL_UPDATED_DATE timestamp, PK_COLUM > string) > ROW FORMAT SERDE > 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' > STORED AS INPUTFORMAT > 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' > OUTPUTFORMAT > 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat' > TBLPROPERTIES ( > 'TRANSLATED_TO_EXTERNAL'='true', > 'bucketing_version'='2', > 'external.table.purge'='true', > 'parquet.compression'='SNAPPY'); > > insert into table hive1_tbl_data select > '00001','john','doe','j...@hotmail.com','2014-01-01 12:01:02','4000-10000'; > insert into table hive1_tbl_data select > '00002','john','doe','j...@hotmail.com','2014-01-01 > 12:01:02','4000-10000';insert into table hive2_tbl_data select > '00001','john','doe','j...@hotmail.com','2014-01-01 12:01:02','00001'; > insert into table hive2_tbl_data select > '00002','john','doe','j...@hotmail.com','2014-01-01 12:01:02','00001'; > > select > t.COLUMID > from ( > select distinct > t.COLUMID as COLUMID > from (SELECT COLUMID FROM hive3_tbl_data UNION ALL SELECT COLUMID FROM > hive1_tbl_data) t > ) t > left join ( > select > distinct t.COLUMID > from (SELECT COLUMID FROM hive4_tbl_data UNION ALL SELECT COLUMID FROM > hive2_tbl_data) t > ) t1 on t.COLUMID = t1.COLUMID > where t1.COLUMID is null; > -- This message was sent by Atlassian Jira (v8.20.10#820010)