[ https://issues.apache.org/jira/browse/HIVE-25863?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
zengxl updated HIVE-25863: -------------------------- Attachment: (was: test_sds_2021_12_21_shuffle_1_new) > join result is null > ------------------- > > Key: HIVE-25863 > URL: https://issues.apache.org/jira/browse/HIVE-25863 > Project: Hive > Issue Type: Bug > Components: Hive > Affects Versions: 3.1.2 > Environment: hadoop 3.2.1 > hive 3.1.2 > Reporter: zengxl > Priority: Blocker > > When I change the number of Reduce, the query result will change.Either inner > join or left join will appear.Partial join results for the > {color:#de350b}third table{color}(pdwd.test_sds_2021_12_21_shuffle_1_new) are > {color:#de350b}null{color}.When there is only one Reduce, the results are all > correct. > when set hive.exec.reducers.bytes.per.reducer=256000 only one reduce; > when set hive.exec.reducers.bytes.per.reducer=2560 has four reduce > Here is my SQL and data > {code:java} > CREATE TABLE pdwd.hive_ah3_metastore_tbl_partitions_sds_test_20220112( > tbl_id bigint COMMENT 'TBL_ID', > tbl_create_time bigint COMMENT 'TBL_CREATE_TIME', > db_id bigint COMMENT 'DB_ID', > tbl_last_access_time bigint COMMENT 'TBL_LAST_ACCESS_TIME', > owner string COMMENT 'OWNER', > retention bigint COMMENT 'RETENTION', > sd_id bigint COMMENT 'SD_ID', > tbl_name string COMMENT 'TBL_NAME', > tbl_type string COMMENT 'TBL_TYPE', > view_expanded_text string COMMENT 'VIEW_EXPANDED_TEXT', > view_original_text string COMMENT 'VIEW_ORIGINAL_TEXT', > is_rewrite_enabled bigint COMMENT 'IS_REWRITE_ENABLED', > tbl_owner_type string COMMENT 'TBL_OWNER_TYPE', > cd_id bigint COMMENT 'CD_ID', > input_format string COMMENT 'INPUT_FORMAT', > is_compressed bigint COMMENT 'IS_COMPRESSED', > is_storedassubdirectories bigint COMMENT 'IS_STOREDASSUBDIRECTORIES', > tbl_or_part_location string COMMENT 'tbl_or_part_location', > num_buckets bigint COMMENT 'NUM_BUCKETS', > output_format string COMMENT 'OUTPUT_FORMAT', > serde_id bigint COMMENT 'SERDE_ID', > part_id bigint COMMENT 'PART_ID', > part_create_time bigint COMMENT 'PART_CREATE_TIME', > part_last_access_time bigint COMMENT 'PART_LAST_ACCESS_TIME', > part_name string COMMENT 'PART_NAME') > ROW FORMAT SERDE > 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' > STORED AS INPUTFORMAT > 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' > OUTPUTFORMAT > 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; > > CREATE TABLE pdwd.test_partitions_2021_12_21_shuffle_1( > part_id bigint, > create_time bigint, > last_access_time bigint, > part_name string, > sd_id bigint, > tbl_id bigint) > ROW FORMAT SERDE > 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' > STORED AS INPUTFORMAT > 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' > OUTPUTFORMAT > 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; > > CREATE TABLE pdwd.test_tbls_2021_12_21_shuffle( > tbl_id bigint, > create_time bigint, > db_id bigint, > last_access_time bigint, > owner string, > retention bigint, > sd_id bigint, > tbl_name string, > tbl_type string, > view_expanded_text string, > view_original_text string, > is_rewrite_enabled bigint, > owner_type string) > ROW FORMAT SERDE > 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' > STORED AS INPUTFORMAT > 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' > OUTPUTFORMAT > 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; > > CREATE TABLE pdwd.test_sds_2021_12_21_shuffle_1_new( > sd_id bigint, > cd_id bigint, > input_format string, > is_compressed bigint, > is_storedassubdirectories bigint, > _c5 string, > num_buckets bigint, > output_format string, > serde_id bigint) > ROW FORMAT SERDE > 'org.apache.hadoop.hive.ql.io.orc.OrcSerde' > STORED AS INPUTFORMAT > 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat' > OUTPUTFORMAT > 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'; > set hive.stats.column.autogather=false; > set hive.exec.reducers.bytes.per.reducer=2560; > set hive.auto.convert.join=false; > insert overwrite table > pdwd.hive_ah3_metastore_tbl_partitions_sds_test_20220112 > select > a.tbl_id, > b.create_time as tbl_create_time, > b.db_id, > b.last_access_time as tbl_last_access_time, > b.owner, > b.retention, > a.sd_id, > b.tbl_name, > b.tbl_type, > b.view_expanded_text, > b.view_original_text, > b.is_rewrite_enabled, > b.owner_type as tbl_owner_type, > d.cd_id, > d.input_format, > d.is_compressed, > d.is_storedassubdirectories, > d.tbl_location, > d.num_buckets, > d.output_format, > d.serde_id, > a.part_id, > a.create_time as part_create_time, > a.last_access_time as part_last_access_time, > a.part_name > from pdwd.test_partitions_2021_12_21_shuffle_1 a > left join pdwd.test_tbls_2021_12_21_shuffle b on a.tbl_id=b.tbl_id > left join pdwd.test_sds_2021_12_21_shuffle_1_new d on a.sd_id=d.sd_id; {code} > Execution error result: > {code:java} > select count(*) from pdwd.hive_ah3_metastore_tbl_partitions_sds_test_20220112 > where output_format is null; > Query ID = op_20220112153733_5c3793c8-c0e0-4dda-9212-239b5bd66f19 > Total jobs = 1 > Launching Job 1 out of 1 > Number of reduce tasks determined at compile time: 1 > In order to change the average load for a reducer (in bytes): > set hive.exec.reducers.bytes.per.reducer=<number> > In order to limit the maximum number of reducers: > set hive.exec.reducers.max=<number> > In order to set a constant number of reducers: > set mapreduce.job.reduces=<number> > 2022-01-12 15:37:35,019 | INFO | > org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider | Failing > over to rm2 > Starting Job = job_1609738754049_4759167, Tracking URL = > http://nm-bigdata-030049043.ctc.local:8088/proxy/application_1609738754049_4759167/ > Kill Command = /usr/local/hadoop3/bin/mapred job -kill > job_1609738754049_4759167 > Hadoop job information for Stage-1: number of mappers: 1; number of reducers: > 1 > 2022-01-12 15:37:46,636 Stage-1 map = 0%, reduce = 0% > 2022-01-12 15:37:56,921 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 5.29 > sec > 2022-01-12 15:38:06,183 Stage-1 map = 100%, reduce = 100%, Cumulative CPU > 8.74 sec > MapReduce Total cumulative CPU time: 8 seconds 740 msec > Ended Job = job_1609738754049_4759167 > MapReduce Jobs Launched: > Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 8.74 sec HDFS Read: > 35145 HDFS Write: 196 SUCCESS > Total MapReduce CPU Time Spent: 8 seconds 740 msec > OK > 70 {code} > Changing the Number of Reduce tasks set > hive.exec.reducers.bytes.per.reducer=256000; > {code:java} > set hive.stats.column.autogather=false; > set hive.exec.reducers.bytes.per.reducer=256000; > set hive.auto.convert.join=false; > insert overwrite table > pdwd.hive_ah3_metastore_tbl_partitions_sds_test_20220112 > select > a.tbl_id, > b.create_time as tbl_create_time, > b.db_id, > b.last_access_time as tbl_last_access_time, > b.owner, > b.retention, > a.sd_id, > b.tbl_name, > b.tbl_type, > b.view_expanded_text, > b.view_original_text, > b.is_rewrite_enabled, > b.owner_type as tbl_owner_type, > d.cd_id, > d.input_format, > d.is_compressed, > d.is_storedassubdirectories, > d.tbl_location, > d.num_buckets, > d.output_format, > d.serde_id, > a.part_id, > a.create_time as part_create_time, > a.last_access_time as part_last_access_time, > a.part_name > from pdwd.test_partitions_2021_12_21_shuffle_1 a > left join pdwd.test_tbls_2021_12_21_shuffle b on a.tbl_id=b.tbl_id > left join pdwd.test_sds_2021_12_21_shuffle_1_new d on a.sd_id=d.sd_id; {code} > Execution result is correct > > {code:java} > select count(*) from pdwd.hive_ah3_metastore_tbl_partitions_sds_test_20220112 > where output_format is null; > Query ID = op_20220112154536_a607937d-3457-44a2-9b27-a955d67dfec7 > Total jobs = 1 > Launching Job 1 out of 1 > Number of reduce tasks determined at compile time: 1 > In order to change the average load for a reducer (in bytes): > set hive.exec.reducers.bytes.per.reducer=<number> > In order to limit the maximum number of reducers: > set hive.exec.reducers.max=<number> > In order to set a constant number of reducers: > set mapreduce.job.reduces=<number> > 2022-01-12 15:45:40,045 | INFO | > org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider | Failing > over to rm2 > Starting Job = job_1609738754049_4759214, Tracking URL = > http://nm-bigdata-030049043.ctc.local:8088/proxy/application_1609738754049_4759214/ > Kill Command = /usr/local/hadoop3/bin/mapred job -kill > job_1609738754049_4759214 > Hadoop job information for Stage-1: number of mappers: 1; number of reducers: > 1 > 2022-01-12 15:46:02,957 Stage-1 map = 0%, reduce = 0% > 2022-01-12 15:46:14,343 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 4.3 > sec > 2022-01-12 15:46:23,618 Stage-1 map = 100%, reduce = 100%, Cumulative CPU > 7.55 sec > MapReduce Total cumulative CPU time: 7 seconds 550 msec > Ended Job = job_1609738754049_4759214 > MapReduce Jobs Launched: > Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 7.55 sec HDFS Read: > 27631 HDFS Write: 195 SUCCESS > Total MapReduce CPU Time Spent: 7 seconds 550 msec > OK > 0 {code} > when join pdwd.test_sds_2021_12_21_shuffle_1_new change {color:#de350b}left > {color}join to {color:#de350b}inner {color}join,execution error result: > > > {code:java} > set hive.stats.column.autogather=false; > set hive.exec.reducers.bytes.per.reducer=2560; > set hive.auto.convert.join=false; > insert overwrite table > pdwd.hive_ah3_metastore_tbl_partitions_sds_test_20220112 > select > a.tbl_id, > b.create_time as tbl_create_time, > b.db_id, > b.last_access_time as tbl_last_access_time, > b.owner, > b.retention, > a.sd_id, > b.tbl_name, > b.tbl_type, > b.view_expanded_text, > b.view_original_text, > b.is_rewrite_enabled, > b.owner_type as tbl_owner_type, > d.cd_id, > d.input_format, > d.is_compressed, > d.is_storedassubdirectories, > d.tbl_location, > d.num_buckets, > d.output_format, > d.serde_id, > a.part_id, > a.create_time as part_create_time, > a.last_access_time as part_last_access_time, > a.part_name > from pdwd.test_partitions_2021_12_21_shuffle_1 a > left join pdwd.test_tbls_2021_12_21_shuffle b on a.tbl_id=b.tbl_id > inner join pdwd.test_sds_2021_12_21_shuffle_1_new d on a.sd_id=d.sd_id; {code} > {color:#de350b}inner {color}join error result: > > {code:java} > set hive.compute.query.using.stats=false; > select count(*) from pdwd.hive_ah3_metastore_tbl_partitions_sds_test_20220112; > Query ID = op_20220112155120_f65d0552-df18-4221-9c31-e1ac21e551f0 > Total jobs = 1 > Launching Job 1 out of 1 > Number of reduce tasks determined at compile time: 1 > In order to change the average load for a reducer (in bytes): > set hive.exec.reducers.bytes.per.reducer=<number> > In order to limit the maximum number of reducers: > set hive.exec.reducers.max=<number> > In order to set a constant number of reducers: > set mapreduce.job.reduces=<number> > 2022-01-12 15:51:21,254 | INFO | > org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider | Failing > over to rm2 > Starting Job = job_1609738754049_4759248, Tracking URL = > http://nm-bigdata-030049043.ctc.local:8088/proxy/application_1609738754049_4759248/ > Kill Command = /usr/local/hadoop3/bin/mapred job -kill > job_1609738754049_4759248 > Hadoop job information for Stage-1: number of mappers: 1; number of reducers: > 1 > 2022-01-12 15:51:29,530 Stage-1 map = 0%, reduce = 0% > 2022-01-12 15:51:38,743 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 3.69 > sec > 2022-01-12 15:51:46,950 Stage-1 map = 100%, reduce = 100%, Cumulative CPU > 6.51 sec > MapReduce Total cumulative CPU time: 6 seconds 510 msec > Ended Job = job_1609738754049_4759248 > MapReduce Jobs Launched: > Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 6.51 sec HDFS Read: > 30277 HDFS Write: 196 SUCCESS > Total MapReduce CPU Time Spent: 6 seconds 510 msec > OK > 23{code} > {color:#de350b}inner join {color}:Changing the Number of Reduce tasks set > hive.exec.reducers.bytes.per.reducer=256000; > {code:java} > set hive.stats.column.autogather=false; > set hive.exec.reducers.bytes.per.reducer=256000; > set hive.auto.convert.join=false; > insert overwrite table > pdwd.hive_ah3_metastore_tbl_partitions_sds_test_20220112 > select > a.tbl_id, > b.create_time as tbl_create_time, > b.db_id, > b.last_access_time as tbl_last_access_time, > b.owner, > b.retention, > a.sd_id, > b.tbl_name, > b.tbl_type, > b.view_expanded_text, > b.view_original_text, > b.is_rewrite_enabled, > b.owner_type as tbl_owner_type, > d.cd_id, > d.input_format, > d.is_compressed, > d.is_storedassubdirectories, > d.tbl_location, > d.num_buckets, > d.output_format, > d.serde_id, > a.part_id, > a.create_time as part_create_time, > a.last_access_time as part_last_access_time, > a.part_name > from pdwd.test_partitions_2021_12_21_shuffle_1 a > left join pdwd.test_tbls_2021_12_21_shuffle b on a.tbl_id=b.tbl_id > inner join pdwd.test_sds_2021_12_21_shuffle_1_new d on a.sd_id=d.sd_id; {code} > result is correct: > {code:java} > set hive.compute.query.using.stats=false; select count(*) from > pdwd.hive_ah3_metastore_tbl_partitions_sds_test_20220112; > Query ID = op_20220112155608_d146f24e-5515-42c0-a4b7-3217e5de9f47 > Total jobs = 1 > Launching Job 1 out of 1 > Number of reduce tasks determined at compile time: 1 > In order to change the average load for a reducer (in bytes): > set hive.exec.reducers.bytes.per.reducer=<number> > In order to limit the maximum number of reducers: > set hive.exec.reducers.max=<number> > In order to set a constant number of reducers: > set mapreduce.job.reduces=<number> > 2022-01-12 15:56:09,652 | INFO | > org.apache.hadoop.yarn.client.ConfiguredRMFailoverProxyProvider | Failing > over to rm2 > Starting Job = job_1609738754049_4759266, Tracking URL = > http://nm-bigdata-030049043.ctc.local:8088/proxy/application_1609738754049_4759266/ > Kill Command = /usr/local/hadoop3/bin/mapred job -kill > job_1609738754049_4759266 > Hadoop job information for Stage-1: number of mappers: 1; number of reducers: > 1 > 2022-01-12 15:56:17,838 Stage-1 map = 0%, reduce = 0% > 2022-01-12 15:56:30,150 Stage-1 map = 100%, reduce = 0%, Cumulative CPU 6.08 > sec > 2022-01-12 15:56:39,391 Stage-1 map = 100%, reduce = 100%, Cumulative CPU > 9.0 sec > MapReduce Total cumulative CPU time: 9 seconds 0 msec > Ended Job = job_1609738754049_4759266 > MapReduce Jobs Launched: > Stage-Stage-1: Map: 1 Reduce: 1 Cumulative CPU: 9.0 sec HDFS Read: 26377 > HDFS Write: 196 SUCCESS > Total MapReduce CPU Time Spent: 9 seconds 0 msec > OK > 93{code} > Either inner join or left join .By changing the number of reduces, I found > that about a fraction of the results were correct -- This message was sent by Atlassian Jira (v8.20.1#820001)