[ https://issues.apache.org/jira/browse/HIVE-16780?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16027342#comment-16027342 ]
liyunzhang_intel edited comment on HIVE-16780 at 5/27/17 4:17 PM: ------------------------------------------------------------------ [~csun]: I found that if i disable "hive.optimize.index.filter", the case pass. if enable hive.optimize.index.filter, case fail, the exception is {code} 17/05/27 23:39:45 DEBUG Executor task launch worker-0 PerfLogger: </PERFLOG method=SparkInitializeOperators start=1495899585574 end=1495899585933 duration=359 from=org.apache.hadoop.hive.ql.exec.spark.SparkRecordHandler> 17/05/27 23:39:45 INFO Executor task launch worker-0 Utilities: PLAN PATH = hdfs://bdpe41:8020/tmp/hive/root/029a2d8a-c6e5-4ea9-adea-ef8fbea3cde2/hive_2017-05-27_23-39-06_464_5915518562441677640-1/-mr-10007/617d9dd6-9f9a-4786-8131-a7b98e8abc3e/map.xml 17/05/27 23:39:45 DEBUG Executor task launch worker-0 Utilities: Found plan in cache for name: map.xml 17/05/27 23:39:45 DEBUG Executor task launch worker-0 DFSClient: Connecting to datanode 10.239.47.162:50010 17/05/27 23:39:45 DEBUG Executor task launch worker-0 MapOperator: Processing alias(es) srcpart_hour for file hdfs://bdpe41:8020/user/hive/warehouse/srcpart_hour/000008_0 17/05/27 23:39:45 DEBUG Executor task launch worker-0 ObjectCache: Creating root_20170527233906_ac2934e1-2e58-4116-9f0d-35dee302d689_DynamicValueRegistry 17/05/27 23:39:45 ERROR Executor task launch worker-0 SparkMapRecordHandler: Error processing row: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row {"hr":"11","hour":"11"} org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row {"hr":"11","hour":"11"} at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:562) at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.processRow(SparkMapRecordHandler.java:136) at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunctionResultList.processNextRecord(HiveMapFunctionResultList.java:48) at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunctionResultList.processNextRecord(HiveMapFunctionResultList.java:27) at org.apache.hadoop.hive.ql.exec.spark.HiveBaseFunctionResultList.hasNext(HiveBaseFunctionResultList.java:85) at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:42) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127) at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127) at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974) at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Caused by: java.lang.IllegalStateException: Failed to retrieve dynamic value for RS_7_srcpart__col3_min at org.apache.hadoop.hive.ql.plan.DynamicValue.getValue(DynamicValue.java:126) at org.apache.hadoop.hive.ql.plan.DynamicValue.getWritableValue(DynamicValue.java:101) at org.apache.hadoop.hive.ql.exec.ExprNodeDynamicValueEvaluator._evaluate(ExprNodeDynamicValueEvaluator.java:51) at org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator.evaluate(ExprNodeEvaluator.java:80) at org.apache.hadoop.hive.ql.exec.ExprNodeGenericFuncEvaluator$DeferredExprObject.get(ExprNodeGenericFuncEvaluator.java:88) at org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan.evaluate(GenericUDFOPEqualOrGreaterThan.java:108) at org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween.evaluate(GenericUDFBetween.java:57) at org.apache.hadoop.hive.ql.exec.ExprNodeGenericFuncEvaluator._evaluate(ExprNodeGenericFuncEvaluator.java:187) at org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator.evaluate(ExprNodeEvaluator.java:80) at org.apache.hadoop.hive.ql.exec.ExprNodeGenericFuncEvaluator$DeferredExprObject.get(ExprNodeGenericFuncEvaluator.java:88) at org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd.evaluate(GenericUDFOPAnd.java:63) at org.apache.hadoop.hive.ql.exec.ExprNodeGenericFuncEvaluator._evaluate(ExprNodeGenericFuncEvaluator.java:187) at org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator.evaluate(ExprNodeEvaluator.java:80) at org.apache.hadoop.hive.ql.exec.ExprNodeGenericFuncEvaluator$DeferredExprObject.get(ExprNodeGenericFuncEvaluator.java:88) at org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd.evaluate(GenericUDFOPAnd.java:63) at org.apache.hadoop.hive.ql.exec.ExprNodeGenericFuncEvaluator._evaluate(ExprNodeGenericFuncEvaluator.java:187) at org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator.evaluate(ExprNodeEvaluator.java:80) at org.apache.hadoop.hive.ql.exec.ExprNodeEvaluatorHead._evaluate(ExprNodeEvaluatorHead.java:44) at org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator.evaluate(ExprNodeEvaluator.java:80) at org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator.evaluate(ExprNodeEvaluator.java:68) at org.apache.hadoop.hive.ql.exec.FilterOperator.process(FilterOperator.java:112) at org.apache.hadoop.hive.ql.exec.Operator.forward(Operator.java:897) at org.apache.hadoop.hive.ql.exec.TableScanOperator.process(TableScanOperator.java:130) at org.apache.hadoop.hive.ql.exec.MapOperator$MapOpCtx.forward(MapOperator.java:148) at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:547) ... 17 more Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: java.lang.NullPointerException at org.apache.hadoop.hive.ql.exec.mr.ObjectCache.retrieve(ObjectCache.java:62) at org.apache.hadoop.hive.ql.exec.mr.ObjectCache.retrieve(ObjectCache.java:51) at org.apache.hadoop.hive.ql.exec.ObjectCacheWrapper.retrieve(ObjectCacheWrapper.java:40) at org.apache.hadoop.hive.ql.plan.DynamicValue.getValue(DynamicValue.java:119) ... 41 more Caused by: java.lang.NullPointerException at org.apache.hadoop.hive.ql.exec.mr.ObjectCache.retrieve(ObjectCache.java:60) ... 44 more 17/05/27 23:39:45 ERROR Executor task launch worker-0 Executor: Exception in task 1.0 in stage 0.0 (TID 1) java.lang.RuntimeException: Error processing row: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row {"hr":"11","hour":"11"} at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.processRow(SparkMapRecordHandler.java:149) at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunctionResultList.processNextRecord(HiveMapFunctionResultList.java:48) at org.apache.hadoop.hive.ql.exec.spark.HiveMapFunctionResultList.processNextRecord(HiveMapFunctionResultList.java:27) at org.apache.hadoop.hive.ql.exec.spark.HiveBaseFunctionResultList.hasNext(HiveBaseFunctionResultList.java:85) at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:42) at scala.collection.Iterator$class.foreach(Iterator.scala:893) at scala.collection.AbstractIterator.foreach(Iterator.scala:1336) at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127) at org.apache.spark.rdd.AsyncRDDActions$$anonfun$foreachAsync$1$$anonfun$apply$12.apply(AsyncRDDActions.scala:127) at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974) at org.apache.spark.SparkContext$$anonfun$33.apply(SparkContext.scala:1974) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:70) at org.apache.spark.scheduler.Task.run(Task.scala:85) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:274) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: Hive Runtime Error while processing row {"hr":"11","hour":"11"} at org.apache.hadoop.hive.ql.exec.MapOperator.process(MapOperator.java:562) at org.apache.hadoop.hive.ql.exec.spark.SparkMapRecordHandler.processRow(SparkMapRecordHandler.java:136) ... 16 more Caused by: java.lang.IllegalStateException: Failed to retrieve dynamic value for RS_7_srcpart__col3_min at org.apache.hadoop.hive.ql.plan.DynamicValue.getValue(DynamicValue.java:126) at org.apache.hadoop.hive.ql.plan.DynamicValue.getWritableValue(DynamicValue.java:101) at org.apache.hadoop.hive.ql.exec.ExprNodeDynamicValueEvaluator._evaluate(ExprNodeDynamicValueEvaluator.java:51) at org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator.evaluate(ExprNodeEvaluator.java:80) at org.apache.hadoop.hive.ql.exec.ExprNodeGenericFuncEvaluator$DeferredExprObject.get(ExprNodeGenericFuncEvaluator.java:88) at org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan.evaluate(GenericUDFOPEqualOrGreaterThan.java:108) at org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween.evaluate(GenericUDFBetween.java:57) at org.apache.hadoop.hive.ql.exec.ExprNodeGenericFuncEvaluator._evaluate(ExprNodeGenericFuncEvaluator.java:187) at org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator.evaluate(ExprNodeEvaluator.java:80) at org.apache.hadoop.hive.ql.exec.ExprNodeGenericFuncEvaluator$DeferredExprObject.get(ExprNodeGenericFuncEvaluator.java:88) at org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPAnd.evaluate(GenericUDFOPAnd.java:63) at org.apache.hadoop.hive.ql.exec.ExprNodeGenericFuncEvaluator._evaluate(ExprNodeGenericFuncEvaluator.java:187) at org.apache.hadoop.hive.ql.exec.ExprNodeEvaluator.evaluate(ExprNodeEvaluator.java:80) at org.apache.hadoop.hive.ql.exec.ExprNodeGenericFuncEvaluator$DeferredExprObject.get(ExprNodeGenericFuncEvaluator.java:88) at org.apache.hadoop.hive {code} Can you help to verify whether this pass or not in your environment? in my enviroment, hive version:54dbca69c9ea630b9cccd5550bdb455b9bbc240c spark:2.0.0. was (Author: kellyzly): the explain of above script is {code} STAGE PLANS: Stage: Stage-2 Spark DagName: root_20170526150838_2e0b1aeb-104a-4a50-8638-79dbf7bd0d28:4 Vertices: Map 4 Map Operator Tree: TableScan alias: srcpart_date filterExpr: ds is not null (type: boolean) Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: ds is not null (type: boolean) Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE Spark HashTable Sink Operator keys: 0 ds (type: string) 1 ds (type: string) Position of Big Table: 0 Select Operator expressions: ds (type: string) outputColumnNames: _col0 Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE Group By Operator keys: _col0 (type: string) mode: hash outputColumnNames: _col0 Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE Spark Partition Pruning Sink Operator partition key expr: ds tmp Path: hdfs://bdpe41:8020/tmp/hive/root/574c5c5b-1a07-4c4f-9056-35482b189871/hive_2017-05-26_15-08-38_396_2133309677947979513-1/-mr-10004/1/4 Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE target column name: ds target work: Map 1 Local Work: Map Reduce Local Work Path -> Alias: hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date [srcpart_date] Path -> Partition: hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date Partition base file name: srcpart_date input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns ds,date columns.comments columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date name default.srcpart_date numFiles 12 numRows 2 rawDataSize 42 serialization.ddl struct srcpart_date { string ds, string date} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 44 transient_lastDdlTime 1495782474 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns ds,date columns.comments columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date name default.srcpart_date numFiles 12 numRows 2 rawDataSize 42 serialization.ddl struct srcpart_date { string ds, string date} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 44 transient_lastDdlTime 1495782474 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart_date name: default.srcpart_date Truncated Path -> Alias: /srcpart_date [srcpart_date] Stage: Stage-3 Spark DagName: root_20170526150838_2e0b1aeb-104a-4a50-8638-79dbf7bd0d28:5 Vertices: Map 4 Map Operator Tree: TableScan alias: srcpart_date filterExpr: ds is not null (type: boolean) Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: ds is not null (type: boolean) Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE Spark HashTable Sink Operator keys: 0 ds (type: string) 1 ds (type: string) Position of Big Table: 0 Select Operator expressions: ds (type: string) outputColumnNames: _col0 Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE Group By Operator keys: _col0 (type: string) mode: hash outputColumnNames: _col0 Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE Spark Partition Pruning Sink Operator partition key expr: ds tmp Path: hdfs://bdpe41:8020/tmp/hive/root/574c5c5b-1a07-4c4f-9056-35482b189871/hive_2017-05-26_15-08-38_396_2133309677947979513-1/-mr-10004/1/4 Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE target column name: ds target work: Map 1 Local Work: Map Reduce Local Work Path -> Alias: hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date [srcpart_date] Path -> Partition: hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date Partition base file name: srcpart_date input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns ds,date columns.comments columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date name default.srcpart_date numFiles 12 numRows 2 rawDataSize 42 serialization.ddl struct srcpart_date { string ds, string date} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 44 transient_lastDdlTime 1495782474 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns ds,date columns.comments columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date name default.srcpart_date numFiles 12 numRows 2 rawDataSize 42 serialization.ddl struct srcpart_date { string ds, string date} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 44 transient_lastDdlTime 1495782474 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart_date name: default.srcpart_date Truncated Path -> Alias: /srcpart_date [srcpart_date] Stage: Stage-1 Spark Edges: Reducer 2 <- Map 6 (GROUP, 1) Reducer 3 <- Map 7 (GROUP, 1) DagName: root_20170526150838_2e0b1aeb-104a-4a50-8638-79dbf7bd0d28:3 Vertices: Map 6 Map Operator Tree: TableScan alias: srcpart Statistics: Num rows: 1 Data size: 23248 Basic stats: PARTIAL Column stats: NONE GatherStats: false Map Join Operator condition map: Inner Join 0 to 1 keys: 0 ds (type: string) 1 ds (type: string) outputColumnNames: _col3 input vertices: 1 Map 4 Position of Big Table: 0 Statistics: Num rows: 2 Data size: 46 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: _col3 (type: string) outputColumnNames: _col0 Statistics: Num rows: 2 Data size: 46 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1000000) mode: hash outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator null sort order: sort order: Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE tag: -1 value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) auto parallelism: false quick start: true Local Work: Map Reduce Local Work Path -> Alias: hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11 [srcpart] hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12 [srcpart] hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=11 [srcpart] hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=12 [srcpart] Path -> Partition: hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11 Partition base file name: hr=11 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-08 hr 11 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782438 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12 Partition base file name: hr=12 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-08 hr 12 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782438 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=11 Partition base file name: hr=11 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-09 hr 11 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=11 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782439 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=12 Partition base file name: hr=12 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-09 hr 12 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=12 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782439 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart Truncated Path -> Alias: /srcpart/ds=2008-04-08/hr=11 [srcpart] /srcpart/ds=2008-04-08/hr=12 [srcpart] /srcpart/ds=2008-04-09/hr=11 [srcpart] /srcpart/ds=2008-04-09/hr=12 [srcpart] Map 7 Map Operator Tree: TableScan alias: srcpart Statistics: Num rows: 1 Data size: 23248 Basic stats: PARTIAL Column stats: NONE GatherStats: false Map Join Operator condition map: Inner Join 0 to 1 keys: 0 ds (type: string) 1 ds (type: string) outputColumnNames: _col3 input vertices: 1 Map 4 Position of Big Table: 0 Statistics: Num rows: 2 Data size: 46 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Inner Join 0 to 1 keys: 0 _col3 (type: string) 1 hr (type: string) input vertices: 1 Map 5 Position of Big Table: 0 Statistics: Num rows: 2 Data size: 50 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: count() mode: hash outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator null sort order: sort order: Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE tag: -1 value expressions: _col0 (type: bigint) auto parallelism: false Local Work: Map Reduce Local Work Path -> Alias: hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11 [srcpart] hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12 [srcpart] hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=11 [srcpart] hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=12 [srcpart] Path -> Partition: hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11 Partition base file name: hr=11 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-08 hr 11 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782438 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12 Partition base file name: hr=12 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-08 hr 12 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782438 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=11 Partition base file name: hr=11 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-09 hr 11 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=11 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782439 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=12 Partition base file name: hr=12 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-09 hr 12 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=12 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782439 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart Truncated Path -> Alias: /srcpart/ds=2008-04-08/hr=11 [srcpart] /srcpart/ds=2008-04-08/hr=12 [srcpart] /srcpart/ds=2008-04-09/hr=11 [srcpart] /srcpart/ds=2008-04-09/hr=12 [srcpart] Reducer 2 Needs Tagging: false Reduce Operator Tree: Group By Operator aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1000000) mode: final outputColumnNames: _col0, _col1, _col2 Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator null sort order: sort order: Statistics: Num rows: 1 Data size: 552 Basic stats: COMPLETE Column stats: NONE tag: -1 value expressions: _col0 (type: string), _col1 (type: string), _col2 (type: binary) auto parallelism: false Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator aggregations: count(VALUE._col0) mode: mergepartial outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false GlobalTableId: 0 directory: hdfs://bdpe41:8020/tmp/hive/root/574c5c5b-1a07-4c4f-9056-35482b189871/hive_2017-05-26_15-08-38_396_2133309677947979513-1/-mr-10001/.hive-staging_hive_2017-05-26_15-08-38_396_2133309677947979513-1/-ext-10002 NumFilesPerFileSink: 1 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE Stats Publishing Key Prefix: hdfs://bdpe41:8020/tmp/hive/root/574c5c5b-1a07-4c4f-9056-35482b189871/hive_2017-05-26_15-08-38_396_2133309677947979513-1/-mr-10001/.hive-staging_hive_2017-05-26_15-08-38_396_2133309677947979513-1/-ext-10002/ table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat properties: columns _col0 columns.types bigint escape.delim \ hive.serialization.extend.additional.nesting.levels true serialization.escape.crlf true serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe TotalFiles: 1 GatherStats: false MultiFileSpray: false Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: ListSink {code} the Map 5 is missing in the explain, that is maybe why the exception is thrown out. if disabling map join, the script successes. the explain is {code} Stage-2 is a root stage [MAPRED] Stage-3 depends on stages: Stage-2 [MAPRED] Stage-1 depends on stages: Stage-3 [MAPRED] Stage-0 depends on stages: Stage-1 [FETCH] STAGE PLANS: Stage: Stage-2 Spark DagName: root_20170527150735_53c5290f-9082-40d5-9e7c-866b1e9bafdd:2 Vertices: Map 6 Map Operator Tree: TableScan alias: srcpart_date Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: ds is not null (type: boolean) Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE Select Operator expressions: ds (type: string) outputColumnNames: _col0 Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE Group By Operator keys: _col0 (type: string) mode: hash outputColumnNames: _col0 Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE Spark Partition Pruning Sink Operator partition key expr: ds tmp Path: hdfs://bdpe41:8020/tmp/hive/root/063af4a7-c3e0-421d-a12c-592af6e84059/hive_2017-05-27_15-07-35_887_2057692265398433754-1/-mr-10004/1/6 Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE target column name: ds target work: Map 1 Path -> Alias: hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date [srcpart_date] Path -> Partition: hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date Partition base file name: srcpart_date input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns ds,date columns.comments columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date name default.srcpart_date numFiles 12 numRows 2 rawDataSize 42 serialization.ddl struct srcpart_date { string ds, string date} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 44 transient_lastDdlTime 1495782474 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns ds,date columns.comments columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date name default.srcpart_date numFiles 12 numRows 2 rawDataSize 42 serialization.ddl struct srcpart_date { string ds, string date} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 44 transient_lastDdlTime 1495782474 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart_date name: default.srcpart_date Truncated Path -> Alias: /srcpart_date [srcpart_date] Stage: Stage-3 Spark DagName: root_20170527150735_53c5290f-9082-40d5-9e7c-866b1e9bafdd:3 Vertices: Map 5 Map Operator Tree: TableScan alias: srcpart_hour Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: hr is not null (type: boolean) Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE Column stats: NONE Spark HashTable Sink Operator keys: 0 _col3 (type: string) 1 hr (type: string) Position of Big Table: 0 Select Operator expressions: hr (type: string) outputColumnNames: _col0 Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE Column stats: NONE Group By Operator keys: _col0 (type: string) mode: hash outputColumnNames: _col0 Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE Column stats: NONE Spark Partition Pruning Sink Operator partition key expr: hr tmp Path: hdfs://bdpe41:8020/tmp/hive/root/063af4a7-c3e0-421d-a12c-592af6e84059/hive_2017-05-27_15-07-35_887_2057692265398433754-1/-mr-10004/1/5 Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE Column stats: NONE target column name: hr target work: Map 1 Local Work: Map Reduce Local Work Path -> Alias: hdfs://bdpe41:8020/user/hive/warehouse/srcpart_hour [srcpart_hour] Path -> Partition: hdfs://bdpe41:8020/user/hive/warehouse/srcpart_hour Partition base file name: srcpart_hour input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns hr,hour columns.comments columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart_hour name default.srcpart_hour numFiles 12 numRows 2 rawDataSize 10 serialization.ddl struct srcpart_hour { string hr, string hour} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 12 transient_lastDdlTime 1495782477 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns hr,hour columns.comments columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart_hour name default.srcpart_hour numFiles 12 numRows 2 rawDataSize 10 serialization.ddl struct srcpart_hour { string hr, string hour} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 12 transient_lastDdlTime 1495782477 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart_hour name: default.srcpart_hour Truncated Path -> Alias: /srcpart_hour [srcpart_hour] Stage: Stage-1 Spark Edges: Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 12), Map 4 (PARTITION-LEVEL SORT, 12) Reducer 3 <- Reducer 2 (GROUP, 1) DagName: root_20170527150735_53c5290f-9082-40d5-9e7c-866b1e9bafdd:1 Vertices: Map 1 Map Operator Tree: TableScan alias: srcpart Statistics: Num rows: 1 Data size: 23248 Basic stats: PARTIAL Column stats: NONE GatherStats: false Reduce Output Operator key expressions: ds (type: string) null sort order: a sort order: + Map-reduce partition columns: ds (type: string) Statistics: Num rows: 1 Data size: 23248 Basic stats: PARTIAL Column stats: NONE tag: 0 value expressions: hr (type: string) auto parallelism: false Path -> Alias: hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11 [srcpart] hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12 [srcpart] hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=11 [srcpart] hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=12 [srcpart] Path -> Partition: hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11 Partition base file name: hr=11 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-08 hr 11 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=11 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782438 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12 Partition base file name: hr=12 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-08 hr 12 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-08/hr=12 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782438 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=11 Partition base file name: hr=11 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-09 hr 11 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=11 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782439 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=12 Partition base file name: hr=12 input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat partition values: ds 2008-04-09 hr 12 properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart/ds=2008-04-09/hr=12 name default.srcpart numFiles 1 numRows 0 partition_columns ds/hr partition_columns.types string:string rawDataSize 0 serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 5812 transient_lastDdlTime 1495782439 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: bucket_count -1 column.name.delimiter , columns key,value columns.comments 'default','default' columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart name default.srcpart partition_columns ds/hr partition_columns.types string:string serialization.ddl struct srcpart { string key, string value} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe transient_lastDdlTime 1495782437 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart name: default.srcpart Truncated Path -> Alias: /srcpart/ds=2008-04-08/hr=11 [srcpart] /srcpart/ds=2008-04-08/hr=12 [srcpart] /srcpart/ds=2008-04-09/hr=11 [srcpart] /srcpart/ds=2008-04-09/hr=12 [srcpart] Map 4 Map Operator Tree: TableScan alias: srcpart_date Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE GatherStats: false Filter Operator isSamplingPred: false predicate: ds is not null (type: boolean) Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: ds (type: string) null sort order: a sort order: + Map-reduce partition columns: ds (type: string) Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE tag: 1 auto parallelism: false Path -> Alias: hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date [srcpart_date] Path -> Partition: hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date Partition base file name: srcpart_date input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns ds,date columns.comments columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date name default.srcpart_date numFiles 12 numRows 2 rawDataSize 42 serialization.ddl struct srcpart_date { string ds, string date} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 44 transient_lastDdlTime 1495782474 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe input format: org.apache.hadoop.mapred.TextInputFormat output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat properties: COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} bucket_count -1 column.name.delimiter , columns ds,date columns.comments columns.types string:string file.inputformat org.apache.hadoop.mapred.TextInputFormat file.outputformat org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat location hdfs://bdpe41:8020/user/hive/warehouse/srcpart_date name default.srcpart_date numFiles 12 numRows 2 rawDataSize 42 serialization.ddl struct srcpart_date { string ds, string date} serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe totalSize 44 transient_lastDdlTime 1495782474 serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe name: default.srcpart_date name: default.srcpart_date Truncated Path -> Alias: /srcpart_date [srcpart_date] Reducer 2 Local Work: Map Reduce Local Work Needs Tagging: true Reduce Operator Tree: Join Operator condition map: Inner Join 0 to 1 keys: 0 ds (type: string) 1 ds (type: string) outputColumnNames: _col3 Statistics: Num rows: 2 Data size: 46 Basic stats: COMPLETE Column stats: NONE Map Join Operator condition map: Inner Join 0 to 1 keys: 0 _col3 (type: string) 1 hr (type: string) input vertices: 1 Map 5 Position of Big Table: 0 Statistics: Num rows: 2 Data size: 50 Basic stats: COMPLETE Column stats: NONE Group By Operator aggregations: count() mode: hash outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator null sort order: sort order: Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE tag: -1 value expressions: _col0 (type: bigint) auto parallelism: false Reducer 3 Needs Tagging: false Reduce Operator Tree: Group By Operator aggregations: count(VALUE._col0) mode: mergepartial outputColumnNames: _col0 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false GlobalTableId: 0 directory: hdfs://bdpe41:8020/tmp/hive/root/063af4a7-c3e0-421d-a12c-592af6e84059/hive_2017-05-27_15-07-35_887_2057692265398433754-1/-mr-10001/.hive-staging_hive_2017-05-27_15-07-35_887_2057692265398433754-1/-ext-10002 NumFilesPerFileSink: 1 Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE Stats Publishing Key Prefix: hdfs://bdpe41:8020/tmp/hive/root/063af4a7-c3e0-421d-a12c-592af6e84059/hive_2017-05-27_15-07-35_887_2057692265398433754-1/-mr-10001/.hive-staging_hive_2017-05-27_15-07-35_887_2057692265398433754-1/-ext-10002/ table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat properties: columns _col0 columns.types bigint escape.delim \ hive.serialization.extend.additional.nesting.levels true serialization.escape.crlf true serialization.format 1 serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe TotalFiles: 1 GatherStats: false MultiFileSpray: false Stage: Stage-0 Fetch Operator limit: -1 Processor Tree: ListSink {code} > Case "multiple sources, single key" in spark_dynamic_pruning.q fails > --------------------------------------------------------------------- > > Key: HIVE-16780 > URL: https://issues.apache.org/jira/browse/HIVE-16780 > Project: Hive > Issue Type: Bug > Reporter: liyunzhang_intel > Assignee: liyunzhang_intel > > script.q > {code} > set hive.optimize.ppd=true; > set hive.ppd.remove.duplicatefilters=true; > set hive.spark.dynamic.partition.pruning=true; > set hive.optimize.metadataonly=false; > set hive.optimize.index.filter=true; > set hive.strict.checks.cartesian.product=false; > set hive.spark.dynamic.partition.pruning=true; > -- multiple sources, single key > select count(*) from srcpart join srcpart_date on (srcpart.ds = > srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) > {code} > exception > {code} > job failed with java.io.FileNotFoundException: File > hdfs://bdpe41:8020/tmp/hive/root/de80d82a-b910-4b87-940c-6be3ea37ba25/hive_2017-05-27_14-55-30_114_8497388836256415979-1/-mr-10004/1/5 > does not exist. > FAILED: Execution Error, return code 3 from > org.apache.hadoop.hive.ql.exec.spark.SparkTask. java.lang.RuntimeException: > org.apache.hadoop.hive.ql.metadata.HiveException: > java.io.FileNotFoundException: File > hdfs://bdpe41:8020/tmp/hive/root/de80d82a-b910-4b87-940c-6be3ea37ba25/hive_2017-05-27_14-55-30_114_8497388836256415979-1/-mr-10004/1/5 > does not exist. > at > org.apache.hadoop.hive.ql.io.HiveInputFormat.init(HiveInputFormat.java:404) > at > org.apache.hadoop.hive.ql.io.CombineHiveInputFormat.getSplits(CombineHiveInputFormat.java:498) > at org.apache.spark.rdd.HadoopRDD.getPartitions(HadoopRDD.scala:200) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246) > at scala.Option.getOrElse(Option.scala:121) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:246) > at > org.apache.spark.rdd.MapPartitionsRDD.getPartitions(MapPartitionsRDD.scala:35) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:248) > at org.apache.spark.rdd.RDD$$anonfun$partitions$2.apply(RDD.scala:246) > at scala.Option.getOrElse(Option.scala:121) > at org.apache.spark.rdd.RDD.partitions(RDD.scala:246) > at org.apache.spark.ShuffleDependency.<init>(Dependency.scala:91) > at org.apache.spark.rdd.ShuffledRDD.getDependencies(ShuffledRDD.scala:91) > at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:235) > at org.apache.spark.rdd.RDD$$anonfun$dependencies$2.apply(RDD.scala:233) > at scala.Option.getOrElse(Option.scala:121) > at org.apache.spark.rdd.RDD.dependencies(RDD.scala:233) > at > org.apache.hadoop.hive.ql.exec.spark.SparkUtilities.rddToString(SparkUtilities.java:144) > at > org.apache.hadoop.hive.ql.exec.spark.SparkUtilities.rddToString(SparkUtilities.java:149) > at > org.apache.hadoop.hive.ql.exec.spark.SparkUtilities.rddToString(SparkUtilities.java:149) > at > org.apache.hadoop.hive.ql.exec.spark.SparkUtilities.rddToString(SparkUtilities.java:149) > at > org.apache.hadoop.hive.ql.exec.spark.SparkUtilities.rddGraphToString(SparkUtilities.java:134) > at > org.apache.hadoop.hive.ql.exec.spark.SparkPlan.generateGraph(SparkPlan.java:93) > at > org.apache.hadoop.hive.ql.exec.spark.RemoteHiveSparkClient$JobStatusJob.call(RemoteHiveSparkClient.java:349) > at > org.apache.hive.spark.client.RemoteDriver$JobWrapper.call(RemoteDriver.java:358) > at > org.apache.hive.spark.client.RemoteDriver$JobWrapper.call(RemoteDriver.java:323) > at java.util.concurrent.FutureTask.run(FutureTask.java:266) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > Caused by: org.apache.hadoop.hive.ql.metadata.HiveException: > java.io.FileNotFoundException: File > hdfs://bdpe41:8020/tmp/hive/root/de80d82a-b910-4b87-940c-6be3ea37ba25/hive_2017-05-27_14-55-30_114_8497388836256415979-1/-mr-10004/1/5 > does not exist. > at > org.apache.hadoop.hive.ql.exec.spark.SparkDynamicPartitionPruner.processFiles(SparkDynamicPartitionPruner.java:147) > at > org.apache.hadoop.hive.ql.exec.spark.SparkDynamicPartitionPruner.prune(SparkDynamicPartitionPruner.java:76) > at > org.apache.hadoop.hive.ql.io.HiveInputFormat.init(HiveInputFormat.java:402) > ... 30 more > Caused by: java.io.FileNotFoundException: File > hdfs://bdpe41:8020/tmp/hive/root/de80d82a-b910-4b87-940c-6be3ea37ba25/hive_2017-05-27_14-55-30_114_8497388836256415979-1/-mr-10004/1/5 > does not exist. > at > org.apache.hadoop.hdfs.DistributedFileSystem.listStatusInternal(DistributedFileSystem.java:795) > at > org.apache.hadoop.hdfs.DistributedFileSystem.access$700(DistributedFileSystem.java:106) > at > org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:853) > at > org.apache.hadoop.hdfs.DistributedFileSystem$18.doCall(DistributedFileSystem.java:849) > at > org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81) > at > org.apache.hadoop.hdfs.DistributedFileSystem.listStatus(DistributedFileSystem.java:860) > at > org.apache.hadoop.hive.ql.exec.spark.SparkDynamicPartitionPruner.processFiles(SparkDynamicPartitionPruner.java:119) > ... 32 more > {code} -- This message was sent by Atlassian JIRA (v6.3.15#6346)