I'm using Spark SQL to query one partition at a time of Hive external table
that sits atop .gzip data, and then I'm saving that partition to a new HDFS
location as a set of parquet snappy files using .saveAsParquetFile()
The query completes successfully, but then I get a vague error message I
think saying one of the temporary files is missing. Any idea what the issue
is here?
--------------
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<ipython-input-22-39049a9c70f8> in <module>()
8 status, output = commands.getstatusoutput("hdfs dfs -rmr " +
save_path)
9 partitions = int((df.count() // 750000)+1)
---> 10 df.repartition(partitions).saveAsParquetFile(save_path + '/' +
str(dt1)[:10])
11
12 dt1 += timedelta(days=1)
/opt/cloudera/parcels/CDH/lib/spark/python/pyspark/sql/dataframe.py in
saveAsParquetFile(self, path)
119 True
120 """
--> 121 self._jdf.saveAsParquetFile(path)
122
123 def registerTempTable(self, name):
/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/java_gateway.py
in __call__(self, *args)
536 answer = self.gateway_client.send_command(command)
537 return_value = get_return_value(answer, self.gateway_client,
--> 538 self.target_id, self.name)
539
540 for temp_arg in temp_args:
/opt/cloudera/parcels/CDH/lib/spark/python/lib/py4j-0.8.2.1-src.zip/py4j/protocol.py
in get_return_value(answer, gateway_client, target_id, name)
298 raise Py4JJavaError(
299 'An error occurred while calling {0}{1}{2}.\n'.
--> 300 format(target_id, '.', name), value)
301 else:
302 raise Py4JError(
Py4JJavaError: An error occurred while calling o206.saveAsParquetFile.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 72
in stage 28.1 failed 4 times, most recent failure: Lost task 72.3 in stage
28.1 (TID 603, phd4.xxx.com): java.io.FileNotFoundException:
/hdata/1/yarn/nm/usercache/me/appcache/application_1446009923448_20507/blockmgr-20140dd7-7f36-4971-afe7-0278241f4439/22/temp_shuffle_c274830e-f626-4ae6-8923-ad72c561a84e
(No such file or directory)
at java.io.FileOutputStream.open(Native Method)
at java.io.FileOutputStream.<init>(FileOutputStream.java:221)
at
org.apache.spark.storage.DiskBlockObjectWriter.open(BlockObjectWriter.scala:130)
at
org.apache.spark.util.collection.ExternalSorter$$anonfun$spillToPartitionFiles$1.apply(ExternalSorter.scala:360)
at
org.apache.spark.util.collection.ExternalSorter$$anonfun$spillToPartitionFiles$1.apply(ExternalSorter.scala:355)
at scala.Array$.fill(Array.scala:267)
at
org.apache.spark.util.collection.ExternalSorter.spillToPartitionFiles(ExternalSorter.scala:355)
at
org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:211)
at
org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:68)
at
org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:64)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:203)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at
org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1203)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1192)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1191)
at
scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
at
org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1191)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at
org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:693)
at scala.Option.foreach(Option.scala:236)
at
org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:693)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1393)
at
org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1354)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
--
View this message in context:
http://apache-spark-user-list.1001560.n3.nabble.com/Vague-Spark-SQL-error-message-with-saveAsParquetFile-tp25265.html
Sent from the Apache Spark User List mailing list archive at Nabble.com.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]