Is there anyone at share me some lights about this issue? Thanks Martin
2017-07-21 18:58 GMT-07:00 Martin Peng <wei...@gmail.com>: > Hi, > > I have several Spark jobs including both batch job and Stream jobs to > process the system log and analyze them. We are using Kafka as the pipeline > to connect each jobs. > > Once upgrade to Spark 2.1.0 + Spark Kafka Streaming 010, I found some of > the jobs(both batch or streaming) are thrown below exceptions > randomly(either after several hours run or just run in 20 mins). Can anyone > give me some suggestions about how to figure out the real root cause? > (Looks like google result is not very useful...) > > Thanks, > Martin > > 00:30:04,510 WARN - 17/07/22 00:30:04 WARN TaskSetManager: Lost task 60.0 > in stage 1518490.0 (TID 338070, 10.133.96.21, executor 0): > java.io.FileNotFoundException: /mnt/mesos/work_dir/slaves/ > 20160924-021501-274760970-5050-7646-S2/frameworks/40aeb8e5-e82a-4df9-b034- > 8815a7a7564b-2543/executors/0/runs/fd15c15d-2511-4f37-a106- > 27431f583153/blockmgr-a0e0e673-f88b-4d12-a802- > c35643e6c6b2/33/shuffle_2090_60_0.index.b66235be-79be-4455-9759-1c7ba70f91f6 > (No such file or directory) > 00:30:04,510 WARN - at java.io.FileOutputStream.open0(Native Method) > 00:30:04,510 WARN - at java.io.FileOutputStream.open( > FileOutputStream.java:270) > 00:30:04,510 WARN - at java.io.FileOutputStream.< > init>(FileOutputStream.java:213) > 00:30:04,510 WARN - at java.io.FileOutputStream.< > init>(FileOutputStream.java:162) > 00:30:04,510 WARN - at org.apache.spark.shuffle. > IndexShuffleBlockResolver.writeIndexFileAndCommit( > IndexShuffleBlockResolver.scala:144) > 00:30:04,510 WARN - at org.apache.spark.shuffle.sort. > BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:128) > 00:30:04,510 WARN - at org.apache.spark.scheduler. > ShuffleMapTask.runTask(ShuffleMapTask.scala:96) > 00:30:04,510 WARN - at org.apache.spark.scheduler. > ShuffleMapTask.runTask(ShuffleMapTask.scala:53) > 00:30:04,510 WARN - at org.apache.spark.scheduler. > Task.run(Task.scala:99) > 00:30:04,510 WARN - at org.apache.spark.executor. > Executor$TaskRunner.run(Executor.scala:282) > 00:30:04,510 WARN - at java.util.concurrent. > ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > 00:30:04,510 WARN - at java.util.concurrent. > ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > 00:30:04,510 WARN - at java.lang.Thread.run(Thread.java:748) > > 00:30:04,580 INFO - Driver stacktrace: > 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler.org > $apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages( > DAGScheduler.scala:1435) > 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler$$anonfun$ > abortStage$1.apply(DAGScheduler.scala:1423) > 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler$$anonfun$ > abortStage$1.apply(DAGScheduler.scala:1422) > 00:30:04,580 INFO - scala.collection.mutable. > ResizableArray$class.foreach(ResizableArray.scala:59) > 00:30:04,580 INFO - scala.collection.mutable.ArrayBuffer.foreach( > ArrayBuffer.scala:48) > 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler.abortStage( > DAGScheduler.scala:1422) > 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler$$anonfun$ > handleTaskSetFailed$1.apply(DAGScheduler.scala:802) > 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler$$anonfun$ > handleTaskSetFailed$1.apply(DAGScheduler.scala:802) > 00:30:04,580 INFO - scala.Option.foreach(Option.scala:257) > 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler. > handleTaskSetFailed(DAGScheduler.scala:802) > 00:30:04,580 INFO - org.apache.spark.scheduler. > DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650) > 00:30:04,580 INFO - org.apache.spark.scheduler. > DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605) > 00:30:04,580 INFO - org.apache.spark.scheduler. > DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594) > 00:30:04,580 INFO - org.apache.spark.util.EventLoop$$anon$1.run( > EventLoop.scala:48) > 00:30:04,580 INFO - org.apache.spark.scheduler.DAGScheduler.runJob( > DAGScheduler.scala:628) > 00:30:04,580 INFO - org.apache.spark.SparkContext. > runJob(SparkContext.scala:1918) > 00:30:04,580 INFO - org.apache.spark.SparkContext. > runJob(SparkContext.scala:1931) > 00:30:04,580 INFO - org.apache.spark.SparkContext. > runJob(SparkContext.scala:1944) > 00:30:04,580 INFO - org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD. > scala:1353) > 00:30:04,580 INFO - org.apache.spark.rdd.RDDOperationScope$.withScope( > RDDOperationScope.scala:151) > 00:30:04,580 INFO - org.apache.spark.rdd.RDDOperationScope$.withScope( > RDDOperationScope.scala:112) > 00:30:04,580 INFO - org.apache.spark.rdd.RDD.withScope(RDD.scala:362) > 00:30:04,580 INFO - org.apache.spark.rdd.RDD.take(RDD.scala:1326) > 00:30:04,580 INFO - org.apache.spark.rdd.RDD$$ > anonfun$isEmpty$1.apply$mcZ$sp(RDD.scala:1461) > 00:30:04,580 INFO - org.apache.spark.rdd.RDD$$ > anonfun$isEmpty$1.apply(RDD.scala:1461) > 00:30:04,580 INFO - org.apache.spark.rdd.RDD$$ > anonfun$isEmpty$1.apply(RDD.scala:1461) > 00:30:04,580 INFO - org.apache.spark.rdd.RDDOperationScope$.withScope( > RDDOperationScope.scala:151) > 00:30:04,580 INFO - org.apache.spark.rdd.RDDOperationScope$.withScope( > RDDOperationScope.scala:112) > 00:30:04,580 INFO - org.apache.spark.rdd.RDD.withScope(RDD.scala:362) > 00:30:04,580 INFO - org.apache.spark.rdd.RDD.isEmpty(RDD.scala:1460) > 00:30:04,580 INFO - com.ericsson.mediafirst.spark.clientlogsenrichment. > ClientLogsEnrichmentJob$.executeIteration(ClientLogsEnrichmentJob.scala: > 133) > 00:30:04,580 INFO - com.ericsson.mediafirst.spark.clientlogsenrichment. > ClientLogsEnrichmentJob$.runIteration(ClientLogsEnrichmentJob.scala:76) > 00:30:04,581 INFO - com.ericsson.mediafirst.spark.clientlogsenrichment. > ClientLogsEnrichmentJob$.runBatch(ClientLogsEnrichmentJob.scala:59) > 00:30:04,581 INFO - com.ericsson.mediafirst.sparkutils.jobtemplates. > BatchJob.main(BatchJob.scala:35) > 00:30:04,581 INFO - com.ericsson.mediafirst.spark.clientlogsenrichment. > ClientLogsEnrichmentJob.main(ClientLogsEnrichmentJob.scala) > 00:30:04,581 INFO - sun.reflect.NativeMethodAccessorImpl.invoke0(Native > Method) > 00:30:04,581 INFO - sun.reflect.NativeMethodAccessorImpl.invoke( > NativeMethodAccessorImpl.java:62) > 00:30:04,581 INFO - sun.reflect.DelegatingMethodAccessorImpl.invoke( > DelegatingMethodAccessorImpl.java:43) > 00:30:04,581 INFO - java.lang.reflect.Method.invoke(Method.java:498) > 00:30:04,581 INFO - org.apache.spark.deploy. > SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain( > SparkSubmit.scala:738) > 00:30:04,581 INFO - org.apache.spark.deploy.SparkSubmit$.doRunMain$1( > SparkSubmit.scala:187) > 00:30:04,581 INFO - org.apache.spark.deploy.SparkSubmit$.submit( > SparkSubmit.scala:212) > 00:30:04,581 INFO - org.apache.spark.deploy. > SparkSubmit$.main(SparkSubmit.scala:126) > 00:30:04,581 INFO - org.apache.spark.deploy.SparkSubmit.main(SparkSubmit. > scala) > 00:30:04,581 WARN - 17/07/22 00:30:04 WARN JobProgressListener: Task > start for unknown stage 1518491 > 00:30:04,670 WARN - 17/07/22 00:30:04 ERROR LiveListenerBus: > SparkListenerBus has already stopped! Dropping event > SparkListenerBlockUpdated(BlockUpdatedInfo(BlockManagerId(0, > 10.133.96.21, 45377, None),rdd_15721_0,StorageLevel(memory, deserialized, > 1 replicas),12024,0)) > 00:30:04,673 WARN - 17/07/22 00:30:04 ERROR LiveListenerBus: > SparkListenerBus has already stopped! Dropping event > SparkListenerBlockUpdated(BlockUpdatedInfo(BlockManagerId(0, > 10.133.96.21, 45377, None),rdd_15721_1,StorageLevel(memory, deserialized, > 1 replicas),13736,0)) > 00:30:04,679 WARN - 17/07/22 00:30:04 ERROR TransportRequestHandler: > Error while invoking RpcHandler#receive() for one-way message. > 00:30:04,679 WARN - org.apache.spark.SparkException: Could not find > CoarseGrainedScheduler. > 00:30:04,679 WARN - at org.apache.spark.rpc.netty. > Dispatcher.postMessage(Dispatcher.scala:154) > 00:30:04,679 WARN - at org.apache.spark.rpc.netty. > Dispatcher.postOneWayMessage(Dispatcher.scala:134) > 00:30:04,679 WARN - at org.apache.spark.rpc.netty. > NettyRpcHandler.receive(NettyRpcEnv.scala:570) > 00:30:04,679 WARN - at org.apache.spark.network.server. > TransportRequestHandler.processOneWayMessage(TransportRequestHandler.java: > 180) > 00:30:04,679 WARN - at org.apache.spark.network.server. > TransportRequestHandler.handle(TransportRequestHandler.java:109) > 00:30:04,679 WARN - at org.apache.spark.network.server. > TransportChannelHandler.channelRead0(TransportChannelHandler.java:119) > 00:30:04,679 WARN - at org.apache.spark.network.server. > TransportChannelHandler.channelRead0(TransportChannelHandler.java:51) > 00:30:04,679 WARN - at io.netty.channel.SimpleChannelInboundHandler. > channelRead(SimpleChannelInboundHandler.java:105) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:367) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:353) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.fireChannelRead( > AbstractChannelHandlerContext.java:346) > 00:30:04,679 WARN - at io.netty.handler.timeout. > IdleStateHandler.channelRead(IdleStateHandler.java:266) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:367) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:353) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.fireChannelRead( > AbstractChannelHandlerContext.java:346) > 00:30:04,679 WARN - at io.netty.handler.codec. > MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:367) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:353) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.fireChannelRead( > AbstractChannelHandlerContext.java:346) > 00:30:04,679 WARN - at org.apache.spark.network.util. > TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:367) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:353) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.fireChannelRead( > AbstractChannelHandlerContext.java:346) > 00:30:04,679 WARN - at io.netty.channel.DefaultChannelPipeline$ > HeadContext.channelRead(DefaultChannelPipeline.java:1294) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:367) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:353) > 00:30:04,679 WARN - at io.netty.channel.DefaultChannelPipeline. > fireChannelRead(DefaultChannelPipeline.java:911) > 00:30:04,679 WARN - at io.netty.channel.nio.AbstractNioByteChannel$ > NioByteUnsafe.read(AbstractNioByteChannel.java:131) > 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop. > processSelectedKey(NioEventLoop.java:652) > 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop. > processSelectedKeysOptimized(NioEventLoop.java:575) > 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop. > processSelectedKeys(NioEventLoop.java:489) > 00:30:04,679 WARN - at io.netty.channel.nio. > NioEventLoop.run(NioEventLoop.java:451) > 00:30:04,679 WARN - at io.netty.util.concurrent. > SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:140) > 00:30:04,679 WARN - at io.netty.util.concurrent.DefaultThreadFactory$ > DefaultRunnableDecorator.run(DefaultThreadFactory.java:144) > 00:30:04,679 WARN - at java.lang.Thread.run(Thread.java:748) > 00:30:04,679 WARN - 17/07/22 00:30:04 ERROR TransportRequestHandler: > Error while invoking RpcHandler#receive() for one-way message. > 00:30:04,679 WARN - org.apache.spark.SparkException: Could not find > CoarseGrainedScheduler. > 00:30:04,679 WARN - at org.apache.spark.rpc.netty. > Dispatcher.postMessage(Dispatcher.scala:154) > 00:30:04,679 WARN - at org.apache.spark.rpc.netty. > Dispatcher.postOneWayMessage(Dispatcher.scala:134) > 00:30:04,679 WARN - at org.apache.spark.rpc.netty. > NettyRpcHandler.receive(NettyRpcEnv.scala:570) > 00:30:04,679 WARN - at org.apache.spark.network.server. > TransportRequestHandler.processOneWayMessage(TransportRequestHandler.java: > 180) > 00:30:04,679 WARN - at org.apache.spark.network.server. > TransportRequestHandler.handle(TransportRequestHandler.java:109) > 00:30:04,679 WARN - at org.apache.spark.network.server. > TransportChannelHandler.channelRead0(TransportChannelHandler.java:119) > 00:30:04,679 WARN - at org.apache.spark.network.server. > TransportChannelHandler.channelRead0(TransportChannelHandler.java:51) > 00:30:04,679 WARN - at io.netty.channel.SimpleChannelInboundHandler. > channelRead(SimpleChannelInboundHandler.java:105) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:367) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:353) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.fireChannelRead( > AbstractChannelHandlerContext.java:346) > 00:30:04,679 WARN - at io.netty.handler.timeout. > IdleStateHandler.channelRead(IdleStateHandler.java:266) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:367) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:353) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.fireChannelRead( > AbstractChannelHandlerContext.java:346) > 00:30:04,679 WARN - at io.netty.handler.codec. > MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:367) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:353) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.fireChannelRead( > AbstractChannelHandlerContext.java:346) > 00:30:04,679 WARN - at org.apache.spark.network.util. > TransportFrameDecoder.channelRead(TransportFrameDecoder.java:85) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:367) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:353) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.fireChannelRead( > AbstractChannelHandlerContext.java:346) > 00:30:04,679 WARN - at io.netty.channel.DefaultChannelPipeline$ > HeadContext.channelRead(DefaultChannelPipeline.java:1294) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:367) > 00:30:04,679 WARN - at io.netty.channel. > AbstractChannelHandlerContext.invokeChannelRead( > AbstractChannelHandlerContext.java:353) > 00:30:04,679 WARN - at io.netty.channel.DefaultChannelPipeline. > fireChannelRead(DefaultChannelPipeline.java:911) > 00:30:04,679 WARN - at io.netty.channel.nio.AbstractNioByteChannel$ > NioByteUnsafe.read(AbstractNioByteChannel.java:131) > 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop. > processSelectedKey(NioEventLoop.java:652) > 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop. > processSelectedKeysOptimized(NioEventLoop.java:575) > 00:30:04,679 WARN - at io.netty.channel.nio.NioEventLoop. > processSelectedKeys(NioEventLoop.java:489) > 00:30:04,679 WARN - at io.netty.channel.nio. > NioEventLoop.run(NioEventLoop.java:451) > 00:30:04,679 WARN - at io.netty.util.concurrent. > SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:140) > 00:30:04,679 WARN - at io.netty.util.concurrent.DefaultThreadFactory$ > DefaultRunnableDecorator.run(DefaultThreadFactory.java:144) > 00:30:04,679 WARN - at java.lang.Thread.run(Thread.java:748) > 00:30:11,318 WARN - I0722 00:30:11.318724 2921 sched.cpp:2021] Asked to > stop the driver > 00:30:11,318 WARN - I0722 00:30:11.318838 2988 sched.cpp:1203] Stopping > framework 40aeb8e5-e82a-4df9-b034-8815a7a7564b-2543 >