[ https://issues.apache.org/jira/browse/FLINK-4632?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15714684#comment-15714684 ]
刘喆 edited comment on FLINK-4632 at 12/2/16 10:10 AM: ----------------------------------------------------- It appears again. This time, there is no container lost. This time I get the logs. On JobManager: 2016-12-02 16:27:17,275 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Sink: Unnamed (1/10) (ca18e99f339595bf987913f543ef3f54) switched from DEPLOYING to RUNNING 2016-12-02 16:27:17,292 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Sink: Unnamed (5/10) (733ebe58c797aa6e6ed9e85004d6c201) switched from DEPLOYING to RUNNING 2016-12-02 16:27:48,523 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Sink: Unnamed (9/10) (fa903c45717171026fae9997ec22b594) switched from RUNNING to FAILED 2016-12-02 16:27:48,523 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Job Flink Streaming Java API Skeleton (c7b96c59cb04fd2cd096b23f85cebdb4) switched from state RUNNING to FAILING. org.apache.flink.runtime.io.network.netty.exception.LocalTransportException: Connection timed out at org.apache.flink.runtime.io.network.netty.PartitionRequestClientHandler.exceptionCaught(PartitionRequestClientHandler.java:152) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.ChannelInboundHandlerAdapter.exceptionCaught(ChannelInboundHandlerAdapter.java:131) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.ChannelInboundHandlerAdapter.exceptionCaught(ChannelInboundHandlerAdapter.java:131) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.ChannelHandlerAdapter.exceptionCaught(ChannelHandlerAdapter.java:79) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.DefaultChannelPipeline.fireExceptionCaught(DefaultChannelPipeline.java:835) at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.handleReadException(AbstractNioByteChannel.java:87) at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:162) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) at java.lang.Thread.run(Thread.java:745) Caused by: java.io.IOException: Connection timed out at sun.nio.ch.FileDispatcherImpl.read0(Native Method) at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39) at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223) at sun.nio.ch.IOUtil.read(IOUtil.java:192) at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:380) at io.netty.buffer.PooledUnsafeDirectByteBuf.setBytes(PooledUnsafeDirectByteBuf.java:311) at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:881) at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:241) at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:119) ... 6 more 2016-12-02 16:27:48,524 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Source: Custom Source -> Map (1/10) (d99ba0f0a16c31e717c8044e92e0e8c4) switched from RUNNING to CANCELING 2016-12-02 16:27:48,524 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Source: Custom Source -> Map (2/10) (294b46febec5e49a8a859d98ceca2a9e) switched from RUNNING to CANCELING on TaskManager: 2016-12-02 16:27:18,177 INFO org.apache.zookeeper.ClientCnxn - Socket connection established to 10.10.10.203/10.10.10.203:2181, initiating session 2016-12-02 16:27:18,183 INFO org.apache.zookeeper.ClientCnxn - Session establishment complete on server 10.10.10.203/10.10.10.203:2181, sessionid = 0x 358be99d7b900a1, negotiated timeout = 40000 2016-12-02 16:27:18,184 INFO org.apache.flink.shaded.org.apache.curator.framework.state.ConnectionStateManager - State change: CONNECTED 2016-12-02 16:27:18,191 INFO org.apache.flink.streaming.connectors.kafka.internals.ZookeeperOffsetHandler - Offset for TopicPartition pview:8 was set to 68844710011 in ZooKeeper. Seeking fetcher to that position. 2016-12-02 16:27:18,191 INFO org.apache.flink.streaming.connectors.kafka.internals.Kafka08Fetcher - Starting periodic offset committer, with commit interval of 60000ms 2016-12-02 16:27:18,192 INFO org.apache.flink.streaming.connectors.kafka.internals.Kafka08Fetcher - Assigning 1 partitions to broker threads 2016-12-02 16:27:18,192 INFO org.apache.flink.streaming.connectors.kafka.internals.Kafka08Fetcher - Refreshing leader information for partitions [Partition: KafkaTopicPartition{to pic='pview', partition=8}, KafkaPartitionHandle=[pview,8], offset=68844710010] 2016-12-02 16:27:18,216 INFO org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumerBase - Trying to get topic metadata from broker 10.10.10.203:9092 in try 0/3 2016-12-02 16:27:18,221 INFO org.apache.flink.streaming.connectors.kafka.internals.Kafka08Fetcher - Starting thread SimpleConsumer - Source: Custom Source -> Map - broker-3 (wx-ka fka-08:9092) 2016-12-02 16:27:18,221 INFO org.apache.flink.streaming.connectors.kafka.internals.SimpleConsumerThread - Starting to fetch from [Partition: KafkaTopicPartition{topic='pview', par tition=8}, KafkaPartitionHandle=[pview,8], offset=68844710010] 2016-12-02 16:27:48,472 ERROR org.apache.flink.runtime.taskmanager.Task - Task execution failed. org.apache.flink.runtime.io.network.netty.exception.LocalTransportException: Connection timed out at org.apache.flink.runtime.io.network.netty.PartitionRequestClientHandler.exceptionCaught(PartitionRequestClientHandler.java:152) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.ChannelInboundHandlerAdapter.exceptionCaught(ChannelInboundHandlerAdapter.java:131) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.ChannelInboundHandlerAdapter.exceptionCaught(ChannelInboundHandlerAdapter.java:131) ...... ( as JobManager logs above) was (Author: liuzhe): It appears again. This time I get the logs. On JobManager: 2016-12-02 16:27:17,275 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Sink: Unnamed (1/10) (ca18e99f339595bf987913f543ef3f54) switched from DEPLOYING to RUNNING 2016-12-02 16:27:17,292 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Sink: Unnamed (5/10) (733ebe58c797aa6e6ed9e85004d6c201) switched from DEPLOYING to RUNNING 2016-12-02 16:27:48,523 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Sink: Unnamed (9/10) (fa903c45717171026fae9997ec22b594) switched from RUNNING to FAILED 2016-12-02 16:27:48,523 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Job Flink Streaming Java API Skeleton (c7b96c59cb04fd2cd096b23f85cebdb4) switched from state RUNNING to FAILING. org.apache.flink.runtime.io.network.netty.exception.LocalTransportException: Connection timed out at org.apache.flink.runtime.io.network.netty.PartitionRequestClientHandler.exceptionCaught(PartitionRequestClientHandler.java:152) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.ChannelInboundHandlerAdapter.exceptionCaught(ChannelInboundHandlerAdapter.java:131) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.ChannelInboundHandlerAdapter.exceptionCaught(ChannelInboundHandlerAdapter.java:131) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.ChannelHandlerAdapter.exceptionCaught(ChannelHandlerAdapter.java:79) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.DefaultChannelPipeline.fireExceptionCaught(DefaultChannelPipeline.java:835) at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.handleReadException(AbstractNioByteChannel.java:87) at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:162) at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511) at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) at java.lang.Thread.run(Thread.java:745) Caused by: java.io.IOException: Connection timed out at sun.nio.ch.FileDispatcherImpl.read0(Native Method) at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39) at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223) at sun.nio.ch.IOUtil.read(IOUtil.java:192) at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:380) at io.netty.buffer.PooledUnsafeDirectByteBuf.setBytes(PooledUnsafeDirectByteBuf.java:311) at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:881) at io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:241) at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:119) ... 6 more 2016-12-02 16:27:48,524 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Source: Custom Source -> Map (1/10) (d99ba0f0a16c31e717c8044e92e0e8c4) switched from RUNNING to CANCELING 2016-12-02 16:27:48,524 INFO org.apache.flink.runtime.executiongraph.ExecutionGraph - Source: Custom Source -> Map (2/10) (294b46febec5e49a8a859d98ceca2a9e) switched from RUNNING to CANCELING on TaskManager: 2016-12-02 16:27:18,177 INFO org.apache.zookeeper.ClientCnxn - Socket connection established to 10.10.10.203/10.10.10.203:2181, initiating session 2016-12-02 16:27:18,183 INFO org.apache.zookeeper.ClientCnxn - Session establishment complete on server 10.10.10.203/10.10.10.203:2181, sessionid = 0x 358be99d7b900a1, negotiated timeout = 40000 2016-12-02 16:27:18,184 INFO org.apache.flink.shaded.org.apache.curator.framework.state.ConnectionStateManager - State change: CONNECTED 2016-12-02 16:27:18,191 INFO org.apache.flink.streaming.connectors.kafka.internals.ZookeeperOffsetHandler - Offset for TopicPartition pview:8 was set to 68844710011 in ZooKeeper. Seeking fetcher to that position. 2016-12-02 16:27:18,191 INFO org.apache.flink.streaming.connectors.kafka.internals.Kafka08Fetcher - Starting periodic offset committer, with commit interval of 60000ms 2016-12-02 16:27:18,192 INFO org.apache.flink.streaming.connectors.kafka.internals.Kafka08Fetcher - Assigning 1 partitions to broker threads 2016-12-02 16:27:18,192 INFO org.apache.flink.streaming.connectors.kafka.internals.Kafka08Fetcher - Refreshing leader information for partitions [Partition: KafkaTopicPartition{to pic='pview', partition=8}, KafkaPartitionHandle=[pview,8], offset=68844710010] 2016-12-02 16:27:18,216 INFO org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumerBase - Trying to get topic metadata from broker 10.10.10.203:9092 in try 0/3 2016-12-02 16:27:18,221 INFO org.apache.flink.streaming.connectors.kafka.internals.Kafka08Fetcher - Starting thread SimpleConsumer - Source: Custom Source -> Map - broker-3 (wx-ka fka-08:9092) 2016-12-02 16:27:18,221 INFO org.apache.flink.streaming.connectors.kafka.internals.SimpleConsumerThread - Starting to fetch from [Partition: KafkaTopicPartition{topic='pview', par tition=8}, KafkaPartitionHandle=[pview,8], offset=68844710010] 2016-12-02 16:27:48,472 ERROR org.apache.flink.runtime.taskmanager.Task - Task execution failed. org.apache.flink.runtime.io.network.netty.exception.LocalTransportException: Connection timed out at org.apache.flink.runtime.io.network.netty.PartitionRequestClientHandler.exceptionCaught(PartitionRequestClientHandler.java:152) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.ChannelInboundHandlerAdapter.exceptionCaught(ChannelInboundHandlerAdapter.java:131) at io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) at io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) at io.netty.channel.ChannelInboundHandlerAdapter.exceptionCaught(ChannelInboundHandlerAdapter.java:131) ...... ( as JobManager logs above) > when yarn nodemanager lost, flink hung > --------------------------------------- > > Key: FLINK-4632 > URL: https://issues.apache.org/jira/browse/FLINK-4632 > Project: Flink > Issue Type: Bug > Components: Distributed Coordination, Streaming > Affects Versions: 1.2.0, 1.1.2 > Environment: cdh5.5.1 jdk1.7 flink1.1.2 1.2-snapshot kafka0.8.2 > Reporter: 刘喆 > Priority: Blocker > > When run flink streaming on yarn, using kafka as source, it runs well when > start. But after long run, for example 8 hours, dealing 60,000,000+ > messages, it hung: no messages consumed, one taskmanager is CANCELING, the > exception show: > org.apache.flink.runtime.io.network.netty.exception.LocalTransportException: > connection timeout > at > org.apache.flink.runtime.io.network.netty.PartitionRequestClientHandler.exceptionCaught(PartitionRequestClientHandler.java:152) > at > io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) > at > io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) > at > io.netty.channel.ChannelInboundHandlerAdapter.exceptionCaught(ChannelInboundHandlerAdapter.java:131) > at > io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) > at > io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) > at > io.netty.channel.ChannelInboundHandlerAdapter.exceptionCaught(ChannelInboundHandlerAdapter.java:131) > at > io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) > at > io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) > at > io.netty.channel.ChannelHandlerAdapter.exceptionCaught(ChannelHandlerAdapter.java:79) > at > io.netty.channel.AbstractChannelHandlerContext.invokeExceptionCaught(AbstractChannelHandlerContext.java:275) > at > io.netty.channel.AbstractChannelHandlerContext.fireExceptionCaught(AbstractChannelHandlerContext.java:253) > at > io.netty.channel.DefaultChannelPipeline.fireExceptionCaught(DefaultChannelPipeline.java:835) > at > io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.handleReadException(AbstractNioByteChannel.java:87) > at > io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:162) > at > io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:511) > at > io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:468) > at > io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:382) > at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:354) > at > io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) > at java.lang.Thread.run(Thread.java:745) > Caused by: java.io.IOException: 连接超时 > at sun.nio.ch.FileDispatcherImpl.read0(Native Method) > at sun.nio.ch.SocketDispatcher.read(SocketDispatcher.java:39) > at sun.nio.ch.IOUtil.readIntoNativeBuffer(IOUtil.java:223) > at sun.nio.ch.IOUtil.read(IOUtil.java:192) > at sun.nio.ch.SocketChannelImpl.read(SocketChannelImpl.java:379) > at > io.netty.buffer.PooledUnsafeDirectByteBuf.setBytes(PooledUnsafeDirectByteBuf.java:311) > at io.netty.buffer.AbstractByteBuf.writeBytes(AbstractByteBuf.java:881) > at > io.netty.channel.socket.nio.NioSocketChannel.doReadBytes(NioSocketChannel.java:241) > at > io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:119) > ... 6 more > after apply https://issues.apache.org/jira/browse/FLINK-4625 > it show: > java.lang.Exception: TaskManager was lost/killed: > ResourceID{resourceId='container_1471620986643_744852_01_001400'} @ > 38.slave.adh (dataPort=45349) > at > org.apache.flink.runtime.instance.SimpleSlot.releaseSlot(SimpleSlot.java:162) > at > org.apache.flink.runtime.instance.SlotSharingGroupAssignment.releaseSharedSlot(SlotSharingGroupAssignment.java:533) > at > org.apache.flink.runtime.instance.SharedSlot.releaseSlot(SharedSlot.java:138) > at > org.apache.flink.runtime.instance.Instance.markDead(Instance.java:167) > at > org.apache.flink.runtime.instance.InstanceManager.unregisterTaskManager(InstanceManager.java:224) > at > org.apache.flink.runtime.jobmanager.JobManager.org$apache$flink$runtime$jobmanager$JobManager$$handleTaskManagerTerminated(JobManager.scala:1054) > at > org.apache.flink.runtime.jobmanager.JobManager$$anonfun$handleMessage$1.applyOrElse(JobManager.scala:458) > at > scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:36) > at > org.apache.flink.runtime.clusterframework.ContaineredJobManager$$anonfun$handleContainerMessage$1.applyOrElse(ContaineredJobManager.scala:100) > at scala.PartialFunction$OrElse.apply(PartialFunction.scala:167) > at > org.apache.flink.runtime.LeaderSessionMessageFilter$$anonfun$receive$1.applyOrElse(LeaderSessionMessageFilter.scala:36) > at > scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:36) > at > org.apache.flink.runtime.LogMessages$$anon$1.apply(LogMessages.scala:33) > at > org.apache.flink.runtime.LogMessages$$anon$1.apply(LogMessages.scala:28) > at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123) > at > org.apache.flink.runtime.LogMessages$$anon$1.applyOrElse(LogMessages.scala:28) > at akka.actor.Actor$class.aroundReceive(Actor.scala:465) > at > org.apache.flink.runtime.jobmanager.JobManager.aroundReceive(JobManager.scala:118) > at akka.actor.ActorCell.receiveMessage(ActorCell.scala:516) > at akka.actor.ActorCell.invoke(ActorCell.scala:487) > at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:254) > at akka.dispatch.Mailbox.run(Mailbox.scala:221) > at akka.dispatch.Mailbox.exec(Mailbox.scala:231) > at scala.concurrent.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260) > at > scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.pollAndExecAll(ForkJoinPool.java:1253) > at > scala.concurrent.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1346) > at > scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979) > at > scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107) -- This message was sent by Atlassian JIRA (v6.3.4#6332)