[ https://issues.apache.org/jira/browse/KAFKA-3172?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15203901#comment-15203901 ]
Mikael Sundberg commented on KAFKA-3172: ---------------------------------------- Im seeing the same thing. A machine can suddenly stop consuming one or several partitions, no errors shown in log. Threaddump gives foe ex: "pool-4-thread-1" #29 prio=5 os_prio=0 tid=0x00007f8d35a5e000 nid=0x21 runnable [0x00007f8d022cc000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269) at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79) at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86) - locked <0x00000000eeaebf50> (a sun.nio.ch.Util$2) - locked <0x00000000eeaebf60> (a java.util.Collections$UnmodifiableSet) - locked <0x00000000eeaebf08> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97) at org.apache.kafka.common.network.Selector.select(Selector.java:425) at org.apache.kafka.common.network.Selector.poll(Selector.java:254) at org.apache.kafka.clients.NetworkClient.poll(NetworkClient.java:270) at org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient.clientPoll(ConsumerNetworkClient.java:303) at org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient.poll(ConsumerNetworkClient.java:197) at org.apache.kafka.clients.consumer.internals.ConsumerNetworkClient.poll(ConsumerNetworkClient.java:187) at org.apache.kafka.clients.consumer.KafkaConsumer.pollOnce(KafkaConsumer.java:877) at org.apache.kafka.clients.consumer.KafkaConsumer.poll(KafkaConsumer.java:829) at com.klarna.ordermanagement.messaging.kafka.KafkaMessageConsumer.consumeRecords(KafkaMessageConsumer.java:63) at com.klarna.ordermanagement.messaging.kafka.KafkaMessageConsumer$$Lambda$87/2015198349.run(Unknown Source) at com.klarna.ordermanagement.commons.LambdaUtils.repeat(LambdaUtils.java:81) at com.klarna.ordermanagement.messaging.kafka.KafkaMessageConsumer.run(KafkaMessageConsumer.java:79) at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511) at java.util.concurrent.FutureTask.run(FutureTask.java:266) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) > Consumer threads stay in 'Watiting' status and are blocked at consumer poll > method > ---------------------------------------------------------------------------------- > > Key: KAFKA-3172 > URL: https://issues.apache.org/jira/browse/KAFKA-3172 > Project: Kafka > Issue Type: Bug > Components: consumer > Affects Versions: 0.9.0.0 > Environment: linux > Reporter: Dany Benjamin > Assignee: Neha Narkhede > Priority: Critical > Fix For: 0.9.0.0 > > Attachments: jmx_info.png, jstack.png, lagSample.png > > > When running multiple consumers on same group (400 - for a 400 partitioned > topic), the application for all threads blocks at consumer.poll() method. The > timeout parameter sent in is 1. > Stack dump: > "pool-1-thread-198" #424 prio=5 os_prio=0 tid=0x00007f6bb6d53800 nid=0xc349 > waiting on condition [0x00007f63df8f7000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x0000000605812710> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1067) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1127) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > "kafka-producer-network-thread | producer-198" #423 daemon prio=5 os_prio=0 > tid=0x00007f6bb6d52000 nid=0xc348 runnable [0x00007f63df9f8000] > java.lang.Thread.State: RUNNABLE > at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) > at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269) > at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79) > at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86) > - locked <0x00000006058283e8> (a sun.nio.ch.Util$2) > - locked <0x00000006058283d8> (a > java.util.Collections$UnmodifiableSet) > - locked <0x0000000605828390> (a sun.nio.ch.EPollSelectorImpl) > at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97) > at org.apache.kafka.common.network.Selector.select(Selector.java:425) > at org.apache.kafka.common.network.Selector.poll(Selector.java:254) > at org.apache.kafka.clients.NetworkClient.poll(NetworkClient.java:270) > at > org.apache.kafka.clients.producer.internals.Sender.run(Sender.java:216) > at > org.apache.kafka.clients.producer.internals.Sender.run(Sender.java:128) > at java.lang.Thread.run(Thread.java:745) -- This message was sent by Atlassian JIRA (v6.3.4#6332)