Abhi created KAFKA-7812: --------------------------- Summary: Deadlock in SaslServerAuthenticator related threads Key: KAFKA-7812 URL: https://issues.apache.org/jira/browse/KAFKA-7812 Project: Kafka Issue Type: Bug Components: core Affects Versions: 2.0.0 Reporter: Abhi Attachments: threaddump-100cpu-tbd-broker-5
I am encountering a deadlock situation in SaslServerAuthenticator related code path where one thread is waiting for a monitor object locked by another thread. +*Thread 1:*+ "kafka-network-thread-5-ListenerName(SASL_PLAINTEXT)-SASL_PLAINTEXT-0" #66 prio=5 os_prio=0 tid=0x00007fe131e17000 nid=0x78f7 runnable [{color:#d04437}*0x00007fde287ed000*{color}] java.lang.Thread.State: RUNNABLE at java.util.HashMap$TreeNode.find(HashMap.java:1865) at java.util.HashMap$TreeNode.find(HashMap.java:1861) at java.util.HashMap$TreeNode.find(HashMap.java:1861) at java.util.HashMap$TreeNode.find(HashMap.java:1861) at java.util.HashMap$TreeNode.find(HashMap.java:1861) at java.util.HashMap$TreeNode.find(HashMap.java:1861) at java.util.HashMap$TreeNode.find(HashMap.java:1861) at java.util.HashMap$TreeNode.find(HashMap.java:1861) at java.util.HashMap$TreeNode.putTreeVal(HashMap.java:1979) at java.util.HashMap.putVal(HashMap.java:637) at java.util.HashMap.put(HashMap.java:611) at java.util.HashSet.add(HashSet.java:219) at javax.security.auth.Subject$ClassSet.populateSet(Subject.java:1418) at javax.security.auth.Subject$ClassSet.<init>(Subject.java:1372) - *{color:#f79232}locked <0x000000068893aae8>{color}* (a java.util.Collections$SynchronizedSet) at javax.security.auth.Subject.getPrivateCredentials(Subject.java:767) at sun.security.jgss.GSSUtil$1.run(GSSUtil.java:343) at sun.security.jgss.GSSUtil$1.run(GSSUtil.java:335) at java.security.AccessController.doPrivileged(Native Method) at sun.security.jgss.GSSUtil.searchSubject(GSSUtil.java:335) at sun.security.jgss.wrapper.NativeGSSFactory.getCredFromSubject(NativeGSSFactory.java:53) at sun.security.jgss.wrapper.NativeGSSFactory.getCredentialElement(NativeGSSFactory.java:116) at sun.security.jgss.GSSManagerImpl.getCredentialElement(GSSManagerImpl.java:193) at sun.security.jgss.GSSCredentialImpl.add(GSSCredentialImpl.java:427) at sun.security.jgss.GSSCredentialImpl.<init>(GSSCredentialImpl.java:62) at sun.security.jgss.GSSManagerImpl.createCredential(GSSManagerImpl.java:154) at com.sun.security.sasl.gsskerb.GssKrb5Server.<init>(GssKrb5Server.java:108) at com.sun.security.sasl.gsskerb.FactoryImpl.createSaslServer(FactoryImpl.java:85) at javax.security.sasl.Sasl.createSaslServer(Sasl.java:524) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator$2.run(SaslServerAuthenticator.java:215) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator$2.run(SaslServerAuthenticator.java:213) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslKerberosServer(SaslServerAuthenticator.java:213) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslServer(SaslServerAuthenticator.java:162) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.handleKafkaRequest(SaslServerAuthenticator.java:443) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.authenticate(SaslServerAuthenticator.java:253) at org.apache.kafka.common.network.KafkaChannel.prepare(KafkaChannel.java:127) at org.apache.kafka.common.network.Selector.pollSelectionKeys(Selector.java:487) at org.apache.kafka.common.network.Selector.poll(Selector.java:425) at kafka.network.Processor.poll(SocketServer.scala:678) at kafka.network.Processor.run(SocketServer.scala:583) at java.lang.Thread.run(Thread.java:745) Locked ownable synchronizers: - None *+Thread 2:+* "kafka-network-thread-5-ListenerName(SASL_PLAINTEXT)-SASL_PLAINTEXT-2" #68 prio=5 os_prio=0 tid=0x00007fe131e1a800 nid=0x78f9 waiting for monitor entry [{color:#d04437}*0x00007fde277ed000*{color}] java.lang.Thread.State: BLOCKED (on object monitor) at java.util.Collections$SynchronizedCollection.add(Collections.java:2035) - {color:#f79232}*waiting to lock <0x000000068893aae8>*{color} (a java.util.Collections$SynchronizedSet) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslKerberosServer(SaslServerAuthenticator.java:206) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslServer(SaslServerAuthenticator.java:162) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.handleKafkaRequest(SaslServerAuthenticator.java:443) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.authenticate(SaslServerAuthenticator.java:253) at org.apache.kafka.common.network.KafkaChannel.prepare(KafkaChannel.java:127) at org.apache.kafka.common.network.Selector.pollSelectionKeys(Selector.java:487) at org.apache.kafka.common.network.Selector.poll(Selector.java:425) at kafka.network.Processor.poll(SocketServer.scala:678) at kafka.network.Processor.run(SocketServer.scala:583) at java.lang.Thread.run(Thread.java:745) Locked ownable synchronizers: - None +*Thread 3:*+ "kafka-network-thread-5-ListenerName(SASL_PLAINTEXT)-SASL_PLAINTEXT-1" #67 prio=5 os_prio=0 tid=0x00007fe131e18800 nid=0x78f8 waiting for monitor entry [{color:#59afe1}0x00007fde27fec000{color}] java.lang.Thread.State: BLOCKED (on object monitor) at javax.security.auth.Subject$ClassSet.<init>(Subject.java:1372) - *{color:#f79232}waiting to lock <0x000000068893aae8>{color}* (a java.util.Collections$SynchronizedSet) at javax.security.auth.Subject.getPrivateCredentials(Subject.java:767) at sun.security.jgss.GSSUtil$1.run(GSSUtil.java:343) at sun.security.jgss.GSSUtil$1.run(GSSUtil.java:335) at java.security.AccessController.doPrivileged(Native Method) at sun.security.jgss.GSSUtil.searchSubject(GSSUtil.java:335) at sun.security.jgss.wrapper.NativeGSSFactory.getCredFromSubject(NativeGSSFactory.java:53) at sun.security.jgss.wrapper.NativeGSSFactory.getCredentialElement(NativeGSSFactory.java:116) at sun.security.jgss.GSSManagerImpl.getCredentialElement(GSSManagerImpl.java:193) at sun.security.jgss.GSSCredentialImpl.add(GSSCredentialImpl.java:427) at sun.security.jgss.GSSCredentialImpl.<init>(GSSCredentialImpl.java:62) at sun.security.jgss.GSSManagerImpl.createCredential(GSSManagerImpl.java:154) at com.sun.security.sasl.gsskerb.GssKrb5Server.<init>(GssKrb5Server.java:108) at com.sun.security.sasl.gsskerb.FactoryImpl.createSaslServer(FactoryImpl.java:85) at javax.security.sasl.Sasl.createSaslServer(Sasl.java:524) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator$2.run(SaslServerAuthenticator.java:215) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator$2.run(SaslServerAuthenticator.java:213) at java.security.AccessController.doPrivileged(Native Method) at javax.security.auth.Subject.doAs(Subject.java:422) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslKerberosServer(SaslServerAuthenticator.java:213) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.createSaslServer(SaslServerAuthenticator.java:162) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.handleKafkaRequest(SaslServerAuthenticator.java:443) at org.apache.kafka.common.security.authenticator.SaslServerAuthenticator.authenticate(SaslServerAuthenticator.java:253) at org.apache.kafka.common.network.KafkaChannel.prepare(KafkaChannel.java:127) at org.apache.kafka.common.network.Selector.pollSelectionKeys(Selector.java:487) at org.apache.kafka.common.network.Selector.poll(Selector.java:425) at kafka.network.Processor.poll(SocketServer.scala:678) at kafka.network.Processor.run(SocketServer.scala:583) at java.lang.Thread.run(Thread.java:745) Locked ownable synchronizers: - None This deadlock is causing the kafka broker to run at 100% causing exceptions in other brokers and affecting replication and other functionalities. +output of top:+ PID USER PR NI VIRT RES SHR S *%CPU* %MEM TIME+ COMMAND 30857 kafkatst 20 0 14.9g 3.9g 17600 S *100.0* 12.7 2727:53 java The connections with other brokers are in stuck in CLOSE_WAIT state: afka...@kafkatest-01.tbd[nj2]:/local/kafka> lsof -p 30857 | grep TCP java 30857 kafkatst 88u IPv4 63730863 0t0 TCP *:34982 (LISTEN) java 30857 kafkatst 89u IPv4 63730864 0t0 TCP *:mwkafka_staging_jmx (LISTEN) java 30857 kafkatst 108u IPv4 67204682 0t0 TCP kafkatest-01.tbd.xxxx.com:44540->mwkafka-zk-test-02.nyc.xxxx.com:eforward (ESTABLISHED) java 30857 kafkatst 288u IPv4 63728109 0t0 TCP *:XmlIpcRegSvc (LISTEN) java 30857 kafkatst 298u IPv4 68617055 0t0 TCP kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-01.dr.xxxx.com:34318 (ESTABLISHED) java 30857 kafkatst 305u IPv4 63722102 0t0 TCP kafkatest-01.tbd.xxxx.com:41354->mwkafka-zk-test-05.tbd.xxxx.com:eforward (ESTABLISHED) java 30857 kafkatst 308u IPv4 68616940 0t0 TCP kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-02.dr.xxxx.com:37050 (CLOSE_WAIT) java 30857 kafkatst 318u IPv4 63728144 0t0 TCP kafkatest-01.tbd.xxxx.com:m3ap->kafkatest-01.dr.xxxx.com:XmlIpcRegSvc (ESTABLISHED) java 30857 kafkatst 319u IPv4 63722104 0t0 TCP kafkatest-01.tbd.xxxx.com:50960->kafkatest-01.nyc.xxxx.com:XmlIpcRegSvc (ESTABLISHED) java 30857 kafkatst 320u IPv4 63722109 0t0 TCP kafkatest-01.tbd.xxxx.com:42892->kafkatest-02.nyc.xxxx.com:XmlIpcRegSvc (ESTABLISHED) java 30857 kafkatst 321u IPv4 68616946 0t0 TCP kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-02.nyc.xxxx.com:54892 (CLOSE_WAIT) java 30857 kafkatst 322u IPv4 68617070 0t0 TCP kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-01.nyc.xxxx.com:47308 (ESTABLISHED) java 30857 kafkatst 323u IPv4 68617081 0t0 TCP kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-02.dr.xxxx.com:37060 (ESTABLISHED) java 30857 kafkatst 324u IPv4 68617095 0t0 TCP kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-02.nyc.xxxx.com:54904 (ESTABLISHED) java 30857 kafkatst 327u IPv4 68616885 0t0 TCP kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-01.dr.xxxx.com:34298 (CLOSE_WAIT) java 30857 kafkatst 328u IPv4 68616894 0t0 TCP kafkatest-01.tbd.xxxx.com:XmlIpcRegSvc->kafkatest-01.nyc.xxxx.com:47290 (CLOSE_WAIT) I have attached a full thread dump here for reference. Please check this. -- This message was sent by Atlassian JIRA (v7.6.3#76005)