Oleksiy Sayankin created HIVE-19587:
---------------------------------------
Summary: HeartBeat thread uses cancelled delegation token while
connecting to meta on KERBEROS cluster
Key: HIVE-19587
URL: https://issues.apache.org/jira/browse/HIVE-19587
Project: Hive
Issue Type: Bug
Reporter: Oleksiy Sayankin
Assignee: Oleksiy Sayankin
*STEP 1. Create test data*
{code}
create table t1 (id int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ",";
create table t2 (id int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ",";
{code}
Generate 10 000 000 lines of random data
{code}
package com.test.app;
import java.io.FileNotFoundException;
import java.io.PrintWriter;
import java.util.concurrent.ThreadLocalRandom;
public class App {
public static void main(String[] args) throws FileNotFoundException {
try (PrintWriter out = new PrintWriter("table.data");) {
int min = 0;
int max = 10_000;
int numRows = 10_000_000;
for (int i = 0; i <= numRows - 1; i++){
int randomNum = ThreadLocalRandom.current().nextInt(min, max + 1);
out.println(randomNum);
}
}
}
}
{code}
Upload data to Hive tables
{code}
load data local inpath '/home/myuser/table.data' into table t1;
load data local inpath '/home/myuser/table.data' into table t2;
{code}
*STEP 2. Configure transactions in hive-site.xml*
{code}
<!-- Transactions -->
<property>
<name>hive.exec.dynamic.partition.mode</name>
<value>nonstrict</value>
</property>
<property>
<name>hive.support.concurrency</name>
<value>true</value>
</property>
<property>
<name>hive.enforce.bucketing</name>
<value>true</value>
</property>
<property>
<name>hive.txn.manager</name>
<value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
</property>
<property>
<name>hive.compactor.initiator.on</name>
<value>true</value>
</property>
<property>
<name>hive.compactor.worker.threads</name>
<value>1</value>
</property>
{code}
*STEP 3. Configure hive.txn.timeout in hive-site.xml*
{code}
<!-- Delegation token issue -->
<property>
<name>hive.txn.timeout</name>
<value>10s</value>
</property>
{code}
*STEP 4. Connect via beeline to HS2 with KERBEROS*
{code}
!connect
jdbc:hive2://node8.cluster:10000/default;principal=myuser/node8.cluster@NODE8;ssl=true;sslTrustStore=/opt/myuser/conf/ssl_truststore
{code}
{code}
select count(*) from t1;
{code}
*STEP 5. Close connection and reconnect*
{code}
!close
{code}
{code}
!connect
jdbc:hive2://node8.cluster:10000/default;principal=myuser/node8.cluster@NODE8;ssl=true;sslTrustStore=/opt/myuser/conf/ssl_truststore
{code}
*STEP 6. Perform long playing query*
This query lasts about 600s
{code}
select count(*) from t1 join t2 on t1.id = t2.id;
{code}
*EXPECTED RESULT*
Query finishes successfully
*ACTUAL RESULT*
{code}
2018-05-17T13:54:54,921 ERROR [pool-7-thread-10] transport.TSaslTransport: SASL
negotiation failure
javax.security.sasl.SaslException: DIGEST-MD5: IO error acquiring password
at
com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:598)
at
com.sun.security.sasl.digest.DigestMD5Server.evaluateResponse(DigestMD5Server.java:244)
at
org.apache.thrift.transport.TSaslTransport$SaslParticipant.evaluateChallengeOrResponse(TSaslTransport.java:539)
at
org.apache.thrift.transport.TSaslTransport.open(TSaslTransport.java:283)
at
org.apache.thrift.transport.TSaslServerTransport.open(TSaslServerTransport.java:41)
at
org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:216)
at
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:663)
at
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:660)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:360)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1613)
at
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory.getTransport(HadoopThriftAuthBridge.java:660)
at
org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:269)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.security.token.SecretManager$InvalidToken: token
expired or does not exist: owner=myuser, renewer=myuser, realUser=,
issueDate=1526565229297, maxDate=1527170029297, sequenceNumber=1, masterKeyId=1
at
org.apache.hadoop.hive.thrift.TokenStoreDelegationTokenSecretManager.retrievePassword(TokenStoreDelegationTokenSecretManager.java:104)
at
org.apache.hadoop.hive.thrift.TokenStoreDelegationTokenSecretManager.retrievePassword(TokenStoreDelegationTokenSecretManager.java:56)
at
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$SaslDigestCallbackHandler.getPassword(HadoopThriftAuthBridge.java:472)
at
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$SaslDigestCallbackHandler.handle(HadoopThriftAuthBridge.java:503)
at
com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:589)
... 15 more
2018-05-17T13:54:54,966 ERROR [pool-7-thread-10] server.TThreadPoolServer:
Error occurred during processing of message.
java.lang.RuntimeException: org.apache.thrift.transport.TTransportException:
DIGEST-MD5: IO error acquiring password
at
org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:219)
at
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:663)
at
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:660)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:360)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1613)
at
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory.getTransport(HadoopThriftAuthBridge.java:660)
at
org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:269)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.thrift.transport.TTransportException: DIGEST-MD5: IO
error acquiring password
at
org.apache.thrift.transport.TSaslTransport.sendAndThrowMessage(TSaslTransport.java:232)
at
org.apache.thrift.transport.TSaslTransport.open(TSaslTransport.java:316)
at
org.apache.thrift.transport.TSaslServerTransport.open(TSaslServerTransport.java:41)
at
org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:216)
... 10 more
{code}
*ROOT-CAUSE*
HS2 uses cancelled delegation token to connect to metastore. From logs
1. Metastore creates delegation token (issueDate=1526565229297)
{code}
2018-05-17T13:53:49,297 INFO [pool-7-thread-3]
delegation.AbstractDelegationTokenSecretManager: Creating password for
identifier: owner=myuser, renewer=myuser, realUser=, issueDate=1526565229297, m$
2018-05-17T13:53:50,789 INFO [Thread-18] txn.TxnHandler: 'Cleaner' locked by
'node8.cluster'
2018-05-17T13:53:50,793 INFO [Thread-18] txn.TxnHandler: 'Cleaner' unlocked by
'node8.cluster'
2018-05-17T13:53:54,840 INFO [pool-7-thread-4] metastore.HiveMetaStore: 3:
Opening raw store with implementation
class:org.apache.hadoop.hive.metastore.ObjectStore
{code}
2. Metastore cancels delegation token (issueDate=1526565229297)
{code}
2018-05-17T13:54:32,380 INFO [pool-7-thread-4]
thrift.TokenStoreDelegationTokenSecretManager: Token cancelation requested for
identifier: owner=mapr, renewer=mapr, realUser=, issueDate=1526565229297,
maxDate=1527170029297,$
2018-05-17T13:54:32,416 INFO [pool-7-thread-4] metastore.HiveMetaStore: 3:
Cleaning up thread local RawStore...
{code}
3. HS2 uses canceled delegation token and crashes (issueDate=1526565229297):
{code}
2018-05-17T13:54:54,921 ERROR [pool-7-thread-10] transport.TSaslTransport: SASL
negotiation failure
javax.security.sasl.SaslException: DIGEST-MD5: IO error acquiring password
at
com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:598)
at
com.sun.security.sasl.digest.DigestMD5Server.evaluateResponse(DigestMD5Server.java:244)
at
org.apache.thrift.transport.TSaslTransport$SaslParticipant.evaluateChallengeOrResponse(TSaslTransport.java:539)
at
org.apache.thrift.transport.TSaslTransport.open(TSaslTransport.java:283)
at
org.apache.thrift.transport.TSaslServerTransport.open(TSaslServerTransport.java:41)
at
org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:216)
at
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:663)
at
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:660)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:360)
at
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1613)
at
org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory.getTransport(HadoopThriftAuthBridge.java:660)
at
org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:269)
at
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.hadoop.security.token.SecretManager$InvalidToken: token
expired or does not exist: owner=myuser, renewer=myuser, realUser=,
issueDate=1526565229297, maxDate=1527170029297, sequenceNumber=1, masterKeyId=1
at
org.apache.hadoop.hive.thrift.TokenStoreDelegationTokenSecretManager.retrievePassword(TokenStoreDelegationTokenSecretManager.java:104)
{code}
*WORKAROUND*
>From Hive sources:
{code}
heartBeatInterval = HIVE_TXN_TIMEOUT / 2.
{code}
So to skip HB during job one needs to set
{code}
HIVE_TXN_TIMEOUT = (max_job_duration * 2) * 1.25.
{code}
Here I added 1.25 factor to be sure there is no HB. In my case max_job_duration
= ~600s, then
{code}
HIVE_TXN_TIMEOUT = 600s * 2 * 1.25 = 1500s
{code}
When I {{set hive.txn.timeout}} = 1500s issue was gone on my local env.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)