[ 
https://issues.apache.org/jira/browse/HIVE-19587?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17084335#comment-17084335
 ] 

Nick Coish commented on HIVE-19587:
-----------------------------------

Checking in on the patch here: The issue is still labelled "unresolved". Are 
there plans to integrate this in to Hive proper? Or is this fix already in Hive 
2.x?

If so, in which version was the fix introduced? Thanks!

> HeartBeat thread uses cancelled delegation token while connecting to meta on 
> KERBEROS cluster
> ---------------------------------------------------------------------------------------------
>
>                 Key: HIVE-19587
>                 URL: https://issues.apache.org/jira/browse/HIVE-19587
>             Project: Hive
>          Issue Type: Bug
>          Components: Transactions
>    Affects Versions: 3.0.0, 2.3.2, 3.1.0
>            Reporter: Oleksiy Sayankin
>            Assignee: Oleksiy Sayankin
>            Priority: Blocker
>         Attachments: HIVE-19587.1.patch
>
>
> *STEP 1. Create test data*
> {code}
> create table t1 (id int) ROW FORMAT DELIMITED FIELDS TERMINATED BY  ",";
> create table t2 (id int) ROW FORMAT DELIMITED FIELDS TERMINATED BY  ",";
> {code}
> Generate 10 000 000 lines of random data
> {code}
> package com.test.app;
> import java.io.FileNotFoundException;
> import java.io.PrintWriter;
> import java.util.concurrent.ThreadLocalRandom;
> public class App {
>   public static void main(String[] args) throws FileNotFoundException {
>     try (PrintWriter out = new PrintWriter("table.data");) {
>       int min = 0;
>       int max = 10_000;
>       int numRows = 10_000_000;
>       for (int i = 0; i <= numRows - 1; i++){
>         int randomNum = ThreadLocalRandom.current().nextInt(min, max + 1);
>         out.println(randomNum);
>       }
>     }
>   }
> }
> {code}
> Upload data to Hive tables
> {code}
> load data local inpath '/home/myuser/table.data' into table t1;
> load data local inpath '/home/myuser/table.data' into table t2;
> {code}
> *STEP 2. Configure transactions in hive-site.xml*
> {code}
> <!-- Transactions -->
>    <property>
>       <name>hive.exec.dynamic.partition.mode</name>
>       <value>nonstrict</value>
>    </property>
>    <property>
>       <name>hive.support.concurrency</name>
>       <value>true</value>
>    </property>
>    <property>
>       <name>hive.enforce.bucketing</name>
>       <value>true</value>
>    </property>
>    <property>
>       <name>hive.txn.manager</name>
>       <value>org.apache.hadoop.hive.ql.lockmgr.DbTxnManager</value>
>    </property>
>    <property>
>       <name>hive.compactor.initiator.on</name>
>       <value>true</value>
>    </property>
>    <property>
>       <name>hive.compactor.worker.threads</name>
>       <value>1</value>
>    </property>
> {code}
> *STEP 3. Configure hive.txn.timeout in hive-site.xml*
> {code}
> <!-- Delegation token issue -->
>    <property>
>       <name>hive.txn.timeout</name>
>       <value>10s</value>
>    </property>
> {code}
> *STEP 4. Connect via beeline to HS2 with KERBEROS*
> {code}
> !connect 
> jdbc:hive2://node8.cluster:10000/default;principal=myuser/node8.cluster@NODE8;ssl=true;sslTrustStore=/opt/myuser/conf/ssl_truststore
> {code}
> {code}
> select count(*) from t1;
> {code}
> *STEP 5. Close connection and reconnect*
> {code}
> !close
> {code}
> {code}
> !connect 
> jdbc:hive2://node8.cluster:10000/default;principal=myuser/node8.cluster@NODE8;ssl=true;sslTrustStore=/opt/myuser/conf/ssl_truststore
> {code}
> *STEP 6. Perform long playing query*
> This query lasts about 600s
> {code}
> select count(*) from t1 join t2 on t1.id = t2.id;
> {code}
> *EXPECTED RESULT*
> Query finishes successfully
> *ACTUAL RESULT*
> {code}
> 2018-05-17T13:54:54,921 ERROR [pool-7-thread-10] transport.TSaslTransport: 
> SASL negotiation failure
> javax.security.sasl.SaslException: DIGEST-MD5: IO error acquiring password
>         at 
> com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:598)
>  
>         at 
> com.sun.security.sasl.digest.DigestMD5Server.evaluateResponse(DigestMD5Server.java:244)
>         at 
> org.apache.thrift.transport.TSaslTransport$SaslParticipant.evaluateChallengeOrResponse(TSaslTransport.java:539)
>  
>         at 
> org.apache.thrift.transport.TSaslTransport.open(TSaslTransport.java:283)
>         at 
> org.apache.thrift.transport.TSaslServerTransport.open(TSaslServerTransport.java:41)
>  
>         at 
> org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:216)
>  
>         at 
> org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:663)
>  
>         at 
> org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:660)
>  
>         at java.security.AccessController.doPrivileged(Native Method) 
>         at javax.security.auth.Subject.doAs(Subject.java:360)
>         at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1613)
>  
>         at 
> org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory.getTransport(HadoopThriftAuthBridge.java:660)
>  
>         at 
> org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:269)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>  
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>  
>         at java.lang.Thread.run(Thread.java:748)
> Caused by: org.apache.hadoop.security.token.SecretManager$InvalidToken: token 
> expired or does not exist: owner=myuser, renewer=myuser, realUser=, 
> issueDate=1526565229297, maxDate=1527170029297, sequenceNumber=1, 
> masterKeyId=1
>         at 
> org.apache.hadoop.hive.thrift.TokenStoreDelegationTokenSecretManager.retrievePassword(TokenStoreDelegationTokenSecretManager.java:104)
>  
>         at 
> org.apache.hadoop.hive.thrift.TokenStoreDelegationTokenSecretManager.retrievePassword(TokenStoreDelegationTokenSecretManager.java:56)
>         at 
> org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$SaslDigestCallbackHandler.getPassword(HadoopThriftAuthBridge.java:472)
>         at 
> org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$SaslDigestCallbackHandler.handle(HadoopThriftAuthBridge.java:503)
>  
>         at 
> com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:589)
>         ... 15 more
> 2018-05-17T13:54:54,966 ERROR [pool-7-thread-10] server.TThreadPoolServer: 
> Error occurred during processing of message.
> java.lang.RuntimeException: org.apache.thrift.transport.TTransportException: 
> DIGEST-MD5: IO error acquiring password
>         at 
> org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:219)
>  
>         at 
> org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:663)
>  
>         at 
> org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:660)
>         at java.security.AccessController.doPrivileged(Native Method) 
>         at javax.security.auth.Subject.doAs(Subject.java:360)
>         at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1613)
>  
>         at 
> org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory.getTransport(HadoopThriftAuthBridge.java:660)
>  
>         at 
> org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:269)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>  
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>         at java.lang.Thread.run(Thread.java:748) 
> Caused by: org.apache.thrift.transport.TTransportException: DIGEST-MD5: IO 
> error acquiring password
>         at 
> org.apache.thrift.transport.TSaslTransport.sendAndThrowMessage(TSaslTransport.java:232)
>  
>         at 
> org.apache.thrift.transport.TSaslTransport.open(TSaslTransport.java:316) 
>         at 
> org.apache.thrift.transport.TSaslServerTransport.open(TSaslServerTransport.java:41)
>  
>         at 
> org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:216)
>  
>         ... 10 more
> {code}
> *ROOT-CAUSE*
> HS2 uses cancelled delegation token to connect to metastore. From logs
> 1. Metastore creates delegation token (issueDate=1526565229297)
> {code}
> 2018-05-17T13:53:49,297  INFO [pool-7-thread-3] 
> delegation.AbstractDelegationTokenSecretManager: Creating password for 
> identifier: owner=myuser, renewer=myuser, realUser=, issueDate=1526565229297, 
> m$
> 2018-05-17T13:53:50,789  INFO [Thread-18] txn.TxnHandler: 'Cleaner' locked by 
> 'node8.cluster'
> 2018-05-17T13:53:50,793  INFO [Thread-18] txn.TxnHandler: 'Cleaner' unlocked 
> by 'node8.cluster'
> 2018-05-17T13:53:54,840  INFO [pool-7-thread-4] metastore.HiveMetaStore: 3: 
> Opening raw store with implementation 
> class:org.apache.hadoop.hive.metastore.ObjectStore
> {code}
> 2. Metastore cancels delegation token (issueDate=1526565229297)
> {code}
> 2018-05-17T13:54:32,380  INFO [pool-7-thread-4] 
> thrift.TokenStoreDelegationTokenSecretManager: Token cancelation requested 
> for identifier: owner=mapr, renewer=mapr, realUser=, issueDate=1526565229297, 
> maxDate=1527170029297,$
> 2018-05-17T13:54:32,416  INFO [pool-7-thread-4] metastore.HiveMetaStore: 3: 
> Cleaning up thread local RawStore...
> {code}
> 3. HS2 uses canceled delegation token and crashes (issueDate=1526565229297):
> {code}
> 2018-05-17T13:54:54,921 ERROR [pool-7-thread-10] transport.TSaslTransport: 
> SASL negotiation failure
> javax.security.sasl.SaslException: DIGEST-MD5: IO error acquiring password
>         at 
> com.sun.security.sasl.digest.DigestMD5Server.validateClientResponse(DigestMD5Server.java:598)
>  
>         at 
> com.sun.security.sasl.digest.DigestMD5Server.evaluateResponse(DigestMD5Server.java:244)
>         at 
> org.apache.thrift.transport.TSaslTransport$SaslParticipant.evaluateChallengeOrResponse(TSaslTransport.java:539)
>  
>         at 
> org.apache.thrift.transport.TSaslTransport.open(TSaslTransport.java:283)
>         at 
> org.apache.thrift.transport.TSaslServerTransport.open(TSaslServerTransport.java:41)
>  
>         at 
> org.apache.thrift.transport.TSaslServerTransport$Factory.getTransport(TSaslServerTransport.java:216)
>  
>         at 
> org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:663)
>  
>         at 
> org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory$1.run(HadoopThriftAuthBridge.java:660)
>  
>         at java.security.AccessController.doPrivileged(Native Method) 
>         at javax.security.auth.Subject.doAs(Subject.java:360)
>         at 
> org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1613)
>  
>         at 
> org.apache.hadoop.hive.thrift.HadoopThriftAuthBridge$Server$TUGIAssumingTransportFactory.getTransport(HadoopThriftAuthBridge.java:660)
>  
>         at 
> org.apache.thrift.server.TThreadPoolServer$WorkerProcess.run(TThreadPoolServer.java:269)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
>  
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
>  
>         at java.lang.Thread.run(Thread.java:748)
> Caused by: org.apache.hadoop.security.token.SecretManager$InvalidToken: token 
> expired or does not exist: owner=myuser, renewer=myuser, realUser=, 
> issueDate=1526565229297, maxDate=1527170029297, sequenceNumber=1, 
> masterKeyId=1
>         at 
> org.apache.hadoop.hive.thrift.TokenStoreDelegationTokenSecretManager.retrievePassword(TokenStoreDelegationTokenSecretManager.java:104)
> {code}
> *WORKAROUND*
> From Hive sources:
> {code}
> heartBeatInterval = HIVE_TXN_TIMEOUT / 2.
> {code}
> So to skip HB during job one needs to set
> {code}
> HIVE_TXN_TIMEOUT = (max_job_duration * 2) * 1.25.
> {code}
> Here I added 1.25 factor to be sure there is no HB. In my case 
> max_job_duration = ~600s, then 
> {code}
> HIVE_TXN_TIMEOUT = 600s * 2 * 1.25 = 1500s
> {code}
> When I {{set hive.txn.timeout}} = 1500s issue was gone on my local env.
> FYI [~isnogood], [~ashutoshc]



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to