Stamatis Zampetakis created HIVE-24439: ------------------------------------------
Summary: HS2 memory leak when commitTxn fails and queries involve partitioned tables Key: HIVE-24439 URL: https://issues.apache.org/jira/browse/HIVE-24439 Project: Hive Issue Type: Task Reporter: Stamatis Zampetakis Attachments: heap_dump_overview.png Running explain plans on queries involving partitioned tables with many partitions (for instance TPC-DS 30TB) leads to a memory leak when there are failures during the commit of a transaction. The heap dump shows many {{FieldSchema}} instances which cannot be garbage collected since they are retained in the {{Context}} of the {{DriverTxnHandler}} due to a [shutdown hook|https://github.com/apache/hive/blob/aed7c86cdd59f0b2a4979633fbd191d451f2fd75/ql/src/java/org/apache/hadoop/hive/ql/DriverTxnHandler.java#L124] that keeps a reference to the enclosing instance of DriverTxnHandler. !heap_dump_overview.png! In this case the commit failures are due to a metastore with a broken schema (see stacktrace below) but I think that similar kind of failures can lead to the same situation. {noformat} 2020-11-27T05:45:32,629 ERROR [c69f30a1-864e-4b66-973a-0cc03fb81f3f main] ql.Driver: FAILED: Hive Internal Error: org.apache.hadoop.hive.ql.lockmgr.LockException(Error communicating with the metastore) org.apache.hadoop.hive.ql.lockmgr.LockException: Error communicating with the metastore at org.apache.hadoop.hive.ql.lockmgr.DbTxnManager.commitTxn(DbTxnManager.java:535) at org.apache.hadoop.hive.ql.DriverTxnHandler.commitOrRollback(DriverTxnHandler.java:572) at org.apache.hadoop.hive.ql.DriverTxnHandler.endTransactionAndCleanup(DriverTxnHandler.java:554) at org.apache.hadoop.hive.ql.DriverTxnHandler.endTransactionAndCleanup(DriverTxnHandler.java:537) at org.apache.hadoop.hive.ql.DriverTxnHandler.handleTransactionAfterExecution(DriverTxnHandler.java:487) at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:333) at org.apache.hadoop.hive.ql.Driver.run(Driver.java:149) at org.apache.hadoop.hive.ql.Driver.run(Driver.java:144) at org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:164) at org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:230) at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:258) at org.apache.hadoop.hive.cli.CliDriver.processCmd1(CliDriver.java:203) at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:129) at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:424) at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:355) at org.apache.hadoop.hive.ql.QTestUtil.executeClientInternal(QTestUtil.java:744) at org.apache.hadoop.hive.ql.QTestUtil.executeClient(QTestUtil.java:714) at org.apache.hadoop.hive.cli.control.CorePerfCliDriver.runTest(CorePerfCliDriver.java:103) at org.apache.hadoop.hive.cli.control.CliAdapter.runTest(CliAdapter.java:157) at org.apache.hadoop.hive.cli.TestTezTPCDS30TBPerfCliDriver.testCliDriver(TestTezTPCDS30TBPerfCliDriver.java:79) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.junit.runners.model.FrameworkMethod$1.runReflectiveCall(FrameworkMethod.java:59) at org.junit.internal.runners.model.ReflectiveCallable.run(ReflectiveCallable.java:12) at org.junit.runners.model.FrameworkMethod.invokeExplosively(FrameworkMethod.java:56) at org.junit.internal.runners.statements.InvokeMethod.evaluate(InvokeMethod.java:17) at org.apache.hadoop.hive.cli.TestTezTPCDS30TBPerfCliDriver$1.evaluate(TestTezTPCDS30TBPerfCliDriver.java:62) Caused by: MetaException(message:Unable to update transaction database org.postgresql.util.PSQLException: ERROR: column "CQ_TXN_ID" does not exist Position: 271 at org.postgresql.core.v3.QueryExecutorImpl.receiveErrorResponse(QueryExecutorImpl.java:2532) at org.postgresql.core.v3.QueryExecutorImpl.processResults(QueryExecutorImpl.java:2267) at org.postgresql.core.v3.QueryExecutorImpl.execute(QueryExecutorImpl.java:312) at org.postgresql.jdbc.PgStatement.executeInternal(PgStatement.java:448) at org.postgresql.jdbc.PgStatement.execute(PgStatement.java:369) at org.postgresql.jdbc.PgPreparedStatement.executeWithFlags(PgPreparedStatement.java:153) at org.postgresql.jdbc.PgPreparedStatement.executeQuery(PgPreparedStatement.java:103) at com.zaxxer.hikari.pool.ProxyPreparedStatement.executeQuery(ProxyPreparedStatement.java:52) at com.zaxxer.hikari.pool.HikariProxyPreparedStatement.executeQuery(HikariProxyPreparedStatement.java) at org.apache.hadoop.hive.metastore.txn.CompactionTxnHandler.getCompactionByTxnId(CompactionTxnHandler.java:1194) at org.apache.hadoop.hive.metastore.txn.CompactionTxnHandler.createCommitNotificationEvent(CompactionTxnHandler.java:1228) at org.apache.hadoop.hive.metastore.txn.TxnHandler.commitTxn(TxnHandler.java:1391) at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.commit_txn(HiveMetaStore.java:8377) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.hadoop.hive.metastore.RetryingHMSHandler.invokeInternal(RetryingHMSHandler.java:147) at org.apache.hadoop.hive.metastore.RetryingHMSHandler.invoke(RetryingHMSHandler.java:108) at com.sun.proxy.$Proxy62.commit_txn(Unknown Source) at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.commitTxn(HiveMetaStoreClient.java:3665) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.invoke(RetryingMetaStoreClient.java:218) at com.sun.proxy.$Proxy63.commitTxn(Unknown Source) at org.apache.hadoop.hive.ql.lockmgr.DbTxnManager.commitTxn(DbTxnManager.java:526) at org.apache.hadoop.hive.ql.DriverTxnHandler.commitOrRollback(DriverTxnHandler.java:572) at org.apache.hadoop.hive.ql.DriverTxnHandler.endTransactionAndCleanup(DriverTxnHandler.java:554) at org.apache.hadoop.hive.ql.DriverTxnHandler.endTransactionAndCleanup(DriverTxnHandler.java:537) at org.apache.hadoop.hive.ql.DriverTxnHandler.handleTransactionAfterExecution(DriverTxnHandler.java:487) at org.apache.hadoop.hive.ql.Driver.runInternal(Driver.java:333) at org.apache.hadoop.hive.ql.Driver.run(Driver.java:149) at org.apache.hadoop.hive.ql.Driver.run(Driver.java:144) at org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:164) at org.apache.hadoop.hive.ql.reexec.ReExecDriver.run(ReExecDriver.java:230) at org.apache.hadoop.hive.cli.CliDriver.processLocalCmd(CliDriver.java:258) at org.apache.hadoop.hive.cli.CliDriver.processCmd1(CliDriver.java:203) at org.apache.hadoop.hive.cli.CliDriver.processCmd(CliDriver.java:129) at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:424) at org.apache.hadoop.hive.cli.CliDriver.processLine(CliDriver.java:355) at org.apache.hadoop.hive.ql.QTestUtil.executeClientInternal(QTestUtil.java:744) at org.apache.hadoop.hive.ql.QTestUtil.executeClient(QTestUtil.java:714) at org.apache.hadoop.hive.cli.control.CorePerfCliDriver.runTest(CorePerfCliDriver.java:103) at org.apache.hadoop.hive.cli.control.CliAdapter.runTest(CliAdapter.java:157) at org.apache.hadoop.hive.cli.TestTezTPCDS30TBPerfCliDriver.testCliDriver(TestTezTPCDS30TBPerfCliDriver.java:79) {noformat} -- This message was sent by Atlassian Jira (v8.3.4#803005)