[ https://issues.apache.org/jira/browse/HIVE-25345?focusedWorklogId=625659&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-625659 ]
ASF GitHub Bot logged work on HIVE-25345: ----------------------------------------- Author: ASF GitHub Bot Created on: 20/Jul/21 16:03 Start Date: 20/Jul/21 16:03 Worklog Time Spent: 10m Work Description: lcspinter commented on a change in pull request #2493: URL: https://github.com/apache/hive/pull/2493#discussion_r673259608 ########## File path: standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java ########## @@ -85,36 +85,113 @@ public void run() { private void collectMetrics() throws MetaException { ShowCompactResponse currentCompactions = txnHandler.showCompact(new ShowCompactRequest()); - updateMetricsFromShowCompact(currentCompactions); + updateMetricsFromShowCompact(currentCompactions, conf); updateDBMetrics(); } private void updateDBMetrics() throws MetaException { MetricsInfo metrics = txnHandler.getMetricsInfo(); Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount()); + if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) && + metrics.getTxnToWriteIdCount() < + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } else if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount()); - + if (metrics.getCompletedTxnsCount() >= + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING) && + metrics.getCompletedTxnsCount() < + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge()); + if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } else if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } Metrics.getOrCreateGauge(NUM_OPEN_NON_REPL_TXNS).set(metrics.getOpenNonReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_ID).set(metrics.getOldestOpenNonReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_AGE).set(metrics.getOldestOpenNonReplTxnAge()); + if (metrics.getOldestOpenNonReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) + && metrics.getOldestOpenNonReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("Found an open transaction with an age of " + metrics.getOldestOpenNonReplTxnAge() + " seconds."); + } else if (metrics.getOldestOpenNonReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("Found an open transaction with an age of " + metrics.getOldestOpenNonReplTxnAge() + " seconds."); + } Metrics.getOrCreateGauge(NUM_ABORTED_TXNS).set(metrics.getAbortedTxnsCount()); Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_ID).set(metrics.getOldestAbortedTxnId()); Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_AGE).set(metrics.getOldestAbortedTxnAge()); + if (metrics.getOldestAbortedTxnAge() >= + MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_WARNING, + TimeUnit.SECONDS) && + metrics.getOldestAbortedTxnAge() < + MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("Found an aborted transaction with an age of " + metrics.getOldestAbortedTxnAge() + " seconds."); + } else if (metrics.getOldestAbortedTxnAge() >= + MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("Found an aborted transaction with an age of " + metrics.getOldestAbortedTxnAge() + " seconds."); + } Metrics.getOrCreateGauge(NUM_LOCKS).set(metrics.getLocksCount()); Metrics.getOrCreateGauge(OLDEST_LOCK_AGE).set(metrics.getOldestLockAge()); Metrics.getOrCreateGauge(TABLES_WITH_X_ABORTED_TXNS).set(metrics.getTablesWithXAbortedTxns()); + if (metrics.getOldestAbortedTxnAge() > Review comment: Yes, that would definitely add some value. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 625659) Time Spent: 1h 10m (was: 1h) > Add logging based on new compaction metrics > ------------------------------------------- > > Key: HIVE-25345 > URL: https://issues.apache.org/jira/browse/HIVE-25345 > Project: Hive > Issue Type: Improvement > Reporter: László Pintér > Assignee: László Pintér > Priority: Major > Labels: pull-request-available > Time Spent: 1h 10m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)