[ https://issues.apache.org/jira/browse/HIVE-25345?focusedWorklogId=624787&page=com.atlassian.jira.plugin.system.issuetabpanels:worklog-tabpanel#worklog-624787 ]
ASF GitHub Bot logged work on HIVE-25345: ----------------------------------------- Author: ASF GitHub Bot Created on: 20/Jul/21 07:51 Start Date: 20/Jul/21 07:51 Worklog Time Spent: 10m Work Description: klcopp commented on a change in pull request #2493: URL: https://github.com/apache/hive/pull/2493#discussion_r672868711 ########## File path: common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ########## @@ -3194,6 +3194,15 @@ private static void populateLlapDaemonVarsSet(Set<String> llapDaemonVarsSetLocal "Age of table/partition's oldest aborted transaction when compaction will be triggered. " + "Default time unit is: hours. Set to a negative number to disable."), + HIVE_COMPACTOR_ACTIVE_DELTA_DIR_THRESHOLD("hive.compactor.active.delta.dir.threshold", 200, + "Number if active delta directories under a given table/partition."), Review comment: I think the descriptions here should reflect that these are thresholds, and that logging will happen if they are passed ########## File path: standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java ########## @@ -432,6 +432,88 @@ public static ConfVars getMetaConf(String name) { COMPACTOR_RUN_AS_USER("metastore.compactor.run.as.user", "hive.compactor.run.as.user", "", "Specify the user to run compactor Initiator and Worker as. If empty string, defaults to table/partition " + "directory owner."), + COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING( Review comment: For all of these: Instead of: "after which a warning should be raised" I think it would be clearer to say: "after which a warning will be logged" ########## File path: standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/HMSMetricsListener.java ########## @@ -101,7 +101,14 @@ public void onAllocWriteId(AllocWriteIdEvent allocWriteIdEvent, Connection dbCon Table table = getTable(allocWriteIdEvent); if (MetaStoreUtils.isNoAutoCompactSet(table.getParameters())) { - Metrics.getOrCreateGauge(MetricsConstants.WRITES_TO_DISABLED_COMPACTION_TABLE).incrementAndGet(); + int noAutoCompactSet = + Metrics.getOrCreateGauge(MetricsConstants.WRITES_TO_DISABLED_COMPACTION_TABLE).incrementAndGet(); + if (noAutoCompactSet >= + MetastoreConf.getIntVar(getConf(), + MetastoreConf.ConfVars.COMPACTOR_NUMBER_OF_DISABLED_COMPACTION_TABLES_THRESHOLD)) { + LOGGER.warn("Number of tables where the compaction is turned off is: " + noAutoCompactSet); Review comment: This might be clearer: "There has been a write to a table where auto-compaction is disabled (tblproperties ("no_auto_compact"="true"))... And definitely log the db and table name, so users can find it and re-enable auto-compaction. ########## File path: standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java ########## @@ -85,36 +85,113 @@ public void run() { private void collectMetrics() throws MetaException { ShowCompactResponse currentCompactions = txnHandler.showCompact(new ShowCompactRequest()); - updateMetricsFromShowCompact(currentCompactions); + updateMetricsFromShowCompact(currentCompactions, conf); updateDBMetrics(); } private void updateDBMetrics() throws MetaException { MetricsInfo metrics = txnHandler.getMetricsInfo(); Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount()); + if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) && + metrics.getTxnToWriteIdCount() < + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } else if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount()); - + if (metrics.getCompletedTxnsCount() >= + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING) && + metrics.getCompletedTxnsCount() < + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge()); + if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } else if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } Metrics.getOrCreateGauge(NUM_OPEN_NON_REPL_TXNS).set(metrics.getOpenNonReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_ID).set(metrics.getOldestOpenNonReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_AGE).set(metrics.getOldestOpenNonReplTxnAge()); + if (metrics.getOldestOpenNonReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) + && metrics.getOldestOpenNonReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("Found an open transaction with an age of " + metrics.getOldestOpenNonReplTxnAge() + " seconds."); Review comment: open non-replication transaction* ########## File path: standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java ########## @@ -85,36 +85,113 @@ public void run() { private void collectMetrics() throws MetaException { ShowCompactResponse currentCompactions = txnHandler.showCompact(new ShowCompactRequest()); - updateMetricsFromShowCompact(currentCompactions); + updateMetricsFromShowCompact(currentCompactions, conf); updateDBMetrics(); } private void updateDBMetrics() throws MetaException { MetricsInfo metrics = txnHandler.getMetricsInfo(); Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount()); + if (metrics.getTxnToWriteIdCount() >= Review comment: It would be more readable to separate out all the logging into a new method (can just pass the MetricsInfo object) ########## File path: standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java ########## @@ -85,36 +85,113 @@ public void run() { private void collectMetrics() throws MetaException { ShowCompactResponse currentCompactions = txnHandler.showCompact(new ShowCompactRequest()); - updateMetricsFromShowCompact(currentCompactions); + updateMetricsFromShowCompact(currentCompactions, conf); updateDBMetrics(); } private void updateDBMetrics() throws MetaException { MetricsInfo metrics = txnHandler.getMetricsInfo(); Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount()); + if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) && + metrics.getTxnToWriteIdCount() < + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } else if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount()); - + if (metrics.getCompletedTxnsCount() >= + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING) && + metrics.getCompletedTxnsCount() < + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge()); + if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } else if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } Metrics.getOrCreateGauge(NUM_OPEN_NON_REPL_TXNS).set(metrics.getOpenNonReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_ID).set(metrics.getOldestOpenNonReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_AGE).set(metrics.getOldestOpenNonReplTxnAge()); + if (metrics.getOldestOpenNonReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) + && metrics.getOldestOpenNonReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("Found an open transaction with an age of " + metrics.getOldestOpenNonReplTxnAge() + " seconds."); + } else if (metrics.getOldestOpenNonReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("Found an open transaction with an age of " + metrics.getOldestOpenNonReplTxnAge() + " seconds."); + } Metrics.getOrCreateGauge(NUM_ABORTED_TXNS).set(metrics.getAbortedTxnsCount()); Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_ID).set(metrics.getOldestAbortedTxnId()); Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_AGE).set(metrics.getOldestAbortedTxnAge()); + if (metrics.getOldestAbortedTxnAge() >= + MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_WARNING, + TimeUnit.SECONDS) && + metrics.getOldestAbortedTxnAge() < + MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("Found an aborted transaction with an age of " + metrics.getOldestAbortedTxnAge() + " seconds."); + } else if (metrics.getOldestAbortedTxnAge() >= + MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("Found an aborted transaction with an age of " + metrics.getOldestAbortedTxnAge() + " seconds."); + } Metrics.getOrCreateGauge(NUM_LOCKS).set(metrics.getLocksCount()); Metrics.getOrCreateGauge(OLDEST_LOCK_AGE).set(metrics.getOldestLockAge()); Metrics.getOrCreateGauge(TABLES_WITH_X_ABORTED_TXNS).set(metrics.getTablesWithXAbortedTxns()); + if (metrics.getOldestAbortedTxnAge() > Review comment: What do you think about saving the list of these tables in MetricsInfo, and logging that list here, so users know where to clean? It's just an idea ########## File path: standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java ########## @@ -143,11 +220,33 @@ public static void updateMetricsFromShowCompact(ShowCompactResponse showCompactR Metrics.getOrCreateGauge(key).set(0); } } + + Long numFailedComp = counts.get(TxnStore.FAILED_RESPONSE); + Long numNotInitiatedComp = counts.get(TxnStore.DID_NOT_INITIATE_RESPONSE); + Long numSucceededComp = counts.get(TxnStore.SUCCEEDED_RESPONSE); + if (numFailedComp != null && numNotInitiatedComp != null && numSucceededComp != null && + ((numFailedComp + numNotInitiatedComp) / (numFailedComp + numNotInitiatedComp + numSucceededComp) > + MetastoreConf.getDoubleVar(conf, MetastoreConf.ConfVars.COMPACTOR_FAILED_COMPACTION_RATIO_THRESHOLD))) { + LOG.warn("Many compactions are failing. Check root cause of failed/not initiated compactions."); + } + if (oldestEnqueueTime == Long.MAX_VALUE) { Metrics.getOrCreateGauge(COMPACTION_OLDEST_ENQUEUE_AGE).set(0); } else { + int oldestEnqueueAge = (int) ((System.currentTimeMillis() - oldestEnqueueTime) / 1000L); Metrics.getOrCreateGauge(COMPACTION_OLDEST_ENQUEUE_AGE) - .set((int) ((System.currentTimeMillis() - oldestEnqueueTime) / 1000L)); + .set(oldestEnqueueAge); + if (oldestEnqueueAge >= MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_INITIATED_COMPACTION_TIME_THRESHOLD_WARNING, TimeUnit.SECONDS) && + oldestEnqueueAge < MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_INITIATED_COMPACTION_TIME_THRESHOLD_ERROR, TimeUnit.SECONDS)) { + LOG.warn("Found compaction entry in compaction queue with an age of " + oldestEnqueueAge + " seconds. " + + "Check the time of last successful compaction and number of worker threads."); Review comment: "Check the time of last successful compaction" -> I know this is part of the alert description but maybe this doesn't make sense...maybe just leave this at "Check the number of worker threads".. or "Consider increasing the number of worker threads" ########## File path: standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java ########## @@ -85,36 +85,113 @@ public void run() { private void collectMetrics() throws MetaException { ShowCompactResponse currentCompactions = txnHandler.showCompact(new ShowCompactRequest()); - updateMetricsFromShowCompact(currentCompactions); + updateMetricsFromShowCompact(currentCompactions, conf); updateDBMetrics(); } private void updateDBMetrics() throws MetaException { MetricsInfo metrics = txnHandler.getMetricsInfo(); Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount()); + if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) && + metrics.getTxnToWriteIdCount() < + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } else if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount()); - + if (metrics.getCompletedTxnsCount() >= + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING) && + metrics.getCompletedTxnsCount() < + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge()); + if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } else if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } Metrics.getOrCreateGauge(NUM_OPEN_NON_REPL_TXNS).set(metrics.getOpenNonReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_ID).set(metrics.getOldestOpenNonReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_AGE).set(metrics.getOldestOpenNonReplTxnAge()); + if (metrics.getOldestOpenNonReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) + && metrics.getOldestOpenNonReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("Found an open transaction with an age of " + metrics.getOldestOpenNonReplTxnAge() + " seconds."); + } else if (metrics.getOldestOpenNonReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("Found an open transaction with an age of " + metrics.getOldestOpenNonReplTxnAge() + " seconds."); + } Metrics.getOrCreateGauge(NUM_ABORTED_TXNS).set(metrics.getAbortedTxnsCount()); Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_ID).set(metrics.getOldestAbortedTxnId()); Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_AGE).set(metrics.getOldestAbortedTxnAge()); + if (metrics.getOldestAbortedTxnAge() >= + MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_WARNING, + TimeUnit.SECONDS) && + metrics.getOldestAbortedTxnAge() < + MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("Found an aborted transaction with an age of " + metrics.getOldestAbortedTxnAge() + " seconds."); + } else if (metrics.getOldestAbortedTxnAge() >= + MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("Found an aborted transaction with an age of " + metrics.getOldestAbortedTxnAge() + " seconds."); + } Metrics.getOrCreateGauge(NUM_LOCKS).set(metrics.getLocksCount()); Metrics.getOrCreateGauge(OLDEST_LOCK_AGE).set(metrics.getOldestLockAge()); Metrics.getOrCreateGauge(TABLES_WITH_X_ABORTED_TXNS).set(metrics.getTablesWithXAbortedTxns()); + if (metrics.getOldestAbortedTxnAge() > + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TABLES_WITH_ABORTEDTXN_THRESHOLD)) { + LOG.error("Found " + metrics.getOldestAbortedTxnAge() + " tables/partitions with more than " + Review comment: getTablesWithXAbortedTxns() instead of getOldestAbortedTxnAge() ########## File path: standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java ########## @@ -432,6 +432,88 @@ public static ConfVars getMetaConf(String name) { COMPACTOR_RUN_AS_USER("metastore.compactor.run.as.user", "hive.compactor.run.as.user", "", "Specify the user to run compactor Initiator and Worker as. If empty string, defaults to table/partition " + "directory owner."), + COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING( + "metastore.compactor.oldest.replication.open.txn.threshold.warning", + "hive.compactor.oldest.replication.open.txn.threshold.warning", + "14d", new TimeValidator(TimeUnit.DAYS), + "Age of open replication transaction after a warning should be raised. Default time unit: days"), + COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR( + "metastore.compactor.oldest.replication.open.txn.threshold.error", + "hive.compactor.oldest.replication.open.txn.threshold.error", + "21d", new TimeValidator(TimeUnit.DAYS), + "Age of open replication transaction after an error should be raised. Default time unit: days"), + COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING( Review comment: *oldest non-replication transaction ########## File path: standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java ########## @@ -85,36 +85,113 @@ public void run() { private void collectMetrics() throws MetaException { ShowCompactResponse currentCompactions = txnHandler.showCompact(new ShowCompactRequest()); - updateMetricsFromShowCompact(currentCompactions); + updateMetricsFromShowCompact(currentCompactions, conf); updateDBMetrics(); } private void updateDBMetrics() throws MetaException { MetricsInfo metrics = txnHandler.getMetricsInfo(); Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount()); + if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) && + metrics.getTxnToWriteIdCount() < + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } else if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount()); - + if (metrics.getCompletedTxnsCount() >= + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING) && + metrics.getCompletedTxnsCount() < + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge()); + if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } else if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } Metrics.getOrCreateGauge(NUM_OPEN_NON_REPL_TXNS).set(metrics.getOpenNonReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_ID).set(metrics.getOldestOpenNonReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_AGE).set(metrics.getOldestOpenNonReplTxnAge()); + if (metrics.getOldestOpenNonReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) + && metrics.getOldestOpenNonReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("Found an open transaction with an age of " + metrics.getOldestOpenNonReplTxnAge() + " seconds."); + } else if (metrics.getOldestOpenNonReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("Found an open transaction with an age of " + metrics.getOldestOpenNonReplTxnAge() + " seconds."); + } Metrics.getOrCreateGauge(NUM_ABORTED_TXNS).set(metrics.getAbortedTxnsCount()); Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_ID).set(metrics.getOldestAbortedTxnId()); Metrics.getOrCreateGauge(OLDEST_ABORTED_TXN_AGE).set(metrics.getOldestAbortedTxnAge()); + if (metrics.getOldestAbortedTxnAge() >= + MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_WARNING, + TimeUnit.SECONDS) && + metrics.getOldestAbortedTxnAge() < + MetastoreConf.getTimeVar(conf, + MetastoreConf.ConfVars.COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("Found an aborted transaction with an age of " + metrics.getOldestAbortedTxnAge() + " seconds."); Review comment: Same as above: can log the txnid ########## File path: standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java ########## @@ -85,36 +85,113 @@ public void run() { private void collectMetrics() throws MetaException { ShowCompactResponse currentCompactions = txnHandler.showCompact(new ShowCompactRequest()); - updateMetricsFromShowCompact(currentCompactions); + updateMetricsFromShowCompact(currentCompactions, conf); updateDBMetrics(); } private void updateDBMetrics() throws MetaException { MetricsInfo metrics = txnHandler.getMetricsInfo(); Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount()); + if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) && + metrics.getTxnToWriteIdCount() < + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } else if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount()); - + if (metrics.getCompletedTxnsCount() >= + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING) && + metrics.getCompletedTxnsCount() < + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge()); + if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + Review comment: Here you can log the id too (OLDEST_OPEN_REPL_TXN_ID) ########## File path: ql/src/java/org/apache/hadoop/hive/ql/txn/compactor/metrics/DeltaFilesMetricReporter.java ########## @@ -212,6 +213,22 @@ public static void mergeDeltaFilesStats(AcidDirectory dir, long checkThresholdIn } } } + Review comment: It would be better to separate this block out into a new method, for better readability ########## File path: standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/metrics/AcidMetricService.java ########## @@ -85,36 +85,113 @@ public void run() { private void collectMetrics() throws MetaException { ShowCompactResponse currentCompactions = txnHandler.showCompact(new ShowCompactRequest()); - updateMetricsFromShowCompact(currentCompactions); + updateMetricsFromShowCompact(currentCompactions, conf); updateDBMetrics(); } private void updateDBMetrics() throws MetaException { MetricsInfo metrics = txnHandler.getMetricsInfo(); Metrics.getOrCreateGauge(NUM_TXN_TO_WRITEID).set(metrics.getTxnToWriteIdCount()); + if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING) && + metrics.getTxnToWriteIdCount() < + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } else if (metrics.getTxnToWriteIdCount() >= + MetastoreConf.getIntVar(conf, MetastoreConf.ConfVars.COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getTxnToWriteIdCount() + ") Hive ACID metadata found in " + + "TXN_TO_WRITEID table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_COMPLETED_TXN_COMPONENTS).set(metrics.getCompletedTxnsCount()); - + if (metrics.getCompletedTxnsCount() >= + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING) && + metrics.getCompletedTxnsCount() < + MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.warn("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } else if (metrics.getCompletedTxnsCount() >= MetastoreConf.getIntVar(conf, + MetastoreConf.ConfVars.COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR)) { + LOG.error("An excessive amount of (" + metrics.getCompletedTxnsCount() + ") Hive ACID metadata found in " + + "COMPLETED_TXN_COMPONENTS table, which can cause serious performance degradation."); + } Metrics.getOrCreateGauge(NUM_OPEN_REPL_TXNS).set(metrics.getOpenReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_ID).set(metrics.getOldestOpenReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_REPL_TXN_AGE).set(metrics.getOldestOpenReplTxnAge()); + if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) && metrics.getOldestOpenReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } else if (metrics.getOldestOpenReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.error("A replication transaction has been open for " + metrics.getOldestOpenReplTxnAge() + " seconds. " + + "Before you abort a transaction that was created by replication, and which has been open a long time, " + + "make sure that the hive.repl.txn.timeout threshold has expired."); + } Metrics.getOrCreateGauge(NUM_OPEN_NON_REPL_TXNS).set(metrics.getOpenNonReplTxnsCount()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_ID).set(metrics.getOldestOpenNonReplTxnId()); Metrics.getOrCreateGauge(OLDEST_OPEN_NON_REPL_TXN_AGE).set(metrics.getOldestOpenNonReplTxnAge()); + if (metrics.getOldestOpenNonReplTxnAge() >= + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING, + TimeUnit.SECONDS) + && metrics.getOldestOpenNonReplTxnAge() < + MetastoreConf.getTimeVar(conf, MetastoreConf.ConfVars.COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR, + TimeUnit.SECONDS)) { + LOG.warn("Found an open transaction with an age of " + metrics.getOldestOpenNonReplTxnAge() + " seconds."); Review comment: Same as above: can log the txn id (OLDEST_OPEN_NON_REPL_TXN_ID) ########## File path: standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/conf/MetastoreConf.java ########## @@ -432,6 +432,88 @@ public static ConfVars getMetaConf(String name) { COMPACTOR_RUN_AS_USER("metastore.compactor.run.as.user", "hive.compactor.run.as.user", "", "Specify the user to run compactor Initiator and Worker as. If empty string, defaults to table/partition " + "directory owner."), + COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_WARNING( + "metastore.compactor.oldest.replication.open.txn.threshold.warning", + "hive.compactor.oldest.replication.open.txn.threshold.warning", + "14d", new TimeValidator(TimeUnit.DAYS), + "Age of open replication transaction after a warning should be raised. Default time unit: days"), + COMPACTOR_OLDEST_REPLICATION_OPENTXN_THRESHOLD_ERROR( + "metastore.compactor.oldest.replication.open.txn.threshold.error", + "hive.compactor.oldest.replication.open.txn.threshold.error", + "21d", new TimeValidator(TimeUnit.DAYS), + "Age of open replication transaction after an error should be raised. Default time unit: days"), + COMPACTOR_OLDEST_OPENTXN_THRESHOLD_WARNING( + "metastore.compactor.oldest.open.txn.threshold.warning", + "hive.compactor.oldest.open.txn.threshold.warning", "24h", + new TimeValidator(TimeUnit.HOURS), "Age of oldest open transaction after a warning should be raised. " + + "Default time unit: hours"), + COMPACTOR_OLDEST_OPENTXN_THRESHOLD_ERROR( + "metastore.compactor.oldest.open.txn.threshold.error", + "hive.compactor.oldest.open.txn.threshold.error", "72h", + new TimeValidator(TimeUnit.HOURS), "Age of oldest open transaction after an error should be raised. " + + "Default time unit: hours"), + COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_WARNING( + "metastore.compactor.oldest.uncleaned.aborted.txn.time.threshold.warning", + "hive.compactor.oldest.uncleaned.aborted.txn.time.threshold.warning", + "24h", new TimeValidator(TimeUnit.HOURS), + "Age of oldest aborted transaction after a warning should be raised. Default time unit: hours"), + COMPACTOR_OLDEST_UNCLEANED_ABORTEDTXN_TIME_THRESHOLD_ERROR( + "metastore.compactor.oldest.uncleaned.aborted.txn.time.threshold.error", + "hive.compactor.oldest.uncleaned.aborted.txn.time.threshold.error", + "48h", new TimeValidator(TimeUnit.HOURS), + "Age of oldest aborted transaction after an error should be raised. Default time unit: hours"), + COMPACTOR_TABLES_WITH_ABORTEDTXN_THRESHOLD( + "metastore.compactor.tables.with.aborted.txn.threshold", + "hive.compactor.tables.with.aborted.txn.threshold", 1, + "Number of tables has not been compacted and have more than " + + "hive.metastore.acidmetrics.table.aborted.txns.threshold (default 1500) aborted transactions."), + COMPACTOR_OLDEST_UNCLEANED_COMPACTION_TIME_THRESHOLD( + "metastore.compactor.oldest.uncleaned.compaction.time.threshold", + "hive.compactor.oldest.uncleaned.compaction.time.threshold", + "24h", new TimeValidator(TimeUnit.HOURS), + "Age of oldest ready for cleaning compaction in the compaction queue. Default time unit is: hours"), + COMPACTOR_FAILED_COMPACTION_RATIO_THRESHOLD( + "metastore.compactor.failed.compaction.ratio.threshold", + "hive.compactor.failed.compaction.ratio.threshold", .01, + "Ratio between the number of failed compactions + not initiated compactions and number of failed " + + "compactions + not initiated compactions + succeeded compactions."), + COMPACTOR_OLDEST_INITIATED_COMPACTION_TIME_THRESHOLD_WARNING( + "metastore.compactor.oldest.initiated.compaction.time.threshold.warning", + "hive.compactor.oldest.initiated.compaction.time.threshold.warning", + "1h", new TimeValidator(TimeUnit.HOURS), + "Age of oldest initiated compaction in the compaction queue when a warning should be raised. " + + "Default time unit is: hours"), + COMPACTOR_OLDEST_INITIATED_COMPACTION_TIME_THRESHOLD_ERROR( + "metastore.compactor.oldest.initiated.compaction.time.threshold.error", + "hive.compactor.oldest.initiated.compaction.time.threshold.error", + "12h", new TimeValidator(TimeUnit.HOURS), + "Age of oldest initiated compaction in the compaction queue when an error should be raised. " + + "Default time unit is: hours"), + COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_WARNING( + "metastore.compactor.completed.txn.components.record.threshold.warning", + "hive.compactor.completed.txn.components.record.threshold.warning", + 500000, + "Number of records in COMPLETED_TXN_COMPONENTS table, after a warning should be raised."), + COMPACTOR_COMPLETED_TXN_COMPONENTS_RECORD_THRESHOLD_ERROR( + "metastore.compactor.completed.txn.components.record.threshold.error", + "hive.compactor.completed.txn.components.record.threshold.error", + 1000000, + "Number of records in COMPLETED_TXN_COMPONENTS table, after an error should be raised."), + COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_WARNING( + "metastore.compactor.txn.to.writeid.record.threshold.warning", + "hive.compactor.txn.to.writeid.record.threshold.warning", + 500000, + "Number of records in TXN_TO_WRITEID table, after a warning should be raised."), + COMPACTOR_TXN_TO_WRITEID_RECORD_THRESHOLD_ERROR( + "metastore.compactor.txn.to.writeid.record.threshold.error", + "hive.compactor.txn.to.writeid.record.threshold.error", + 1000000, + "Number of records in TXN_TO_WRITEID table, after a warning should be error."), + COMPACTOR_NUMBER_OF_DISABLED_COMPACTION_TABLES_THRESHOLD( + "metastore.compactor.number.of.disabled.compaction.tables.threshold", + "hive.compactor.number.of.disabled.compaction.tables.threshold", + 1, + "Number of tables where the compaction is disabled"), Review comment: Something like this might be clearer: If the number of writes to tables where auto-compaction is disabled reaches this threshold, a warning will be logged after every subsequent write to any table where auto-compaction is disabled. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: gitbox-unsubscr...@hive.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org Issue Time Tracking ------------------- Worklog Id: (was: 624787) Time Spent: 20m (was: 10m) > Add logging based on new compaction metrics > ------------------------------------------- > > Key: HIVE-25345 > URL: https://issues.apache.org/jira/browse/HIVE-25345 > Project: Hive > Issue Type: Improvement > Reporter: László Pintér > Assignee: László Pintér > Priority: Major > Labels: pull-request-available > Time Spent: 20m > Remaining Estimate: 0h > -- This message was sent by Atlassian Jira (v8.3.4#803005)