This is an automated email from the ASF dual-hosted git repository. lide pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new abefe26fc9a [refactor](log) Add some log for table and db transaction manager's lock (#40746) abefe26fc9a is described below commit abefe26fc9a3c57f65d81c68971b95f568b5456e Author: Lijia Liu <liutang...@yeah.net> AuthorDate: Fri Sep 20 21:58:06 2024 +0800 [refactor](log) Add some log for table and db transaction manager's lock (#40746) Use the fe config like #39015, print table lock usage duration. In 2.0.6, I found `replayer` thread is blocked for a long time. ``` 2024-09-12 11:00:58,797 INFO (replayer|89) [DatabaseTransactionMgr.replayUpsertTransactionState():1977] replay a visible transaction TransactionState. transaction id: 471728922, label: flink_connector_20240912_103150_269f8d635ffb455bb4043eecd1a8406b, db id: 475180, table id list: 14473765, callback id: -1, coordinator: BE: 10.217.0.100, transaction status: VISIBLE, error replicas num: 0, replica ids: , prepare time: 1726108310505, commit time: 1726108310579, finish time: 1726108310604 [...] 2024-09-12 11:00:58,798 WARN (replayer|89) [Env.replayJournal():2575] entityCost:940256 loadJournalCost:940256 logId:1392353661 replayedJournalId:1392353661 code:100 size:562 ``` And, there are many table lock fail log: ``` 2024-09-12 10:56:53,167 WARN (mysql-nio-pool-32036|412611) [Table.tryReadLock():155] Failed to try table atdpersonpaycode's read lock. timeout 1 MINUTES. Current owner: null ``` But, I don't kown what thread hold the table lock. So, add some logs to print lock holding info. --- .../main/java/org/apache/doris/common/Config.java | 4 ++ .../main/java/org/apache/doris/catalog/Table.java | 46 ++++++++++++++++++++++ .../doris/transaction/DatabaseTransactionMgr.java | 15 +++++-- 3 files changed, 61 insertions(+), 4 deletions(-) diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index 86547830282..ffc23900a0f 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -2092,6 +2092,10 @@ public class Config extends ConfigBase { @ConfField public static long lock_reporting_threshold_ms = 500L; + @ConfField(mutable = true, description = {"表示最大锁持有时间,超过该时间会打印告警日志,单位秒", + "Maximum lock hold time; logs a warning if exceeded"}) + public static long max_lock_hold_threshold_seconds = 10L; + /** * If false, when select from tables in information_schema database, * the result will not contain the information of the table in external catalog. diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java index 6b5de7e581f..7cbef2a5d6c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java @@ -19,6 +19,7 @@ package org.apache.doris.catalog; import org.apache.doris.alter.AlterCancelException; import org.apache.doris.analysis.CreateTableStmt; +import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; import org.apache.doris.common.MetaNotFoundException; import org.apache.doris.common.io.Text; @@ -73,6 +74,8 @@ public abstract class Table extends MetaObject implements Writable, TableIf { protected TableType type; protected long createTime; protected QueryableReentrantReadWriteLock rwLock; + private final ThreadLocal<Long> lockReadStart = new ThreadLocal<>(); + private long lockWriteStart; /* * fullSchema and nameToColumn should contains all columns, both visible and shadow. @@ -146,6 +149,7 @@ public abstract class Table extends MetaObject implements Writable, TableIf { public void readLock() { this.rwLock.readLock().lock(); + lockReadStart.set(System.currentTimeMillis()); } public boolean tryReadLock(long timeout, TimeUnit unit) { @@ -155,6 +159,9 @@ public abstract class Table extends MetaObject implements Writable, TableIf { LOG.warn("Failed to try table {}'s read lock. timeout {} {}. Current owner: {}", name, timeout, unit.name(), rwLock.getOwner()); } + if (res) { + lockReadStart.set(System.currentTimeMillis()); + } return res; } catch (InterruptedException e) { LOG.warn("failed to try read lock at table[" + name + "]", e); @@ -163,11 +170,16 @@ public abstract class Table extends MetaObject implements Writable, TableIf { } public void readUnlock() { + if (lockReadStart.get() != null) { + checkAndLogLockDuration("read", lockReadStart.get(), System.currentTimeMillis()); + lockReadStart.remove(); + } this.rwLock.readLock().unlock(); } public void writeLock() { this.rwLock.writeLock().lock(); + lockWriteStart = System.currentTimeMillis(); } public boolean writeLockIfExist() { @@ -176,9 +188,39 @@ public abstract class Table extends MetaObject implements Writable, TableIf { this.rwLock.writeLock().unlock(); return false; } + lockWriteStart = System.currentTimeMillis(); return true; } + private void checkAndLogLockDuration(String type, long lockStart, long lockEnd) { + long duration = lockEnd - lockStart; + if (duration > Config.max_lock_hold_threshold_seconds * 1000) { + StringBuilder msgBuilder = new StringBuilder(); + msgBuilder.append(getId()) + .append(" ") + .append(getName()) + .append(" ") + .append(type) + .append(" lock is held at ") + .append(lockStart) + .append(".And release after ") + .append(duration) + .append(" ms.") + .append("Call stack is :\n") + .append(getStackTrace(Thread.currentThread())); + LOG.info(msgBuilder.toString()); + } + } + + private static String getStackTrace(Thread t) { + final StackTraceElement[] stackTrace = t.getStackTrace(); + StringBuilder msgBuilder = new StringBuilder(); + for (StackTraceElement e : stackTrace) { + msgBuilder.append(e.toString()).append("\n"); + } + return msgBuilder.toString(); + } + public boolean tryWriteLock(long timeout, TimeUnit unit) { try { boolean res = this.rwLock.writeLock().tryLock(timeout, unit); @@ -186,6 +228,9 @@ public abstract class Table extends MetaObject implements Writable, TableIf { LOG.warn("Failed to try table {}'s write lock. timeout {} {}. Current owner: {}", name, timeout, unit.name(), rwLock.getOwner()); } + if (res) { + lockWriteStart = System.currentTimeMillis(); + } return res; } catch (InterruptedException e) { LOG.warn("failed to try write lock at table[" + name + "]", e); @@ -194,6 +239,7 @@ public abstract class Table extends MetaObject implements Writable, TableIf { } public void writeUnlock() { + checkAndLogLockDuration("write", lockWriteStart, System.currentTimeMillis()); this.rwLock.writeLock().unlock(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java index cf4bed28e5b..3bc676a1b42 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/transaction/DatabaseTransactionMgr.java @@ -165,14 +165,20 @@ public class DatabaseTransactionMgr { private volatile long usedQuotaDataBytes = -1; private long lockWriteStart; + private final ThreadLocal<Long> lockReadStart = new ThreadLocal<>(); - private long lockReportingThresholdMs = Config.lock_reporting_threshold_ms; + private final long lockReportingThresholdMs = Config.lock_reporting_threshold_ms; protected void readLock() { this.transactionLock.readLock().lock(); + lockReadStart.set(System.currentTimeMillis()); } protected void readUnlock() { + if (lockReadStart.get() != null) { + checkAndLogLockDuration("read", lockReadStart.get(), System.currentTimeMillis()); + lockReadStart.remove(); + } this.transactionLock.readLock().unlock(); } @@ -182,7 +188,7 @@ public class DatabaseTransactionMgr { } protected void writeUnlock() { - checkAndLogWriteLockDuration(lockWriteStart, System.currentTimeMillis()); + checkAndLogLockDuration("write", lockWriteStart, System.currentTimeMillis()); this.transactionLock.writeLock().unlock(); } @@ -2211,11 +2217,12 @@ public class DatabaseTransactionMgr { * @param lockStart holing lock start time. * @param lockEnd release lock time. */ - private void checkAndLogWriteLockDuration(long lockStart, long lockEnd) { + private void checkAndLogLockDuration(String type, long lockStart, long lockEnd) { long duration = lockEnd - lockStart; if (duration > lockReportingThresholdMs) { StringBuilder msgBuilder = new StringBuilder(); - msgBuilder.append("lock is held at ") + msgBuilder.append(type) + .append(" lock is held at ") .append(lockStart) .append(".And release after ") .append(duration) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org