sorry, forgot to add a patch to the letter
чт, 20 мая 2021 г. в 13:52, Кирилл Решке <reshkekir...@gmail.com>: > Hi, > I recently ran into a problem in one of our production postgresql cluster. > I had noticed lock contention on procarray lock on standby, which causes > WAL replay lag growth. > To reproduce this, you can do the following: > > 1) set max_connections to big number, like 100000 > 2) begin a transaction on primary > 3) start pgbench workload on primary and on standby > > After a while it will be possible to see KnownAssignedXidsGetAndSetXmin in > perf top consuming abount 75 % of CPU. > > %% > PerfTop: 1060 irqs/sec kernel: 0.0% exact: 0.0% [4000Hz cycles:u], > (target_pid: 273361) > > ------------------------------------------------------------------------------- > > 73.92% postgres [.] KnownAssignedXidsGetAndSetXmin > 1.40% postgres [.] base_yyparse > 0.96% postgres [.] LWLockAttemptLock > 0.84% postgres [.] hash_search_with_hash_value > 0.84% postgres [.] AtEOXact_GUC > 0.72% postgres [.] ResetAllOptions > 0.70% postgres [.] AllocSetAlloc > 0.60% postgres [.] _bt_compare > 0.55% postgres [.] core_yylex > 0.42% libc-2.27.so [.] __strlen_avx2 > 0.23% postgres [.] LWLockRelease > 0.19% postgres [.] MemoryContextAllocZeroAligned > 0.18% postgres [.] expression_tree_walker.part.3 > 0.18% libc-2.27.so [.] __memmove_avx_unaligned_erms > 0.17% postgres [.] PostgresMain > 0.17% postgres [.] palloc > 0.17% libc-2.27.so [.] _int_malloc > 0.17% postgres [.] set_config_option > 0.17% postgres [.] ScanKeywordLookup > 0.16% postgres [.] _bt_checkpage > > %% > > > We have tried to fix this by using BitMapSet instead of boolean array > KnownAssignedXidsValid, but this does not help too much. > > Instead, using a doubly linked list helps a little more, we got +1000 tps > on pgbench workload with patched postgresql. The general idea of this patch > is that, instead of memorizing which elements in KnownAssignedXids are > valid, lets maintain a doubly linked list of them. This solution will work > in exactly the same way, except that taking a snapshot on the replica is > now O(running transaction) instead of O(head - tail) which is significantly > faster under some workloads. The patch helps to reduce CPU usage of > KnownAssignedXidsGetAndSetXmin to ~48% instead of ~74%, but does eliminate > it from perf top. > > The problem is better reproduced on PG13 since PG14 has some snapshot > optimization. > > Thanks! > > Best regards, reshke >
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 5ff8cab394..165cf56ea9 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -255,9 +255,18 @@ static PGPROC *allProcs; * Bookkeeping for tracking emulated transactions in recovery */ static TransactionId *KnownAssignedXids; -static bool *KnownAssignedXidsValid; + +typedef struct { + int prv; + int nxt; +} KnownAssignedXidValidNode; + +// Doubly linked list of valid xids +static KnownAssignedXidValidNode *KnownAssignedXidsValidDLL; static TransactionId latestObservedXid = InvalidTransactionId; + + /* * If we're in STANDBY_SNAPSHOT_PENDING state, standbySnapshotPendingXmin is * the highest xid that might still be running that we don't have in @@ -348,6 +357,9 @@ static void GlobalVisUpdateApply(ComputeXidHorizonsResult *horizons); /* * Report shared-memory space needed by CreateSharedProcArray. */ + +static KnownAssignedXidValidNode KAX_E_INVALID; + Size ProcArrayShmemSize(void) { @@ -380,13 +392,20 @@ ProcArrayShmemSize(void) size = add_size(size, mul_size(sizeof(TransactionId), TOTAL_MAX_CACHED_SUBXIDS)); - size = add_size(size, - mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS)); + size = add_size(size, + mul_size(sizeof(KnownAssignedXidValidNode), TOTAL_MAX_CACHED_SUBXIDS)); } + KAX_E_INVALID.prv = -1; + KAX_E_INVALID.nxt = TOTAL_MAX_CACHED_SUBXIDS; + return size; } +#define KAX_DLL_INDEX_VALID(i) (-1 < i && i < TOTAL_MAX_CACHED_SUBXIDS) +#define KAX_DLL_ENTRY_INVALID(i) (-1 == KnownAssignedXidsValidDLL[i].prv && KnownAssignedXidsValidDLL[i].nxt == TOTAL_MAX_CACHED_SUBXIDS) + + /* * Initialize the shared PGPROC array during postmaster startup. */ @@ -431,10 +450,15 @@ CreateSharedProcArray(void) mul_size(sizeof(TransactionId), TOTAL_MAX_CACHED_SUBXIDS), &found); - KnownAssignedXidsValid = (bool *) - ShmemInitStruct("KnownAssignedXidsValid", - mul_size(sizeof(bool), TOTAL_MAX_CACHED_SUBXIDS), + + KnownAssignedXidsValidDLL = (KnownAssignedXidValidNode*) + ShmemInitStruct("KnownAssignedXidsValidDLL", + mul_size(sizeof(KnownAssignedXidValidNode), TOTAL_MAX_CACHED_SUBXIDS), &found); + + for (int i = 0; i < TOTAL_MAX_CACHED_SUBXIDS; ++ i) { + KnownAssignedXidsValidDLL[i] = KAX_E_INVALID; + } } } @@ -4545,15 +4569,17 @@ KnownAssignedXidsCompress(bool force) * re-aligning data to 0th element. */ compress_index = 0; - for (i = tail; i < head; i++) + int prv = -1; + for (i = tail; i < head; i = KnownAssignedXidsValidDLL[i].nxt) { - if (KnownAssignedXidsValid[i]) - { - KnownAssignedXids[compress_index] = KnownAssignedXids[i]; - KnownAssignedXidsValid[compress_index] = true; - compress_index++; - } + KnownAssignedXids[compress_index] = KnownAssignedXids[i]; + KnownAssignedXidsValidDLL[compress_index].prv = prv; + KnownAssignedXidsValidDLL[compress_index].nxt = compress_index + 1; + + compress_index++; } + // fix last entry + KnownAssignedXidsValidDLL[compress_index - 1].nxt = TOTAL_MAX_CACHED_SUBXIDS; pArray->tailKnownAssignedXids = 0; pArray->headKnownAssignedXids = compress_index; @@ -4652,7 +4678,8 @@ KnownAssignedXidsAdd(TransactionId from_xid, TransactionId to_xid, for (i = 0; i < nxids; i++) { KnownAssignedXids[head] = next_xid; - KnownAssignedXidsValid[head] = true; + KnownAssignedXidsValidDLL[head].nxt = 1 + head; + KnownAssignedXidsValidDLL[head + 1].prv = head; TransactionIdAdvance(next_xid); head++; } @@ -4741,12 +4768,23 @@ KnownAssignedXidsSearch(TransactionId xid, bool remove) if (result_index < 0) return false; /* not in array */ - if (!KnownAssignedXidsValid[result_index]) + if (KAX_DLL_ENTRY_INVALID(result_index)) return false; /* in array, but invalid */ if (remove) { - KnownAssignedXidsValid[result_index] = false; + int prv = KnownAssignedXidsValidDLL[result_index].prv; + int nxt = KnownAssignedXidsValidDLL[result_index].nxt; + + KnownAssignedXidsValidDLL[result_index] = KAX_E_INVALID; + + if (KAX_DLL_INDEX_VALID(prv)) { + KnownAssignedXidsValidDLL[prv].nxt = nxt; + } + + if (KAX_DLL_INDEX_VALID(nxt)) { + KnownAssignedXidsValidDLL[nxt].prv = prv; + } pArray->numKnownAssignedXids--; Assert(pArray->numKnownAssignedXids >= 0); @@ -4757,9 +4795,7 @@ KnownAssignedXidsSearch(TransactionId xid, bool remove) */ if (result_index == tail) { - tail++; - while (tail < head && !KnownAssignedXidsValid[tail]) - tail++; + tail = KnownAssignedXidsValidDLL[tail].nxt; if (tail >= head) { /* Array is empty, so we can reset both pointers */ @@ -4868,21 +4904,23 @@ KnownAssignedXidsRemovePreceding(TransactionId removeXid) tail = pArray->tailKnownAssignedXids; head = pArray->headKnownAssignedXids; - for (i = tail; i < head; i++) + for (i = tail; i < head; i = KnownAssignedXidsValidDLL[i].nxt) { - if (KnownAssignedXidsValid[i]) - { - TransactionId knownXid = KnownAssignedXids[i]; + TransactionId knownXid = KnownAssignedXids[i]; - if (TransactionIdFollowsOrEquals(knownXid, removeXid)) - break; + if (TransactionIdFollowsOrEquals(knownXid, removeXid)) + break; - if (!StandbyTransactionIdIsPrepared(knownXid)) - { - KnownAssignedXidsValid[i] = false; - count++; - } - } + if (!StandbyTransactionIdIsPrepared(knownXid)) + { + int nxt = KnownAssignedXidsValidDLL[i].nxt; + KnownAssignedXidsValidDLL[i] = KAX_E_INVALID; + + if (KAX_DLL_INDEX_VALID(nxt)) { + KnownAssignedXidsValidDLL[i].prv = -1; + } + count++; + } } pArray->numKnownAssignedXids -= count; @@ -4891,21 +4929,20 @@ KnownAssignedXidsRemovePreceding(TransactionId removeXid) /* * Advance the tail pointer if we've marked the tail item invalid. */ - for (i = tail; i < head; i++) - { - if (KnownAssignedXidsValid[i]) - break; - } - if (i >= head) - { - /* Array is empty, so we can reset both pointers */ - pArray->headKnownAssignedXids = 0; - pArray->tailKnownAssignedXids = 0; - } - else - { - pArray->tailKnownAssignedXids = i; - } + if (KAX_DLL_ENTRY_INVALID(tail)) { + tail = KnownAssignedXidsValidDLL[tail].nxt; + + if (!KAX_DLL_INDEX_VALID(tail)) + { + /* Array is empty, so we can reset both pointers */ + pArray->headKnownAssignedXids = 0; + pArray->tailKnownAssignedXids = 0; + } + else + { + pArray->tailKnownAssignedXids = tail; + } + } /* Opportunistically compress the array */ KnownAssignedXidsCompress(false); @@ -4957,32 +4994,29 @@ KnownAssignedXidsGetAndSetXmin(TransactionId *xarray, TransactionId *xmin, head = procArray->headKnownAssignedXids; SpinLockRelease(&procArray->known_assigned_xids_lck); - for (i = tail; i < head; i++) + for (i = tail; i < head; i = KnownAssignedXidsValidDLL[i].nxt) { /* Skip any gaps in the array */ - if (KnownAssignedXidsValid[i]) - { - TransactionId knownXid = KnownAssignedXids[i]; - - /* - * Update xmin if required. Only the first XID need be checked, - * since the array is sorted. - */ - if (count == 0 && - TransactionIdPrecedes(knownXid, *xmin)) - *xmin = knownXid; - - /* - * Filter out anything >= xmax, again relying on sorted property - * of array. - */ - if (TransactionIdIsValid(xmax) && - TransactionIdFollowsOrEquals(knownXid, xmax)) - break; - - /* Add knownXid into output array */ - xarray[count++] = knownXid; - } + TransactionId knownXid = KnownAssignedXids[i]; + + /* + * Update xmin if required. Only the first XID need be checked, + * since the array is sorted. + */ + if (count == 0 && + TransactionIdPrecedes(knownXid, *xmin)) + *xmin = knownXid; + + /* + * Filter out anything >= xmax, again relying on sorted property + * of array. + */ + if (TransactionIdIsValid(xmax) && + TransactionIdFollowsOrEquals(knownXid, xmax)) + break; + + /* Add knownXid into output array */ + xarray[count++] = knownXid; } return count; @@ -5007,12 +5041,12 @@ KnownAssignedXidsGetOldestXmin(void) head = procArray->headKnownAssignedXids; SpinLockRelease(&procArray->known_assigned_xids_lck); - for (i = tail; i < head; i++) - { - /* Skip any gaps in the array */ - if (KnownAssignedXidsValid[i]) - return KnownAssignedXids[i]; - } + i = KnownAssignedXidsValidDLL[i].nxt; + + /* Skip any gaps in the array */ + if (i < head) { + return KnownAssignedXids[i]; + } return InvalidTransactionId; } @@ -5042,13 +5076,10 @@ KnownAssignedXidsDisplay(int trace_level) initStringInfo(&buf); - for (i = tail; i < head; i++) + for (i = tail; i < head; i = KnownAssignedXidsValidDLL[i].nxt) { - if (KnownAssignedXidsValid[i]) - { - nxids++; - appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]); - } + nxids++; + appendStringInfo(&buf, "[%d]=%u ", i, KnownAssignedXids[i]); } elog(trace_level, "%d KnownAssignedXids (num=%d tail=%d head=%d) %s",