Hi! Here is the case.
Assume we have a master to slave replication with shared_buffers set up to 2 GB at the master and 4 GB at the slave. All of the data is written to the master, while reading occurs from slave.
Now we decided to drop many tables, let's say 1000 or 10000 not in a single transaction, but each table in a separate one. So, due to "plain" shared_buffers memory we have to do for loop for every relation which leads to lag between master and slave.
In real case scenario such issue lead to not a minutes lag, but hours lag. At the same time PostgreSQL have a great routine to delete many relations in a single transaction.
So, to get rid of this kind of issue here came up an idea: what if not to delete everyone of relations right away and just store them in an array, prevent shared buffers (correspond to a deleted relations) from been flushed. And then array reaches it max size we need to walk all buffers only once to "free" shared buffers correspond to a deleted relations.
Here some values from the test which I am made. Without patch: 1. (master 2 GB) - drop 1000 tables took 6 sec (slave 4 GB) - drop 1000 tables took 8 sec 2. (master 4 GB) - drop 1000 tables took 10 sec (slave 8 GB) - drop 1000 tables took 16 sec 3. (master 10 GB) - drop 1000 tables took 22 sec (slave 20 GB) - drop 1000 tables took 38 sec With patch: 1. (master 2 GB) - drop 1000 tables took 2 sec (slave 4 GB) - drop 1000 tables took 2 sec 2. (master 4 GB) - drop 1000 tables took 3 sec (slave 8 GB) - drop 1000 tables took 3 sec 3. (master 10 GB) - drop 1000 tables took 4 sec (slave 20 GB) - drop 1000 tables took 4 sec -- Max Orlov E-mail: m.or...@postgrespro.ru
init.sh
Description: application/shellscript
run-001.sh
Description: application/shellscript
run-001-core.sh
Description: application/shellscript
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 1f10a97dc78..e0d1fb6a824 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -66,8 +66,6 @@ #define BUF_WRITTEN 0x01 #define BUF_REUSABLE 0x02 -#define DROP_RELS_BSEARCH_THRESHOLD 20 - typedef struct PrivateRefCountEntry { Buffer buffer; @@ -410,6 +408,72 @@ ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref) } } +/* + * Lazy relations delete mechanism. + * + * On relation drop no need to walk shared buffers every time, just put a deleted + * RelFileNode into an array. Thus we store RelFileNodes in RelNodeDeleted + * struct and delete them later when number of deleted relations will + * exceed LAZY_DELETE_ARRAY_SIZE. + */ + +#define LAZY_DELETE_ARRAY_SIZE 128 + +/* Wrapper for array of lazy deleted RelFileNodes. */ +typedef struct RelNodeDeletedBuffer +{ + RelFileNode rnodes[LAZY_DELETE_ARRAY_SIZE]; /* Array of deleted */ + int idx; /* Current position */ +} RelNodeDeletedBuffer; + +static RelNodeDeletedBuffer *RelNodeDeleted = NULL; + +/* + * Initialize shared array of lazy deleted relations. + * + * This is called once during shared-memory initialization (either in the + * postmaster, or in a standalone backend). + */ +void InitRelNodeDeletedBuffer(void) +{ + bool found; + + RelNodeDeleted = ShmemInitStruct("Lazy Deleted Relations", + sizeof(RelNodeDeletedBuffer), + &found); + + if (!found) + MemSet(RelNodeDeleted, 0, sizeof(RelNodeDeletedBuffer)); +} + +/* + * Compute the size of shared memory for the buffer of lazy deleted relations. + */ +Size +RelNodeDeletedBufferSize(void) +{ + Size size = 0; + + size = add_size(size, sizeof(RelNodeDeletedBuffer)); + + return size; +} + +/* + * Check for relation is in lazy deleted buffer (up to n-th position). + */ +static inline bool +IsRelFileNodeDeleted(RelFileNode rnode, int n) +{ + int i; + + for (i = 0; i < n; i++) + if (RelFileNodeEquals(rnode, RelNodeDeleted->rnodes[i])) + return true; + + return false; +} + /* * BufferIsPinned * True iff the buffer is pinned (also checks for valid buffer number). @@ -452,6 +516,7 @@ static BufferDesc *BufferAlloc(SMgrRelation smgr, BlockNumber blockNum, BufferAccessStrategy strategy, bool *foundPtr); +static void InvalidateRelFileNodesBuffers(RelFileNode *nodes, int nnodes); static void FlushBuffer(BufferDesc *buf, SMgrRelation reln); static void AtProcExit_Buffers(int code, Datum arg); static void CheckForBufferLeaks(void); @@ -2690,6 +2755,22 @@ FlushBuffer(BufferDesc *buf, SMgrRelation reln) char *bufToWrite; uint32 buf_state; + LWLockAcquire(LazyRelDeleteLock, LW_SHARED); + + /* + * If rnode is in lazy deleted buffer clear dirty and checkpoint flags. + */ + if (IsRelFileNodeDeleted(buf->tag.rnode, RelNodeDeleted->idx)) + { + buf_state = LockBufHdr(buf); + buf_state &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED); + UnlockBufHdr(buf, buf_state); + LWLockRelease(LazyRelDeleteLock); + return; + } + + LWLockRelease(LazyRelDeleteLock); + /* * Acquire the buffer's io_in_progress lock. If StartBufferIO returns * false, then someone else flushed the buffer before we could, so we need @@ -2993,6 +3074,43 @@ DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum, } } +/* + * Invalidate all of the nodes buffers. + */ +void +InvalidateRelFileNodesBuffers(RelFileNode *nodes, int nnodes) +{ + int i; + + pg_qsort(nodes, nnodes, sizeof(RelFileNode), rnode_comparator); + + for (i = 0; i < NBuffers; i++) + { + RelFileNode *rnode = NULL; + BufferDesc *bufHdr = GetBufferDescriptor(i); + uint32 buf_state; + + /* + * As in DropRelFileNodeBuffers, an unlocked precheck should be safe + * and saves some cycles. + */ + + rnode = bsearch((const void *) &(bufHdr->tag.rnode), + nodes, nnodes, sizeof(RelFileNode), + rnode_comparator); + + /* buffer doesn't belong to any of the given relfilenodes; skip it */ + if (rnode == NULL) + continue; + + buf_state = LockBufHdr(bufHdr); + if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode))) + InvalidateBuffer(bufHdr); /* releases spinlock */ + else + UnlockBufHdr(bufHdr, buf_state); + } +} + /* --------------------------------------------------------------------- * DropRelFileNodesAllBuffers * @@ -3006,25 +3124,27 @@ void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes) { int i, + j, n = 0; RelFileNode *nodes; - bool use_bsearch; if (nnodes == 0) return; nodes = palloc(sizeof(RelFileNode) * nnodes); /* non-local relations */ - /* If it's a local relation, it's localbuf.c's problem. */ for (i = 0; i < nnodes; i++) { + /* If it's a local relation, it's localbuf.c's problem. */ if (RelFileNodeBackendIsTemp(rnodes[i])) { if (rnodes[i].backend == MyBackendId) DropRelFileNodeAllLocalBuffers(rnodes[i].node); } else + { nodes[n++] = rnodes[i].node; + } } /* @@ -3037,58 +3157,41 @@ DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes) return; } - /* - * For low number of relations to drop just use a simple walk through, to - * save the bsearch overhead. The threshold to use is rather a guess than - * an exactly determined value, as it depends on many factors (CPU and RAM - * speeds, amount of shared buffers etc.). - */ - use_bsearch = n > DROP_RELS_BSEARCH_THRESHOLD; + if (n > LAZY_DELETE_ARRAY_SIZE) + { + InvalidateRelFileNodesBuffers(nodes, n); + } + else + { + int idx; - /* sort the list of rnodes if necessary */ - if (use_bsearch) - pg_qsort(nodes, n, sizeof(RelFileNode), rnode_comparator); + LWLockAcquire(LazyRelDeleteLock, LW_SHARED); + idx = RelNodeDeleted->idx; + Assert(idx < LAZY_DELETE_ARRAY_SIZE); - for (i = 0; i < NBuffers; i++) - { - RelFileNode *rnode = NULL; - BufferDesc *bufHdr = GetBufferDescriptor(i); - uint32 buf_state; + /* Copy deleted relations into lazy deleted array, but watch the size ... */ + for (i = idx, j = 0; i < LAZY_DELETE_ARRAY_SIZE && j < n; i++, j++) + RelNodeDeleted->rnodes[i] = rnodes[j].node; - /* - * As in DropRelFileNodeBuffers, an unlocked precheck should be safe - * and saves some cycles. - */ + idx += j; - if (!use_bsearch) + if (idx == LAZY_DELETE_ARRAY_SIZE) { - int j; + InvalidateRelFileNodesBuffers(RelNodeDeleted->rnodes, idx); - for (j = 0; j < n; j++) - { - if (RelFileNodeEquals(bufHdr->tag.rnode, nodes[j])) - { - rnode = &nodes[j]; - break; - } - } + /* ... and сopy rest of deleted relations. */ + for (i = 0; j < n; i++, j++) + RelNodeDeleted->rnodes[i] = rnodes[j].node; + + RelNodeDeleted->idx = i; } else { - rnode = bsearch((const void *) &(bufHdr->tag.rnode), - nodes, n, sizeof(RelFileNode), - rnode_comparator); + RelNodeDeleted->idx = idx; } - /* buffer doesn't belong to any of the given relfilenodes; skip it */ - if (rnode == NULL) - continue; - - buf_state = LockBufHdr(bufHdr); - if (RelFileNodeEquals(bufHdr->tag.rnode, (*rnode))) - InvalidateBuffer(bufHdr); /* releases spinlock */ - else - UnlockBufHdr(bufHdr, buf_state); + Assert(RelNodeDeleted->idx < LAZY_DELETE_ARRAY_SIZE); + LWLockRelease(LazyRelDeleteLock); } pfree(nodes); @@ -3133,6 +3236,22 @@ DropDatabaseBuffers(Oid dbid) else UnlockBufHdr(bufHdr, buf_state); } + + /* + * Remove all relations of dbid from lazy deleted array. + */ + LWLockAcquire(LazyRelDeleteLock, LW_SHARED); + { + int idx = RelNodeDeleted->idx; + + for (i = 0; i < idx; i++) + { + if (RelNodeDeleted->rnodes[i].dbNode == dbid) + RelNodeDeleted->rnodes[i] = RelNodeDeleted->rnodes[--idx]; + } + RelNodeDeleted->idx = idx; + } + LWLockRelease(LazyRelDeleteLock); } /* ----------------------------------------------------------------- @@ -3330,6 +3449,15 @@ FlushDatabaseBuffers(Oid dbid) if (bufHdr->tag.rnode.dbNode != dbid) continue; + LWLockAcquire(LazyRelDeleteLock, LW_SHARED); + + if (IsRelFileNodeDeleted(bufHdr->tag.rnode, RelNodeDeleted->idx)) + { + LWLockRelease(LazyRelDeleteLock); + continue; + } + + LWLockRelease(LazyRelDeleteLock); ReservePrivateRefCountEntry(); buf_state = LockBufHdr(bufHdr); diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index 4829953ee64..fdfdb78dff8 100644 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -120,6 +120,7 @@ CreateSharedMemoryAndSemaphores(void) size = add_size(size, hash_estimate_size(SHMEM_INDEX_SIZE, sizeof(ShmemIndexEnt))); size = add_size(size, BufferShmemSize()); + size = add_size(size, RelNodeDeletedBufferSize()); size = add_size(size, LockShmemSize()); size = add_size(size, PredicateLockShmemSize()); size = add_size(size, ProcGlobalShmemSize()); @@ -217,6 +218,7 @@ CreateSharedMemoryAndSemaphores(void) SUBTRANSShmemInit(); MultiXactShmemInit(); InitBufferPool(); + InitRelNodeDeletedBuffer(); /* * Set up lock manager diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt index db478432291..a35e99190ff 100644 --- a/src/backend/storage/lmgr/lwlocknames.txt +++ b/src/backend/storage/lmgr/lwlocknames.txt @@ -49,3 +49,4 @@ MultiXactTruncationLock 41 OldSnapshotTimeMapLock 42 LogicalRepWorkerLock 43 CLogTruncationLock 44 +LazyRelDeleteLock 45 diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 17b97f7e38f..9cb51bc5f0d 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -179,6 +179,7 @@ extern Buffer ReleaseAndReadBuffer(Buffer buffer, Relation relation, BlockNumber blockNum); extern void InitBufferPool(void); +extern void InitRelNodeDeletedBuffer(void); extern void InitBufferPoolAccess(void); extern void InitBufferPoolBackend(void); extern void AtEOXact_Buffers(bool isCommit); @@ -205,6 +206,7 @@ extern XLogRecPtr BufferGetLSNAtomic(Buffer buffer); extern void PrintPinnedBufs(void); #endif extern Size BufferShmemSize(void); +extern Size RelNodeDeletedBufferSize(void); extern void BufferGetTag(Buffer buffer, RelFileNode *rnode, ForkNumber *forknum, BlockNumber *blknum);