Hi, I've previously posted a patch at http://archives.postgresql.org/message-id/20141010160020.GG6670%40alap3.anarazel.de that reduces contention in StrategyGetBuffer() by making the clock sweep lockless. Robert asked me to post it to a new thread; I originally wrote it to see some other contention in more detail, that's why it ended up in that thread...
The performance numbers I've quoted over there are: > Test: > pgbench -M prepared -P 5 -S -c 496 -j 496 -T 5000 > on a scale=1000 database, with 4GB of shared buffers. > > Before: > progress: 40.0 s, 136252.3 tps, lat 3.628 ms stddev 4.547 > progress: 45.0 s, 135049.0 tps, lat 3.660 ms stddev 4.515 > progress: 50.0 s, 135788.9 tps, lat 3.640 ms stddev 4.398 > progress: 55.0 s, 135268.4 tps, lat 3.654 ms stddev 4.469 > progress: 60.0 s, 134991.6 tps, lat 3.661 ms stddev 4.739 > > after: > progress: 40.0 s, 207701.1 tps, lat 2.382 ms stddev 3.018 > progress: 45.0 s, 208022.4 tps, lat 2.377 ms stddev 2.902 > progress: 50.0 s, 209187.1 tps, lat 2.364 ms stddev 2.970 > progress: 55.0 s, 206462.7 tps, lat 2.396 ms stddev 2.871 > progress: 60.0 s, 210263.8 tps, lat 2.351 ms stddev 2.914 Imo the patch doesn't complicate the logic noticeably... I do wonder if we could make the freelist accesses lockless as well - but I think that's harder. So I don't want to mix that with this. Greetings, Andres Freund -- Andres Freund http://www.2ndQuadrant.com/ PostgreSQL Development, 24x7 Support, Training & Services
>From 6b486e5b467e94ab9297d7656a5b39b816c5c55a Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Fri, 10 Oct 2014 17:36:46 +0200 Subject: [PATCH] WIP: lockless StrategyGetBuffer hotpath --- src/backend/storage/buffer/freelist.c | 154 ++++++++++++++++++++-------------- 1 file changed, 90 insertions(+), 64 deletions(-) diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 5966beb..0c634a0 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -18,6 +18,12 @@ #include "storage/buf_internals.h" #include "storage/bufmgr.h" +#include "port/atomics.h" + + +#define LATCHPTR_ACCESS_ONCE(var) ((Latch *)(*((volatile Latch **)&(var)))) +#define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var)))) + /* * The shared freelist control information. @@ -27,8 +33,12 @@ typedef struct /* Spinlock: protects the values below */ slock_t buffer_strategy_lock; - /* Clock sweep hand: index of next buffer to consider grabbing */ - int nextVictimBuffer; + /* + * Clock sweep hand: index of next buffer to consider grabbing. Note that + * this isn't a concrete buffer - we only ever increase the value. So, to + * get an actual buffer, it needs to be used modulo NBuffers. + */ + pg_atomic_uint32 nextVictimBuffer; int firstFreeBuffer; /* Head of list of unused buffers */ int lastFreeBuffer; /* Tail of list of unused buffers */ @@ -42,8 +52,8 @@ typedef struct * Statistics. These counters should be wide enough that they can't * overflow during a single bgwriter cycle. */ - uint32 completePasses; /* Complete cycles of the clock sweep */ - uint32 numBufferAllocs; /* Buffers allocated since last reset */ + pg_atomic_uint32 completePasses; /* Complete cycles of the clock sweep */ + pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ /* * Notification latch, or NULL if none. See StrategyNotifyBgWriter. @@ -124,87 +134,107 @@ StrategyGetBuffer(BufferAccessStrategy strategy) return buf; } - /* Nope, so lock the freelist */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - /* * We count buffer allocation requests so that the bgwriter can estimate * the rate of buffer consumption. Note that buffers recycled by a * strategy object are intentionally not counted here. */ - StrategyControl->numBufferAllocs++; + pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); /* - * If bgwriterLatch is set, we need to waken the bgwriter, but we should - * not do so while holding buffer_strategy_lock; so release and re-grab. - * This is annoyingly tedious, but it happens at most once per bgwriter - * cycle, so the performance hit is minimal. + * If bgwriterLatch is set, we need to waken the bgwriter. We don't want + * to check this while holding the spinlock, so we the latch from memory + * once to see whether it's set. There's no need to do so with a lock + * present - we'll just set the latch next call if we missed it once. + * + * Since we're not guaranteed atomic 8 byte reads we need to acquire the + * spinlock if not null to be sure we get a correct pointer. Because we + * don't want to set the latch while holding the buffer_strategy_lock we + * just grab the lock to read and reset the pointer. */ - bgwriterLatch = StrategyControl->bgwriterLatch; + bgwriterLatch = LATCHPTR_ACCESS_ONCE(StrategyControl->bgwriterLatch); if (bgwriterLatch) { + /* we don't have guaranteed atomic 64bit reads */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + bgwriterLatch = LATCHPTR_ACCESS_ONCE(StrategyControl->bgwriterLatch); StrategyControl->bgwriterLatch = NULL; SpinLockRelease(&StrategyControl->buffer_strategy_lock); - SetLatch(bgwriterLatch); - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + + /* recheck */ + if (bgwriterLatch) + SetLatch(bgwriterLatch); } /* - * Try to get a buffer from the freelist. Note that the freeNext fields - * are considered to be protected by the buffer_strategy_lock not the - * individual buffer spinlocks, so it's OK to manipulate them without - * holding the spinlock. + * First check, without acquiring the lock, wether there's buffers in the + * freelist. Since we otherwise don't require the spinlock in every + * StrategyGetBuffer() invocation, it'd be sad to acquire it here - + * uselessly in mos tcases. + * + * If there's buffers on the freelist, acquire the spinlock and look into + * them. + * + * Note that the freeNext fields are considered to be protected by + * the buffer_strategy_lock not the individual buffer spinlocks, so it's + * OK to manipulate them without holding the spinlock. */ - while (StrategyControl->firstFreeBuffer >= 0) + if (INT_ACCESS_ONCE(StrategyControl->firstFreeBuffer) >= 0) { - buf = &BufferDescriptors[StrategyControl->firstFreeBuffer]; - Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - /* Unconditionally remove buffer from freelist */ - StrategyControl->firstFreeBuffer = buf->freeNext; - buf->freeNext = FREENEXT_NOT_IN_LIST; + while (StrategyControl->firstFreeBuffer >= 0) + { + buf = &BufferDescriptors[StrategyControl->firstFreeBuffer]; + Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); - /* - * Release the lock so someone else can access the freelist (or run - * the clocksweep) while we check out this buffer. - */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); + /* Unconditionally remove buffer from freelist */ + StrategyControl->firstFreeBuffer = buf->freeNext; + buf->freeNext = FREENEXT_NOT_IN_LIST; - /* - * If the buffer is pinned or has a nonzero usage_count, we cannot use - * it; discard it and retry. (This can only happen if VACUUM put a - * valid buffer in the freelist and then someone else used it before - * we got to it. It's probably impossible altogether as of 8.3, but - * we'd better check anyway.) - */ - LockBufHdr(buf); - if (buf->refcount == 0 && buf->usage_count == 0) - { - if (strategy != NULL) - AddBufferToRing(strategy, buf); - return buf; - } - UnlockBufHdr(buf); + /* + * Release the lock so someone else can access the freelist (or run + * the clocksweep) while we check out this buffer. + */ + SpinLockRelease(&StrategyControl->buffer_strategy_lock); - /* Reacquire the lock and go around for another pass. */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + /* + * If the buffer is pinned or has a nonzero usage_count, we cannot use + * it; discard it and retry. (This can only happen if VACUUM put a + * valid buffer in the freelist and then someone else used it before + * we got to it. It's probably impossible altogether as of 8.3, but + * we'd better check anyway.) + */ + LockBufHdr(buf); + if (buf->refcount == 0 && buf->usage_count == 0) + { + if (strategy != NULL) + AddBufferToRing(strategy, buf); + return buf; + } + UnlockBufHdr(buf); + + /* Reacquire the lock and go around for another pass. */ + SpinLockAcquire(&StrategyControl->buffer_strategy_lock); + } + SpinLockRelease(&StrategyControl->buffer_strategy_lock); } /* Nothing on the freelist, so run the "clock sweep" algorithm */ trycounter = NBuffers; for (;;) { - buf = &BufferDescriptors[StrategyControl->nextVictimBuffer]; + int victim; + + victim = pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); - if (++StrategyControl->nextVictimBuffer >= NBuffers) + buf = &BufferDescriptors[victim % NBuffers]; + + if (victim % NBuffers == 0) { - StrategyControl->nextVictimBuffer = 0; - StrategyControl->completePasses++; + pg_atomic_add_fetch_u32(&StrategyControl->completePasses, 1); } - /* Release the lock before manipulating the candidate buffer. */ - SpinLockRelease(&StrategyControl->buffer_strategy_lock); - /* * If the buffer is pinned or has a nonzero usage_count, we cannot use * it; decrement the usage_count (unless pinned) and keep scanning. @@ -238,9 +268,6 @@ StrategyGetBuffer(BufferAccessStrategy strategy) elog(ERROR, "no unpinned buffers available"); } UnlockBufHdr(buf); - - /* Reacquire the lock and get a new candidate buffer. */ - SpinLockAcquire(&StrategyControl->buffer_strategy_lock); } } @@ -284,13 +311,12 @@ StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) int result; SpinLockAcquire(&StrategyControl->buffer_strategy_lock); - result = StrategyControl->nextVictimBuffer; + result = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer) % NBuffers; if (complete_passes) - *complete_passes = StrategyControl->completePasses; + *complete_passes = pg_atomic_read_u32(&StrategyControl->completePasses); if (num_buf_alloc) { - *num_buf_alloc = StrategyControl->numBufferAllocs; - StrategyControl->numBufferAllocs = 0; + *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); } SpinLockRelease(&StrategyControl->buffer_strategy_lock); return result; @@ -389,11 +415,11 @@ StrategyInitialize(bool init) StrategyControl->lastFreeBuffer = NBuffers - 1; /* Initialize the clock sweep pointer */ - StrategyControl->nextVictimBuffer = 0; + pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); /* Clear statistics */ - StrategyControl->completePasses = 0; - StrategyControl->numBufferAllocs = 0; + pg_atomic_init_u32(&StrategyControl->completePasses, 0); + pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); /* No pending notification */ StrategyControl->bgwriterLatch = NULL; -- 1.8.3.251.g1462b67
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers