On 2016-04-10 09:03:37 +0300, Alexander Korotkov wrote: > On Sun, Apr 10, 2016 at 8:36 AM, Alexander Korotkov < > a.korot...@postgrespro.ru> wrote: > > > On Sat, Apr 9, 2016 at 10:49 PM, Andres Freund <and...@anarazel.de> wrote: > > > >> > >> > >> On April 9, 2016 12:43:03 PM PDT, Andres Freund <and...@anarazel.de> > >> wrote: > >> >On 2016-04-09 22:38:31 +0300, Alexander Korotkov wrote: > >> >> There are results with 5364b357 reverted. > >> > > >> >Crazy that this has such a negative impact. Amit, can you reproduce > >> >that? Alexander, I guess for r/w workload 5364b357 is a benefit on that > >> >machine as well? > >> > >> How sure are you about these measurements? > > > > > > I'm pretty sure. I've retried it multiple times by hand before re-run the > > script. > > > > > >> Because there really shouldn't be clog lookups one a steady state is > >> reached... > >> > > > > Hm... I'm also surprised. There shouldn't be clog lookups once hint bits > > are set. > > > > I also tried to run perf top during pgbench and get some interesting > results. > > Without 5364b357: > 5,69% postgres [.] GetSnapshotData > 4,47% postgres [.] LWLockAttemptLock > 3,81% postgres [.] _bt_compare > 3,42% postgres [.] hash_search_with_hash_value > 3,08% postgres [.] LWLockRelease > 2,49% postgres [.] PinBuffer.isra.3 > 1,58% postgres [.] AllocSetAlloc > 1,17% [kernel] [k] __schedule > 1,15% postgres [.] PostgresMain > 1,13% libc-2.17.so [.] vfprintf > 1,01% libc-2.17.so [.] __memcpy_ssse3_back > > With 5364b357: > 18,54% postgres [.] GetSnapshotData > 3,45% postgres [.] LWLockRelease > 3,27% postgres [.] LWLockAttemptLock > 3,21% postgres [.] _bt_compare > 2,93% postgres [.] hash_search_with_hash_value > 2,00% postgres [.] PinBuffer.isra.3 > 1,32% postgres [.] AllocSetAlloc > 1,10% libc-2.17.so [.] vfprintf > > Very surprising. It appears that after 5364b357, GetSnapshotData consumes > more time. But I can't see anything depending on clog buffers > in GetSnapshotData code...
Could you retry after applying the attached series of patches? - Andres
>From e8ad791c97004c64a1f27a500ba100b69fdc8d87 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Sun, 10 Apr 2016 21:47:04 -0700 Subject: [PATCH 1/3] Finish 09adc9a8c09c9640de05c7023b27fb83c761e91c. --- src/backend/access/transam/parallel.c | 2 +- src/backend/port/sysv_shmem.c | 2 +- src/backend/port/win32_shmem.c | 2 +- src/backend/storage/buffer/buf_init.c | 19 ++++++------------- src/backend/storage/ipc/shm_toc.c | 6 +++--- src/backend/storage/ipc/shmem.c | 17 ++++++++++------- src/include/c.h | 2 -- src/include/pg_config_manual.h | 8 -------- src/include/storage/shm_toc.h | 2 +- 9 files changed, 23 insertions(+), 37 deletions(-) diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 0bba9a7..f8ea89c 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -237,7 +237,7 @@ InitializeParallelDSM(ParallelContext *pcxt) shm_toc_estimate_keys(&pcxt->estimator, 6); /* Estimate space need for error queues. */ - StaticAssertStmt(BUFFERALIGN(PARALLEL_ERROR_QUEUE_SIZE) == + StaticAssertStmt(CACHELINEALIGN(PARALLEL_ERROR_QUEUE_SIZE) == PARALLEL_ERROR_QUEUE_SIZE, "parallel error queue size not buffer-aligned"); shm_toc_estimate_chunk(&pcxt->estimator, diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 6c442b9..084bc31 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -559,7 +559,7 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port, * Initialize space allocation status for segment. */ hdr->totalsize = size; - hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + hdr->freeoffset = CACHELINEALIGN(sizeof(PGShmemHeader)); *shim = hdr; /* Save info for possible future use */ diff --git a/src/backend/port/win32_shmem.c b/src/backend/port/win32_shmem.c index 0ff2c7e..81705fc 100644 --- a/src/backend/port/win32_shmem.c +++ b/src/backend/port/win32_shmem.c @@ -241,7 +241,7 @@ PGSharedMemoryCreate(Size size, bool makePrivate, int port, * Initialize space allocation status for segment. */ hdr->totalsize = size; - hdr->freeoffset = MAXALIGN(sizeof(PGShmemHeader)); + hdr->freeoffset = CACHELINEALIGN(sizeof(PGShmemHeader)); hdr->dsm_control = 0; /* Save info for possible future use */ diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index a5cffc7..61f9c34 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -76,11 +76,9 @@ InitBufferPool(void) /* Align descriptors to a cacheline boundary. */ BufferDescriptors = (BufferDescPadded *) - CACHELINEALIGN( - ShmemInitStruct("Buffer Descriptors", - NBuffers * sizeof(BufferDescPadded) - + PG_CACHE_LINE_SIZE, - &foundDescs)); + ShmemInitStruct("Buffer Descriptors", + NBuffers * sizeof(BufferDescPadded), + &foundDescs); BufferBlocks = (char *) ShmemInitStruct("Buffer Blocks", @@ -88,10 +86,9 @@ InitBufferPool(void) /* Align lwlocks to cacheline boundary */ BufferIOLWLockArray = (LWLockMinimallyPadded *) - CACHELINEALIGN(ShmemInitStruct("Buffer IO Locks", - NBuffers * (Size) sizeof(LWLockMinimallyPadded) - + PG_CACHE_LINE_SIZE, - &foundIOLocks)); + ShmemInitStruct("Buffer IO Locks", + NBuffers * (Size) sizeof(LWLockMinimallyPadded), + &foundIOLocks); BufferIOLWLockTranche.name = "buffer_io"; BufferIOLWLockTranche.array_base = BufferIOLWLockArray; @@ -179,8 +176,6 @@ BufferShmemSize(void) /* size of buffer descriptors */ size = add_size(size, mul_size(NBuffers, sizeof(BufferDescPadded))); - /* to allow aligning buffer descriptors */ - size = add_size(size, PG_CACHE_LINE_SIZE); /* size of data pages */ size = add_size(size, mul_size(NBuffers, BLCKSZ)); @@ -197,8 +192,6 @@ BufferShmemSize(void) * not highly contentended, we lay out the array with minimal padding. */ size = add_size(size, mul_size(NBuffers, sizeof(LWLockMinimallyPadded))); - /* to allow aligning the above */ - size = add_size(size, PG_CACHE_LINE_SIZE); /* size of checkpoint sort array in bufmgr.c */ size = add_size(size, mul_size(NBuffers, sizeof(CkptSortItem))); diff --git a/src/backend/storage/ipc/shm_toc.c b/src/backend/storage/ipc/shm_toc.c index 55248c2..683abc4 100644 --- a/src/backend/storage/ipc/shm_toc.c +++ b/src/backend/storage/ipc/shm_toc.c @@ -89,7 +89,7 @@ shm_toc_allocate(shm_toc *toc, Size nbytes) Size toc_bytes; /* Make sure request is well-aligned. */ - nbytes = BUFFERALIGN(nbytes); + nbytes = CACHELINEALIGN(nbytes); SpinLockAcquire(&toc->toc_mutex); @@ -133,8 +133,8 @@ shm_toc_freespace(shm_toc *toc) SpinLockRelease(&toc->toc_mutex); toc_bytes = offsetof(shm_toc, toc_entry) +nentry * sizeof(shm_toc_entry); - Assert(allocated_bytes + BUFFERALIGN(toc_bytes) <= total_bytes); - return total_bytes - (allocated_bytes + BUFFERALIGN(toc_bytes)); + Assert(allocated_bytes + CACHELINEALIGN(toc_bytes) <= total_bytes); + return total_bytes - (allocated_bytes + CACHELINEALIGN(toc_bytes)); } /* diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 1ad68cd..ae26d4b 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -125,7 +125,7 @@ InitShmemAllocation(void) PGSemaphore spinsemas; spinsemas = (PGSemaphore) (((char *) shmhdr) + shmhdr->freeoffset); - shmhdr->freeoffset += MAXALIGN(SpinlockSemaSize()); + shmhdr->freeoffset += CACHELINEALIGN(SpinlockSemaSize()); SpinlockSemaInit(spinsemas); Assert(shmhdr->freeoffset <= shmhdr->totalsize); } @@ -136,7 +136,7 @@ InitShmemAllocation(void) * way, too, for the same reasons as above. */ ShmemLock = (slock_t *) (((char *) shmhdr) + shmhdr->freeoffset); - shmhdr->freeoffset += MAXALIGN(sizeof(slock_t)); + shmhdr->freeoffset += CACHELINEALIGN(sizeof(slock_t)); Assert(shmhdr->freeoffset <= shmhdr->totalsize); SpinLockInit(ShmemLock); @@ -179,7 +179,9 @@ ShmemAlloc(Size size) * boundary. The calling code will still need to be careful about how it * uses the allocated space - e.g. by padding each element in an array of * structures out to a power-of-two size - but without this, even that - * won't be sufficient. + * won't be sufficient. We take care that freeoffset initially is + * adequately aligned, so aligning the size for all allocations guarantees + * allocations are aligned. */ size = CACHELINEALIGN(size); @@ -189,10 +191,6 @@ ShmemAlloc(Size size) newStart = ShmemSegHdr->freeoffset; - /* extra alignment for large requests, since they are probably buffers */ - if (size >= BLCKSZ) - newStart = BUFFERALIGN(newStart); - newFree = newStart + size; if (newFree <= ShmemSegHdr->totalsize) { @@ -209,6 +207,8 @@ ShmemAlloc(Size size) (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of shared memory"))); + Assert(newSpace == (void *) CACHELINEALIGN(newSpace)); + return newSpace; } @@ -425,6 +425,9 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) LWLockRelease(ShmemIndexLock); Assert(ShmemAddrIsValid(structPtr)); + + Assert(structPtr == (void *) CACHELINEALIGN(structPtr)); + return structPtr; } diff --git a/src/include/c.h b/src/include/c.h index 7c57430..fb526b7 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -567,8 +567,6 @@ typedef NameData *Name; #define LONGALIGN(LEN) TYPEALIGN(ALIGNOF_LONG, (LEN)) #define DOUBLEALIGN(LEN) TYPEALIGN(ALIGNOF_DOUBLE, (LEN)) #define MAXALIGN(LEN) TYPEALIGN(MAXIMUM_ALIGNOF, (LEN)) -/* MAXALIGN covers only built-in types, not buffers */ -#define BUFFERALIGN(LEN) TYPEALIGN(ALIGNOF_BUFFER, (LEN)) #define CACHELINEALIGN(LEN) TYPEALIGN(PG_CACHE_LINE_SIZE, (LEN)) #define TYPEALIGN_DOWN(ALIGNVAL,LEN) \ diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index ef89521..8f59b53 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -107,14 +107,6 @@ #define BITS_PER_BYTE 8 /* - * Preferred alignment for disk I/O buffers. On some CPUs, copies between - * user space and kernel space are significantly faster if the user buffer - * is aligned on a larger-than-MAXALIGN boundary. Ideally this should be - * a platform-dependent value, but for now we just hard-wire it. - */ -#define ALIGNOF_BUFFER 32 - -/* * Disable UNIX sockets for certain operating systems. */ #if defined(WIN32) diff --git a/src/include/storage/shm_toc.h b/src/include/storage/shm_toc.h index 6822f91..2365b02 100644 --- a/src/include/storage/shm_toc.h +++ b/src/include/storage/shm_toc.h @@ -48,7 +48,7 @@ typedef struct ((e)->space_for_chunks = 0, (e)->number_of_keys = 0) #define shm_toc_estimate_chunk(e, sz) \ ((e)->space_for_chunks = add_size((e)->space_for_chunks, \ - BUFFERALIGN((sz)))) + CACHELINEALIGN((sz)))) #define shm_toc_estimate_keys(e, cnt) \ ((e)->number_of_keys = add_size((e)->number_of_keys, (cnt))) -- 2.7.0.229.g701fa7f
>From 5a19816c3b4268d4e2279119e1c7c4272bad73c9 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Sun, 10 Apr 2016 21:47:14 -0700 Subject: [PATCH 2/3] Align individual parts of an slru. --- src/backend/access/transam/slru.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 36a011c..71b6d16 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -146,18 +146,18 @@ SimpleLruShmemSize(int nslots, int nlsns) Size sz; /* we assume nslots isn't so large as to risk overflow */ - sz = MAXALIGN(sizeof(SlruSharedData)); - sz += MAXALIGN(nslots * sizeof(char *)); /* page_buffer[] */ - sz += MAXALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */ - sz += MAXALIGN(nslots * sizeof(bool)); /* page_dirty[] */ - sz += MAXALIGN(nslots * sizeof(int)); /* page_number[] */ - sz += MAXALIGN(nslots * sizeof(int)); /* page_lru_count[] */ - sz += MAXALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */ + sz = CACHELINEALIGN(sizeof(SlruSharedData)); + sz += CACHELINEALIGN(nslots * sizeof(char *)); /* page_buffer[] */ + sz += CACHELINEALIGN(nslots * sizeof(SlruPageStatus)); /* page_status[] */ + sz += CACHELINEALIGN(nslots * sizeof(bool)); /* page_dirty[] */ + sz += CACHELINEALIGN(nslots * sizeof(int)); /* page_number[] */ + sz += CACHELINEALIGN(nslots * sizeof(int)); /* page_lru_count[] */ + sz += CACHELINEALIGN(nslots * sizeof(LWLockPadded)); /* buffer_locks[] */ if (nlsns > 0) - sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */ + sz += CACHELINEALIGN(nslots * nlsns * sizeof(XLogRecPtr)); /* group_lsn[] */ - return BUFFERALIGN(sz) + BLCKSZ * nslots; + return CACHELINEALIGN(sz) + BLCKSZ * nslots; } void @@ -192,22 +192,22 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, /* shared->latest_page_number will be set later */ ptr = (char *) shared; - offset = MAXALIGN(sizeof(SlruSharedData)); + offset = CACHELINEALIGN(sizeof(SlruSharedData)); shared->page_buffer = (char **) (ptr + offset); - offset += MAXALIGN(nslots * sizeof(char *)); + offset += CACHELINEALIGN(nslots * sizeof(char *)); shared->page_status = (SlruPageStatus *) (ptr + offset); - offset += MAXALIGN(nslots * sizeof(SlruPageStatus)); + offset += CACHELINEALIGN(nslots * sizeof(SlruPageStatus)); shared->page_dirty = (bool *) (ptr + offset); - offset += MAXALIGN(nslots * sizeof(bool)); + offset += CACHELINEALIGN(nslots * sizeof(bool)); shared->page_number = (int *) (ptr + offset); - offset += MAXALIGN(nslots * sizeof(int)); + offset += CACHELINEALIGN(nslots * sizeof(int)); shared->page_lru_count = (int *) (ptr + offset); - offset += MAXALIGN(nslots * sizeof(int)); + offset += CACHELINEALIGN(nslots * sizeof(int)); if (nlsns > 0) { shared->group_lsn = (XLogRecPtr *) (ptr + offset); - offset += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr)); + offset += CACHELINEALIGN(nslots * nlsns * sizeof(XLogRecPtr)); } /* Initialize LWLocks */ @@ -220,7 +220,7 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns, shared->lwlock_tranche.array_base = shared->buffer_locks; shared->lwlock_tranche.array_stride = sizeof(LWLockPadded); - ptr += BUFFERALIGN(offset); + ptr += CACHELINEALIGN(offset); for (slotno = 0; slotno < nslots; slotno++) { LWLockInitialize(&shared->buffer_locks[slotno].lock, -- 2.7.0.229.g701fa7f
>From 956044e7af1ee543c2e76d05b3367109db562477 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Sun, 10 Apr 2016 21:59:49 -0700 Subject: [PATCH 3/3] WIP: Force PGXACT stride to be 16 instead of 12 bytes. The previous solution lead to PGXACTs split across cachelines. --- src/include/storage/proc.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index c3b462c..d2cd71d 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -204,6 +204,8 @@ typedef struct PGXACT * previously called InCommit */ uint8 nxids; + + uint32 pad; /* FIXME: proper padding solution */ } PGXACT; /* -- 2.7.0.229.g701fa7f
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers