This developed a slight merge conflict. I've rebased the attached version, and I also took the step of getting rid of buf_table.c, as I think I proposed somewhere upthread. This avoids the overhead of constructing a BufferTag only to copy it into a BufferLookupEnt, plus some function calls and so forth. A quick-and-dirty test suggests this might not have cut down on the 1-client overhead much, but I think it's worth doing anyway: it's certainly saving a few cycles, and I don't think it's complicating anything measurably.
-- Robert Haas EnterpriseDB: http://www.enterprisedb.com The Enterprise PostgreSQL Company
diff --git a/contrib/Makefile b/contrib/Makefile index 195d447..141ef0a 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -19,6 +19,7 @@ SUBDIRS = \ earthdistance \ file_fdw \ fuzzystrmatch \ + hashtest \ hstore \ intagg \ intarray \ diff --git a/contrib/hashtest/Makefile b/contrib/hashtest/Makefile new file mode 100644 index 0000000..3ee42f8 --- /dev/null +++ b/contrib/hashtest/Makefile @@ -0,0 +1,18 @@ +# contrib/hashtest/Makefile + +MODULE_big = hashtest +OBJS = hashtest.o + +EXTENSION = hashtest +DATA = hashtest--1.0.sql + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/hashtest +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/hashtest/hashtest--1.0.sql b/contrib/hashtest/hashtest--1.0.sql new file mode 100644 index 0000000..e271baf --- /dev/null +++ b/contrib/hashtest/hashtest--1.0.sql @@ -0,0 +1,52 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION hashtest" to load this file. \quit + +CREATE FUNCTION chash_insert_test() +RETURNS void +AS 'MODULE_PATHNAME', 'chash_insert_test' +LANGUAGE C; + +CREATE FUNCTION chash_search_test() +RETURNS void +AS 'MODULE_PATHNAME', 'chash_search_test' +LANGUAGE C; + +CREATE FUNCTION chash_delete_test() +RETURNS void +AS 'MODULE_PATHNAME', 'chash_delete_test' +LANGUAGE C; + +CREATE FUNCTION chash_concurrent_test() +RETURNS void +AS 'MODULE_PATHNAME', 'chash_concurrent_test' +LANGUAGE C; + +CREATE FUNCTION chash_collision_test() +RETURNS void +AS 'MODULE_PATHNAME', 'chash_collision_test' +LANGUAGE C; + +CREATE FUNCTION dynahash_insert_test() +RETURNS void +AS 'MODULE_PATHNAME', 'dynahash_insert_test' +LANGUAGE C; + +CREATE FUNCTION dynahash_search_test() +RETURNS void +AS 'MODULE_PATHNAME', 'dynahash_search_test' +LANGUAGE C; + +CREATE FUNCTION dynahash_delete_test() +RETURNS void +AS 'MODULE_PATHNAME', 'dynahash_delete_test' +LANGUAGE C; + +CREATE FUNCTION dynahash_concurrent_test() +RETURNS void +AS 'MODULE_PATHNAME', 'dynahash_concurrent_test' +LANGUAGE C; + +CREATE FUNCTION dynahash_collision_test() +RETURNS void +AS 'MODULE_PATHNAME', 'dynahash_collision_test' +LANGUAGE C; diff --git a/contrib/hashtest/hashtest.c b/contrib/hashtest/hashtest.c new file mode 100644 index 0000000..172a5bb --- /dev/null +++ b/contrib/hashtest/hashtest.c @@ -0,0 +1,527 @@ +/*------------------------------------------------------------------------- + * hashtest.c + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "funcapi.h" +#include "libpq/auth.h" +#include "lib/stringinfo.h" +#include "miscadmin.h" +#include "portability/instr_time.h" +#include "storage/ipc.h" +#include "utils/chash.h" + +PG_MODULE_MAGIC; + +void _PG_init(void); +Datum chash_insert_test(PG_FUNCTION_ARGS); +Datum chash_search_test(PG_FUNCTION_ARGS); +Datum chash_delete_test(PG_FUNCTION_ARGS); +Datum chash_concurrent_test(PG_FUNCTION_ARGS); +Datum chash_collision_test(PG_FUNCTION_ARGS); +Datum dynahash_insert_test(PG_FUNCTION_ARGS); +Datum dynahash_search_test(PG_FUNCTION_ARGS); +Datum dynahash_delete_test(PG_FUNCTION_ARGS); +Datum dynahash_concurrent_test(PG_FUNCTION_ARGS); +Datum dynahash_collision_test(PG_FUNCTION_ARGS); +static void hashtest_shmem_startup(void); + +PG_FUNCTION_INFO_V1(chash_insert_test); +PG_FUNCTION_INFO_V1(chash_search_test); +PG_FUNCTION_INFO_V1(chash_delete_test); +PG_FUNCTION_INFO_V1(chash_concurrent_test); +PG_FUNCTION_INFO_V1(chash_collision_test); +PG_FUNCTION_INFO_V1(dynahash_insert_test); +PG_FUNCTION_INFO_V1(dynahash_search_test); +PG_FUNCTION_INFO_V1(dynahash_delete_test); +PG_FUNCTION_INFO_V1(dynahash_concurrent_test); +PG_FUNCTION_INFO_V1(dynahash_collision_test); + +typedef struct +{ + uint32 key; + uint32 val; +} hentry; + +static CHashDescriptor cdesc = { + "hashtest-chash", /* name */ + 1048576, /* capacity */ + sizeof(hentry), /* element size */ + sizeof(uint32) /* key size */ +}; + +#define DYNAHASH_PARTITIONS 16 + +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; +static CHashTable chash; +static HTAB *dynahash; +static LWLockId dynahash_lock[DYNAHASH_PARTITIONS]; +static ClientAuthentication_hook_type original_client_auth_hook = NULL; + +static void hashtest_client_auth_hook(Port *port, int status); +static void chash_write_stats_to_log(int code, Datum dummy); + +#define dynahash_get_lock(hashcode) \ + (dynahash_lock[(hashcode) % DYNAHASH_PARTITIONS]) + +void +_PG_init(void) +{ + Size cs; + Size ds; + + if (!process_shared_preload_libraries_in_progress) + return; + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = hashtest_shmem_startup; + chash = CHashBootstrap(&cdesc); + cs = CHashEstimateSize(chash); + RequestAddinShmemSpace(cs); + ds = hash_estimate_size(cdesc.capacity, cdesc.element_size); + RequestAddinShmemSpace(ds); + elog(LOG, "chash: %u bytes; dynahash: %u bytes", (unsigned) cs, + (unsigned) ds); + RequestAddinLWLocks(DYNAHASH_PARTITIONS); + original_client_auth_hook = ClientAuthentication_hook; + ClientAuthentication_hook = hashtest_client_auth_hook; + +} + +static void +hashtest_client_auth_hook(Port *port, int status) +{ + if (original_client_auth_hook) + original_client_auth_hook(port, status); + on_proc_exit(chash_write_stats_to_log, (Datum) 0); +} + +static void +chash_write_stats_to_log(int code, Datum dummy) +{ + uint64 stats[CHS_NumberOfStatistics]; + CHashStatisticsType i; + StringInfoData buf; + + CHashStatistics(chash, stats); + initStringInfo(&buf); + + for (i = 0; i < CHS_NumberOfStatistics; ++i) + { + if (stats[i] == 0) + continue; + appendStringInfo(&buf, UINT64_FORMAT " %s; ", stats[i], + CHashStatisticsNames[i]); + } + + if (buf.len > 1) + { + buf.data[buf.len-2] = '\0'; + elog(LOG, "chash statistics: %s", buf.data); + } +} + +static void +hashtest_shmem_startup(void) +{ + HASHCTL info; + uint32 i; + + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + /* Initialize concurrent hash table. */ + chash = CHashInitialize(chash, &cdesc); + + /* Initialize shared dynahash table. */ + info.keysize = cdesc.key_size; + info.entrysize = cdesc.element_size; + info.hash = tag_hash; + info.num_partitions = DYNAHASH_PARTITIONS; + + dynahash = ShmemInitHash("hashtest-dynahash", + cdesc.capacity, cdesc.capacity, + &info, + HASH_ELEM | HASH_FUNCTION | HASH_PARTITION); + + for (i = 0; i < DYNAHASH_PARTITIONS; ++i) + dynahash_lock[i] = LWLockAssign(); +} + +Datum +chash_insert_test(PG_FUNCTION_ARGS) +{ + uint32 i; + hentry e; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + + e.key = i; + e.val = i * 31; + ok = CHashInsert(chash, &e); + if (!ok) + elog(LOG, "insert %u: failed", i); + ok = CHashInsert(chash, &e); + if (ok) + elog(LOG, "insert %u: worked twice", i); + } + + PG_RETURN_VOID(); +} + +Datum +chash_search_test(PG_FUNCTION_ARGS) +{ + uint32 i; + hentry e; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + + e.key = i; + ok = CHashSearch(chash, &e); + if (!ok) + elog(LOG, "search %u: not found", i); + else if (e.val != e.key * 31) + elog(LOG, "search %u: found %u", i, e.val); + } + + PG_RETURN_VOID(); +} + +Datum +chash_delete_test(PG_FUNCTION_ARGS) +{ + uint32 i; + hentry e; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + + e.key = i; + ok = CHashDelete(chash, &e); + if (!ok) + elog(LOG, "delete %u: not found", i); + ok = CHashDelete(chash, &e); + if (ok) + elog(LOG, "delete %u: found twice", i); + } + + PG_RETURN_VOID(); +} + +Datum +chash_concurrent_test(PG_FUNCTION_ARGS) +{ + uint32 i; + hentry e; + uint32 seed = MyProcPid << 16; + + for (i = 0; i < 10000; ++i) + { + bool ok; + + e.key = seed | i; + e.val = MyProcPid; + ok = CHashInsert(chash, &e); + if (!ok) + elog(LOG, "insert %u: found", i); + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + e.key = seed | i; + e.val = 0; + ok = CHashSearch(chash, &e); + if (!ok) + { + uint64 retry = 1; + elog(LOG, "search %u: not found", i); + while (!CHashSearch(chash, &e)) + ++retry; + elog(LOG, "search %u: eventually found it after " + UINT64_FORMAT " retries", i, retry); + } + if (e.val != MyProcPid) + elog(LOG, "search %u: expected %u found %u", i, (unsigned) MyProcPid, e.val); + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + e.key = seed | i; + ok = CHashDelete(chash, &e); + if (!ok) + { + uint64 retry = 1; + elog(LOG, "delete %u: not found", i); + while (!CHashDelete(chash, &e)) + ++retry; + elog(LOG, "delete %u: eventually deleted it after " + UINT64_FORMAT " retries", i, retry); + } + } + + PG_RETURN_VOID(); +} + +Datum +chash_collision_test(PG_FUNCTION_ARGS) +{ + uint32 i; + hentry e; + + /* Don't stack-allocate this. */ + static bool mine[10000]; + + memset(mine, 0, 10000 * sizeof(bool)); + + for (i = 0; i < 10000; ++i) + { + bool ok; + + e.key = i; + e.val = MyProcPid; + ok = CHashInsert(chash, &e); + if (ok) + mine[i] = true; + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + if (!mine[i]) + continue; + e.key = i; + ok = CHashSearch(chash, &e); + if (!ok) + elog(LOG, "search %u: not found", i); + else if (e.val != MyProcPid) + elog(LOG, "search %u: expected %u found %u", + i, (unsigned) MyProcPid, e.val); + ok = CHashDelete(chash, &e); + if (!ok) + elog(LOG, "delete %u: not found", i); + } + + PG_RETURN_VOID(); +} + +static bool +dynahash_insert(uint32 key, uint32 val) +{ + bool found; + uint32 hashcode; + hentry *e; + LWLockId lockid; + + hashcode = get_hash_value(dynahash, (void *) &key); + lockid = dynahash_get_lock(hashcode); + LWLockAcquire(lockid, LW_EXCLUSIVE); + e = hash_search_with_hash_value(dynahash, (void *) &key, + hashcode, HASH_ENTER, &found); + if (!found) + e->val = val; + LWLockRelease(lockid); + + return !found; +} + +static bool +dynahash_search(uint32 key, uint32 *val) +{ + uint32 hashcode; + hentry *e; + LWLockId lockid; + + hashcode = get_hash_value(dynahash, (void *) &key); + lockid = dynahash_get_lock(hashcode); + LWLockAcquire(lockid, LW_SHARED); + e = hash_search_with_hash_value(dynahash, (void *) &key, + hashcode, HASH_FIND, NULL); + if (e) + *val = e->val; + LWLockRelease(lockid); + + return e != NULL; +} + +static bool +dynahash_delete(uint32 key) +{ + uint32 hashcode; + hentry *e; + LWLockId lockid; + + hashcode = get_hash_value(dynahash, (void *) &key); + lockid = dynahash_get_lock(hashcode); + LWLockAcquire(lockid, LW_EXCLUSIVE); + e = hash_search_with_hash_value(dynahash, (void *) &key, + hashcode, HASH_REMOVE, NULL); + LWLockRelease(lockid); + + return e != NULL; +} + +Datum +dynahash_insert_test(PG_FUNCTION_ARGS) +{ + uint32 i; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + + ok = dynahash_insert(i, i * 31); + if (!ok) + elog(LOG, "insert %u: failed", i); + ok = dynahash_insert(i, i * 31); + if (ok) + elog(LOG, "insert %u: worked twice", i); + } + + PG_RETURN_VOID(); +} + +Datum +dynahash_search_test(PG_FUNCTION_ARGS) +{ + uint32 i; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + uint32 val; + + ok = dynahash_search(i, &val); + if (!ok) + elog(LOG, "search %u: not found", i); + else if (val != i* 31) + elog(LOG, "search %u: found %u", i, val); + } + + PG_RETURN_VOID(); +} + +Datum +dynahash_delete_test(PG_FUNCTION_ARGS) +{ + uint32 i; + + for (i = 0; i < 1000000; ++i) + { + bool ok; + + ok = dynahash_delete(i); + if (!ok) + elog(LOG, "delete %u: not found", i); + ok = dynahash_delete(i); + if (ok) + elog(LOG, "delete %u: found twice", i); + } + + PG_RETURN_VOID(); +} + +Datum +dynahash_concurrent_test(PG_FUNCTION_ARGS) +{ + uint32 i; + uint32 val; + uint32 seed = MyProcPid << 16; + + for (i = 0; i < 10000; ++i) + { + bool ok; + + ok = dynahash_insert(seed | i, MyProcPid); + if (!ok) + elog(LOG, "insert %u: found", i); + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + ok = dynahash_search(seed | i, &val); + if (!ok) + { + uint64 retry = 1; + elog(LOG, "search %u: not found", i); + while (!dynahash_search(seed | i, &val)) + ++retry; + elog(LOG, "search %u: eventually found it after " + UINT64_FORMAT " retries", i, retry); + } + if (val != MyProcPid) + elog(LOG, "search %u: expected %u found %u", + i, (unsigned) MyProcPid, val); + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + ok = dynahash_delete(seed | i); + if (!ok) + { + uint64 retry = 1; + elog(LOG, "delete %u: not found", i); + while (!dynahash_delete(seed | i)) + ++retry; + elog(LOG, "delete %u: eventually deleted it after " + UINT64_FORMAT " retries", i, retry); + } + } + + PG_RETURN_VOID(); +} + +Datum +dynahash_collision_test(PG_FUNCTION_ARGS) +{ + uint32 i; + uint32 val; + + /* Don't stack-allocate this. */ + static bool mine[10000]; + + memset(mine, 0, 10000 * sizeof(bool)); + + for (i = 0; i < 10000; ++i) + { + bool ok; + + ok = dynahash_insert(i, MyProcPid); + if (ok) + mine[i] = true; + } + + for (i = 0; i < 10000; ++i) + { + bool ok; + + if (!mine[i]) + continue; + ok = dynahash_search(i, &val); + if (!ok) + elog(LOG, "search %u: not found", i); + else if (val != MyProcPid) + elog(LOG, "search %u: expected %u found %u", + i, (unsigned) MyProcPid, val); + ok = dynahash_delete(i); + if (!ok) + elog(LOG, "delete %u: not found", i); + } + + PG_RETURN_VOID(); +} diff --git a/contrib/hashtest/hashtest.control b/contrib/hashtest/hashtest.control new file mode 100644 index 0000000..b8e0f01 --- /dev/null +++ b/contrib/hashtest/hashtest.control @@ -0,0 +1,4 @@ +comment = 'hash testing code' +default_version = '1.0' +module_pathname = '$libdir/hashtest' +relocatable = true diff --git a/src/backend/storage/buffer/Makefile b/src/backend/storage/buffer/Makefile index 2c10fba..b30a0da 100644 --- a/src/backend/storage/buffer/Makefile +++ b/src/backend/storage/buffer/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/storage/buffer top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = buf_table.o buf_init.o bufmgr.o freelist.o localbuf.o +OBJS = buf_init.o bufmgr.o freelist.o localbuf.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/storage/buffer/README b/src/backend/storage/buffer/README index a4ebbcc..86697e9 100644 --- a/src/backend/storage/buffer/README +++ b/src/backend/storage/buffer/README @@ -100,30 +100,10 @@ Buffer Manager's Internal Locking Before PostgreSQL 8.1, all operations of the shared buffer manager itself were protected by a single system-wide lock, the BufMgrLock, which -unsurprisingly proved to be a source of contention. The new locking scheme -avoids grabbing system-wide exclusive locks in common code paths. It works -like this: - -* There is a system-wide LWLock, the BufMappingLock, that notionally -protects the mapping from buffer tags (page identifiers) to buffers. -(Physically, it can be thought of as protecting the hash table maintained -by buf_table.c.) To look up whether a buffer exists for a tag, it is -sufficient to obtain share lock on the BufMappingLock. Note that one -must pin the found buffer, if any, before releasing the BufMappingLock. -To alter the page assignment of any buffer, one must hold exclusive lock -on the BufMappingLock. This lock must be held across adjusting the buffer's -header fields and changing the buf_table hash table. The only common -operation that needs exclusive lock is reading in a page that was not -in shared buffers already, which will require at least a kernel call -and usually a wait for I/O, so it will be slow anyway. - -* As of PG 8.2, the BufMappingLock has been split into NUM_BUFFER_PARTITIONS -separate locks, each guarding a portion of the buffer tag space. This allows -further reduction of contention in the normal code paths. The partition -that a particular buffer tag belongs to is determined from the low-order -bits of the tag's hash value. The rules stated above apply to each partition -independently. If it is necessary to lock more than one partition at a time, -they must be locked in partition-number order to avoid risk of deadlock. +unsurprisingly proved to be a source of contention. In subsequent releases, +this lock was split into NUM_BUFFER_PARTITIONS locks, each guarding a portion +of the buffer tag space. Even this proved to be too much contention, so +now we use a highly concurrent hashtable (see chash.c and chash.h). * A separate system-wide spinlock, buffer_strategy_lock, provides mutual exclusion for operations that access the buffer free list or select diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c deleted file mode 100644 index 6ed47d5..0000000 --- a/src/backend/storage/buffer/buf_table.c +++ /dev/null @@ -1,163 +0,0 @@ -/*------------------------------------------------------------------------- - * - * buf_table.c - * routines for mapping BufferTags to buffer indexes. - * - * Note: the routines in this file do no locking of their own. The caller - * must hold a suitable lock on the appropriate BufMappingLock, as specified - * in the comments. We can't do the locking inside these functions because - * in most cases the caller needs to adjust the buffer header contents - * before the lock is released (see notes in README). - * - * - * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * src/backend/storage/buffer/buf_table.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "storage/bufmgr.h" -#include "storage/buf_internals.h" - - -/* entry for buffer lookup hashtable */ -typedef struct -{ - BufferTag key; /* Tag of a disk page */ - int id; /* Associated buffer ID */ -} BufferLookupEnt; - -static HTAB *SharedBufHash; - - -/* - * Estimate space needed for mapping hashtable - * size is the desired hash table size (possibly more than NBuffers) - */ -Size -BufTableShmemSize(int size) -{ - return hash_estimate_size(size, sizeof(BufferLookupEnt)); -} - -/* - * Initialize shmem hash table for mapping buffers - * size is the desired hash table size (possibly more than NBuffers) - */ -void -InitBufTable(int size) -{ - HASHCTL info; - - /* assume no locking is needed yet */ - - /* BufferTag maps to Buffer */ - info.keysize = sizeof(BufferTag); - info.entrysize = sizeof(BufferLookupEnt); - info.num_partitions = NUM_BUFFER_PARTITIONS; - - SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table", - size, size, - &info, - HASH_ELEM | HASH_BLOBS | HASH_PARTITION); -} - -/* - * BufTableHashCode - * Compute the hash code associated with a BufferTag - * - * This must be passed to the lookup/insert/delete routines along with the - * tag. We do it like this because the callers need to know the hash code - * in order to determine which buffer partition to lock, and we don't want - * to do the hash computation twice (hash_any is a bit slow). - */ -uint32 -BufTableHashCode(BufferTag *tagPtr) -{ - return get_hash_value(SharedBufHash, (void *) tagPtr); -} - -/* - * BufTableLookup - * Lookup the given BufferTag; return buffer ID, or -1 if not found - * - * Caller must hold at least share lock on BufMappingLock for tag's partition - */ -int -BufTableLookup(BufferTag *tagPtr, uint32 hashcode) -{ - BufferLookupEnt *result; - - result = (BufferLookupEnt *) - hash_search_with_hash_value(SharedBufHash, - (void *) tagPtr, - hashcode, - HASH_FIND, - NULL); - - if (!result) - return -1; - - return result->id; -} - -/* - * BufTableInsert - * Insert a hashtable entry for given tag and buffer ID, - * unless an entry already exists for that tag - * - * Returns -1 on successful insertion. If a conflicting entry exists - * already, returns the buffer ID in that entry. - * - * Caller must hold exclusive lock on BufMappingLock for tag's partition - */ -int -BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id) -{ - BufferLookupEnt *result; - bool found; - - Assert(buf_id >= 0); /* -1 is reserved for not-in-table */ - Assert(tagPtr->blockNum != P_NEW); /* invalid tag */ - - result = (BufferLookupEnt *) - hash_search_with_hash_value(SharedBufHash, - (void *) tagPtr, - hashcode, - HASH_ENTER, - &found); - - if (found) /* found something already in the table */ - return result->id; - - result->id = buf_id; - - return -1; -} - -/* - * BufTableDelete - * Delete the hashtable entry for given tag (which must exist) - * - * Caller must hold exclusive lock on BufMappingLock for tag's partition - */ -void -BufTableDelete(BufferTag *tagPtr, uint32 hashcode) -{ - BufferLookupEnt *result; - - result = (BufferLookupEnt *) - hash_search_with_hash_value(SharedBufHash, - (void *) tagPtr, - hashcode, - HASH_REMOVE, - NULL); - - if (!result) /* shouldn't happen */ - elog(ERROR, "shared buffer hash table corrupted"); -} diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 7eb2d22..4435b3e 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -24,9 +24,7 @@ * MarkBufferDirty() -- mark a pinned buffer's contents as "dirty". * The disk write is delayed until buffer replacement or checkpoint. * - * See also these files: - * freelist.c -- chooses victim for buffer replacement - * buf_table.c -- manages the buffer lookup table + * See also freelist.c, which chooses victim for buffer replacement */ #include "postgres.h" @@ -47,10 +45,25 @@ #include "storage/proc.h" #include "storage/smgr.h" #include "storage/standby.h" +#include "utils/chash.h" #include "utils/rel.h" #include "utils/resowner_private.h" #include "utils/timestamp.h" +/* entry for buffer lookup hashtable */ +typedef struct +{ + BufferTag key; /* Tag of a disk page */ + int id; /* Associated buffer ID */ +} BufferLookupEnt; + +static CHashDescriptor SharedBufDescriptor = { + "buffer lookup table", + 0, + sizeof(BufferLookupEnt), + sizeof(BufferTag) +}; +static CHashTable SharedBufHash; /* Note: these two macros only work on shared buffers, not local ones! */ #define BufHdrGetBlock(bufHdr) ((Block) (BufferBlocks + ((Size) (bufHdr)->buf_id) * BLCKSZ)) @@ -138,6 +151,38 @@ static inline int32 GetPrivateRefCount(Buffer buffer); static void ForgetPrivateRefCountEntry(PrivateRefCountEntry *ref); /* + * Estimate space needed for mapping hashtable + * size is the desired hash table size (possibly more than NBuffers) + */ +Size +BufMgrShmemSize(int size) +{ + if (SharedBufHash == NULL) + { + SharedBufDescriptor.capacity = size; + SharedBufHash = CHashBootstrap(&SharedBufDescriptor); + } + + return CHashEstimateSize(SharedBufHash); +} + +/* + * Initialize shmem hash table for mapping buffers + * size is the desired hash table size (possibly more than NBuffers) + */ +void +BufMgrInitShmem(int size) +{ + if (SharedBufHash == NULL || !IsUnderPostmaster) + { + Assert(SharedBufDescriptor.capacity == 0 || + SharedBufDescriptor.capacity == size); + SharedBufDescriptor.capacity = size; + SharedBufHash = CHashInitialize(SharedBufHash, &SharedBufDescriptor); + } +} + +/* * Ensure that the the PrivateRefCountArray has sufficient space to store one * more entry. This has to be called before using NewPrivateRefCountEntry() to * fill a new entry - but it's perfectly fine to not use a reserved entry. @@ -444,26 +489,14 @@ PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) } else { - BufferTag newTag; /* identity of requested block */ - uint32 newHash; /* hash value for newTag */ - LWLock *newPartitionLock; /* buffer partition lock for it */ - int buf_id; + BufferLookupEnt ent; /* identity of requested block */ /* create a tag so we can lookup the buffer */ - INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode.node, + INIT_BUFFERTAG(ent.key, reln->rd_smgr->smgr_rnode.node, forkNum, blockNum); - /* determine its hash code and partition lock ID */ - newHash = BufTableHashCode(&newTag); - newPartitionLock = BufMappingPartitionLock(newHash); - - /* see if the block is in the buffer pool already */ - LWLockAcquire(newPartitionLock, LW_SHARED); - buf_id = BufTableLookup(&newTag, newHash); - LWLockRelease(newPartitionLock); - /* If not in buffers, initiate prefetch */ - if (buf_id < 0) + if (!CHashSearch(SharedBufHash, &ent)) smgrprefetch(reln->rd_smgr, forkNum, blockNum); /* @@ -870,43 +903,39 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, BufferAccessStrategy strategy, bool *foundPtr) { - BufferTag newTag; /* identity of requested block */ - uint32 newHash; /* hash value for newTag */ - LWLock *newPartitionLock; /* buffer partition lock for it */ - BufferTag oldTag; /* previous identity of selected buffer */ - uint32 oldHash; /* hash value for oldTag */ - LWLock *oldPartitionLock; /* buffer partition lock for it */ + BufferLookupEnt newEnt; /* identity of requested block */ + BufferLookupEnt oldEnt; /* previous identity of selected buffer */ BufFlags oldFlags; - int buf_id; volatile BufferDesc *buf; bool valid; /* create a tag so we can lookup the buffer */ - INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum); - - /* determine its hash code and partition lock ID */ - newHash = BufTableHashCode(&newTag); - newPartitionLock = BufMappingPartitionLock(newHash); + INIT_BUFFERTAG(newEnt.key, smgr->smgr_rnode.node, forkNum, blockNum); /* see if the block is in the buffer pool already */ - LWLockAcquire(newPartitionLock, LW_SHARED); - buf_id = BufTableLookup(&newTag, newHash); - if (buf_id >= 0) +start: + if (CHashSearch(SharedBufHash, &newEnt)) { + BufferDesc *foundbuf; + /* * Found it. Now, pin the buffer so no one can steal it from the - * buffer pool, and check to see if the correct data has been loaded - * into the buffer. + * buffer pool. */ - buf = &BufferDescriptors[buf_id]; + foundbuf = &BufferDescriptors[newEnt.id]; - valid = PinBuffer(buf, strategy); + valid = PinBuffer(foundbuf, strategy); - /* Can release the mapping lock as soon as we've pinned it */ - LWLockRelease(newPartitionLock); + /* Check whether someone recycled the buffer before we pinned it. */ + if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag)) + { + UnpinBuffer(foundbuf, true); + goto start; + } *foundPtr = TRUE; + /* Check to see if the correct data has been loaded into the buffer. */ if (!valid) { /* @@ -916,7 +945,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * own read attempt if the page is still not BM_VALID. * StartBufferIO does it all. */ - if (StartBufferIO(buf, true)) + if (StartBufferIO(foundbuf, true)) { /* * If we get here, previous attempts to read the buffer must @@ -926,15 +955,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } } - return buf; + return foundbuf; } - /* - * Didn't find it in the buffer pool. We'll have to initialize a new - * buffer. Remember to unlock the mapping lock while doing the work. - */ - LWLockRelease(newPartitionLock); - /* Loop here in case we have to try another victim buffer */ for (;;) { @@ -1041,42 +1064,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, */ if (oldFlags & BM_TAG_VALID) { - /* - * Need to compute the old tag's hashcode and partition lock ID. - * XXX is it worth storing the hashcode in BufferDesc so we need - * not recompute it here? Probably not. - */ - oldTag = buf->tag; - oldHash = BufTableHashCode(&oldTag); - oldPartitionLock = BufMappingPartitionLock(oldHash); - - /* - * Must lock the lower-numbered partition first to avoid - * deadlocks. - */ - if (oldPartitionLock < newPartitionLock) - { - LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE); - LWLockAcquire(newPartitionLock, LW_EXCLUSIVE); - } - else if (oldPartitionLock > newPartitionLock) - { - LWLockAcquire(newPartitionLock, LW_EXCLUSIVE); - LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE); - } - else - { - /* only one partition, only one lock */ - LWLockAcquire(newPartitionLock, LW_EXCLUSIVE); - } - } - else - { - /* if it wasn't valid, we need only the new partition */ - LWLockAcquire(newPartitionLock, LW_EXCLUSIVE); - /* these just keep the compiler quiet about uninit variables */ - oldHash = 0; - oldPartitionLock = 0; + /* Save old tag. */ + oldEnt.key = buf->tag; } /* @@ -1086,32 +1075,33 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * Note that we have not yet removed the hashtable entry for the old * tag. */ - buf_id = BufTableInsert(&newTag, newHash, buf->buf_id); - - if (buf_id >= 0) +enter: + newEnt.id = buf->buf_id; + if (!CHashInsert(SharedBufHash, &newEnt)) { + BufferDesc *foundbuf; + + /* + * We've got a collision, apparently: it looks like someone else + * did what we were about to do. We can handle this as if we had + * found the buffer in the pool in the first place, but we must + * recheck the buffer tag after pinning it, because it could still + * get renamed under us. + */ + foundbuf = &BufferDescriptors[newEnt.id]; + valid = PinBuffer(foundbuf, strategy); + if (!BUFFERTAGS_EQUAL(newEnt.key, foundbuf->tag)) + { + UnpinBuffer(foundbuf, true); + goto enter; + } + /* - * Got a collision. Someone has already done what we were about to - * do. We'll just handle this as if it were found in the buffer - * pool in the first place. First, give up the buffer we were - * planning to use. + * Collision confirmed. Give up the buffer we were planning to + * use. */ UnpinBuffer(buf, true); - /* Can give up that buffer's mapping partition lock now */ - if ((oldFlags & BM_TAG_VALID) && - oldPartitionLock != newPartitionLock) - LWLockRelease(oldPartitionLock); - - /* remaining code should match code at top of routine */ - - buf = &BufferDescriptors[buf_id]; - - valid = PinBuffer(buf, strategy); - - /* Can release the mapping lock as soon as we've pinned it */ - LWLockRelease(newPartitionLock); - *foundPtr = TRUE; if (!valid) @@ -1123,7 +1113,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * then set up our own read attempt if the page is still not * BM_VALID. StartBufferIO does it all. */ - if (StartBufferIO(buf, true)) + if (StartBufferIO(foundbuf, true)) { /* * If we get here, previous attempts to read the buffer @@ -1133,7 +1123,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, } } - return buf; + return foundbuf; } /* @@ -1152,11 +1142,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, break; UnlockBufHdr(buf); - BufTableDelete(&newTag, newHash); - if ((oldFlags & BM_TAG_VALID) && - oldPartitionLock != newPartitionLock) - LWLockRelease(oldPartitionLock); - LWLockRelease(newPartitionLock); + if (!CHashDelete(SharedBufHash, &newEnt.key)) + elog(ERROR, "shared buffer hash table corrupted"); UnpinBuffer(buf, true); } @@ -1168,7 +1155,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, * the old content is no longer relevant. (The usage_count starts out at * 1 so that the buffer can survive one clock-sweep pass.) */ - buf->tag = newTag; + buf->tag = newEnt.key; buf->flags &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED | BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT); if (relpersistence == RELPERSISTENCE_PERMANENT) buf->flags |= BM_TAG_VALID | BM_PERMANENT; @@ -1178,14 +1165,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, UnlockBufHdr(buf); - if (oldFlags & BM_TAG_VALID) - { - BufTableDelete(&oldTag, oldHash); - if (oldPartitionLock != newPartitionLock) - LWLockRelease(oldPartitionLock); - } - - LWLockRelease(newPartitionLock); + if ((oldFlags & BM_TAG_VALID) != 0 && + !CHashDelete(SharedBufHash, &oldEnt)) + elog(ERROR, "shared buffer hash table corrupted"); /* * Buffer contents are currently invalid. Try to get the io_in_progress @@ -1220,42 +1202,11 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, static void InvalidateBuffer(volatile BufferDesc *buf) { - BufferTag oldTag; - uint32 oldHash; /* hash value for oldTag */ - LWLock *oldPartitionLock; /* buffer partition lock for it */ + BufferLookupEnt oldEnt; BufFlags oldFlags; /* Save the original buffer tag before dropping the spinlock */ - oldTag = buf->tag; - - UnlockBufHdr(buf); - - /* - * Need to compute the old tag's hashcode and partition lock ID. XXX is it - * worth storing the hashcode in BufferDesc so we need not recompute it - * here? Probably not. - */ - oldHash = BufTableHashCode(&oldTag); - oldPartitionLock = BufMappingPartitionLock(oldHash); - -retry: - - /* - * Acquire exclusive mapping lock in preparation for changing the buffer's - * association. - */ - LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE); - - /* Re-lock the buffer header */ - LockBufHdr(buf); - - /* If it's changed while we were waiting for lock, do nothing */ - if (!BUFFERTAGS_EQUAL(buf->tag, oldTag)) - { - UnlockBufHdr(buf); - LWLockRelease(oldPartitionLock); - return; - } + oldEnt.key = buf->tag; /* * We assume the only reason for it to be pinned is that someone else is @@ -1266,15 +1217,21 @@ retry: * yet done StartBufferIO, WaitIO will fall through and we'll effectively * be busy-looping here.) */ - if (buf->refcount != 0) + while (buf->refcount != 0) { UnlockBufHdr(buf); - LWLockRelease(oldPartitionLock); /* safety check: should definitely not be our *own* pin */ if (GetPrivateRefCount(buf->buf_id) > 0) elog(ERROR, "buffer is pinned in InvalidateBuffer"); WaitIO(buf); - goto retry; + LockBufHdr(buf); + + /* If it's changed while we were waiting for lock, do nothing */ + if (!BUFFERTAGS_EQUAL(buf->tag, oldEnt.key)) + { + UnlockBufHdr(buf); + return; + } } /* @@ -1291,13 +1248,9 @@ retry: /* * Remove the buffer from the lookup hashtable, if it was in there. */ - if (oldFlags & BM_TAG_VALID) - BufTableDelete(&oldTag, oldHash); - - /* - * Done with mapping lock. - */ - LWLockRelease(oldPartitionLock); + if ((oldFlags & BM_TAG_VALID) != 0 && + !CHashDelete(SharedBufHash, &oldEnt)) + elog(ERROR, "shared buffer hash table corrupted"); /* * Insert the buffer at the head of the list of free buffers. diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c index 3add619..2410dfc 100644 --- a/src/backend/storage/buffer/freelist.c +++ b/src/backend/storage/buffer/freelist.c @@ -432,7 +432,7 @@ StrategyShmemSize(void) Size size = 0; /* size of lookup hash table ... see comment in StrategyInitialize */ - size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); + size = add_size(size, BufMgrShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); /* size of the shared replacement strategy control block */ size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl))); @@ -462,7 +462,7 @@ StrategyInitialize(bool init) * happening in each partition concurrently, so we could need as many as * NBuffers + NUM_BUFFER_PARTITIONS entries. */ - InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); + BufMgrInitShmem(NBuffers + NUM_BUFFER_PARTITIONS); /* * Get or create the shared strategy control block diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c index 250e312..5f94f57 100644 --- a/src/backend/storage/ipc/shmem.c +++ b/src/backend/storage/ipc/shmem.c @@ -423,6 +423,29 @@ ShmemInitStruct(const char *name, Size size, bool *foundPtr) return structPtr; } +/* + * ShmemInitStruct -- Attach to an existing structure in shared memory. + */ +void * +ShmemAttachStruct(const char *name) +{ + ShmemIndexEnt *result; + void *ptr; + bool found; + + LWLockAcquire(ShmemIndexLock, LW_SHARED); + + result = (ShmemIndexEnt *) + hash_search(ShmemIndex, name, HASH_FIND, &found); + if (!found || result == NULL) + elog(ERROR, "shared memory structure %s not found", name); + ptr = result->location; + Assert(ptr != NULL); + + LWLockRelease(ShmemIndexLock); + + return ptr; +} /* * Add two Size values, checking for overflow diff --git a/src/backend/utils/hash/Makefile b/src/backend/utils/hash/Makefile index 05d347c..5d53382 100644 --- a/src/backend/utils/hash/Makefile +++ b/src/backend/utils/hash/Makefile @@ -12,6 +12,6 @@ subdir = src/backend/utils/hash top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = dynahash.o hashfn.o +OBJS = chash.o dynahash.o hashfn.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/utils/hash/chash.c b/src/backend/utils/hash/chash.c new file mode 100644 index 0000000..0d4dc78 --- /dev/null +++ b/src/backend/utils/hash/chash.c @@ -0,0 +1,1075 @@ +/*------------------------------------------------------------------------- + * + * chash.c + * concurrent hash tables + * + * A concurrent hash table stores a collection of fixed-size objects. + * From the point of view of this module, such objects are merely an + * opaque array of bytes, but the caller will typically implement them as + * a C "struct". Some fixed-size, leading portion of each object is + * designated as the key, which must be distinct for all objects in the + * collection. Since PostgreSQL's shared memory model does not permit + * dynamic shared-memory allocation, we preallocate shared-memory space + * for the maximum number of entities which can be stored (plus a few + * extra, for reasons that will be further explained below). This space + * is allocated as a single large array called the arena, and we often + * refer to entities by their position in the arena rather than via an + * ordinary pointer. This saves a considerable amount of memory, since + * most modern architectures are 64-bit and therefore use 8-byte pointers, + * while arena offsets can be stored in a 32-bit word. In fact, we + * reserve one bit in each such word as a mark bit, so the maximum size + * of the arena is 2^31 elements, a restriction that does not currently + * appear to be problematic. An additional advantage of this representation + * is that aligned 32-bit loads and stores are atomic on all architectures + * we support, but 64-bit loads and stores are not. + * + * When an element is inserted, we copy the data from the backend-private + * object supplied by the caller into one of these shared-memory entities. + * When the hash table is searched, the caller passes a backend-private + * entity with just the key filled in; if a matching element is found, + * data is copied from the shared memory entity into the non-key portion + * of the user-supplied entity. In this way, clients of this module + * never use pointers into shared memory directly. + * + * As normal, we structure the hash table as an array of buckets, whose + * size is always a power of two, so that the low-order bytes of the + * hash code can be used to select a bucket. If multiple entities has + * to the same bucket, we use separate chaining: each entity in the + * arena has an 8-byte header that stores the 4-byte arena offset of the + * next item in the bucket and the hash value of the entity's key. + * Bucket chains are maintained in order by ascending hash value and + * then by ascending entity key (as per memcmp) so that there is + * precisely one legal location at which a given new item can be inserted + * into a bucket. + * + * Items are inserted into buckets using compare-and-swap (CAS). Thus, this + * module cannot be used on architectures where we do not have 4-byte + * compare-and-swap. When an item is deleted, we first set its mark bit, + * which is stored within the next-pointer, again using CAS. Once this + * step is completed, the node is deleted. As a cleanup operation, we then + * use CAS to modify the next-pointer of the previous node to point to the + * node following the deleted node. Note that, even once this cleanup + * operation has been performed, some other backend concurrently walking the + * chain might still be visiting the deleted node. Thus, we must be certain + * not to overwrite the deleted node's next-pointer until all concurrent + * bucket scans have completed. This means, in particular, that we cannot + * immediately view the deleted node as available for reuse. + * + * Instead, when a delete-marked node is removed from the bucket chain, it + * is added to one of many garbage lists. There is a many-to-one mapping from + * buckets to garbage lists, so that the choice of bucket determines the + * garbage list but not visca versa. Any process which wishes to scan a bucket + * must first advertise in shared memory the corresponding garbage list number. + * When a backend wishes to move entries from a garbage list to a free list, + * it must first wait (by spinning) for any backends scanning that garbage + * list to complete their scans. + * + * Ideally, it would be nice to make this completely lock-free, but because + * of the above-described choice of garbage collection algorithm, it currently + * isn't. For an algorithm to be lock-free, it must be possible to suspend + * the execution of any number of processes for an arbitrary period of time + * without impeding the overall progress of the system. For this code, that + * is true except when garbage collection occurs. In that case, an insert, + * search, or delete operation can obstruct an inserting thread attempting to + * perform garbage collection for an unbounded period of time. The algorithm + * could be adapted as to be completely lock-free, essentially by guaranteeing + * that even in the worst case no combination of processes can obstruct garbage + * collection to a sufficient degree as to prevent an inserter from finding + * an available entry in a hash table containing fewer live elements than its + * declared maximum capacity. However, it's not clear that this is a good + * idea, because it would likely slow down operation in the non-contended + * case to solve a problem that we hope won't happen anyway. + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * src/backend/utils/hash/chash.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "access/hash.h" +#include "storage/barrier.h" +#include "storage/proc.h" +#include "storage/shmem.h" +#include "utils/chash.h" +#include "utils/memutils.h" + +/* + * CHashPtr represents an offset into the arena, plus a mark bit that is + * used to implement concurrent deletion. + */ +typedef uint32 CHashPtr; +#define InvalidCHashPtr ((uint32) -2) +#define CHashPtrIsInvalid(x) ((x) >= InvalidCHashPtr) +#define CHashPtrIsMarked(x) ((x) & 1) +#define CHashPtrGetOffset(x) ((x) >> 1) +#define CHashPtrMark(x) ((x) | 1) +#define CHashPtrUnmark(x) ((x) & ~1) +#define MakeCHashPtr(x) ((x) << 1) +#define CHashMaxCapacity CHashPtrGetOffset(InvalidCHashPtr) + +/* + * Each object stored in the hash table is represented by a CHashNode, which + * stores a pointer to the next item in the same bucket, and the exact hash + * value of the current item. Each CHashNode is followed by space for the + * item itself. + */ +typedef struct +{ + CHashPtr next; /* arena offset of next element */ + union + { + uint32 hashcode; /* hash(key) */ + CHashPtr gcnext; /* arena offset of next garbage item */ + } un; +} CHashNode; + +#define SizeOfCHashNode MAXALIGN(sizeof(CHashNode)) +#define CHashNodeGetItem(x) (((char *) x) + SizeOfCHashNode) + +/* + * CHashTableData stores all the information that we need in order to access + * a concurrent hash table. We store one copy of this data in shared memory, + * and an additional copy in the private memory of each backend accessing the + * table. + */ +typedef struct CHashTableData +{ + /* + * These fields do not change after initialization. + */ + CHashDescriptor desc; /* descriptor for this hash table */ + uint32 bucket_mask; /* # of buckets, minus one */ + uint8 garbage_shift; /* log2(# of buckets/# of garbage lists) */ + uint8 freelist_shift; /* log2(# of garbage lists/# freelists) */ + uint16 nfreelists; /* # of freelists */ + uint32 arena_limit; /* # of arena elements */ + uint32 arena_stride; /* bytes allocated per arena element */ + CHashPtr *bucket; /* array with 1 entry per bucket */ + CHashPtr *extra; /* entries for garbage and free lists */ + char *arena; /* arena */ + + /* + * These fields will be different in each backend; the shared copy is + * irrelevant. + */ + int gc_pid; /* PID that set gc_next */ + uint32 gc_next; /* next garbage list to reclaim */ + uint64 stats[CHS_NumberOfStatistics]; /* statistics */ +} CHashTableData; + +/* Compute # of buckets, garbage lists, or free lists. */ +#define CHashTableNBuckets(table) \ + ((table)->bucket_mask + 1) +#define CHashTableNGarbage(table) \ + (CHashTableNBuckets((table)) >> (table)->garbage_shift) +#define CHashTableNFreeLists(table) \ + ((table)->nfreelists) + +/* + * Garbage lists and free lists are interleaved to reduce cache line + * contention on the free lists, so the calculation of where an individual + * list is located is a bit complex. + */ +#define CHashTableGetGarbageList(table, n) \ + (&(table)->extra[(n) + ((n) >> (table)->freelist_shift)]) +#define CHashTableGetGarbageByBucket(table, n) \ + (CHashTableGetGarbageList((table), (n) >> (table)->garbage_shift)) +#define CHashTableGetFreeList(table, n) \ + (&(table)->extra[(n) + (((n) + 1) << (table)->freelist_shift)]) + +/* Access macros for arena nodes. */ +#define CHashTableGetRaw(table, offset) \ + (AssertMacro((offset) < (table)->arena_limit), \ + (CHashNode *) ((table)->arena + (table)->arena_stride * (offset))) +#define CHashTableGetNode(table, ptr) \ + (AssertMacro(!CHashPtrIsInvalid(ptr)), \ + CHashTableGetRaw((table), CHashPtrGetOffset((ptr)))) + +/* Statistics macros. */ +#define CHashTableIncrementStatistic(table, stat) \ + ((table)->stats[(stat)]++) + +/* + * Bucket scan. + */ +typedef struct +{ + CHashPtr target; + CHashPtr next; + CHashPtr *pointer_to_target; + CHashNode *target_node; + bool found; +} CHashScanResult; + +/* Human-readable statistics names. */ +char *CHashStatisticsNames[] = { + "searches", + "searches failed", + "inserts", + "inserts failed", + "inserts retried", + "deletions", + "deletions failed", + "deletions retried", + "scan expunges", + "scan expunges failed", + "scans restarted", + "cleanup scans", + "allocations failed", + "garbage enqueues retried", + "garbage dequeues failed", + "garbage collections", + "garbage collection spins", + "garbage collection reclaims skipped", + "garbage collection fast reclaims", + "garbage collection reclaims retried", + "<end>" +}; + +/* Function prototypes. */ +static CHashPtr CHashAllocate(CHashTable table); +static CHashPtr CHashAllocateViaGC(CHashTable table); +static void CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c); +static void CHashBucketScan(CHashTable table, + CHashPtr *start, + uint32 hashcode, + const void *key, + CHashScanResult *res); + +/* + * First stage of CHashTable initialization. We fill in all the constants + * here, but not the pointers. + */ +CHashTable +CHashBootstrap(CHashDescriptor *desc) +{ + CHashTable table; + uint32 bucket_shift; + + /* Sanity check. */ + Assert(!strcmp(CHashStatisticsNames[CHS_NumberOfStatistics], "<end>")); + + /* Allocate table and copy descriptor. */ + table = MemoryContextAllocZero(TopMemoryContext, sizeof(CHashTableData)); + memcpy(&table->desc, desc, sizeof(CHashDescriptor)); + + /* Sanity checks. */ + if (desc->capacity < 1 || desc->capacity > CHashMaxCapacity) + elog(ERROR, "invalid capacity for concurrent hash"); + if (desc->key_size < 1 || desc->key_size > desc->element_size) + elog(ERROR, "invalid key size for concurrent hash"); + + /* + * The number of buckets must be a power of two. To avoid (as much as + * possible) having to traverse long bucket chains, we aim for a load + * factor <= 1.0, so this is a pretty simple calculation: we just find the + * smallest power of two greater than or equal to the target capacity. + */ + bucket_shift = fls(desc->capacity) - 1; + table->bucket_mask = (1 << bucket_shift) - 1; + + /* + * We choose to have one garbage list for every 64 hash table buckets + * (that is, garbage_shift = 6) unless there are fewer than 64 buckets in + * total, in which case we still have a minimum of one garbage list. + * + * Increasing the garbage_shift would reduce the likelihood of a backend + * performing garbage collection needing to wait for a backend walking a + * bucket to finish its scan. On the other hand, decreasing the garbage + * shift would allow more items to be recovered in a single garbage + * collection cycle. It's not clear what the optimal value is. + */ + table->garbage_shift = Min(bucket_shift, 6); + table->gc_next = 0; + table->gc_pid = 0; + + /* + * Experimentation reveals that the free list manipulation is both one of + * the slowest parts of this algorithm and the most vulnerable to + * contention. Therefore, we want to have as many free lists as possible, + * but there's no need to have more than the number of CPU cores, so we + * limit the number of freelists to 64. There might be a benefit in some + * larger limit on a really big system, but we'll cap it here pending some + * actual test results. We're also limited to having no more freelists + * than we do garbage lists. + */ +#define LOG2_MAX_FREELIST 6 + if (bucket_shift - table->garbage_shift < LOG2_MAX_FREELIST) + table->freelist_shift = 0; + else + table->freelist_shift = + (bucket_shift - table->garbage_shift) - LOG2_MAX_FREELIST; + table->nfreelists = + 1 << (bucket_shift - (table->garbage_shift + table->freelist_shift)); + + /* + * To make garbage collection efficient, we overallocate. Normally, we + * overallocate by one-eighth, but if that would be less than 15 elements, + * then we allocate 15 elements instead. This extra capacity can actually + * be used, but for best performance, it shouldn't be. It's the caller's + * responsibility to avoid this. + */ + table->arena_limit = desc->capacity; + if (desc->capacity < 120) + table->arena_limit += 15; + else + table->arena_limit += table->arena_limit / 8; + + /* Each arena element must be MAXALIGN'd and include per-node space. */ + table->arena_stride = SizeOfCHashNode + MAXALIGN(desc->element_size); + + /* Zero out statistics. */ + memset(table->stats, 0, sizeof(uint64) * CHS_NumberOfStatistics); + + return table; +} + +/* + * Estimate shared memory requirements. + */ +Size +CHashEstimateSize(CHashTable table) +{ + Size total_buckets; + Size size; + Size nbuckets = CHashTableNBuckets(table); + Size ngarbage = CHashTableNGarbage(table); + Size nfreelists = CHashTableNFreeLists(table); + + Assert(nbuckets > 0 && ngarbage > 0 && nfreelists > 0); + total_buckets = add_size(nbuckets, ngarbage); + total_buckets = add_size(total_buckets, nfreelists); + + size = MAXALIGN(sizeof(CHashTableData)); + size = add_size(size, mul_size(sizeof(CHashPtr), total_buckets)); + size = add_size(size, mul_size(table->arena_stride, table->arena_limit)); + + return size; +} + +/* + * Create a concurrent hash table in shared memory, or attach to an existing + * table. + */ +CHashTable +CHashInitialize(CHashTable table, CHashDescriptor *desc) +{ + Size size; + bool found; + void *shmem; + uint32 i; + uint32 nbuckets; + uint32 nfreelists; + uint32 ngarbage; + uint32 nextra; + + /* + * If we're under the postmaster, this must be the EXEC_BACKEND case where + * we need to attach to an existing shared-memory segment. + */ + if (IsUnderPostmaster) + { + void *shmem; + + Assert(table == NULL); + table = MemoryContextAlloc(TopMemoryContext, sizeof(CHashTableData)); + shmem = ShmemAttachStruct(desc->shmem_name); + memcpy(table, shmem, sizeof(CHashTableData)); + return table; + } + + /* + * Otherwise, the hash table should not already exist, and we must + * create it. But the table should already be bootstrapped, since we + * must previously have computed its size when figuring out our shared + * memory allocation. + */ + Assert(table != NULL); + size = CHashEstimateSize(table); + shmem = ShmemInitStruct(table->desc.shmem_name, size, &found); + Assert(!found); + + /* Bucket, garbage, and freelist arrays follow table info. */ + table->bucket = (CHashPtr *) + (((char *) shmem) + MAXALIGN(sizeof(CHashTableData))); + nbuckets = CHashTableNBuckets(table); + table->extra = &table->bucket[nbuckets]; + + /* Arena follows the various lists. */ + ngarbage = CHashTableNGarbage(table); + nfreelists = CHashTableNFreeLists(table); + nextra = ngarbage + nfreelists; + table->arena = (void *) (&table->extra[nextra]); + + /* Initialize all three sets of lists to empty. */ + for (i = 0; i < nbuckets; ++i) + table->bucket[i] = InvalidCHashPtr; + for (i = 0; i < nextra; ++i) + table->extra[i] = InvalidCHashPtr; + + /* Put all arena elements on the free lists. */ + for (i = 0; i < table->arena_limit; ++i) + { + CHashPtr *f = CHashTableGetFreeList(table, i % nfreelists); + CHashNode *n = CHashTableGetRaw(table, i); + + n->un.gcnext = *f; + *f = MakeCHashPtr(i); + } + + /* + * Copy table (with pointers now filled in) to shared memory. This is + * arguably unnecessary when not using EXEC_BACKEND, but we do it anyway. + */ + memcpy(shmem, table, sizeof(CHashTableData)); + + return table; +} + +/* + * Search a concurrent hash table. entry should be a block of memory large + * enough to hold a complete entry, with just the key portion filled in. If + * a matching entry is found, this function will fill in the rest of the entry + * from the data in the hash table and return true. If not, it will return + * false. + */ +bool +CHashSearch(CHashTable table, void *entry) +{ + uint32 hashcode = hash_any(entry, table->desc.key_size); + uint32 bucket = hashcode & table->bucket_mask; + CHashPtr *b = &table->bucket[bucket]; + CHashScanResult scan; + + /* Prevent garbage collection for this bucket. */ + Assert(MyProc->hazard[0] == NULL); + MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket); + pg_memory_barrier(); + + /* Scan bucket and return data from any matching entry. */ + CHashBucketScan(table, b, hashcode, entry, &scan); + if (scan.found) + memcpy(((char *) entry) + table->desc.key_size, + CHashNodeGetItem(scan.target_node) + table->desc.key_size, + table->desc.element_size - table->desc.key_size); + + /* Allow garbage collection for this bucket. */ + Assert(MyProc->hazard[0] != NULL); + pg_memory_barrier(); + MyProc->hazard[0] = NULL; + + CHashTableIncrementStatistic(table, CHS_Search); + if (!scan.found) + CHashTableIncrementStatistic(table, CHS_Search_Failed); + return scan.found; +} + +/* + * Insert into a concurrent hash table. entry should be the complete entry + * to be inserted. If no duplicate entry is found in the table, this function + * will insert the entry and return true. Otherwise, the duplicate entry's + * value will be copied into the supplied entry and the function will return + * false. + * + * The caller is responsible for ensuring that no inserts are performed into + * a hash table which is at capacity. The behavor of such an insert is + * undefined (the actual behavior is that the insert may either succeed, + * degrading performance; or CHashAllocate may enter a tight loop until such + * time as an element is deleted). + */ +bool +CHashInsert(CHashTable table, void *entry) +{ + uint32 hashcode = hash_any(entry, table->desc.key_size); + uint32 bucket = hashcode & table->bucket_mask; + CHashPtr *b = &table->bucket[bucket]; + CHashPtr new; + CHashNode *nnew; + CHashScanResult scan; + + /* + * Allocate and initialize a new entry, on the assumption that the insert + * will succeed. If it ends up failing, we must be sure to put this back + * on some free list, lest it be permanently leaked. + */ + new = CHashAllocate(table); + nnew = CHashTableGetNode(table, new); + nnew->un.hashcode = hashcode; + memcpy(CHashNodeGetItem(nnew), entry, table->desc.element_size); + + /* Prevent garbage collection for this bucket. */ + MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket); + pg_memory_barrier(); + + /* + * Scan the bucket. If we don't find a match, use compare-and-swap to + * insert the new node at the insert position. If we do find a match, + * return the data to the caller. + */ +retry: + CHashBucketScan(table, b, hashcode, entry, &scan); + if (scan.found) + memcpy(((char *) entry) + table->desc.key_size, + CHashNodeGetItem(scan.target_node) + table->desc.key_size, + table->desc.element_size - table->desc.key_size); + else + { + /* + * We didn't find a matching element, so use compare-and-swap to + * attempt to insert the new element we've prepared. If this fails, + * someone currently inserted or deleted an element. The correct + * insertion point might have changed, or the key we're trying to + * insert might now be present when it wasn't before, so we'll have + * to search the bucket chain anew. + * + * There is a risk of starvation here, but we hope it will not happen + * in practice. Contention for inserting new elements should be + * spread out pretty much evenly across N+M possible insertion points, + * where N is the number of buckets and M is the number of elements + * in the table. Even for a quite modestly size table this is likely + * to exceed the number of CPU cores. + */ + Assert(!CHashPtrIsMarked(scan.target)); + nnew->next = scan.target; + if (!__sync_bool_compare_and_swap(scan.pointer_to_target, + scan.target, new)) + { + CHashTableIncrementStatistic(table, CHS_Insert_Retry); + goto retry; + } + } + + /* Allow garbage collection for this bucket. */ + Assert(MyProc->hazard[0] != NULL); + pg_memory_barrier(); + MyProc->hazard[0] = NULL; + + /* + * If the insert failed, add the entry we found to the appropriate + * garbage list. We can't simply put this back on the freelist, + * because that leads to an ABA problem: some other backend could + * read the head of the freelist, go into the tank, and then use + * CAS to pop an element. If at that point, it finds the same + * element at the head of the freelist but with a different successor, + * we're hosed. To prevent that, we ensure that elements are added + * to a given freelist only after verifying that any allocations in + * progress at the time we popped the freelist has completed. This + * guarantees that any allocation still in progress at the time this + * element makes it back to the freelist is trying to allocate some + * other node. + */ + CHashTableIncrementStatistic(table, CHS_Insert); + if (scan.found) + { + CHashTableIncrementStatistic(table, CHS_Insert_Failed); + CHashAddToGarbage(table, bucket, new); + } + + /* The insert succeeded if and only if no duplicate was found. */ + return !scan.found; +} + +/* + * Delete from a concurrent hash table. entry need only contain the key field. + * Returns true if we find and delete a matching key and false otherwise. + */ +bool +CHashDelete(CHashTable table, void *entry) +{ + uint32 hashcode = hash_any(entry, table->desc.key_size); + uint32 bucket = hashcode & table->bucket_mask; + CHashPtr *b = &table->bucket[bucket]; + CHashScanResult scan; + + /* Prevent garbage collection for this bucket. */ + Assert(MyProc->hazard[0] == NULL); + MyProc->hazard[0] = CHashTableGetGarbageByBucket(table, bucket); + pg_memory_barrier(); + + /* Scan bucket. */ +retry: + CHashBucketScan(table, b, hashcode, entry, &scan); + + /* If we found it, try to delete it. */ + if (scan.found) + { + Assert(!CHashPtrIsMarked(scan.next)); + + /* Attempt to apply delete-mark. */ + if (!__sync_bool_compare_and_swap(&scan.target_node->next, + scan.next, + CHashPtrMark(scan.next))) + { + CHashTableIncrementStatistic(table, CHS_Delete_Retry); + goto retry; + } + + /* Deletion is done; attempt to remove node from list. */ + if (__sync_bool_compare_and_swap(scan.pointer_to_target, + scan.target, + scan.next)) + CHashAddToGarbage(table, bucket, scan.target); + else + { + CHashScanResult cleanup_scan; + + /* + * If we weren't able to remove the deleted item, rescan + * the bucket to make sure it's really gone. This is just + * like a regular bucket scan, except that we don't care + * about the results. We're just doing it to achieve the + * side-effect of removing delete-marked nodes from the + * bucket chain. + */ + CHashTableIncrementStatistic(table, CHS_Cleanup_Scan); + CHashBucketScan(table, b, hashcode, entry, &cleanup_scan); + } + } + + /* Allow garbage collection for this bucket. */ + Assert(MyProc->hazard[0] != NULL); + pg_memory_barrier(); + MyProc->hazard[0] = NULL; + + /* We're done. */ + CHashTableIncrementStatistic(table, CHS_Delete); + if (!scan.found) + CHashTableIncrementStatistic(table, CHS_Delete_Failed); + return scan.found; +} + +/* + * Provide backend-local statistics to caller. + */ +void +CHashStatistics(CHashTable table, uint64 *stats) +{ + memcpy(stats, &table->stats, sizeof(uint64) * CHS_NumberOfStatistics); +} + +/* + * Scan one bucket of a concurrent hash table, storing the results in a + * CHashResult object provided by the caller. + * + * Caller must suppress garbage collection for the target bucket before + * calling this function, and resume it afterwards. + * + * On return, res->found will be true if a matching item was found and false + * otherwise. res->target will be a CHashPtr referencing the matching item, + * or the first one following the position where the matching item should have + * been; res->pointer_to_target will be a pointer to the memory address from + * which res->target was read. + * + * If res->target is not invalid, then res->target_node is a pointer to its + * location in memory. If res->found, then res->next will be the next pointer + * of res->target_node; otherwise, it's undefined. + */ +static void +CHashBucketScan(CHashTable table, + CHashPtr *start, + uint32 hashcode, + const void *key, + CHashScanResult *res) +{ + CHashPtr target; + CHashPtr *pointer_to_target; + CHashNode *target_node = NULL; + +retry: + pointer_to_target = start; + target = *pointer_to_target; + for (;;) + { + CHashPtr next; + uint32 h; + int cmp; + + /* + * If we've reached the end of the bucket chain, stop; otherwise, + * figure out the actual address of the next item. + */ + if (CHashPtrIsInvalid(target)) + { + res->found = false; + break; + } + target_node = CHashTableGetNode(table, target); + + /* + * target may have been fetched from an arena entry that could be + * concurrently modified, so a dependency barrier is required before + * dereferencing the derived pointer. + */ + pg_read_barrier_depends(); + next = target_node->next; + + /* + * For simplicity, any bucket scan, even if it's servicing a search, + * will attempt to remove delete-marked items from the bucket. This + * ensures that delete-marked elements are removed from bucket chains + * as quickly as possible and reduces code duplication. See + * CHashDelete for further comments about why delete-marking is + * necessary and how it allows safe deletion. + */ + if (CHashPtrIsMarked(next)) + { +zap: + if (__sync_bool_compare_and_swap(pointer_to_target, + target, + CHashPtrUnmark(next))) + { + /* + * No one else can possibly have modified target_node->next, + * because such modifications are not allowed after a + * delete-mark has been applied. Thus, if we just keep + * following the next pointers, we're guaranteed to visit + * all non-deleted items (and possibly some deleted items) + * that were present at the time we began the scan. + */ + CHashTableIncrementStatistic(table, CHS_Scan_Expunge); + CHashAddToGarbage(table, hashcode & table->bucket_mask, + target); + target = CHashPtrUnmark(next); + } + else + { + /* + * If the previous node has been delete-marked, we can't + * further update the next-pointer, so restart the scan. + * Otherwise, we know that some other backend removed one + * or more deleted nodes from the bucket chain just as we + * were trying to do, and we can simply continue the scan + * from wherever the previous node is pointing now. It's + * possible that some concurrent inserts have happened, too, + * but that's OK; we can view those as having happened "before" + * whatever operation this scan is supporting. + * + * Although starvation is a theoretical possibility here if + * we are forced to retry repeatedly, even a single retry is + * vanishingly unlikely in practice. It requires some other + * backend to delete both the node that we're looking at and + * the node which precedes it before we advance to the next + * node. That could certainly happen occasionally, but we'd + * have to be pretty unlucky to have it happen even twice in + * a row. + */ + CHashTableIncrementStatistic(table, CHS_Scan_Expunge_Fail); + target = *pointer_to_target; + if (CHashPtrIsMarked(target)) + { + CHashTableIncrementStatistic(table, CHS_Scan_Restart); + goto retry; + } + } + continue; + } + + /* + * Bucket chains are kept in order, so that there is exactly one legal + * point at which any given key can be inserted. The ordering is by + * hashcode first, and then by memcmp ordering of the keys involved. + */ + h = target_node->un.hashcode; + if (h == hashcode) + cmp = memcmp(CHashNodeGetItem(target_node), key, + table->desc.key_size); + else if (h > hashcode) + cmp = 1; + else + cmp = -1; + + /* + * If cmp < 0, then we haven't yet reached the point at which we'd + * expect to find the key, so we must continue the scan. If cmp == 0, + * we've found the key and can stop. If cmp > 0, we've either passed + * the point where we expect to find the key OR someone delete-marked + * the item and overwrote the hashcode with a gcnext pointer. In the + * latter case we must take care not to be fooled into stopping the + * scan early. + */ + if (cmp >= 0) + { + if (cmp == 0) + { + res->found = true; + res->next = next; + break; + } + else + { + /* + * pg_read_barrier() prevents the reread of the next pointer + * from being speculated ahead of the read of the hash value. + */ + pg_read_barrier(); + next = target_node->next; + if (CHashPtrIsMarked(next)) + goto zap; + res->found = false; + break; + } + } + + /* Continue scan from next node. */ + pointer_to_target = &target_node->next; + target = next; + } + + /* Send results back to caller. */ + res->target = target; + res->pointer_to_target = pointer_to_target; + res->target_node = target_node; +} + +/* + * Allocate an arena slot for a new item to be inserted into a hash table. + * + * We don't want to wait until every single free-list is completely empty + * before beginning to garbage collect, because that could result in very + * fast allocation followed by a storm of garbage collection activity. + * It could also lead to every inserting backend ganging up on the only + * non-empty freelist. + * + * To avoid that, we check free lists and garbage lists in alternation. + * We always start with the same free list - which one is based on our + * backend ID - but we try to round-robin over all the available garbage + * lists. Whenever we successfully garbage collect, we put the recovered + * items on our own free list. In this way, if there's only one backend + * active, it will typically find a free buffer in the first place it looks: + * its own free list. It will also settle into a pattern of garbage + * collecting the garbage list which it has visited least recently, which + * is what we want. + */ +static CHashPtr +CHashAllocate(CHashTable table) +{ + uint32 f_current; + CHashPtr new; + + /* Pick a starting freelist base on our backend ID. */ + f_current = ((uint32) MyBackendId) % CHashTableNFreeLists(table); + + /* If this process hasn't initialized gc_next yet, do that now. */ + if (table->gc_pid != MyProcPid) + { + table->gc_pid = MyProcPid; + table->gc_next = ((uint32) MyProcPid) % CHashTableNGarbage(table); + } + + /* Loop until we allocate a buffer. */ + for (;;) + { + CHashPtr *b; + + /* + * Try to pop a buffer from a freelist using compare-and-swap. + * + * Note that this is only safe if it's impossible for the next pointer + * of the first element on the list to change between the time when + * we read it and the time we use CAS to pop it off the list. This + * means that we can't allow any element that is currently on this + * freelist to be allocated, marked as garbage, garbage collected, + * and returned back to this freelist before we finish the CAS. + * + * If we attempt to pop the free-list and fail, we retry immediately + * with the same free-list. This reduces the frequency with which + * we're obliged to update our hazard pointers, which is a material + * savings due to the associated memory barrier. + */ + b = CHashTableGetFreeList(table, f_current); + MyProc->hazard[0] = b; + pg_memory_barrier(); + new = *b; + while (!CHashPtrIsInvalid(new)) + { + CHashNode *n = CHashTableGetNode(table, new); + + /* + * n is computed from table->freelist[f_current], which could + * be modified by concurrent activity, so we need a dependency + * barrier here. + */ + pg_read_barrier_depends(); + if (__sync_bool_compare_and_swap(b, new, n->un.gcnext)) + return new; + CHashTableIncrementStatistic(table, CHS_Allocate_Fail); + new = *b; + } + + /* Attempt garbage collection. */ + new = CHashAllocateViaGC(table); + if (!CHashPtrIsInvalid(new)) + return new; + + /* Advance to next freelist. */ + f_current = (f_current + 1) % CHashTableNFreeLists(table); + } +} + +/* + * Attempt to satisfy an allocation request via garbage collection. + */ +static CHashPtr +CHashAllocateViaGC(CHashTable table) +{ + uint32 f_home; + CHashPtr *b; + CHashPtr *fh; + CHashPtr fhead; + CHashPtr garbage; + CHashPtr new; + CHashNode *n; + uint32 i; + + /* Pick a target freelist based on our backend ID. */ + f_home = ((uint32) MyBackendId) % CHashTableNFreeLists(table); + fh = CHashTableGetFreeList(table, f_home); + + /* Select target garbage list. */ + table->gc_next = (table->gc_next + 1) % CHashTableNGarbage(table); + b = CHashTableGetGarbageList(table, table->gc_next); + garbage = *b; + + /* If list is empty, fail. */ + if (CHashPtrIsInvalid(garbage)) + return InvalidCHashPtr; + + /* If we're unable to empty the list via compare-and-swap, fail. */ + if (!__sync_bool_compare_and_swap(b, garbage, InvalidCHashPtr)) + { + CHashTableIncrementStatistic(table, CHS_Garbage_Dequeue_Fail); + return InvalidCHashPtr; + } + + /* + * Before removing elements removed from the garbage list to the + * freelist, we must wait until (1) all bucket scans that might + * still see elements on the freelist as part of the bucket chain + * have completed and (2) all allocations that might see an old + * version of the freelist containing one of the elements to be + * garbage collected have completed. + * + * Note: We can't begin this operation until the clearing of the + * garbage list has been committed to memory, but since that was + * done using an atomic operation no explicit barrier is needed + * here. + * + * Note: We could have a "soft" version of this that merely + * requeues the garbage if it's not immediately recycleable, but + * it's not clear that we need such a thing. On the flip side we + * might want to eventually enter a longer sleep here, or PANIC, + * but it's not clear exactly how to calibrate that. + */ + CHashTableIncrementStatistic(table, CHS_GC); + MyProc->hazard[0] = NULL; + for (i = 0; i < ProcGlobal->allProcCount; i++) + { + volatile PGPROC *proc = &ProcGlobal->allProcs[i]; + void *hazard; + + hazard = proc->hazard[0]; + if (hazard == b || hazard == fh) + { + CHashTableIncrementStatistic(table, CHS_GC_Spin); + do + { + hazard = proc->hazard[0]; + } while (hazard == b || hazard == fh); + } + } + + /* Remove one item from list to satisfy current allocation. */ + new = garbage; + n = CHashTableGetNode(table, new); + pg_read_barrier_depends(); + fhead = n->un.gcnext; + + if (CHashPtrIsInvalid(fhead)) + { + /* + * We have reclaimed exactly node, so there's nothing to put + * back on the free list. In this case (only) we need a + * memory barrier, because the reads above must complete + * before we overwrite n->un.gcnext with a new hashcode. + * (This is only needed when we reclaim exactly one node, + * because in any other case we'll do a compare-and-swap + * before returning, which implies a full barrier.) + */ + pg_memory_barrier(); + CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Skipped); + } + else if (__sync_bool_compare_and_swap(fh, InvalidCHashPtr, fhead)) + { + /* + * Our free list is empty, and we've succesfully pushed the + * reclaimed nodes onto it. So we're done. + */ + CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Fast); + } + else + { + CHashPtr fcurrent; + CHashPtr fnext; + CHashPtr oldhead; + + /* Walk list of reclaimed elements to end. */ + fcurrent = fhead; + for (;;) + { + n = CHashTableGetNode(table, fcurrent); + fnext = n->un.gcnext; + if (CHashPtrIsInvalid(fnext)) + break; + fcurrent = fnext; + } + + /* Push reclaimed elements onto home free list. */ + for (;;) + { + oldhead = *fh; + n->un.gcnext = oldhead; + if (__sync_bool_compare_and_swap(fh, oldhead, fhead)) + break; + CHashTableIncrementStatistic(table, CHS_GC_Reclaim_Retry); + } + } + + /* Return the element we saved for ourselves. */ + return new; +} + +/* + * Add an arena slot to the appropriate garbage list. + * + * The next garbage collection cycle for the affected bucket will move it + * to the free list. We can't do that immediately because there might be + * someone traversing the list who is counting on being able to follow the + * next pointer. It's OK to clobber the hash value, though, since a spurious + * failure to match an already-deleted item shouldn't cause any problems; + * this is why gcnext can share space with the hash value. + */ +static void +CHashAddToGarbage(CHashTable table, uint32 bucket, CHashPtr c) +{ + CHashPtr g; + CHashNode *n; + CHashPtr *garbage; + + n = CHashTableGetNode(table, c); + garbage = CHashTableGetGarbageByBucket(table, bucket); + + while (1) + { + g = *garbage; + n->un.gcnext = g; + if (__sync_bool_compare_and_swap(garbage, g, c)) + break; + CHashTableIncrementStatistic(table, CHS_Garbage_Enqueue_Retry); + } +} diff --git a/src/include/storage/barrier.h b/src/include/storage/barrier.h index cd85633..dbcc0f8 100644 --- a/src/include/storage/barrier.h +++ b/src/include/storage/barrier.h @@ -20,4 +20,12 @@ */ #include "port/atomics.h" +/* + * If dependency barriers are undefined, we define them as a no-op. The only + * known platform where anything more is required is DEC Alpha. + */ +#if !defined(pg_read_barrier_depends) +#define pg_read_barrier_depends() +#endif + #endif /* BARRIER_H */ diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 9b8ace5..b58af88 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -96,20 +96,6 @@ typedef struct buftag ) /* - * The shared buffer mapping table is partitioned to reduce contention. - * To determine which partition lock a given tag requires, compute the tag's - * hash code with BufTableHashCode(), then apply BufMappingPartitionLock(). - * NB: NUM_BUFFER_PARTITIONS must be a power of 2! - */ -#define BufTableHashPartition(hashcode) \ - ((hashcode) % NUM_BUFFER_PARTITIONS) -#define BufMappingPartitionLock(hashcode) \ - (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \ - BufTableHashPartition(hashcode)].lock) -#define BufMappingPartitionLockByIndex(i) \ - (&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock) - -/* * BufferDesc -- shared descriptor/state data for a single shared buffer. * * Note: buf_hdr_lock must be held to examine or change the tag, flags, @@ -196,14 +182,6 @@ extern void StrategyNotifyBgWriter(int bgwprocno); extern Size StrategyShmemSize(void); extern void StrategyInitialize(bool init); -/* buf_table.c */ -extern Size BufTableShmemSize(int size); -extern void InitBufTable(int size); -extern uint32 BufTableHashCode(BufferTag *tagPtr); -extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode); -extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); -extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); - /* localbuf.c */ extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum); @@ -215,4 +193,8 @@ extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode); extern void AtEOXact_LocalBuffers(bool isCommit); +/* bufmgr.c */ +extern Size BufMgrShmemSize(int size); +extern void BufMgrInitShmem(int size); + #endif /* BUFMGR_INTERNALS_H */ diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index e3c2efc..1b37447 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -144,7 +144,7 @@ extern PGDLLIMPORT LWLockPadded *MainLWLockArray; */ /* Number of partitions of the shared buffer mapping hashtable */ -#define NUM_BUFFER_PARTITIONS 128 +#define NUM_BUFFER_PARTITIONS 0 /* Number of partitions the shared lock tables are divided into */ #define LOG2_NUM_LOCK_PARTITIONS 4 diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h index d194f38..8d9b4dd 100644 --- a/src/include/storage/proc.h +++ b/src/include/storage/proc.h @@ -59,6 +59,14 @@ struct XidCache #define FP_LOCK_SLOTS_PER_BACKEND 16 /* + * Some lock-free algorithms require each backend process to be able to + * advertise a certain number of "hazard pointers" in shared memory. + * Right now one per backend is enough for our purpose, but some + * algorithms require more. + */ +#define NUM_HAZARD_POINTERS 1 + +/* * Each backend has a PGPROC struct in shared memory. There is also a list of * currently-unused PGPROC structs that will be reallocated to new backends. * @@ -143,6 +151,12 @@ struct PGPROC bool fpVXIDLock; /* are we holding a fast-path VXID lock? */ LocalTransactionId fpLocalTransactionId; /* lxid for fast-path VXID * lock */ + + /* + * Hazard pointers. Currently one is enough, though some algorithms + * require a few more. + */ + void *hazard[NUM_HAZARD_POINTERS]; }; /* NOTE: "typedef struct PGPROC PGPROC" appears in storage/lock.h. */ diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h index c94d620..855b65e 100644 --- a/src/include/storage/shmem.h +++ b/src/include/storage/shmem.h @@ -40,6 +40,7 @@ extern void InitShmemIndex(void); extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size, HASHCTL *infoP, int hash_flags); extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr); +extern void *ShmemAttachStruct(const char *name); extern Size add_size(Size s1, Size s2); extern Size mul_size(Size s1, Size s2); diff --git a/src/include/utils/chash.h b/src/include/utils/chash.h new file mode 100644 index 0000000..ee0573c --- /dev/null +++ b/src/include/utils/chash.h @@ -0,0 +1,69 @@ +/*------------------------------------------------------------------------- + * + * chash.h + * Concurrent shared-memory hash table. + * + * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/utils/chash.h + * + *------------------------------------------------------------------------- + */ +#ifndef CHASH_H +#define CHASH_H + +/* Everything caller must supply to set up a concurrent hash table. */ +typedef struct +{ + const char *shmem_name; /* shared memory name for this hash table */ + uint32 capacity; /* maximum size of hash table */ + uint16 element_size; /* size of each element */ + uint16 key_size; /* size of each key */ +} CHashDescriptor; + +/* Concurrent hash table statistics. */ +typedef enum +{ + CHS_Search, /* search */ + CHS_Search_Failed, /* search failed (no such key) */ + CHS_Insert, /* insert */ + CHS_Insert_Failed, /* insert failed (duplicate key) */ + CHS_Insert_Retry, /* insert retried (concurrent update) */ + CHS_Delete, /* delete */ + CHS_Delete_Failed, /* delete failed (no such key) */ + CHS_Delete_Retry, /* delete retried (concurrent update) */ + CHS_Scan_Expunge, /* scan expunged deleted item */ + CHS_Scan_Expunge_Fail, /* scan failed to expunge */ + CHS_Scan_Restart, /* concurrent deletes forced a scan restart */ + CHS_Cleanup_Scan, /* concurrent update forced a cleanup scan */ + CHS_Allocate_Fail, /* allocation failed to pop freelist */ + CHS_Garbage_Enqueue_Retry, /* enqueue on garbage list retried */ + CHS_Garbage_Dequeue_Fail, /* dequeue of garbage failed */ + CHS_GC, /* garbage collection cycle */ + CHS_GC_Spin, /* GC spun waiting for concurrent process */ + CHS_GC_Reclaim_Skipped, /* GC recovered only one item */ + CHS_GC_Reclaim_Fast, /* GC put garbage on freelist via fast path */ + CHS_GC_Reclaim_Retry, /* enqueue of garbage on freelist retried */ + CHS_NumberOfStatistics /* number of statistics */ +} CHashStatisticsType; + +/* Human-readable names for statistics. */ +extern char *CHashStatisticsNames[]; + +/* Opaque handle for a concurrent hash table. */ +struct CHashTableData; +typedef struct CHashTableData *CHashTable; + +/* Initialization functions. */ +extern CHashTable CHashBootstrap(CHashDescriptor *desc); +extern Size CHashEstimateSize(CHashTable table); +extern CHashTable CHashInitialize(CHashTable table, CHashDescriptor *desc); + +/* Accessor functions. */ +extern bool CHashInsert(CHashTable table, void *entry); +extern bool CHashDelete(CHashTable table, void *key); +extern bool CHashSearch(CHashTable table, void *entry); +extern void CHashStatistics(CHashTable table, uint64 *stats); + +#endif /* CHASH_H */
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers