Re: [HACKERS] Patch: fix lock contention for HASHHDR.mutex

Aleksander Alekseev Wed, 10 Feb 2016 00:27:03 -0800

Hello, Robert

> Basically, the burden for you to impose a new coding rule on everybody
> who uses shared hash tables in the future is very high.


I fixed an issue you described. Number of spinlocks doesn't depend of
NUM_LOCK_PARTITIONS anymore and could be specified for each hash table
on a calling side.

I did a benchmark described in a first message of this thread again.
Currently I don't have access to the same 60-core server so I used more
common 12-core server (24 with HT). According to this benchmark TPS
increment depends on NUM_LOCK_PARTITIONS and default number of
spinlocks this way:

pgbench -f pgbench.sql -T 150 -P 1 -c 40 -j 12

 DMN | NLP = 16 | NLP = 32 | NLP = 64 | NLP = 128
-----|----------|----------|----------|----------
   8 |  +15.1%  |  +28.2%  |  +34.1%  |  +33.7%  
  16 |  +16.6%  |  +30.9%  |  +37.0%  |  +40.8%  
  32 |  +15.1%  |  +33.9%  |  +39.5%  |  +41.9%  
  64 |  +15.0%  |  +31.9%  |  +40.1%  |  +42.9%  
 128 |   +7.7%  |  +24.7%  |  +29.6%  |  +31.6%  

* NLP = NUM_LOCK_PARTITIONS
* DMN = DEFAULT_MUTEXES_NUM

I realize this benchmark doesn't represent any possible workload so for
attached patch I choose NUM_LOCK_PARTITIONS = DEFAULT_MUTEXES_NUM = 32.
It seems to be a reasonable compromise of a speedup according to
"synthetic and meaningless in practice" benchmark and number of used
locks which could mean quite a lot in practice. Still this values could
be easily changed in any moment.

Here are before/after benchmark results for this concrete patch.


BEFORE

pgbench (default):

tps = 1295.798531 (including connections establishing)
tps = 1295.858295 (excluding connections establishing)

pgbench -f pgbench.sql:

tps = 1020.072172 (including connections establishing)
tps = 1020.116888 (excluding connections establishing)


AFTER

pgbench (default):

tps = 1299.369807 (including connections establishing)
tps = 1299.429764 (excluding connections establishing)

pgbench -f pgbench.sql:

tps = 1365.749333 (including connections establishing)
tps = 1365.814384 (excluding connections establishing)


So as I understand this patch solves a lock contention problem and
doesn't make anything else worse.

diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c
index dffc477..9a15a2a 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -495,7 +495,7 @@ pgss_shmem_startup(void)
 	info.hash = pgss_hash_fn;
 	info.match = pgss_match_fn;
 	pgss_hash = ShmemInitHash("pg_stat_statements hash",
-							  pgss_max, pgss_max,
+							  pgss_max,
 							  &info,
 							  HASH_ELEM | HASH_FUNCTION | HASH_COMPARE);
 
diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
index 39e8baf..dd5acb7 100644
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -62,7 +62,7 @@ InitBufTable(int size)
 	info.num_partitions = NUM_BUFFER_PARTITIONS;
 
 	SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
-								  size, size,
+								  size,
 								  &info,
 								  HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
 }
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index 81506ea..4c18701 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -237,7 +237,7 @@ InitShmemIndex(void)
 	hash_flags = HASH_ELEM;
 
 	ShmemIndex = ShmemInitHash("ShmemIndex",
-							   SHMEM_INDEX_SIZE, SHMEM_INDEX_SIZE,
+							   SHMEM_INDEX_SIZE,
 							   &info, hash_flags);
 }
 
@@ -255,17 +255,12 @@ InitShmemIndex(void)
  * exceeded substantially (since it's used to compute directory size and
  * the hash table buckets will get overfull).
  *
- * init_size is the number of hashtable entries to preallocate.  For a table
- * whose maximum size is certain, this should be equal to max_size; that
- * ensures that no run-time out-of-shared-memory failures can occur.
- *
  * Note: before Postgres 9.0, this function returned NULL for some failure
  * cases.  Now, it always throws error instead, so callers need not check
  * for NULL.
  */
 HTAB *
 ShmemInitHash(const char *name, /* table string name for shmem index */
-			  long init_size,	/* initial table size */
 			  long max_size,	/* max size of the table */
 			  HASHCTL *infoP,	/* info about key and bucket size */
 			  int hash_flags)	/* info about infoP */
@@ -299,7 +294,7 @@ ShmemInitHash(const char *name, /* table string name for shmem index */
 	/* Pass location of hashtable header to hash_create */
 	infoP->hctl = (HASHHDR *) location;
 
-	return hash_create(name, init_size, infoP, hash_flags);
+	return hash_create(name, max_size, infoP, hash_flags);
 }
 
 /*
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index e3e9599..fc20a67 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -374,18 +374,10 @@ void
 InitLocks(void)
 {
 	HASHCTL		info;
-	long		init_table_size,
-				max_table_size;
+	long		max_table_size;
 	bool		found;
 
 	/*
-	 * Compute init/max size to request for lock hashtables.  Note these
-	 * calculations must agree with LockShmemSize!
-	 */
-	max_table_size = NLOCKENTS();
-	init_table_size = max_table_size / 2;
-
-	/*
 	 * Allocate hash table for LOCK structs.  This stores per-locked-object
 	 * information.
 	 */
@@ -393,16 +385,15 @@ InitLocks(void)
 	info.keysize = sizeof(LOCKTAG);
 	info.entrysize = sizeof(LOCK);
 	info.num_partitions = NUM_LOCK_PARTITIONS;
+	max_table_size = NLOCKENTS();
 
 	LockMethodLockHash = ShmemInitHash("LOCK hash",
-									   init_table_size,
 									   max_table_size,
 									   &info,
 									HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
 
 	/* Assume an average of 2 holders per lock */
 	max_table_size *= 2;
-	init_table_size *= 2;
 
 	/*
 	 * Allocate hash table for PROCLOCK structs.  This stores
@@ -414,7 +405,6 @@ InitLocks(void)
 	info.num_partitions = NUM_LOCK_PARTITIONS;
 
 	LockMethodProcLockHash = ShmemInitHash("PROCLOCK hash",
-										   init_table_size,
 										   max_table_size,
 										   &info,
 								 HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 026d2b9..be73a60 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -1117,7 +1117,6 @@ InitPredicateLocks(void)
 
 	PredicateLockTargetHash = ShmemInitHash("PREDICATELOCKTARGET hash",
 											max_table_size,
-											max_table_size,
 											&info,
 											HASH_ELEM | HASH_BLOBS |
 											HASH_PARTITION | HASH_FIXED_SIZE);
@@ -1145,7 +1144,6 @@ InitPredicateLocks(void)
 
 	PredicateLockHash = ShmemInitHash("PREDICATELOCK hash",
 									  max_table_size,
-									  max_table_size,
 									  &info,
 									  HASH_ELEM | HASH_FUNCTION |
 									  HASH_PARTITION | HASH_FIXED_SIZE);
@@ -1226,7 +1224,6 @@ InitPredicateLocks(void)
 
 	SerializableXidHash = ShmemInitHash("SERIALIZABLEXID hash",
 										max_table_size,
-										max_table_size,
 										&info,
 										HASH_ELEM | HASH_BLOBS |
 										HASH_FIXED_SIZE);
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 24a53da..6a49958 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -15,7 +15,7 @@
  * to hash_create.  This prevents any attempt to split buckets on-the-fly.
  * Therefore, each hash bucket chain operates independently, and no fields
  * of the hash header change after init except nentries and freeList.
- * A partitioned table uses a spinlock to guard changes of those two fields.
+ * A partitioned table uses spinlocks to guard changes of those fields.
  * This lets any subset of the hash buckets be treated as a separately
  * lockable partition.  We expect callers to use the low-order bits of a
  * lookup key's hash value as a partition number --- this will work because
@@ -111,6 +111,12 @@
 #define DEF_DIRSIZE			   256
 #define DEF_FFACTOR			   1	/* default fill factor */
 
+/*
+ * Default and maximum number of mutexes and respectively number of nentries
+ * and freeLists (see below). Both values should be power of 2.
+ */
+#define DEFAULT_MUTEXES_NUM    32
+#define MAX_MUTEXES_NUM		   128
 
 /* A hash bucket is a linked list of HASHELEMENTs */
 typedef HASHELEMENT *HASHBUCKET;
@@ -128,12 +134,24 @@ typedef HASHBUCKET *HASHSEGMENT;
  */
 struct HASHHDR
 {
-	/* In a partitioned table, take this lock to touch nentries or freeList */
-	slock_t		mutex;			/* unused if not partitioned table */
-
-	/* These fields change during entry addition/deletion */
-	long		nentries;		/* number of entries in hash table */
-	HASHELEMENT *freeList;		/* linked list of free elements */
+	/*
+	 * There are two fields declared below: nentries and freeList. nentries
+	 * stores current number of entries in a hash table. freeList is a linked
+	 * list of free elements.
+	 *
+	 * To keep these fields consistent in a partitioned table we need to
+	 * synchronize access to them using a spinlock. But it turned out that a
+	 * single spinlock can create a bottleneck. To prevent lock contention an
+	 * array of num_mutexes spinlocks is used, where num_mutexes should be
+	 * power of two less or equal to MAX_MUTEXES_NUM. Each spinlock protects
+	 * one element of nentries and freeList arrays.
+	 *
+	 * If hash table is not partitioned only nentries[0] and freeList[0] are
+	 * used and spinlocks are not used at all.
+	 */
+	slock_t		mutex[MAX_MUTEXES_NUM]; /* array of spinlocks */
+	long		nentries[MAX_MUTEXES_NUM];		/* number of entries */
+	HASHELEMENT *freeList[MAX_MUTEXES_NUM];		/* lists of free elements */
 
 	/* These fields can change, but not in a partitioned table */
 	/* Also, dsize can't change in a shared table, even if unpartitioned */
@@ -147,6 +165,8 @@ struct HASHHDR
 	Size		keysize;		/* hash key length in bytes */
 	Size		entrysize;		/* total user element size in bytes */
 	long		num_partitions; /* # partitions (must be power of 2), or 0 */
+	long		num_mutexes;	/* # mutexes (must be power of 2) */
+	long		mutexes_mask;	/* equals num_mutexes - 1 */
 	long		ffactor;		/* target fill factor */
 	long		max_dsize;		/* 'dsize' limit if directory is fixed size */
 	long		ssize;			/* segment size --- must be power of 2 */
@@ -166,6 +186,8 @@ struct HASHHDR
 
 #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
 
+#define FREELIST_IDX(hctl, hashcode) (IS_PARTITIONED(hctl) ? hashcode & hctl->mutexes_mask : 0)
+
 /*
  * Top control structure for a hashtable --- in a shared table, each backend
  * has its own copy (OK since no fields change at runtime)
@@ -219,10 +241,10 @@ static long hash_accesses,
  */
 static void *DynaHashAlloc(Size size);
 static HASHSEGMENT seg_alloc(HTAB *hashp);
-static bool element_alloc(HTAB *hashp, int nelem);
+static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
 static bool dir_realloc(HTAB *hashp);
 static bool expand_table(HTAB *hashp);
-static HASHBUCKET get_hash_entry(HTAB *hashp);
+static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
 static void hdefault(HTAB *hashp);
 static int	choose_nelem_alloc(Size entrysize);
 static bool init_htab(HTAB *hashp, long nelem);
@@ -282,6 +304,10 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 {
 	HTAB	   *hashp;
 	HASHHDR    *hctl;
+	int			i,
+				partitions_number,
+				nelem_alloc,
+				nelem_alloc_first;
 
 	/*
 	 * For shared hash tables, we have a local hash header (HTAB struct) that
@@ -417,6 +443,24 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 
 	hctl = hashp->hctl;
 
+	/*
+	 * Set num_mutexes and mutexes_mask
+	 */
+	if (flags & HASH_NMUTEXES)
+	{
+		hctl->num_mutexes = next_pow2_int(info->num_mutexes);
+
+		if (hctl->num_mutexes > MAX_MUTEXES_NUM)
+			hctl->num_mutexes = MAX_MUTEXES_NUM;
+	}
+	else
+		hctl->num_mutexes = DEFAULT_MUTEXES_NUM;
+
+	Assert(hctl->num_mutexes == next_pow2_int(hctl->num_mutexes));
+
+	hctl->mutexes_mask = hctl->num_mutexes - 1;
+
+
 	if (flags & HASH_PARTITION)
 	{
 		/* Doesn't make sense to partition a local hash table */
@@ -482,10 +526,34 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	if ((flags & HASH_SHARED_MEM) ||
 		nelem < hctl->nelem_alloc)
 	{
-		if (!element_alloc(hashp, (int) nelem))
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
+		/*
+		 * If hash table is partitioned all freeLists have equal number of
+		 * elements. Otherwise only freeList[0] is used.
+		 */
+		if (IS_PARTITIONED(hashp->hctl))
+			partitions_number = hctl->num_mutexes;
+		else
+			partitions_number = 1;
+
+		nelem_alloc = nelem / partitions_number;
+		if (nelem_alloc == 0)
+			nelem_alloc = 1;
+
+		if (nelem_alloc * partitions_number < nelem)
+			/* Make sure all memory will be used */
+			nelem_alloc_first = nelem - nelem_alloc * (partitions_number - 1);
+		else
+			nelem_alloc_first = nelem_alloc;
+
+		for (i = 0; i < partitions_number; i++)
+		{
+			int			temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
+
+			if (!element_alloc(hashp, temp, i))
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+		}
 	}
 
 	if (flags & HASH_FIXED_SIZE)
@@ -503,9 +571,6 @@ hdefault(HTAB *hashp)
 
 	MemSet(hctl, 0, sizeof(HASHHDR));
 
-	hctl->nentries = 0;
-	hctl->freeList = NULL;
-
 	hctl->dsize = DEF_DIRSIZE;
 	hctl->nsegs = 0;
 
@@ -572,12 +637,14 @@ init_htab(HTAB *hashp, long nelem)
 	HASHSEGMENT *segp;
 	int			nbuckets;
 	int			nsegs;
+	int			i;
 
 	/*
 	 * initialize mutex if it's a partitioned table
 	 */
 	if (IS_PARTITIONED(hctl))
-		SpinLockInit(&hctl->mutex);
+		for (i = 0; i < hctl->num_mutexes; i++)
+			SpinLockInit(&(hctl->mutex[i]));
 
 	/*
 	 * Divide number of elements by the fill factor to determine a desired
@@ -648,7 +715,7 @@ init_htab(HTAB *hashp, long nelem)
 			"HIGH MASK       ", hctl->high_mask,
 			"LOW  MASK       ", hctl->low_mask,
 			"NSEGS           ", hctl->nsegs,
-			"NENTRIES        ", hctl->nentries);
+			"NENTRIES        ", hash_get_num_entries(hctl));
 #endif
 	return true;
 }
@@ -769,7 +836,7 @@ hash_stats(const char *where, HTAB *hashp)
 			where, hashp->hctl->accesses, hashp->hctl->collisions);
 
 	fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
-			hashp->hctl->nentries, (long) hashp->hctl->keysize,
+			hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
 			hashp->hctl->max_bucket, hashp->hctl->nsegs);
 	fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
 			where, hash_accesses, hash_collisions);
@@ -863,6 +930,7 @@ hash_search_with_hash_value(HTAB *hashp,
 	HASHBUCKET	currBucket;
 	HASHBUCKET *prevBucketPtr;
 	HashCompareFunc match;
+	int			freelist_idx = FREELIST_IDX(hctl, hashvalue);
 
 #if HASH_STATISTICS
 	hash_accesses++;
@@ -885,7 +953,7 @@ hash_search_with_hash_value(HTAB *hashp,
 		 * order of these tests is to try to check cheaper conditions first.
 		 */
 		if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
-			hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
+		hctl->nentries[0] / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
 			!has_seq_scans(hashp))
 			(void) expand_table(hashp);
 	}
@@ -943,20 +1011,20 @@ hash_search_with_hash_value(HTAB *hashp,
 			{
 				/* if partitioned, must lock to touch nentries and freeList */
 				if (IS_PARTITIONED(hctl))
-					SpinLockAcquire(&hctl->mutex);
+					SpinLockAcquire(&(hctl->mutex[freelist_idx]));
 
-				Assert(hctl->nentries > 0);
-				hctl->nentries--;
+				Assert(hctl->nentries[freelist_idx] > 0);
+				hctl->nentries[freelist_idx]--;
 
 				/* remove record from hash bucket's chain. */
 				*prevBucketPtr = currBucket->link;
 
 				/* add the record to the freelist for this table.  */
-				currBucket->link = hctl->freeList;
-				hctl->freeList = currBucket;
+				currBucket->link = hctl->freeList[freelist_idx];
+				hctl->freeList[freelist_idx] = currBucket;
 
 				if (IS_PARTITIONED(hctl))
-					SpinLockRelease(&hctl->mutex);
+					SpinLockRelease(&hctl->mutex[freelist_idx]);
 
 				/*
 				 * better hope the caller is synchronizing access to this
@@ -982,7 +1050,7 @@ hash_search_with_hash_value(HTAB *hashp,
 				elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
 					 hashp->tabname);
 
-			currBucket = get_hash_entry(hashp);
+			currBucket = get_hash_entry(hashp, freelist_idx);
 			if (currBucket == NULL)
 			{
 				/* out of memory */
@@ -1175,41 +1243,70 @@ hash_update_hash_key(HTAB *hashp,
  * create a new entry if possible
  */
 static HASHBUCKET
-get_hash_entry(HTAB *hashp)
+get_hash_entry(HTAB *hashp, int freelist_idx)
 {
-	HASHHDR *hctl = hashp->hctl;
+	HASHHDR    *hctl = hashp->hctl;
 	HASHBUCKET	newElement;
+	int			borrow_from_idx;
 
 	for (;;)
 	{
 		/* if partitioned, must lock to touch nentries and freeList */
 		if (IS_PARTITIONED(hctl))
-			SpinLockAcquire(&hctl->mutex);
+			SpinLockAcquire(&hctl->mutex[freelist_idx]);
 
 		/* try to get an entry from the freelist */
-		newElement = hctl->freeList;
+		newElement = hctl->freeList[freelist_idx];
+
 		if (newElement != NULL)
-			break;
+		{
+			/* remove entry from freelist, bump nentries */
+			hctl->freeList[freelist_idx] = newElement->link;
+			hctl->nentries[freelist_idx]++;
+			if (IS_PARTITIONED(hctl))
+				SpinLockRelease(&hctl->mutex[freelist_idx]);
+
+			return newElement;
+		}
 
-		/* no free elements.  allocate another chunk of buckets */
 		if (IS_PARTITIONED(hctl))
-			SpinLockRelease(&hctl->mutex);
+			SpinLockRelease(&hctl->mutex[freelist_idx]);
 
-		if (!element_alloc(hashp, hctl->nelem_alloc))
+		/* no free elements.  allocate another chunk of buckets */
+		if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
 		{
-			/* out of memory */
-			return NULL;
-		}
-	}
+			if (!IS_PARTITIONED(hctl))
+				return NULL;	/* out of memory */
 
-	/* remove entry from freelist, bump nentries */
-	hctl->freeList = newElement->link;
-	hctl->nentries++;
+			/* try to borrow element from another partition */
+			borrow_from_idx = freelist_idx;
+			for (;;)
+			{
+				borrow_from_idx = (borrow_from_idx + 1) & hctl->mutexes_mask;
+				if (borrow_from_idx == freelist_idx)
+					break;
 
-	if (IS_PARTITIONED(hctl))
-		SpinLockRelease(&hctl->mutex);
+				SpinLockAcquire(&(hctl->mutex[borrow_from_idx]));
+				newElement = hctl->freeList[borrow_from_idx];
+
+				if (newElement != NULL)
+				{
+					hctl->freeList[borrow_from_idx] = newElement->link;
+					SpinLockRelease(&(hctl->mutex[borrow_from_idx]));
+
+					SpinLockAcquire(&hctl->mutex[freelist_idx]);
+					hctl->nentries[freelist_idx]++;
+					SpinLockRelease(&hctl->mutex[freelist_idx]);
+
+					break;
+				}
 
-	return newElement;
+				SpinLockRelease(&(hctl->mutex[borrow_from_idx]));
+			}
+
+			return newElement;
+		}
+	}
 }
 
 /*
@@ -1218,11 +1315,21 @@ get_hash_entry(HTAB *hashp)
 long
 hash_get_num_entries(HTAB *hashp)
 {
+	int			i;
+	long		sum = hashp->hctl->nentries[0];
+
 	/*
 	 * We currently don't bother with the mutex; it's only sensible to call
 	 * this function if you've got lock on all partitions of the table.
 	 */
-	return hashp->hctl->nentries;
+
+	if (!IS_PARTITIONED(hashp->hctl))
+		return sum;
+
+	for (i = 1; i < hashp->hctl->num_mutexes; i++)
+		sum += hashp->hctl->nentries[i];
+
+	return sum;
 }
 
 /*
@@ -1530,9 +1637,9 @@ seg_alloc(HTAB *hashp)
  * allocate some new elements and link them into the free list
  */
 static bool
-element_alloc(HTAB *hashp, int nelem)
+element_alloc(HTAB *hashp, int nelem, int freelist_idx)
 {
-	HASHHDR *hctl = hashp->hctl;
+	HASHHDR    *hctl = hashp->hctl;
 	Size		elementSize;
 	HASHELEMENT *firstElement;
 	HASHELEMENT *tmpElement;
@@ -1563,14 +1670,14 @@ element_alloc(HTAB *hashp, int nelem)
 
 	/* if partitioned, must lock to touch freeList */
 	if (IS_PARTITIONED(hctl))
-		SpinLockAcquire(&hctl->mutex);
+		SpinLockAcquire(&hctl->mutex[freelist_idx]);
 
 	/* freelist could be nonempty if two backends did this concurrently */
-	firstElement->link = hctl->freeList;
-	hctl->freeList = prevElement;
+	firstElement->link = hctl->freeList[freelist_idx];
+	hctl->freeList[freelist_idx] = prevElement;
 
 	if (IS_PARTITIONED(hctl))
-		SpinLockRelease(&hctl->mutex);
+		SpinLockRelease(&hctl->mutex[freelist_idx]);
 
 	return true;
 }
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 2bbe1b6..6e44027 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -142,7 +142,7 @@ extern PGDLLIMPORT int NamedLWLockTrancheRequests;
 #define NUM_BUFFER_PARTITIONS  128
 
 /* Number of partitions the shared lock tables are divided into */
-#define LOG2_NUM_LOCK_PARTITIONS  4
+#define LOG2_NUM_LOCK_PARTITIONS  5
 #define NUM_LOCK_PARTITIONS  (1 << LOG2_NUM_LOCK_PARTITIONS)
 
 /* Number of partitions the shared predicate lock tables are divided into */
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index 6468e66..50cf928 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -37,7 +37,7 @@ extern void InitShmemAllocation(void);
 extern void *ShmemAlloc(Size size);
 extern bool ShmemAddrIsValid(const void *addr);
 extern void InitShmemIndex(void);
-extern HTAB *ShmemInitHash(const char *name, long init_size, long max_size,
+extern HTAB *ShmemInitHash(const char *name, long max_size,
 			  HASHCTL *infoP, int hash_flags);
 extern void *ShmemInitStruct(const char *name, Size size, bool *foundPtr);
 extern Size add_size(Size s1, Size s2);
diff --git a/src/include/utils/hsearch.h b/src/include/utils/hsearch.h
index 007ba2c..8e2e6a1 100644
--- a/src/include/utils/hsearch.h
+++ b/src/include/utils/hsearch.h
@@ -69,6 +69,7 @@ typedef struct HASHCTL
 	long		dsize;			/* (initial) directory size */
 	long		max_dsize;		/* limit to dsize if dir size is limited */
 	long		ffactor;		/* fill factor */
+	long		num_mutexes;	/* number of mutexes */
 	Size		keysize;		/* hash key length in bytes */
 	Size		entrysize;		/* total user element size in bytes */
 	HashValueFunc hash;			/* hash function */
@@ -94,6 +95,7 @@ typedef struct HASHCTL
 #define HASH_SHARED_MEM 0x0800	/* Hashtable is in shared memory */
 #define HASH_ATTACH		0x1000	/* Do not initialize hctl */
 #define HASH_FIXED_SIZE 0x2000	/* Initial size is a hard limit */
+#define HASH_NMUTEXES	0x4000	/* Set number of mutexes */
 
 
 /* max_dsize value to indicate expansible directory */

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Patch: fix lock contention for HASHHDR.mutex

Reply via email to