Hello, Robert

> It also strikes me that it's probably quite likely that slock_t
> mutex[NUM_FREELISTS] is a poor way to lay out this data in memory.
> For example, on a system where slock_t is just one byte, most likely
> all of those mutexes are going to be in the same cache line, which
> means you're going to get a LOT of false sharing.  It seems like it
> would be sensible to define a new struct that contains an slock_t, a
> long, and a HASHELEMENT *, and then make an array of those structs.
> That wouldn't completely eliminate false sharing, but it would reduce
> it quite a bit.  My guess is that if you did that, you could reduce
> the number of freelists to 8 or less and get pretty much the same
> performance benefit that you're getting right now with 32. And that,
> too, seems likely to be good for single-client performance.

I experimented for a while trying to fit every spinlock in a separate
cache line. Indeed we can gain some speedup this way. Here are
benchmark results on 12-core server for NUM_LOCK_PARTITIONS = 32 (in
this case difference is more notable):

| FREELISTS | SIZE =  32 | SIZE =  64 | SIZE = 128 | SIZE = 256 |
|-----------|------------|------------|------------|------------|
|     4     |   +25.4%   |   +28.7%   |   +28.4%   |   +28.3%   |
|     8     |   +28.2%   |   +29.4%   |   +31.7%   |   +31.4%   |
|    16     |   +29.9%   |   +32.6%   |   +31.6%   |   +30.8%   |
|    32     |   +33.7%   |   +34.2%   |   +33.6%   |   +32.6%   |

Here SIZE is short for FREELIST_BUFFER_SIZE (part of a hack I used to
align FREELIST structure, see attached patch). Cache line on this CPU
is 64 bytes according to /sys/devices/system/cpu/cpu0/cache/index0/*

I would like to remind that without these changes we had the following
picture (please note "NLP = 32" column):

pgbench -f pgbench.sql -T 150 -P 1 -c 40 -j 12

 DMN | NLP = 16 | NLP = 32 | NLP = 64 | NLP = 128
-----|----------|----------|----------|----------
   8 |  +15.1%  |  +28.2%  |  +34.1%  |  +33.7%  
  16 |  +16.6%  |  +30.9%  |  +37.0%  |  +40.8%  
  32 |  +15.1%  |  +33.9%  |  +39.5%  |  +41.9%  
  64 |  +15.0%  |  +31.9%  |  +40.1%  |  +42.9%  
 128 |   +7.7%  |  +24.7%  |  +29.6%  |  +31.6%  

* NLP = NUM_LOCK_PARTITIONS
* DMN = DEFAULT_MUTEXES_NUM

So its +34.2% vs +33.9% and the number of freeLists remains the same.

> I am however wondering if it to set the freelist affinity based on
> something other than the hash value, like say the PID, so that the
> same process doesn't keep switching to a different freelist for every
> buffer eviction.

Also I tried to use PID to determine freeList number:

```
#include <sys/types.h>
#include <unistd.h>

...

#define FREELIST_IDX(hctl, hashcode) \
  (IS_PARTITIONED(hctl) ? ((uint32)getpid()) % NUM_FREELISTS : 0)

...

// now nentries could be negative in this case
// Assert(FREELIST(hctl, freelist_idx).nentries > 0);

```

Unfortunately this approach gives +33.9% TPS in best case. Also there
is a possibility that nentries will overflow. So far I don't have a
clear idea how to solve this issue effectively.

I'm not sure if further improvements worth an effort.
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 24a53da..4d9ed3e 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -15,7 +15,7 @@
  * to hash_create.  This prevents any attempt to split buckets on-the-fly.
  * Therefore, each hash bucket chain operates independently, and no fields
  * of the hash header change after init except nentries and freeList.
- * A partitioned table uses a spinlock to guard changes of those two fields.
+ * A partitioned table uses spinlocks to guard changes of those fields.
  * This lets any subset of the hash buckets be treated as a separately
  * lockable partition.  We expect callers to use the low-order bits of a
  * lookup key's hash value as a partition number --- this will work because
@@ -111,6 +111,12 @@
 #define DEF_DIRSIZE			   256
 #define DEF_FFACTOR			   1	/* default fill factor */
 
+/* Number of freelists to be used for a partitioned hash table. */
+#define NUM_FREELISTS			4
+
+// TODO: comment. Should be power of 2.
+// #define FREELIST_IDX_STEP 8
+#define FREELIST_BUFFER_SIZE 32
 
 /* A hash bucket is a linked list of HASHELEMENTs */
 typedef HASHELEMENT *HASHBUCKET;
@@ -118,6 +124,20 @@ typedef HASHELEMENT *HASHBUCKET;
 /* A hash segment is an array of bucket headers */
 typedef HASHBUCKET *HASHSEGMENT;
 
+// TODO: re-check comments
+// TODO: pgindent
+
+// TODO: comment!
+typedef struct
+{
+	slock_t mutex;	/* a spinlock */
+	long nentries;	/* number of entries */
+	HASHELEMENT* freeList; /* lists of free elements */
+} FREELIST;
+
+// TODO: comment
+typedef struct { uint8 x[FREELIST_BUFFER_SIZE]; } FREELISTBUFF;
+
 /*
  * Header structure for a hash table --- contains all changeable info
  *
@@ -128,12 +148,16 @@ typedef HASHBUCKET *HASHSEGMENT;
  */
 struct HASHHDR
 {
-	/* In a partitioned table, take this lock to touch nentries or freeList */
-	slock_t		mutex;			/* unused if not partitioned table */
-
-	/* These fields change during entry addition/deletion */
-	long		nentries;		/* number of entries in hash table */
-	HASHELEMENT *freeList;		/* linked list of free elements */
+	/*
+	 * The freelist can become a point of contention on high-concurrency hash
+	 * tables, so we use an array of freelist, each with its own mutex and
+	 * nentries count, instead of just a single one.
+	 *
+	 * If hash table is not partitioned only freeLists[0] is used and
+	 * spinlocks are not used at all.
+	 * TODO: dont acces directly, use FREELIST macro
+	 */
+	FREELISTBUFF __freeLists[NUM_FREELISTS]; 
 
 	/* These fields can change, but not in a partitioned table */
 	/* Also, dsize can't change in a shared table, even if unpartitioned */
@@ -166,6 +190,13 @@ struct HASHHDR
 
 #define IS_PARTITIONED(hctl)  ((hctl)->num_partitions != 0)
 
+#define FREELIST_IDX(hctl, hashcode) \
+	(IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
+
+// TODO: comment
+#define FREELIST(hctl, idx) \
+	(*(FREELIST*)&(hctl->__freeLists[idx]))
+
 /*
  * Top control structure for a hashtable --- in a shared table, each backend
  * has its own copy (OK since no fields change at runtime)
@@ -219,10 +250,10 @@ static long hash_accesses,
  */
 static void *DynaHashAlloc(Size size);
 static HASHSEGMENT seg_alloc(HTAB *hashp);
-static bool element_alloc(HTAB *hashp, int nelem);
+static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
 static bool dir_realloc(HTAB *hashp);
 static bool expand_table(HTAB *hashp);
-static HASHBUCKET get_hash_entry(HTAB *hashp);
+static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
 static void hdefault(HTAB *hashp);
 static int	choose_nelem_alloc(Size entrysize);
 static bool init_htab(HTAB *hashp, long nelem);
@@ -283,6 +314,9 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	HTAB	   *hashp;
 	HASHHDR    *hctl;
 
+	StaticAssertStmt(sizeof(FREELIST) <= sizeof(FREELISTBUFF),
+		"sizeof FREELIST should be less or equal sizeof FREELISTBUFF");
+
 	/*
 	 * For shared hash tables, we have a local hash header (HTAB struct) that
 	 * we allocate in TopMemoryContext; all else is in shared memory.
@@ -482,10 +516,40 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
 	if ((flags & HASH_SHARED_MEM) ||
 		nelem < hctl->nelem_alloc)
 	{
-		if (!element_alloc(hashp, (int) nelem))
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
+		int			i,
+					freelist_partitions,
+					nelem_alloc,
+					nelem_alloc_first;
+
+		/*
+		 * If hash table is partitioned all freeLists have equal number of
+		 * elements. Otherwise only freeLists[0] is used.
+		 */
+		if (IS_PARTITIONED(hashp->hctl))
+			freelist_partitions = NUM_FREELISTS;
+		else
+			freelist_partitions = 1;
+
+		nelem_alloc = nelem / freelist_partitions;
+		if (nelem_alloc == 0)
+			nelem_alloc = 1;
+
+		/* Make sure all memory will be used */
+		if (nelem_alloc * freelist_partitions < nelem)
+			nelem_alloc_first =
+				nelem - nelem_alloc * (freelist_partitions - 1);
+		else
+			nelem_alloc_first = nelem_alloc;
+
+		for (i = 0; i < freelist_partitions; i++)
+		{
+			int			temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
+
+			if (!element_alloc(hashp, temp, i))
+				ereport(ERROR,
+						(errcode(ERRCODE_OUT_OF_MEMORY),
+						 errmsg("out of memory")));
+		}
 	}
 
 	if (flags & HASH_FIXED_SIZE)
@@ -503,9 +567,6 @@ hdefault(HTAB *hashp)
 
 	MemSet(hctl, 0, sizeof(HASHHDR));
 
-	hctl->nentries = 0;
-	hctl->freeList = NULL;
-
 	hctl->dsize = DEF_DIRSIZE;
 	hctl->nsegs = 0;
 
@@ -572,12 +633,14 @@ init_htab(HTAB *hashp, long nelem)
 	HASHSEGMENT *segp;
 	int			nbuckets;
 	int			nsegs;
+	int			i;
 
 	/*
 	 * initialize mutex if it's a partitioned table
 	 */
 	if (IS_PARTITIONED(hctl))
-		SpinLockInit(&hctl->mutex);
+		for (i = 0; i < NUM_FREELISTS; i++)
+			SpinLockInit(&(FREELIST(hctl, i).mutex));
 
 	/*
 	 * Divide number of elements by the fill factor to determine a desired
@@ -648,7 +711,7 @@ init_htab(HTAB *hashp, long nelem)
 			"HIGH MASK       ", hctl->high_mask,
 			"LOW  MASK       ", hctl->low_mask,
 			"NSEGS           ", hctl->nsegs,
-			"NENTRIES        ", hctl->nentries);
+			"NENTRIES        ", hash_get_num_entries(hctl));
 #endif
 	return true;
 }
@@ -769,7 +832,7 @@ hash_stats(const char *where, HTAB *hashp)
 			where, hashp->hctl->accesses, hashp->hctl->collisions);
 
 	fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
-			hashp->hctl->nentries, (long) hashp->hctl->keysize,
+			hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
 			hashp->hctl->max_bucket, hashp->hctl->nsegs);
 	fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
 			where, hash_accesses, hash_collisions);
@@ -863,6 +926,7 @@ hash_search_with_hash_value(HTAB *hashp,
 	HASHBUCKET	currBucket;
 	HASHBUCKET *prevBucketPtr;
 	HashCompareFunc match;
+	int			freelist_idx = FREELIST_IDX(hctl, hashvalue);
 
 #if HASH_STATISTICS
 	hash_accesses++;
@@ -885,7 +949,8 @@ hash_search_with_hash_value(HTAB *hashp,
 		 * order of these tests is to try to check cheaper conditions first.
 		 */
 		if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
-			hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
+		((FREELIST(hctl, 0).nentries / (long) (hctl->max_bucket + 1)) >=
+				hctl->ffactor) &&
 			!has_seq_scans(hashp))
 			(void) expand_table(hashp);
 	}
@@ -943,20 +1008,20 @@ hash_search_with_hash_value(HTAB *hashp,
 			{
 				/* if partitioned, must lock to touch nentries and freeList */
 				if (IS_PARTITIONED(hctl))
-					SpinLockAcquire(&hctl->mutex);
+					SpinLockAcquire(&(FREELIST(hctl, freelist_idx).mutex));
 
-				Assert(hctl->nentries > 0);
-				hctl->nentries--;
+				Assert(FREELIST(hctl, freelist_idx).nentries > 0);
+				FREELIST(hctl, freelist_idx).nentries--;
 
 				/* remove record from hash bucket's chain. */
 				*prevBucketPtr = currBucket->link;
 
 				/* add the record to the freelist for this table.  */
-				currBucket->link = hctl->freeList;
-				hctl->freeList = currBucket;
+				currBucket->link = FREELIST(hctl, freelist_idx).freeList;
+				FREELIST(hctl, freelist_idx).freeList = currBucket;
 
 				if (IS_PARTITIONED(hctl))
-					SpinLockRelease(&hctl->mutex);
+					SpinLockRelease(&(FREELIST(hctl, freelist_idx).mutex));
 
 				/*
 				 * better hope the caller is synchronizing access to this
@@ -982,7 +1047,7 @@ hash_search_with_hash_value(HTAB *hashp,
 				elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
 					 hashp->tabname);
 
-			currBucket = get_hash_entry(hashp);
+			currBucket = get_hash_entry(hashp, freelist_idx);
 			if (currBucket == NULL)
 			{
 				/* out of memory */
@@ -1175,39 +1240,70 @@ hash_update_hash_key(HTAB *hashp,
  * create a new entry if possible
  */
 static HASHBUCKET
-get_hash_entry(HTAB *hashp)
+get_hash_entry(HTAB *hashp, int freelist_idx)
 {
 	HASHHDR *hctl = hashp->hctl;
 	HASHBUCKET	newElement;
+	int			borrow_from_idx;
 
 	for (;;)
 	{
 		/* if partitioned, must lock to touch nentries and freeList */
 		if (IS_PARTITIONED(hctl))
-			SpinLockAcquire(&hctl->mutex);
+			SpinLockAcquire(&(FREELIST(hctl, freelist_idx).mutex));
 
 		/* try to get an entry from the freelist */
-		newElement = hctl->freeList;
+		newElement = FREELIST(hctl, freelist_idx).freeList;
+
 		if (newElement != NULL)
 			break;
 
-		/* no free elements.  allocate another chunk of buckets */
 		if (IS_PARTITIONED(hctl))
-			SpinLockRelease(&hctl->mutex);
+			SpinLockRelease(&(FREELIST(hctl, freelist_idx).mutex));
 
-		if (!element_alloc(hashp, hctl->nelem_alloc))
+		/* no free elements.  allocate another chunk of buckets */
+		if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
 		{
-			/* out of memory */
-			return NULL;
+			if (!IS_PARTITIONED(hctl))
+				return NULL;	/* out of memory */
+
+			/* try to borrow element from another partition */
+			borrow_from_idx = freelist_idx;
+			for (;;)
+			{
+				borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
+				if (borrow_from_idx == freelist_idx)
+					break;
+
+				SpinLockAcquire(&(FREELIST(hctl, borrow_from_idx).mutex));
+				newElement = FREELIST(hctl, borrow_from_idx).freeList;
+
+				if (newElement != NULL)
+				{
+					FREELIST(hctl, borrow_from_idx).freeList =
+						newElement->link;
+					SpinLockRelease(&(FREELIST(hctl, borrow_from_idx).mutex));
+
+					SpinLockAcquire(&(FREELIST(hctl, freelist_idx).mutex));
+					FREELIST(hctl, freelist_idx).nentries++;
+					SpinLockRelease(&(FREELIST(hctl, freelist_idx).mutex));
+
+					break;
+				}
+
+				SpinLockRelease(&(FREELIST(hctl, borrow_from_idx).mutex));
+			}
+
+			return newElement;
 		}
 	}
 
 	/* remove entry from freelist, bump nentries */
-	hctl->freeList = newElement->link;
-	hctl->nentries++;
+	FREELIST(hctl, freelist_idx).freeList = newElement->link;
+	FREELIST(hctl, freelist_idx).nentries++;
 
 	if (IS_PARTITIONED(hctl))
-		SpinLockRelease(&hctl->mutex);
+		SpinLockRelease(&(FREELIST(hctl, freelist_idx).mutex));
 
 	return newElement;
 }
@@ -1218,11 +1314,21 @@ get_hash_entry(HTAB *hashp)
 long
 hash_get_num_entries(HTAB *hashp)
 {
+	int			i;
+	long		sum = FREELIST(hashp->hctl, 0).nentries;
+
 	/*
 	 * We currently don't bother with the mutex; it's only sensible to call
 	 * this function if you've got lock on all partitions of the table.
 	 */
-	return hashp->hctl->nentries;
+
+	if (!IS_PARTITIONED(hashp->hctl))
+		return sum;
+
+	for (i = 1; i < NUM_FREELISTS; i++)
+		sum += FREELIST(hashp->hctl, i).nentries;
+
+	return sum;
 }
 
 /*
@@ -1527,10 +1633,10 @@ seg_alloc(HTAB *hashp)
 }
 
 /*
- * allocate some new elements and link them into the free list
+ * allocate some new elements and link them into the indicated free list
  */
 static bool
-element_alloc(HTAB *hashp, int nelem)
+element_alloc(HTAB *hashp, int nelem, int freelist_idx)
 {
 	HASHHDR *hctl = hashp->hctl;
 	Size		elementSize;
@@ -1563,14 +1669,14 @@ element_alloc(HTAB *hashp, int nelem)
 
 	/* if partitioned, must lock to touch freeList */
 	if (IS_PARTITIONED(hctl))
-		SpinLockAcquire(&hctl->mutex);
+		SpinLockAcquire(&(FREELIST(hctl, freelist_idx).mutex));
 
 	/* freelist could be nonempty if two backends did this concurrently */
-	firstElement->link = hctl->freeList;
-	hctl->freeList = prevElement;
+	firstElement->link = FREELIST(hctl, freelist_idx).freeList;
+	FREELIST(hctl, freelist_idx).freeList = prevElement;
 
 	if (IS_PARTITIONED(hctl))
-		SpinLockRelease(&hctl->mutex);
+		SpinLockRelease(&FREELIST(hctl, freelist_idx).mutex);
 
 	return true;
 }
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to