Hello, Robert
> It also strikes me that it's probably quite likely that slock_t
> mutex[NUM_FREELISTS] is a poor way to lay out this data in memory.
> For example, on a system where slock_t is just one byte, most likely
> all of those mutexes are going to be in the same cache line, which
> means you're going to get a LOT of false sharing. It seems like it
> would be sensible to define a new struct that contains an slock_t, a
> long, and a HASHELEMENT *, and then make an array of those structs.
> That wouldn't completely eliminate false sharing, but it would reduce
> it quite a bit. My guess is that if you did that, you could reduce
> the number of freelists to 8 or less and get pretty much the same
> performance benefit that you're getting right now with 32. And that,
> too, seems likely to be good for single-client performance.
I experimented for a while trying to fit every spinlock in a separate
cache line. Indeed we can gain some speedup this way. Here are
benchmark results on 12-core server for NUM_LOCK_PARTITIONS = 32 (in
this case difference is more notable):
| FREELISTS | SIZE = 32 | SIZE = 64 | SIZE = 128 | SIZE = 256 |
|-----------|------------|------------|------------|------------|
| 4 | +25.4% | +28.7% | +28.4% | +28.3% |
| 8 | +28.2% | +29.4% | +31.7% | +31.4% |
| 16 | +29.9% | +32.6% | +31.6% | +30.8% |
| 32 | +33.7% | +34.2% | +33.6% | +32.6% |
Here SIZE is short for FREELIST_BUFFER_SIZE (part of a hack I used to
align FREELIST structure, see attached patch). Cache line on this CPU
is 64 bytes according to /sys/devices/system/cpu/cpu0/cache/index0/*
I would like to remind that without these changes we had the following
picture (please note "NLP = 32" column):
pgbench -f pgbench.sql -T 150 -P 1 -c 40 -j 12
DMN | NLP = 16 | NLP = 32 | NLP = 64 | NLP = 128
-----|----------|----------|----------|----------
8 | +15.1% | +28.2% | +34.1% | +33.7%
16 | +16.6% | +30.9% | +37.0% | +40.8%
32 | +15.1% | +33.9% | +39.5% | +41.9%
64 | +15.0% | +31.9% | +40.1% | +42.9%
128 | +7.7% | +24.7% | +29.6% | +31.6%
* NLP = NUM_LOCK_PARTITIONS
* DMN = DEFAULT_MUTEXES_NUM
So its +34.2% vs +33.9% and the number of freeLists remains the same.
> I am however wondering if it to set the freelist affinity based on
> something other than the hash value, like say the PID, so that the
> same process doesn't keep switching to a different freelist for every
> buffer eviction.
Also I tried to use PID to determine freeList number:
```
#include <sys/types.h>
#include <unistd.h>
...
#define FREELIST_IDX(hctl, hashcode) \
(IS_PARTITIONED(hctl) ? ((uint32)getpid()) % NUM_FREELISTS : 0)
...
// now nentries could be negative in this case
// Assert(FREELIST(hctl, freelist_idx).nentries > 0);
```
Unfortunately this approach gives +33.9% TPS in best case. Also there
is a possibility that nentries will overflow. So far I don't have a
clear idea how to solve this issue effectively.
I'm not sure if further improvements worth an effort.
diff --git a/src/backend/utils/hash/dynahash.c b/src/backend/utils/hash/dynahash.c
index 24a53da..4d9ed3e 100644
--- a/src/backend/utils/hash/dynahash.c
+++ b/src/backend/utils/hash/dynahash.c
@@ -15,7 +15,7 @@
* to hash_create. This prevents any attempt to split buckets on-the-fly.
* Therefore, each hash bucket chain operates independently, and no fields
* of the hash header change after init except nentries and freeList.
- * A partitioned table uses a spinlock to guard changes of those two fields.
+ * A partitioned table uses spinlocks to guard changes of those fields.
* This lets any subset of the hash buckets be treated as a separately
* lockable partition. We expect callers to use the low-order bits of a
* lookup key's hash value as a partition number --- this will work because
@@ -111,6 +111,12 @@
#define DEF_DIRSIZE 256
#define DEF_FFACTOR 1 /* default fill factor */
+/* Number of freelists to be used for a partitioned hash table. */
+#define NUM_FREELISTS 4
+
+// TODO: comment. Should be power of 2.
+// #define FREELIST_IDX_STEP 8
+#define FREELIST_BUFFER_SIZE 32
/* A hash bucket is a linked list of HASHELEMENTs */
typedef HASHELEMENT *HASHBUCKET;
@@ -118,6 +124,20 @@ typedef HASHELEMENT *HASHBUCKET;
/* A hash segment is an array of bucket headers */
typedef HASHBUCKET *HASHSEGMENT;
+// TODO: re-check comments
+// TODO: pgindent
+
+// TODO: comment!
+typedef struct
+{
+ slock_t mutex; /* a spinlock */
+ long nentries; /* number of entries */
+ HASHELEMENT* freeList; /* lists of free elements */
+} FREELIST;
+
+// TODO: comment
+typedef struct { uint8 x[FREELIST_BUFFER_SIZE]; } FREELISTBUFF;
+
/*
* Header structure for a hash table --- contains all changeable info
*
@@ -128,12 +148,16 @@ typedef HASHBUCKET *HASHSEGMENT;
*/
struct HASHHDR
{
- /* In a partitioned table, take this lock to touch nentries or freeList */
- slock_t mutex; /* unused if not partitioned table */
-
- /* These fields change during entry addition/deletion */
- long nentries; /* number of entries in hash table */
- HASHELEMENT *freeList; /* linked list of free elements */
+ /*
+ * The freelist can become a point of contention on high-concurrency hash
+ * tables, so we use an array of freelist, each with its own mutex and
+ * nentries count, instead of just a single one.
+ *
+ * If hash table is not partitioned only freeLists[0] is used and
+ * spinlocks are not used at all.
+ * TODO: dont acces directly, use FREELIST macro
+ */
+ FREELISTBUFF __freeLists[NUM_FREELISTS];
/* These fields can change, but not in a partitioned table */
/* Also, dsize can't change in a shared table, even if unpartitioned */
@@ -166,6 +190,13 @@ struct HASHHDR
#define IS_PARTITIONED(hctl) ((hctl)->num_partitions != 0)
+#define FREELIST_IDX(hctl, hashcode) \
+ (IS_PARTITIONED(hctl) ? hashcode % NUM_FREELISTS : 0)
+
+// TODO: comment
+#define FREELIST(hctl, idx) \
+ (*(FREELIST*)&(hctl->__freeLists[idx]))
+
/*
* Top control structure for a hashtable --- in a shared table, each backend
* has its own copy (OK since no fields change at runtime)
@@ -219,10 +250,10 @@ static long hash_accesses,
*/
static void *DynaHashAlloc(Size size);
static HASHSEGMENT seg_alloc(HTAB *hashp);
-static bool element_alloc(HTAB *hashp, int nelem);
+static bool element_alloc(HTAB *hashp, int nelem, int freelist_idx);
static bool dir_realloc(HTAB *hashp);
static bool expand_table(HTAB *hashp);
-static HASHBUCKET get_hash_entry(HTAB *hashp);
+static HASHBUCKET get_hash_entry(HTAB *hashp, int freelist_idx);
static void hdefault(HTAB *hashp);
static int choose_nelem_alloc(Size entrysize);
static bool init_htab(HTAB *hashp, long nelem);
@@ -283,6 +314,9 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
HTAB *hashp;
HASHHDR *hctl;
+ StaticAssertStmt(sizeof(FREELIST) <= sizeof(FREELISTBUFF),
+ "sizeof FREELIST should be less or equal sizeof FREELISTBUFF");
+
/*
* For shared hash tables, we have a local hash header (HTAB struct) that
* we allocate in TopMemoryContext; all else is in shared memory.
@@ -482,10 +516,40 @@ hash_create(const char *tabname, long nelem, HASHCTL *info, int flags)
if ((flags & HASH_SHARED_MEM) ||
nelem < hctl->nelem_alloc)
{
- if (!element_alloc(hashp, (int) nelem))
- ereport(ERROR,
- (errcode(ERRCODE_OUT_OF_MEMORY),
- errmsg("out of memory")));
+ int i,
+ freelist_partitions,
+ nelem_alloc,
+ nelem_alloc_first;
+
+ /*
+ * If hash table is partitioned all freeLists have equal number of
+ * elements. Otherwise only freeLists[0] is used.
+ */
+ if (IS_PARTITIONED(hashp->hctl))
+ freelist_partitions = NUM_FREELISTS;
+ else
+ freelist_partitions = 1;
+
+ nelem_alloc = nelem / freelist_partitions;
+ if (nelem_alloc == 0)
+ nelem_alloc = 1;
+
+ /* Make sure all memory will be used */
+ if (nelem_alloc * freelist_partitions < nelem)
+ nelem_alloc_first =
+ nelem - nelem_alloc * (freelist_partitions - 1);
+ else
+ nelem_alloc_first = nelem_alloc;
+
+ for (i = 0; i < freelist_partitions; i++)
+ {
+ int temp = (i == 0) ? nelem_alloc_first : nelem_alloc;
+
+ if (!element_alloc(hashp, temp, i))
+ ereport(ERROR,
+ (errcode(ERRCODE_OUT_OF_MEMORY),
+ errmsg("out of memory")));
+ }
}
if (flags & HASH_FIXED_SIZE)
@@ -503,9 +567,6 @@ hdefault(HTAB *hashp)
MemSet(hctl, 0, sizeof(HASHHDR));
- hctl->nentries = 0;
- hctl->freeList = NULL;
-
hctl->dsize = DEF_DIRSIZE;
hctl->nsegs = 0;
@@ -572,12 +633,14 @@ init_htab(HTAB *hashp, long nelem)
HASHSEGMENT *segp;
int nbuckets;
int nsegs;
+ int i;
/*
* initialize mutex if it's a partitioned table
*/
if (IS_PARTITIONED(hctl))
- SpinLockInit(&hctl->mutex);
+ for (i = 0; i < NUM_FREELISTS; i++)
+ SpinLockInit(&(FREELIST(hctl, i).mutex));
/*
* Divide number of elements by the fill factor to determine a desired
@@ -648,7 +711,7 @@ init_htab(HTAB *hashp, long nelem)
"HIGH MASK ", hctl->high_mask,
"LOW MASK ", hctl->low_mask,
"NSEGS ", hctl->nsegs,
- "NENTRIES ", hctl->nentries);
+ "NENTRIES ", hash_get_num_entries(hctl));
#endif
return true;
}
@@ -769,7 +832,7 @@ hash_stats(const char *where, HTAB *hashp)
where, hashp->hctl->accesses, hashp->hctl->collisions);
fprintf(stderr, "hash_stats: entries %ld keysize %ld maxp %u segmentcount %ld\n",
- hashp->hctl->nentries, (long) hashp->hctl->keysize,
+ hash_get_num_entries(hashp), (long) hashp->hctl->keysize,
hashp->hctl->max_bucket, hashp->hctl->nsegs);
fprintf(stderr, "%s: total accesses %ld total collisions %ld\n",
where, hash_accesses, hash_collisions);
@@ -863,6 +926,7 @@ hash_search_with_hash_value(HTAB *hashp,
HASHBUCKET currBucket;
HASHBUCKET *prevBucketPtr;
HashCompareFunc match;
+ int freelist_idx = FREELIST_IDX(hctl, hashvalue);
#if HASH_STATISTICS
hash_accesses++;
@@ -885,7 +949,8 @@ hash_search_with_hash_value(HTAB *hashp,
* order of these tests is to try to check cheaper conditions first.
*/
if (!IS_PARTITIONED(hctl) && !hashp->frozen &&
- hctl->nentries / (long) (hctl->max_bucket + 1) >= hctl->ffactor &&
+ ((FREELIST(hctl, 0).nentries / (long) (hctl->max_bucket + 1)) >=
+ hctl->ffactor) &&
!has_seq_scans(hashp))
(void) expand_table(hashp);
}
@@ -943,20 +1008,20 @@ hash_search_with_hash_value(HTAB *hashp,
{
/* if partitioned, must lock to touch nentries and freeList */
if (IS_PARTITIONED(hctl))
- SpinLockAcquire(&hctl->mutex);
+ SpinLockAcquire(&(FREELIST(hctl, freelist_idx).mutex));
- Assert(hctl->nentries > 0);
- hctl->nentries--;
+ Assert(FREELIST(hctl, freelist_idx).nentries > 0);
+ FREELIST(hctl, freelist_idx).nentries--;
/* remove record from hash bucket's chain. */
*prevBucketPtr = currBucket->link;
/* add the record to the freelist for this table. */
- currBucket->link = hctl->freeList;
- hctl->freeList = currBucket;
+ currBucket->link = FREELIST(hctl, freelist_idx).freeList;
+ FREELIST(hctl, freelist_idx).freeList = currBucket;
if (IS_PARTITIONED(hctl))
- SpinLockRelease(&hctl->mutex);
+ SpinLockRelease(&(FREELIST(hctl, freelist_idx).mutex));
/*
* better hope the caller is synchronizing access to this
@@ -982,7 +1047,7 @@ hash_search_with_hash_value(HTAB *hashp,
elog(ERROR, "cannot insert into frozen hashtable \"%s\"",
hashp->tabname);
- currBucket = get_hash_entry(hashp);
+ currBucket = get_hash_entry(hashp, freelist_idx);
if (currBucket == NULL)
{
/* out of memory */
@@ -1175,39 +1240,70 @@ hash_update_hash_key(HTAB *hashp,
* create a new entry if possible
*/
static HASHBUCKET
-get_hash_entry(HTAB *hashp)
+get_hash_entry(HTAB *hashp, int freelist_idx)
{
HASHHDR *hctl = hashp->hctl;
HASHBUCKET newElement;
+ int borrow_from_idx;
for (;;)
{
/* if partitioned, must lock to touch nentries and freeList */
if (IS_PARTITIONED(hctl))
- SpinLockAcquire(&hctl->mutex);
+ SpinLockAcquire(&(FREELIST(hctl, freelist_idx).mutex));
/* try to get an entry from the freelist */
- newElement = hctl->freeList;
+ newElement = FREELIST(hctl, freelist_idx).freeList;
+
if (newElement != NULL)
break;
- /* no free elements. allocate another chunk of buckets */
if (IS_PARTITIONED(hctl))
- SpinLockRelease(&hctl->mutex);
+ SpinLockRelease(&(FREELIST(hctl, freelist_idx).mutex));
- if (!element_alloc(hashp, hctl->nelem_alloc))
+ /* no free elements. allocate another chunk of buckets */
+ if (!element_alloc(hashp, hctl->nelem_alloc, freelist_idx))
{
- /* out of memory */
- return NULL;
+ if (!IS_PARTITIONED(hctl))
+ return NULL; /* out of memory */
+
+ /* try to borrow element from another partition */
+ borrow_from_idx = freelist_idx;
+ for (;;)
+ {
+ borrow_from_idx = (borrow_from_idx + 1) % NUM_FREELISTS;
+ if (borrow_from_idx == freelist_idx)
+ break;
+
+ SpinLockAcquire(&(FREELIST(hctl, borrow_from_idx).mutex));
+ newElement = FREELIST(hctl, borrow_from_idx).freeList;
+
+ if (newElement != NULL)
+ {
+ FREELIST(hctl, borrow_from_idx).freeList =
+ newElement->link;
+ SpinLockRelease(&(FREELIST(hctl, borrow_from_idx).mutex));
+
+ SpinLockAcquire(&(FREELIST(hctl, freelist_idx).mutex));
+ FREELIST(hctl, freelist_idx).nentries++;
+ SpinLockRelease(&(FREELIST(hctl, freelist_idx).mutex));
+
+ break;
+ }
+
+ SpinLockRelease(&(FREELIST(hctl, borrow_from_idx).mutex));
+ }
+
+ return newElement;
}
}
/* remove entry from freelist, bump nentries */
- hctl->freeList = newElement->link;
- hctl->nentries++;
+ FREELIST(hctl, freelist_idx).freeList = newElement->link;
+ FREELIST(hctl, freelist_idx).nentries++;
if (IS_PARTITIONED(hctl))
- SpinLockRelease(&hctl->mutex);
+ SpinLockRelease(&(FREELIST(hctl, freelist_idx).mutex));
return newElement;
}
@@ -1218,11 +1314,21 @@ get_hash_entry(HTAB *hashp)
long
hash_get_num_entries(HTAB *hashp)
{
+ int i;
+ long sum = FREELIST(hashp->hctl, 0).nentries;
+
/*
* We currently don't bother with the mutex; it's only sensible to call
* this function if you've got lock on all partitions of the table.
*/
- return hashp->hctl->nentries;
+
+ if (!IS_PARTITIONED(hashp->hctl))
+ return sum;
+
+ for (i = 1; i < NUM_FREELISTS; i++)
+ sum += FREELIST(hashp->hctl, i).nentries;
+
+ return sum;
}
/*
@@ -1527,10 +1633,10 @@ seg_alloc(HTAB *hashp)
}
/*
- * allocate some new elements and link them into the free list
+ * allocate some new elements and link them into the indicated free list
*/
static bool
-element_alloc(HTAB *hashp, int nelem)
+element_alloc(HTAB *hashp, int nelem, int freelist_idx)
{
HASHHDR *hctl = hashp->hctl;
Size elementSize;
@@ -1563,14 +1669,14 @@ element_alloc(HTAB *hashp, int nelem)
/* if partitioned, must lock to touch freeList */
if (IS_PARTITIONED(hctl))
- SpinLockAcquire(&hctl->mutex);
+ SpinLockAcquire(&(FREELIST(hctl, freelist_idx).mutex));
/* freelist could be nonempty if two backends did this concurrently */
- firstElement->link = hctl->freeList;
- hctl->freeList = prevElement;
+ firstElement->link = FREELIST(hctl, freelist_idx).freeList;
+ FREELIST(hctl, freelist_idx).freeList = prevElement;
if (IS_PARTITIONED(hctl))
- SpinLockRelease(&hctl->mutex);
+ SpinLockRelease(&FREELIST(hctl, freelist_idx).mutex);
return true;
}
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers