> It'd really like to see it being replaced by a queuing lock
> (i.e. lwlock) before we go there. And then maybe partition the
> freelist, and make nentries an atomic.
I believe I just implemented something like this (see attachment). The
idea is to partition PROCLOCK hash table manually into NUM_LOCK_
PARTITIONS smaller and non-partitioned hash tables. Since these tables
are non-partitioned spinlock is not used and there is no lock
contention.
On 60-core server we gain 3.5-4 more TPS according to benchmark
described above. As I understand there is no performance degradation in
other cases (different CPU, traditional pgbench, etc).
If this patch seems to be OK I believe we could consider applying the
same change not only to PROCLOCK hash table.
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 76fc615..1a86f86 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -249,15 +249,56 @@ static volatile FastPathStrongRelationLockData *FastPathStrongRelationLocks;
* shared memory; LockMethodLocalHash is local to each backend.
*/
static HTAB *LockMethodLockHash;
-static HTAB *LockMethodProcLockHash;
+static HTAB *LockMethodProcLockHashPartitions[NUM_LOCK_PARTITIONS];
static HTAB *LockMethodLocalHash;
+/*
+ * LockMethodProcLockHash hash table is partitioned into NUM_LOCK_PARTITIONS
+ * usual (non-partitioned, i.e. without HASH_PARTITION flag) hash tables. The
+ * reason for partitioning LockMethodProcLockHash manually instead of just
+ * using single hash table with HASH_PARTITION flag is following. If hash
+ * table has HASH_PARTITION flag it uses a single spinlock to safely
+ * access some of its fields. Turned out in case of this particular hash
+ * table it causes a considerable performance degradation becouse of lock
+ * contention on servers with tens of cores.
+ */
+#define LockMethodProcLockHash(hashcode) \
+ (LockMethodProcLockHashPartitions[LockHashPartition(hashcode)])
+
+/*
+ * Each partition of LockMethodProcLockHash hash table has an unique name
+ * generated from this template using snprintf.
+ */
+static const char ProcLockHashNameTemplate[] = "PROCLOCK hash, partition %d";
+
+/*
+ * Size of buffer required to store LockMethodProcLockHash partition name.
+ *
+ * 10 is number of digits in 2^32. 2 is length of "%d" string.
+ */
+#define PROC_LOCK_HASH_NAME_BUFF_SIZE (sizeof(ProcLockHashNameTemplate)+10-2)
/* private state for error cleanup */
static LOCALLOCK *StrongLockInProgress;
static LOCALLOCK *awaitedLock;
static ResourceOwner awaitedOwner;
+/*
+ * Calculate total number of entries in LockMethodProcLockHash hash table.
+ *
+ * Caller is responsible for having acquired appropriate LWLocks.
+ */
+static long
+GetProcLockHashNumEntries()
+{
+ int i;
+ long sum = 0;
+
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ sum += hash_get_num_entries(LockMethodProcLockHashPartitions[i]);
+
+ return sum;
+}
#ifdef LOCK_DEBUG
@@ -373,16 +414,16 @@ void
InitLocks(void)
{
HASHCTL info;
- long init_table_size,
- max_table_size;
+ long shmem_table_size;
bool found;
+ int i;
+ char proclock_hash_name[PROC_LOCK_HASH_NAME_BUFF_SIZE];
/*
* Compute init/max size to request for lock hashtables. Note these
* calculations must agree with LockShmemSize!
*/
- max_table_size = NLOCKENTS();
- init_table_size = max_table_size / 2;
+ shmem_table_size = NLOCKENTS();
/*
* Allocate hash table for LOCK structs. This stores per-locked-object
@@ -394,14 +435,13 @@ InitLocks(void)
info.num_partitions = NUM_LOCK_PARTITIONS;
LockMethodLockHash = ShmemInitHash("LOCK hash",
- init_table_size,
- max_table_size,
+ shmem_table_size,
+ shmem_table_size,
&info,
HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
/* Assume an average of 2 holders per lock */
- max_table_size *= 2;
- init_table_size *= 2;
+ shmem_table_size = (shmem_table_size * 2) / NUM_LOCK_PARTITIONS;
/*
* Allocate hash table for PROCLOCK structs. This stores
@@ -412,11 +452,17 @@ InitLocks(void)
info.hash = proclock_hash;
info.num_partitions = NUM_LOCK_PARTITIONS;
- LockMethodProcLockHash = ShmemInitHash("PROCLOCK hash",
- init_table_size,
- max_table_size,
- &info,
- HASH_ELEM | HASH_FUNCTION | HASH_PARTITION);
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
+ {
+ snprintf(proclock_hash_name, sizeof(proclock_hash_name),
+ ProcLockHashNameTemplate, i + 1);
+
+ LockMethodProcLockHashPartitions[i] = ShmemInitHash(proclock_hash_name,
+ shmem_table_size,
+ shmem_table_size,
+ &info,
+ HASH_ELEM | HASH_FUNCTION);
+ }
/*
* Allocate fast-path structures.
@@ -943,7 +989,7 @@ LockAcquireExtended(const LOCKTAG *locktag,
proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode);
SHMQueueDelete(&proclock->lockLink);
SHMQueueDelete(&proclock->procLink);
- if (!hash_search_with_hash_value(LockMethodProcLockHash,
+ if (!hash_search_with_hash_value(LockMethodProcLockHash(hashcode),
(void *) &(proclock->tag),
proclock_hashcode,
HASH_REMOVE,
@@ -1102,7 +1148,7 @@ SetupLockInTable(LockMethod lockMethodTable, PGPROC *proc,
/*
* Find or create a proclock entry with this tag
*/
- proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+ proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash(hashcode),
(void *) &proclocktag,
proclock_hashcode,
HASH_ENTER_NULL,
@@ -1424,7 +1470,7 @@ CleanUpLock(LOCK *lock, PROCLOCK *proclock,
SHMQueueDelete(&proclock->lockLink);
SHMQueueDelete(&proclock->procLink);
proclock_hashcode = ProcLockHashCode(&proclock->tag, hashcode);
- if (!hash_search_with_hash_value(LockMethodProcLockHash,
+ if (!hash_search_with_hash_value(LockMethodProcLockHash(hashcode),
(void *) &(proclock->tag),
proclock_hashcode,
HASH_REMOVE,
@@ -1881,7 +1927,7 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock)
proclocktag.myLock = lock;
proclocktag.myProc = MyProc;
- locallock->proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash,
+ locallock->proclock = (PROCLOCK *) hash_search(LockMethodProcLockHash(locallock->hashcode),
(void *) &proclocktag,
HASH_FIND,
NULL);
@@ -2631,7 +2677,7 @@ FastPathGetRelationLockEntry(LOCALLOCK *locallock)
proclock_hashcode = ProcLockHashCode(&proclocktag, locallock->hashcode);
proclock = (PROCLOCK *)
- hash_search_with_hash_value(LockMethodProcLockHash,
+ hash_search_with_hash_value(LockMethodProcLockHash(locallock->hashcode),
(void *) &proclocktag,
proclock_hashcode,
HASH_FIND,
@@ -2914,7 +2960,8 @@ LockRefindAndRelease(LockMethod lockMethodTable, PGPROC *proc,
proclock_hashcode = ProcLockHashCode(&proclocktag, hashcode);
- proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+ proclock = (PROCLOCK *) hash_search_with_hash_value(
+ LockMethodProcLockHash(hashcode),
(void *) &proclocktag,
proclock_hashcode,
HASH_FIND,
@@ -3243,7 +3290,7 @@ PostPrepare_Locks(TransactionId xid)
* the same hash key, since there can be only one entry for any
* given lock with my own proc.
*/
- if (!hash_update_hash_key(LockMethodProcLockHash,
+ if (!hash_update_hash_key(LockMethodProcLockHashPartitions[partition],
(void *) proclock,
(void *) &proclocktag))
elog(PANIC, "duplicate entry found while reassigning a prepared transaction's locks");
@@ -3410,7 +3457,7 @@ GetLockStatusData(void)
LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
/* Now we can safely count the number of proclocks */
- data->nelements = el + hash_get_num_entries(LockMethodProcLockHash);
+ data->nelements = el + GetProcLockHashNumEntries();
if (data->nelements > els)
{
els = data->nelements;
@@ -3418,27 +3465,30 @@ GetLockStatusData(void)
repalloc(data->locks, sizeof(LockInstanceData) * els);
}
- /* Now scan the tables to copy the data */
- hash_seq_init(&seqstat, LockMethodProcLockHash);
-
- while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
{
- PGPROC *proc = proclock->tag.myProc;
- LOCK *lock = proclock->tag.myLock;
- LockInstanceData *instance = &data->locks[el];
+ /* Now scan the tables to copy the data */
+ hash_seq_init(&seqstat, LockMethodProcLockHashPartitions[i]);
- memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG));
- instance->holdMask = proclock->holdMask;
- if (proc->waitLock == proclock->tag.myLock)
- instance->waitLockMode = proc->waitLockMode;
- else
- instance->waitLockMode = NoLock;
- instance->backend = proc->backendId;
- instance->lxid = proc->lxid;
- instance->pid = proc->pid;
- instance->fastpath = false;
+ while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
+ {
+ PGPROC *proc = proclock->tag.myProc;
+ LOCK *lock = proclock->tag.myLock;
+ LockInstanceData *instance = &data->locks[el];
- el++;
+ memcpy(&instance->locktag, &lock->tag, sizeof(LOCKTAG));
+ instance->holdMask = proclock->holdMask;
+ if (proc->waitLock == proclock->tag.myLock)
+ instance->waitLockMode = proc->waitLockMode;
+ else
+ instance->waitLockMode = NoLock;
+ instance->backend = proc->backendId;
+ instance->lxid = proc->lxid;
+ instance->pid = proc->pid;
+ instance->fastpath = false;
+
+ el++;
+ }
}
/*
@@ -3487,7 +3537,7 @@ GetRunningTransactionLocks(int *nlocks)
LWLockAcquire(LockHashPartitionLockByIndex(i), LW_SHARED);
/* Now we can safely count the number of proclocks */
- els = hash_get_num_entries(LockMethodProcLockHash);
+ els = GetProcLockHashNumEntries();
/*
* Allocating enough space for all locks in the lock table is overkill,
@@ -3495,42 +3545,46 @@ GetRunningTransactionLocks(int *nlocks)
*/
accessExclusiveLocks = palloc(els * sizeof(xl_standby_lock));
- /* Now scan the tables to copy the data */
- hash_seq_init(&seqstat, LockMethodProcLockHash);
-
- /*
- * If lock is a currently granted AccessExclusiveLock then it will have
- * just one proclock holder, so locks are never accessed twice in this
- * particular case. Don't copy this code for use elsewhere because in the
- * general case this will give you duplicate locks when looking at
- * non-exclusive lock types.
- */
- index = 0;
- while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
{
- /* make sure this definition matches the one used in LockAcquire */
- if ((proclock->holdMask & LOCKBIT_ON(AccessExclusiveLock)) &&
- proclock->tag.myLock->tag.locktag_type == LOCKTAG_RELATION)
+ /* Now scan the tables to copy the data */
+ hash_seq_init(&seqstat, LockMethodProcLockHashPartitions[i]);
+
+ /*
+ * If lock is a currently granted AccessExclusiveLock then it will
+ * have just one proclock holder, so locks are never accessed twice in
+ * this particular case. Don't copy this code for use elsewhere
+ * because in the general case this will give you duplicate locks when
+ * looking at non-exclusive lock types.
+ */
+ index = 0;
+ while ((proclock = (PROCLOCK *) hash_seq_search(&seqstat)))
{
- PGPROC *proc = proclock->tag.myProc;
- PGXACT *pgxact = &ProcGlobal->allPgXact[proc->pgprocno];
- LOCK *lock = proclock->tag.myLock;
- TransactionId xid = pgxact->xid;
+ /* make sure this definition matches the one used in LockAcquire */
+ if ((proclock->holdMask & LOCKBIT_ON(AccessExclusiveLock)) &&
+ proclock->tag.myLock->tag.locktag_type == LOCKTAG_RELATION)
+ {
+ PGPROC *proc = proclock->tag.myProc;
+ PGXACT *pgxact = &ProcGlobal->allPgXact[proc->pgprocno];
+ LOCK *lock = proclock->tag.myLock;
+ TransactionId xid = pgxact->xid;
- /*
- * Don't record locks for transactions if we know they have
- * already issued their WAL record for commit but not yet released
- * lock. It is still possible that we see locks held by already
- * complete transactions, if they haven't yet zeroed their xids.
- */
- if (!TransactionIdIsValid(xid))
- continue;
+ /*
+ * Don't record locks for transactions if we know they have
+ * already issued their WAL record for commit but not yet
+ * released lock. It is still possible that we see locks held
+ * by already complete transactions, if they haven't yet
+ * zeroed their xids.
+ */
+ if (!TransactionIdIsValid(xid))
+ continue;
- accessExclusiveLocks[index].xid = xid;
- accessExclusiveLocks[index].dbOid = lock->tag.locktag_field1;
- accessExclusiveLocks[index].relOid = lock->tag.locktag_field2;
+ accessExclusiveLocks[index].xid = xid;
+ accessExclusiveLocks[index].dbOid = lock->tag.locktag_field1;
+ accessExclusiveLocks[index].relOid = lock->tag.locktag_field2;
- index++;
+ index++;
+ }
}
}
@@ -3614,23 +3668,27 @@ DumpAllLocks(void)
PROCLOCK *proclock;
LOCK *lock;
HASH_SEQ_STATUS status;
+ int i;
proc = MyProc;
if (proc && proc->waitLock)
LOCK_PRINT("DumpAllLocks: waiting on", proc->waitLock, 0);
- hash_seq_init(&status, LockMethodProcLockHash);
-
- while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL)
+ for (i = 0; i < NUM_LOCK_PARTITIONS; i++)
{
- PROCLOCK_PRINT("DumpAllLocks", proclock);
+ hash_seq_init(&status, LockMethodProcLockHashPartitions[i]);
- lock = proclock->tag.myLock;
- if (lock)
- LOCK_PRINT("DumpAllLocks", lock, 0);
- else
- elog(LOG, "DumpAllLocks: proclock->tag.myLock = NULL");
+ while ((proclock = (PROCLOCK *) hash_seq_search(&status)) != NULL)
+ {
+ PROCLOCK_PRINT("DumpAllLocks", proclock);
+
+ lock = proclock->tag.myLock;
+ if (lock)
+ LOCK_PRINT("DumpAllLocks", lock, 0);
+ else
+ elog(LOG, "DumpAllLocks: proclock->tag.myLock = NULL");
+ }
}
}
#endif /* LOCK_DEBUG */
@@ -3749,7 +3807,7 @@ lock_twophase_recover(TransactionId xid, uint16 info,
/*
* Find or create a proclock entry with this tag
*/
- proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash,
+ proclock = (PROCLOCK *) hash_search_with_hash_value(LockMethodProcLockHash(hashcode),
(void *) &proclocktag,
proclock_hashcode,
HASH_ENTER_NULL,
--
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers