Re: [HACKERS] Moving relation extension locks out of heavyweight lock manager

Masahiko Sawada Mon, 20 Nov 2017 14:21:13 -0800

On Tue, Nov 14, 2017 at 4:36 PM, Masahiko Sawada <sawada.m...@gmail.com> wrote:
> Thank you for pointing out and comments.
>
> On Fri, Nov 10, 2017 at 12:38 AM, Tom Lane <t...@sss.pgh.pa.us> wrote:
>> Robert Haas <robertmh...@gmail.com> writes:
>>> No, that's not right.  Now that you mention it, I realize that tuple
>>> locks can definitely cause deadlocks.  Example:
>>
>> Yeah.  Foreign-key-related tuple locks are another rich source of
>> examples.
>>
>>> ... So I don't
>>> think we can remove speculative insertion locks from the deadlock
>>> detector either.
>>
>> That scares me too.  I think that relation extension can safely
>> be transferred to some lower-level mechanism, because what has to
>> be done while holding the lock is circumscribed and below the level
>> of database operations (which might need other locks).  These other
>> ideas seem a lot riskier.
>>
>> (But see recent conversation where I discouraged Alvaro from holding
>> extension locks across BRIN summarization activity.  We'll need to look
>> and make sure that nobody else has had creative ideas like that.)
>>
>
> It seems that we should focus on transferring only relation extension
> locks as a first step. The page locks would also be safe but it might
> require some fundamental changes related to fast insertion, which is
> discussed on other thread[1]. Also in this case I think it's better to
> focus on relation extension locks so that we can optimize the
> lower-level lock mechanism for it.
>
> So I'll update the patch based on the comment I got from Robert before.
>


Attached updated version patch. I've moved only relation extension
locks out of heavy-weight lock as per discussion so far.

I've done a write-heavy benchmark on my laptop; loading 24kB data to
one table using COPY by 1 client, for 10 seconds. The through-put of
patched is 10% better than current HEAD. The result of 5 times is the
following.

----- PATCHED -----
tps = 178.791515 (excluding connections establishing)
tps = 176.522693 (excluding connections establishing)
tps = 168.705442 (excluding connections establishing)
tps = 158.158009 (excluding connections establishing)
tps = 161.145709 (excluding connections establishing)

----- HEAD -----
tps = 147.079803 (excluding connections establishing)
tps = 149.079540 (excluding connections establishing)
tps = 149.082275 (excluding connections establishing)
tps = 148.255376 (excluding connections establishing)
tps = 145.542552 (excluding connections establishing)

Also I've done a micro-benchmark; calling LockRelationForExtension and
UnlockRelationForExtension tightly in order to measure the number of
lock/unlock cycles per second. The result is,
PATCHED = 3.95892e+06 (cycles/sec)
HEAD = 1.15284e+06 (cycles/sec)
The patched is 3 times faster than current HEAD.

Attached updated patch and the function I used for micro-benchmark.
Please review it.

Regards,

--
Masahiko Sawada
NIPPON TELEGRAPH AND TELEPHONE CORPORATION
NTT Open Source Software Center

Datum
extlock_bench(PG_FUNCTION_ARGS)
{
	Oid relid = PG_GETARG_OID(0);
	int nLoops = PG_GETARG_INT32(1);
	Relation rel;
	TimestampTz start, end;
	int i;
	long secs;
	int microsecs;
	float duration;

	rel = relation_open(relid, AccessShareLock);

	/* Start time */
	start = GetCurrentTimestamp();

	/* Bench */
	for (i = 0; i < nLoops; i++)
	{
#ifdef EXTENSION_LOCK_H
		LockRelationForExtension(rel, RELEXT_EXCLUSIVE);
		UnlockRelationForExtension(rel, RELEXT_EXCLUSIVE);
#else
		LockRelationForExtension(rel, ExclusiveLock);
		UnlockRelationForExtension(rel, ExclusiveLock);
#endif
	}

	/* End time */
	end = GetCurrentTimestamp();

	relation_close(rel, AccessShareLock);

	TimestampDifference(start, end, &secs, &microsecs);
	duration = (float) microsecs / 1000000 + secs;

	PG_RETURN_FLOAT4(nLoops / duration);
}

diff --git a/src/backend/access/brin/brin_pageops.c b/src/backend/access/brin/brin_pageops.c
index 09db5c6..4e64258 100644
--- a/src/backend/access/brin/brin_pageops.c
+++ b/src/backend/access/brin/brin_pageops.c
@@ -623,8 +623,8 @@ brin_page_cleanup(Relation idxrel, Buffer buf)
 	 */
 	if (PageIsNew(page))
 	{
-		LockRelationForExtension(idxrel, ShareLock);
-		UnlockRelationForExtension(idxrel, ShareLock);
+		LockRelationForExtension(idxrel, RELEXT_SHARED);
+		UnlockRelationForExtension(idxrel, RELEXT_SHARED);
 
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 		if (PageIsNew(page))
@@ -716,7 +716,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 			 */
 			if (!RELATION_IS_LOCAL(irel))
 			{
-				LockRelationForExtension(irel, ExclusiveLock);
+				LockRelationForExtension(irel, RELEXT_EXCLUSIVE);
 				extensionLockHeld = true;
 			}
 			buf = ReadBuffer(irel, P_NEW);
@@ -768,7 +768,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 				}
 
 				if (extensionLockHeld)
-					UnlockRelationForExtension(irel, ExclusiveLock);
+					UnlockRelationForExtension(irel, RELEXT_EXCLUSIVE);
 
 				ReleaseBuffer(buf);
 				return InvalidBuffer;
@@ -778,7 +778,7 @@ brin_getinsertbuffer(Relation irel, Buffer oldbuf, Size itemsz,
 		LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
 
 		if (extensionLockHeld)
-			UnlockRelationForExtension(irel, ExclusiveLock);
+			UnlockRelationForExtension(irel, RELEXT_EXCLUSIVE);
 
 		page = BufferGetPage(buf);
 
diff --git a/src/backend/access/brin/brin_revmap.c b/src/backend/access/brin/brin_revmap.c
index 03e53ce..c8fc1ab 100644
--- a/src/backend/access/brin/brin_revmap.c
+++ b/src/backend/access/brin/brin_revmap.c
@@ -570,7 +570,7 @@ revmap_physical_extend(BrinRevmap *revmap)
 	else
 	{
 		if (needLock)
-			LockRelationForExtension(irel, ExclusiveLock);
+			LockRelationForExtension(irel, RELEXT_EXCLUSIVE);
 
 		buf = ReadBuffer(irel, P_NEW);
 		if (BufferGetBlockNumber(buf) != mapBlk)
@@ -582,7 +582,7 @@ revmap_physical_extend(BrinRevmap *revmap)
 			 * page from under whoever is using it.
 			 */
 			if (needLock)
-				UnlockRelationForExtension(irel, ExclusiveLock);
+				UnlockRelationForExtension(irel, RELEXT_EXCLUSIVE);
 			LockBuffer(revmap->rm_metaBuf, BUFFER_LOCK_UNLOCK);
 			ReleaseBuffer(buf);
 			return;
@@ -591,7 +591,7 @@ revmap_physical_extend(BrinRevmap *revmap)
 		page = BufferGetPage(buf);
 
 		if (needLock)
-			UnlockRelationForExtension(irel, ExclusiveLock);
+			UnlockRelationForExtension(irel, RELEXT_EXCLUSIVE);
 	}
 
 	/* Check that it's a regular block (or an empty page) */
diff --git a/src/backend/access/gin/ginutil.c b/src/backend/access/gin/ginutil.c
index d9c6483..1af884a 100644
--- a/src/backend/access/gin/ginutil.c
+++ b/src/backend/access/gin/ginutil.c
@@ -325,13 +325,13 @@ GinNewBuffer(Relation index)
 	/* Must extend the file */
 	needLock = !RELATION_IS_LOCAL(index);
 	if (needLock)
-		LockRelationForExtension(index, ExclusiveLock);
+		LockRelationForExtension(index, RELEXT_EXCLUSIVE);
 
 	buffer = ReadBuffer(index, P_NEW);
 	LockBuffer(buffer, GIN_EXCLUSIVE);
 
 	if (needLock)
-		UnlockRelationForExtension(index, ExclusiveLock);
+		UnlockRelationForExtension(index, RELEXT_EXCLUSIVE);
 
 	return buffer;
 }
diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c
index 394bc83..c1a89f9 100644
--- a/src/backend/access/gin/ginvacuum.c
+++ b/src/backend/access/gin/ginvacuum.c
@@ -716,10 +716,10 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	needLock = !RELATION_IS_LOCAL(index);
 
 	if (needLock)
-		LockRelationForExtension(index, ExclusiveLock);
+		LockRelationForExtension(index, RELEXT_EXCLUSIVE);
 	npages = RelationGetNumberOfBlocks(index);
 	if (needLock)
-		UnlockRelationForExtension(index, ExclusiveLock);
+		UnlockRelationForExtension(index, RELEXT_EXCLUSIVE);
 
 	totFreePages = 0;
 
@@ -766,10 +766,10 @@ ginvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	stats->pages_free = totFreePages;
 
 	if (needLock)
-		LockRelationForExtension(index, ExclusiveLock);
+		LockRelationForExtension(index, RELEXT_EXCLUSIVE);
 	stats->num_pages = RelationGetNumberOfBlocks(index);
 	if (needLock)
-		UnlockRelationForExtension(index, ExclusiveLock);
+		UnlockRelationForExtension(index, RELEXT_EXCLUSIVE);
 
 	return stats;
 }
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index d8d1c0a..5f4fe13 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -821,13 +821,13 @@ gistNewBuffer(Relation r)
 	needLock = !RELATION_IS_LOCAL(r);
 
 	if (needLock)
-		LockRelationForExtension(r, ExclusiveLock);
+		LockRelationForExtension(r, RELEXT_EXCLUSIVE);
 
 	buffer = ReadBuffer(r, P_NEW);
 	LockBuffer(buffer, GIST_EXCLUSIVE);
 
 	if (needLock)
-		UnlockRelationForExtension(r, ExclusiveLock);
+		UnlockRelationForExtension(r, RELEXT_EXCLUSIVE);
 
 	return buffer;
 }
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index 77d9d12..ca45b06 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -59,10 +59,10 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 
 	/* try to find deleted pages */
 	if (needLock)
-		LockRelationForExtension(rel, ExclusiveLock);
+		LockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 	npages = RelationGetNumberOfBlocks(rel);
 	if (needLock)
-		UnlockRelationForExtension(rel, ExclusiveLock);
+		UnlockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 
 	totFreePages = 0;
 	for (blkno = GIST_ROOT_BLKNO + 1; blkno < npages; blkno++)
@@ -91,10 +91,10 @@ gistvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	/* return statistics */
 	stats->pages_free = totFreePages;
 	if (needLock)
-		LockRelationForExtension(rel, ExclusiveLock);
+		LockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 	stats->num_pages = RelationGetNumberOfBlocks(rel);
 	if (needLock)
-		UnlockRelationForExtension(rel, ExclusiveLock);
+		UnlockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 
 	return stats;
 }
diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c
index 13e3bdc..a8ce6c7 100644
--- a/src/backend/access/heap/hio.c
+++ b/src/backend/access/heap/hio.c
@@ -519,11 +519,11 @@ loop:
 	if (needLock)
 	{
 		if (!use_fsm)
-			LockRelationForExtension(relation, ExclusiveLock);
-		else if (!ConditionalLockRelationForExtension(relation, ExclusiveLock))
+			LockRelationForExtension(relation, RELEXT_EXCLUSIVE);
+		else if (!ConditionalLockRelationForExtension(relation, RELEXT_EXCLUSIVE))
 		{
 			/* Couldn't get the lock immediately; wait for it. */
-			LockRelationForExtension(relation, ExclusiveLock);
+			LockRelationForExtension(relation, RELEXT_EXCLUSIVE);
 
 			/*
 			 * Check if some other backend has extended a block for us while
@@ -537,7 +537,7 @@ loop:
 			 */
 			if (targetBlock != InvalidBlockNumber)
 			{
-				UnlockRelationForExtension(relation, ExclusiveLock);
+				UnlockRelationForExtension(relation, RELEXT_EXCLUSIVE);
 				goto loop;
 			}
 
@@ -576,7 +576,7 @@ loop:
 	 * against vacuumlazy.c --- see comments therein.
 	 */
 	if (needLock)
-		UnlockRelationForExtension(relation, ExclusiveLock);
+		UnlockRelationForExtension(relation, RELEXT_EXCLUSIVE);
 
 	/*
 	 * We need to initialize the empty new page.  Double-check that it really
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 4c2a13a..7dc3088 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -641,7 +641,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	 * Note that another backend might have extended or created the relation
 	 * by the time we get the lock.
 	 */
-	LockRelationForExtension(rel, ExclusiveLock);
+	LockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 
 	/* Might have to re-open if a cache flush happened */
 	RelationOpenSmgr(rel);
@@ -679,7 +679,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
 	/* Update local cache with the up-to-date size */
 	rel->rd_smgr->smgr_vm_nblocks = vm_nblocks_now;
 
-	UnlockRelationForExtension(rel, ExclusiveLock);
+	UnlockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 
 	pfree(pg);
 }
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index c774349..0eb1102 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -659,7 +659,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 		needLock = !RELATION_IS_LOCAL(rel);
 
 		if (needLock)
-			LockRelationForExtension(rel, ExclusiveLock);
+			LockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 
 		buf = ReadBuffer(rel, P_NEW);
 
@@ -673,7 +673,7 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 		 * condition against btvacuumscan --- see comments therein.
 		 */
 		if (needLock)
-			UnlockRelationForExtension(rel, ExclusiveLock);
+			UnlockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 
 		/* Initialize the new page before returning it */
 		page = BufferGetPage(buf);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 399e6a1..be457b0 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1058,10 +1058,10 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	{
 		/* Get the current relation length */
 		if (needLock)
-			LockRelationForExtension(rel, ExclusiveLock);
+			LockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 		num_pages = RelationGetNumberOfBlocks(rel);
 		if (needLock)
-			UnlockRelationForExtension(rel, ExclusiveLock);
+			UnlockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 
 		/* Quit if we've scanned the whole relation */
 		if (blkno >= num_pages)
diff --git a/src/backend/access/spgist/spgutils.c b/src/backend/access/spgist/spgutils.c
index bd5301f..8f54015 100644
--- a/src/backend/access/spgist/spgutils.c
+++ b/src/backend/access/spgist/spgutils.c
@@ -230,13 +230,13 @@ SpGistNewBuffer(Relation index)
 	/* Must extend the file */
 	needLock = !RELATION_IS_LOCAL(index);
 	if (needLock)
-		LockRelationForExtension(index, ExclusiveLock);
+		LockRelationForExtension(index, RELEXT_EXCLUSIVE);
 
 	buffer = ReadBuffer(index, P_NEW);
 	LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 
 	if (needLock)
-		UnlockRelationForExtension(index, ExclusiveLock);
+		UnlockRelationForExtension(index, RELEXT_EXCLUSIVE);
 
 	return buffer;
 }
diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c
index d7d5e90..3888d93 100644
--- a/src/backend/access/spgist/spgvacuum.c
+++ b/src/backend/access/spgist/spgvacuum.c
@@ -824,10 +824,10 @@ spgvacuumscan(spgBulkDeleteState *bds)
 	{
 		/* Get the current relation length */
 		if (needLock)
-			LockRelationForExtension(index, ExclusiveLock);
+			LockRelationForExtension(index, RELEXT_EXCLUSIVE);
 		num_pages = RelationGetNumberOfBlocks(index);
 		if (needLock)
-			UnlockRelationForExtension(index, ExclusiveLock);
+			UnlockRelationForExtension(index, RELEXT_EXCLUSIVE);
 
 		/* Quit if we've scanned the whole relation */
 		if (blkno >= num_pages)
diff --git a/src/backend/commands/discard.c b/src/backend/commands/discard.c
index f0dcd87..216c197 100644
--- a/src/backend/commands/discard.c
+++ b/src/backend/commands/discard.c
@@ -19,6 +19,7 @@
 #include "commands/discard.h"
 #include "commands/prepare.h"
 #include "commands/sequence.h"
+#include "storage/extension_lock.h"
 #include "utils/guc.h"
 #include "utils/portal.h"
 
@@ -71,6 +72,7 @@ DiscardAll(bool isTopLevel)
 	ResetAllOptions();
 	DropAllPreparedStatements();
 	Async_UnlistenAll();
+	RelExtLockReleaseAll();
 	LockReleaseAll(USER_LOCKMETHOD, true);
 	ResetPlanCache();
 	ResetTempTableNamespace();
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 6587db7..56ee82b 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -860,8 +860,8 @@ lazy_scan_heap(Relation onerel, int options, LVRelStats *vacrelstats,
 			 * it's got exclusive lock on the whole relation.
 			 */
 			LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-			LockRelationForExtension(onerel, ExclusiveLock);
-			UnlockRelationForExtension(onerel, ExclusiveLock);
+			LockRelationForExtension(onerel, RELEXT_EXCLUSIVE);
+			UnlockRelationForExtension(onerel, RELEXT_EXCLUSIVE);
 			LockBufferForCleanup(buf);
 			if (PageIsNew(page))
 			{
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 5c256ff..5beba70 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3628,6 +3628,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
 		case WAIT_EVENT_SYNC_REP:
 			event_name = "SyncRep";
 			break;
+		case WAIT_EVENT_RELATION_EXTENSION:
+			event_name = "RelationExtension";
+			break;
 			/* no default case, so that compiler will warn */
 	}
 
diff --git a/src/backend/storage/freespace/freespace.c b/src/backend/storage/freespace/freespace.c
index 4648473..498223a 100644
--- a/src/backend/storage/freespace/freespace.c
+++ b/src/backend/storage/freespace/freespace.c
@@ -624,7 +624,7 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 	 * Note that another backend might have extended or created the relation
 	 * by the time we get the lock.
 	 */
-	LockRelationForExtension(rel, ExclusiveLock);
+	LockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 
 	/* Might have to re-open if a cache flush happened */
 	RelationOpenSmgr(rel);
@@ -652,7 +652,7 @@ fsm_extend(Relation rel, BlockNumber fsm_nblocks)
 	/* Update local cache with the up-to-date size */
 	rel->rd_smgr->smgr_fsm_nblocks = fsm_nblocks_now;
 
-	UnlockRelationForExtension(rel, ExclusiveLock);
+	UnlockRelationForExtension(rel, RELEXT_EXCLUSIVE);
 
 	pfree(pg);
 }
diff --git a/src/backend/storage/lmgr/Makefile b/src/backend/storage/lmgr/Makefile
index e1b787e..2334a40 100644
--- a/src/backend/storage/lmgr/Makefile
+++ b/src/backend/storage/lmgr/Makefile
@@ -13,7 +13,7 @@ top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
 OBJS = lmgr.o lock.o proc.o deadlock.o lwlock.o lwlocknames.o spin.o \
-	s_lock.o predicate.o condition_variable.o
+	s_lock.o predicate.o condition_variable.o extension_lock.o
 
 include $(top_srcdir)/src/backend/common.mk
 
diff --git a/src/backend/storage/lmgr/README b/src/backend/storage/lmgr/README
index 56b0a12..5e0a394 100644
--- a/src/backend/storage/lmgr/README
+++ b/src/backend/storage/lmgr/README
@@ -29,6 +29,12 @@ process has to wait for an LWLock, it blocks on a SysV semaphore so as
 to not consume CPU time.  Waiting processes will be granted the lock in
 arrival order.  There is no timeout.
 
+* Relation extension locks.  The relation extension lock manager is
+specialized in relation extensions. In PostgreSQL 10 relation extension
+lock has been moved out of regular lock. It's similar to regular locks
+but doesn't have full dead lock detection and group locking. When
+confliction relation extension lock waits using condition variables.
+
 * Regular locks (a/k/a heavyweight locks).  The regular lock manager
 supports a variety of lock modes with table-driven semantics, and it has
 full deadlock detection and automatic release at transaction end.
@@ -40,9 +46,9 @@ Acquisition of either a spinlock or a lightweight lock causes query
 cancel and die() interrupts to be held off until all such locks are
 released. No such restriction exists for regular locks, however.  Also
 note that we can accept query cancel and die() interrupts while waiting
-for a regular lock, but we will not accept them while waiting for
-spinlocks or LW locks. It is therefore not a good idea to use LW locks
-when the wait time might exceed a few seconds.
+for a relation extension lock or a regular lock, but we will not accept
+them while waiting for spinlocks or LW locks. It is therefore not a good
+idea to use LW locks when the wait time might exceed a few seconds.
 
 The rest of this README file discusses the regular lock manager in detail.
 
diff --git a/src/backend/storage/lmgr/extension_lock.c b/src/backend/storage/lmgr/extension_lock.c
new file mode 100644
index 0000000..13acef7
--- /dev/null
+++ b/src/backend/storage/lmgr/extension_lock.c
@@ -0,0 +1,494 @@
+/*-------------------------------------------------------------------------
+ *
+ * extension_lock.c
+ *	  Relation extension lock manager
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ * src/backend/storage/lmgr/extension_lock.c
+ *
+ * NOTES:
+ *
+ * This lock manager is specialized in relation extension locks; light
+ * weight and interruptible lock manager. It's similar to heavy-weight
+ * lock but doesn't have dead lock detection mechanism and group locking
+ * mechanism.
+ *
+ * For lock acquisition we use an atomic compare-and-exchange on the
+ * state variable. When a process tries to acquire a lock that conflicts
+ * with existing lock, it is put to sleep using condition variables
+ * if not conditional locking. When release the lock, we use an atomic
+ * decrement to release the lock, but don't remove the RELEXTLOCK entry
+ * in the hash table. The all unused entries will be reclaimed when
+ * acquisition once the hash table got full.
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "pgstat.h"
+#include "pg_trace.h"
+#include "postmaster/postmaster.h"
+#include "replication/slot.h"
+#include "storage/ipc.h"
+#include "storage/proc.h"
+#include "storage/proclist.h"
+#include "storage/spin.h"
+#include "storage/extension_lock.h"
+#include "utils/memutils.h"
+#include "utils/rel.h"
+
+/*
+ * Compute the hash code associated with a RELEXTLOCK.
+ *
+ * To avoid unnecessary recomputations of the hash code, we try to do this
+ * just once per function, and then pass it around as needed.  Aside from
+ * passing the hashcode to hash_search_with_hash_value(), we can extract
+ * the lock partition number from the hashcode.
+ */
+#define RelExtLockTargetTagHashCode(relextlocktargettag) \
+	get_hash_value(RelExtLockHash, (const void *) relextlocktargettag)
+
+/*
+ * The lockmgr's shared hash tables are partitioned to reduce contention.
+ * To determine which partition a given relid belongs to, compute the tag's
+ * hash code with ExtLockTagHashCode(), then apply one of these macros.
+ * NB: NUM_RELEXTENSIONLOCK_PARTITIONS must be a power of 2!
+ */
+#define RelExtLockHashPartition(hashcode) \
+	((hashcode) % NUM_RELEXTLOCK_PARTITIONS)
+#define RelExtLockHashPartitionLock(hashcode) \
+	(&MainLWLockArray[RELEXTLOCK_MANAGER_LWLOCK_OFFSET + \
+					  LockHashPartition(hashcode)].lock)
+#define RelExtLockHashPartitionLockByIndex(i) \
+	(&MainLWLockArray[RELEXTLOCK_MANAGER_LWLOCK_OFFSET + (i)].lock)
+
+#define	RELEXT_VAL_EXCLUSIVE	((uint32) 1 << 24)
+#define RELEXT_VAL_SHARED		1
+
+#define RELEXT_LOCK_MASK			((uint32) ((1 << 25) - 1))
+
+typedef struct RELEXTLOCK
+{
+	/* hash key -- must be first */
+	Oid					relid;
+
+	/* state of exclusive/non-exclusive lock */
+	pg_atomic_uint32	state;
+	pg_atomic_uint32	pin_counts;
+
+	ConditionVariable	cv;
+} RELEXTLOCK;
+
+/*
+ * This structure holds information per-object relation extension
+ * lock. held_extlocks represents the RelExtLocks we're holding.
+ */
+typedef	struct relextlock_handle
+{
+	RELEXTLOCK		*lock;
+	RelExtLockMode	mode;	/* lock mode for this table entry */
+	int				nLocks;
+} relextlock_handle;
+
+/*
+ * We use this structure to keep track of locked relation extension locks
+ * for release during error recovery.  Normaly, at most one lock associated
+ * with a relation will be held at once. However, sometimes we could try to
+ * acquire new one while holding another one; for example, adding extra
+ * relation blocks for both relation and its free space map.
+ */
+static relextlock_handle held_relextlock;
+static int num_held_relextlocks = 0;
+
+static bool RelExtLockAcquire(Oid relid, RelExtLockMode lockmode, bool conditional);
+static void RelExtLockRelease(Oid rleid, RelExtLockMode lockmode);
+static bool RelExtLockAttemptLock(RELEXTLOCK *extlock, RelExtLockMode lockmode);
+static bool RelExtLockShrinkLocks(void);
+
+/*
+ * Pointers to hash tables containing lock state
+ *
+ * The RelExtLockHash hash table is in shared memory
+ */
+static HTAB *RelExtLockHash;
+
+/*
+ * InitRelExtLock
+ *      Initialize the relation extension lock manager's data structures.
+ */
+void
+InitRelExtLock(long max_table_size)
+{
+	HASHCTL	info;
+	long		init_table_size;
+
+	/*
+	 * Compute init/max size to request for lock hashtables.  Note these
+	 * calculations must agree with LockShmemSize!
+	 */
+	init_table_size = max_table_size / 2;
+
+	/*
+	 * Allocate hash table for RELEXTLOCK structs. This stores per-relation
+	 * lock.
+	 */
+	MemSet(&info, 0, sizeof(info));
+	info.keysize = sizeof(Oid);
+	info.entrysize = sizeof(RELEXTLOCK);
+	info.num_partitions = NUM_RELEXTLOCK_PARTITIONS;
+
+	RelExtLockHash = ShmemInitHash("RELEXTLOCK Hash",
+								   init_table_size,
+								   max_table_size,
+								   &info,
+								   HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
+}
+
+/*
+ *		LockRelationForExtension
+ *
+ * This lock is used to interlock addition of pages to relations.
+ * We need such locking because bufmgr/smgr definition of P_NEW is not
+ * race-condition-proof.
+ *
+ * We assume the caller is already holding some type of regular lock on
+ * the relation, so no AcceptInvalidationMessages call is needed here.
+ */
+void
+LockRelationForExtension(Relation relation, RelExtLockMode lockmode)
+{
+	RelExtLockAcquire(relation->rd_id, lockmode, false);
+}
+
+/*
+ *		ConditionalLockRelationForExtension
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns TRUE iff the lock was acquired.
+ */
+bool
+ConditionalLockRelationForExtension(Relation relation, RelExtLockMode lockmode)
+{
+	return RelExtLockAcquire(relation->rd_id, lockmode, true);
+}
+
+/*
+ *		RelationExtensionLockWaiterCount
+ *
+ * Count the number of processes waiting for the given relation extension lock.
+ */
+int
+RelationExtensionLockWaiterCount(Relation relation)
+{
+	LWLock		*partitionLock;
+	RELEXTLOCK	*extlock;
+	Oid			relid;
+	uint32		hashcode;
+	uint32		pin_counts;
+	bool		found;
+
+	relid = RelationGetRelid(relation);
+
+	hashcode = RelExtLockTargetTagHashCode(&relid);
+	partitionLock = RelExtLockHashPartitionLock(hashcode);
+
+	LWLockAcquire(partitionLock, LW_SHARED);
+
+	extlock = (RELEXTLOCK *) hash_search_with_hash_value(RelExtLockHash,
+														 (void *) &relid,
+														 hashcode,
+														 HASH_FIND, &found);
+
+	LWLockRelease(partitionLock);
+
+	/* We assume that we already acquire this lock */
+	Assert(found);
+
+	pin_counts = pg_atomic_read_u32(&(extlock->pin_counts));
+
+	/* Except for me */
+	return pin_counts - 1;
+}
+
+/*
+ *		UnlockRelationForExtension
+ */
+void
+UnlockRelationForExtension(Relation relation, RelExtLockMode lockmode)
+{
+	RelExtLockRelease(relation->rd_id, lockmode);
+}
+
+/*
+ * RelationExtensionLockReleaseAll - release all currently-held relation extension locks
+ */
+void
+RelExtLockReleaseAll(void)
+{
+	if (num_held_relextlocks > 0)
+	{
+		HOLD_INTERRUPTS();
+		RelExtLockRelease(held_relextlock.lock->relid, held_relextlock.mode);
+	}
+}
+
+/*
+ * Acquire relation extension lock and create RELEXTLOCK hash entry on shared
+ * hash table. To avoid dead-lock with partition lock and LWLock, we acquire
+ * them but don't release it here. The caller must call DeleteRelExtLock later
+ * to release these locks.
+ */
+static bool
+RelExtLockAcquire(Oid relid, RelExtLockMode lockmode, bool conditional)
+{
+	RELEXTLOCK	*extlock = NULL;
+	LWLock	*partitionLock;
+	uint32	hashcode;
+	bool	found;
+	bool mustwait;
+
+	hashcode = RelExtLockTargetTagHashCode(&relid);
+	partitionLock = RelExtLockHashPartitionLock(hashcode);
+
+	/* If we already hold the lock, we can just increase the count locally */
+	if (num_held_relextlocks > 0 &&
+		relid == held_relextlock.lock->relid &&
+		lockmode == held_relextlock.mode)
+	{
+		held_relextlock.nLocks++;
+		return true;
+	}
+
+	for (;;)
+	{
+
+		LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+		if (!extlock)
+			extlock = (RELEXTLOCK *) hash_search_with_hash_value(RelExtLockHash,
+																 (void * ) &relid,
+																 hashcode, HASH_ENTER_NULL,
+																 &found);
+
+		/*
+		 * Failed to create new hash entry. Try to shrink the hash table and
+		 * retry.
+		 */
+		if (!extlock)
+		{
+			bool	successed;
+			LWLockRelease(partitionLock);
+			successed = RelExtLockShrinkLocks();
+
+			if (!successed)
+				ereport(ERROR,
+						(errmsg("out of shared memory"),
+						 errhint("You might need to increase max_pred_locks_per_transaction.")));
+
+			continue;
+		}
+
+		if (!found)
+		{
+			extlock->relid = relid;
+			pg_atomic_init_u32(&(extlock->state), 0);
+			pg_atomic_init_u32(&(extlock->pin_counts), 0);
+			ConditionVariableInit(&(extlock->cv));
+		}
+
+		/* Increment pin count */
+		pg_atomic_add_fetch_u32(&(extlock->pin_counts), 1);
+
+		mustwait = RelExtLockAttemptLock(extlock, lockmode);
+
+		if (!mustwait)
+			break;	/* got the lock */
+
+		/* Could not got the lock, return if in conditional locking */
+		if (mustwait && conditional)
+		{
+			pg_atomic_sub_fetch_u32(&(extlock->pin_counts), 1);
+			LWLockRelease(partitionLock);
+			return false;
+		}
+
+		/* Release the partition lock before sleep */
+		LWLockRelease(partitionLock);
+
+		/* Sleep until the lock is released */
+		ConditionVariableSleep(&(extlock->cv), WAIT_EVENT_RELATION_EXTENSION);
+	}
+
+	LWLockRelease(partitionLock);
+	ConditionVariableCancelSleep();
+
+	Assert(!mustwait);
+
+	/* Remember lock held by this backend */
+	held_relextlock.lock = extlock;
+	held_relextlock.mode = lockmode;
+	held_relextlock.nLocks = 1;
+	num_held_relextlocks++;
+
+	/* Always return true if not conditional lock */
+	return true;
+}
+
+/*
+ * ExtLockRelease
+ *
+ * Release a previously acquired relation extension lock. We don't remove
+ * hash entry at the time. Once the hash table got full, all un-pinned hash
+ * entries will be removed.
+ */
+static void
+RelExtLockRelease(Oid relid, RelExtLockMode lockmode)
+{
+	RELEXTLOCK	*extlock;
+	RelExtLockMode mode;
+	LWLock	*partitionLock;
+	uint32	hashcode;
+	uint32	pin_counts;
+
+	/* We should have acquired a lock before releasing */
+	Assert(num_held_relextlocks > 0);
+
+	/* Decrease the lock count locally */
+	held_relextlock.nLocks--;
+
+	/* If we are still holding the lock, we're done */
+	 if (held_relextlock.nLocks > 0)
+		return;
+
+	hashcode = RelExtLockTargetTagHashCode(&relid);
+	partitionLock = RelExtLockHashPartitionLock(hashcode);
+
+	LWLockAcquire(partitionLock, LW_EXCLUSIVE);
+
+	Assert(num_held_relextlocks > 0);
+
+	if (relid != held_relextlock.lock->relid || lockmode != held_relextlock.mode)
+		ereport(ERROR,
+				(errcode(ERRCODE_LOCK_NOT_AVAILABLE),
+				 errmsg("relation extension lock for %u in lock mode %d is not held",
+						relid, lockmode)));
+
+	extlock = held_relextlock.lock;
+	mode = held_relextlock.mode;
+
+	num_held_relextlocks--;
+
+	if (mode == RELEXT_EXCLUSIVE)
+		pg_atomic_sub_fetch_u32(&(extlock->state), RELEXT_VAL_EXCLUSIVE);
+	else
+		pg_atomic_sub_fetch_u32(&(extlock->state), RELEXT_VAL_SHARED);
+
+	/* Decrement pin counter */
+	pin_counts = pg_atomic_sub_fetch_u32(&(extlock->pin_counts), 1);
+
+	LWLockRelease(partitionLock);
+
+	/* Wake up waiters if someone looking at this lock */
+	if (pin_counts > 0)
+		ConditionVariableBroadcast(&(extlock->cv));
+}
+
+/*
+ * Internal function that tries to atomically acquire the relation extension
+ * lock in the passed in mode.
+ *
+ * Returns true if the lock isn't free and we need to wait.
+ */
+static bool
+RelExtLockAttemptLock(RELEXTLOCK *extlock, RelExtLockMode lockmode)
+{
+	uint32	oldstate;
+
+	oldstate = pg_atomic_read_u32(&extlock->state);
+
+	while (true)
+	{
+		uint32	desired_state;
+		bool	lock_free;
+
+		desired_state = oldstate;
+
+		if (lockmode == RELEXT_EXCLUSIVE)
+		{
+			lock_free = (oldstate & RELEXT_LOCK_MASK) == 0;
+			if (lock_free)
+				desired_state += RELEXT_VAL_EXCLUSIVE;
+		}
+		else
+		{
+			lock_free = (oldstate & RELEXT_VAL_EXCLUSIVE) == 0;
+			if (lock_free)
+				desired_state += RELEXT_VAL_SHARED;
+		}
+
+		if (pg_atomic_compare_exchange_u32(&extlock->state,
+										   &oldstate, desired_state))
+		{
+			if (lock_free)
+				return false;
+			else
+				return true;
+		}
+	}
+	pg_unreachable();
+}
+
+/*
+ * Reclaim all un-pinned RELEXTLOCK entries from the hash table.
+ */
+static bool
+RelExtLockShrinkLocks(void)
+{
+	HASH_SEQ_STATUS	hstat;
+	RELEXTLOCK		*extlock;
+	List			*entries_to_remove = NIL;
+	ListCell		*cell;
+	int				i;
+
+	/*
+	 * To ensure consistency, take all partition locks in exclusive
+	 * mode.
+	 */
+	for (i = 0; i < NUM_RELEXTLOCK_PARTITIONS; i++)
+		LWLockAcquire(RelExtLockHashPartitionLockByIndex(i), LW_EXCLUSIVE);
+
+	/* Collect all un-pinned RELEXTLOCK entries */
+	hash_seq_init(&hstat, RelExtLockHash);
+	while ((extlock = (RELEXTLOCK *) hash_seq_search(&hstat)) != NULL)
+	{
+		uint32	pin_count = pg_atomic_read_u32(&(extlock->pin_counts));
+
+		if (pin_count == 0)
+			entries_to_remove = lappend(entries_to_remove, extlock);
+	}
+
+	/* We could not find any entries that we can remove right now */
+	if (list_length(entries_to_remove) == 0)
+		return false;
+
+	/* Remove collected entries from RelExtLockHash has table */
+	foreach (cell, entries_to_remove)
+	{
+		RELEXTLOCK	*el = (RELEXTLOCK *) lfirst(cell);
+		uint32	hc = RelExtLockTargetTagHashCode(&(el->relid));
+
+		hash_search_with_hash_value(RelExtLockHash, (void *) &(el->relid),
+									hc, HASH_REMOVE, NULL);
+	}
+
+	/* Release all partition locks */
+	for (i = 0; i < NUM_RELEXTLOCK_PARTITIONS; i++)
+		LWLockRelease(RelExtLockHashPartitionLockByIndex(i));
+
+	return true;
+}
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index da5679b..4fbc0c4 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -319,78 +319,6 @@ UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode)
 }
 
 /*
- *		LockRelationForExtension
- *
- * This lock tag is used to interlock addition of pages to relations.
- * We need such locking because bufmgr/smgr definition of P_NEW is not
- * race-condition-proof.
- *
- * We assume the caller is already holding some type of regular lock on
- * the relation, so no AcceptInvalidationMessages call is needed here.
- */
-void
-LockRelationForExtension(Relation relation, LOCKMODE lockmode)
-{
-	LOCKTAG		tag;
-
-	SET_LOCKTAG_RELATION_EXTEND(tag,
-								relation->rd_lockInfo.lockRelId.dbId,
-								relation->rd_lockInfo.lockRelId.relId);
-
-	(void) LockAcquire(&tag, lockmode, false, false);
-}
-
-/*
- *		ConditionalLockRelationForExtension
- *
- * As above, but only lock if we can get the lock without blocking.
- * Returns true iff the lock was acquired.
- */
-bool
-ConditionalLockRelationForExtension(Relation relation, LOCKMODE lockmode)
-{
-	LOCKTAG		tag;
-
-	SET_LOCKTAG_RELATION_EXTEND(tag,
-								relation->rd_lockInfo.lockRelId.dbId,
-								relation->rd_lockInfo.lockRelId.relId);
-
-	return (LockAcquire(&tag, lockmode, false, true) != LOCKACQUIRE_NOT_AVAIL);
-}
-
-/*
- *		RelationExtensionLockWaiterCount
- *
- * Count the number of processes waiting for the given relation extension lock.
- */
-int
-RelationExtensionLockWaiterCount(Relation relation)
-{
-	LOCKTAG		tag;
-
-	SET_LOCKTAG_RELATION_EXTEND(tag,
-								relation->rd_lockInfo.lockRelId.dbId,
-								relation->rd_lockInfo.lockRelId.relId);
-
-	return LockWaiterCount(&tag);
-}
-
-/*
- *		UnlockRelationForExtension
- */
-void
-UnlockRelationForExtension(Relation relation, LOCKMODE lockmode)
-{
-	LOCKTAG		tag;
-
-	SET_LOCKTAG_RELATION_EXTEND(tag,
-								relation->rd_lockInfo.lockRelId.dbId,
-								relation->rd_lockInfo.lockRelId.relId);
-
-	LockRelease(&tag, lockmode, false);
-}
-
-/*
  *		LockPage
  *
  * Obtain a page-level lock.  This is currently used by some index access
@@ -961,12 +889,6 @@ DescribeLockTag(StringInfo buf, const LOCKTAG *tag)
 							 tag->locktag_field2,
 							 tag->locktag_field1);
 			break;
-		case LOCKTAG_RELATION_EXTEND:
-			appendStringInfo(buf,
-							 _("extension of relation %u of database %u"),
-							 tag->locktag_field2,
-							 tag->locktag_field1);
-			break;
 		case LOCKTAG_PAGE:
 			appendStringInfo(buf,
 							 _("page %u of relation %u of database %u"),
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index 5833086..5ca1c27 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -45,6 +45,7 @@
 #include "storage/sinvaladt.h"
 #include "storage/spin.h"
 #include "storage/standby.h"
+#include "storage/lmgr.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
 #include "utils/resowner_private.h"
@@ -388,6 +389,9 @@ InitLocks(void)
 	max_table_size = NLOCKENTS();
 	init_table_size = max_table_size / 2;
 
+	/* Initialize lock structure for relation extension lock */
+	InitRelExtLock(max_table_size);
+
 	/*
 	 * Allocate hash table for LOCK structs.  This stores per-locked-object
 	 * information.
@@ -3366,6 +3370,7 @@ LockShmemSize(void)
 	/* lock hash table */
 	max_table_size = NLOCKENTS();
 	size = add_size(size, hash_estimate_size(max_table_size, sizeof(LOCK)));
+	size = add_size(size, hash_estimate_size(max_table_size, sizeof(LWLock)));
 
 	/* proclock hash table */
 	max_table_size *= 2;
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index e5c3e86..b12aba0 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -451,6 +451,13 @@ InitializeLWLocks(void)
 	for (id = 0; id < NUM_PREDICATELOCK_PARTITIONS; id++, lock++)
 		LWLockInitialize(&lock->lock, LWTRANCHE_PREDICATE_LOCK_MANAGER);
 
+	/* Initialize relation extension lmgr's LWLocks in main array */
+	lock = MainLWLockArray + NUM_INDIVIDUAL_LWLOCKS +
+		NUM_BUFFER_PARTITIONS + NUM_LOCK_PARTITIONS +
+		NUM_PREDICATELOCK_PARTITIONS;
+	for (id = 0; id < NUM_RELEXTLOCK_PARTITIONS; id++, lock++)
+		LWLockInitialize(&lock->lock, LWTRANCHE_RELEXT_LOCK_MANAGER);
+
 	/* Initialize named tranches. */
 	if (NamedLWLockTrancheRequests > 0)
 	{
@@ -508,6 +515,7 @@ RegisterLWLockTranches(void)
 	LWLockRegisterTranche(LWTRANCHE_LOCK_MANAGER, "lock_manager");
 	LWLockRegisterTranche(LWTRANCHE_PREDICATE_LOCK_MANAGER,
 						  "predicate_lock_manager");
+	LWLockRegisterTranche(LWTRANCHE_RELEXT_LOCK_MANAGER, "relext_lock_manager");
 	LWLockRegisterTranche(LWTRANCHE_PARALLEL_QUERY_DSA,
 						  "parallel_query_dsa");
 	LWLockRegisterTranche(LWTRANCHE_SESSION_DSA,
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 5f6727d..f698e9c 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -765,6 +765,8 @@ ProcReleaseLocks(bool isCommit)
 		return;
 	/* If waiting, get off wait queue (should only be needed after error) */
 	LockErrorCleanup();
+	/* Release relation extension locks */
+	RelExtLockReleaseAll();
 	/* Release standard locks, including session-level if aborting */
 	LockReleaseAll(DEFAULT_LOCKMETHOD, !isCommit);
 	/* Release transaction-level advisory locks */
diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c
index 9e0a8ab..6d8916c 100644
--- a/src/backend/utils/adt/lockfuncs.c
+++ b/src/backend/utils/adt/lockfuncs.c
@@ -25,7 +25,6 @@
 /* This must match enum LockTagType! */
 const char *const LockTagTypeNames[] = {
 	"relation",
-	"extend",
 	"page",
 	"tuple",
 	"transactionid",
@@ -234,7 +233,6 @@ pg_lock_status(PG_FUNCTION_ARGS)
 		switch ((LockTagType) instance->locktag.locktag_type)
 		{
 			case LOCKTAG_RELATION:
-			case LOCKTAG_RELATION_EXTEND:
 				values[1] = ObjectIdGetDatum(instance->locktag.locktag_field1);
 				values[2] = ObjectIdGetDatum(instance->locktag.locktag_field2);
 				nulls[3] = true;
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 20f1d27..c004844 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -1153,6 +1153,7 @@ ShutdownPostgres(int code, Datum arg)
 	 * User locks are not released by transaction end, so be sure to release
 	 * them explicitly.
 	 */
+	RelExtLockReleaseAll();
 	LockReleaseAll(USER_LOCKMETHOD, true);
 }
 
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 089b7c3..958822f 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -816,7 +816,8 @@ typedef enum
 	WAIT_EVENT_REPLICATION_ORIGIN_DROP,
 	WAIT_EVENT_REPLICATION_SLOT_DROP,
 	WAIT_EVENT_SAFE_SNAPSHOT,
-	WAIT_EVENT_SYNC_REP
+	WAIT_EVENT_SYNC_REP,
+	WAIT_EVENT_RELATION_EXTENSION
 } WaitEventIPC;
 
 /* ----------
diff --git a/src/include/storage/extension_lock.h b/src/include/storage/extension_lock.h
new file mode 100644
index 0000000..d373b04
--- /dev/null
+++ b/src/include/storage/extension_lock.h
@@ -0,0 +1,41 @@
+/*-------------------------------------------------------------------------
+ *
+ * extension_lock.h
+ *	  Relation extension lock manager
+ *
+ *
+ * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/storage/extension_lock.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef EXTENSION_LOCK_H
+#define EXTENSION_LOCK_H
+
+#ifdef FRONTEND
+#error "extension_lock.h may not be included from frontend code"
+#endif
+
+#include "storage/proclist_types.h"
+#include "storage/s_lock.h"
+#include "storage/condition_variable.h"
+#include "port/atomics.h"
+
+typedef enum RelExtLockMode
+{
+	RELEXT_EXCLUSIVE,
+	RELEXT_SHARED
+} RelExtLockMode;
+
+/* Lock a relation for extension */
+extern void InitRelExtLock(long max_table_size);
+extern void LockRelationForExtension(Relation relation, RelExtLockMode lockmode);
+extern void UnlockRelationForExtension(Relation relation, RelExtLockMode lockmode);
+extern bool ConditionalLockRelationForExtension(Relation relation, RelExtLockMode lockmode);
+extern int	RelationExtensionLockWaiterCount(Relation relation);
+extern void RelExtLockReleaseAll(void);
+
+#endif	/* EXTENSION_LOCK_H */
diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h
index 0b92322..6b357aa 100644
--- a/src/include/storage/lmgr.h
+++ b/src/include/storage/lmgr.h
@@ -15,6 +15,7 @@
 #define LMGR_H
 
 #include "lib/stringinfo.h"
+#include "storage/extension_lock.h"
 #include "storage/itemptr.h"
 #include "storage/lock.h"
 #include "utils/rel.h"
@@ -50,13 +51,6 @@ extern bool LockHasWaitersRelation(Relation relation, LOCKMODE lockmode);
 extern void LockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode);
 extern void UnlockRelationIdForSession(LockRelId *relid, LOCKMODE lockmode);
 
-/* Lock a relation for extension */
-extern void LockRelationForExtension(Relation relation, LOCKMODE lockmode);
-extern void UnlockRelationForExtension(Relation relation, LOCKMODE lockmode);
-extern bool ConditionalLockRelationForExtension(Relation relation,
-									LOCKMODE lockmode);
-extern int	RelationExtensionLockWaiterCount(Relation relation);
-
 /* Lock a page (currently only used within indexes) */
 extern void LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
 extern bool ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode);
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
index 765431e..3be18ea 100644
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -138,8 +138,6 @@ typedef uint16 LOCKMETHODID;
 typedef enum LockTagType
 {
 	LOCKTAG_RELATION,			/* whole relation */
-	/* ID info for a relation is DB OID + REL OID; DB OID = 0 if shared */
-	LOCKTAG_RELATION_EXTEND,	/* the right to extend a relation */
 	/* same ID info as RELATION */
 	LOCKTAG_PAGE,				/* one page of a relation */
 	/* ID info for a page is RELATION info + BlockNumber */
@@ -198,14 +196,6 @@ typedef struct LOCKTAG
 	 (locktag).locktag_type = LOCKTAG_RELATION, \
 	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
 
-#define SET_LOCKTAG_RELATION_EXTEND(locktag,dboid,reloid) \
-	((locktag).locktag_field1 = (dboid), \
-	 (locktag).locktag_field2 = (reloid), \
-	 (locktag).locktag_field3 = 0, \
-	 (locktag).locktag_field4 = 0, \
-	 (locktag).locktag_type = LOCKTAG_RELATION_EXTEND, \
-	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
-
 #define SET_LOCKTAG_PAGE(locktag,dboid,reloid,blocknum) \
 	((locktag).locktag_field1 = (dboid), \
 	 (locktag).locktag_field2 = (reloid), \
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 596fdad..b138aad 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -120,14 +120,21 @@ extern PGDLLIMPORT int NamedLWLockTrancheRequests;
 #define LOG2_NUM_PREDICATELOCK_PARTITIONS  4
 #define NUM_PREDICATELOCK_PARTITIONS  (1 << LOG2_NUM_PREDICATELOCK_PARTITIONS)
 
+/* Number of partitions the shared relation extension lock tables are divided into */
+#define LOG2_NUM_RELEXTLOCK_PARTITIONS	4
+#define NUM_RELEXTLOCK_PARTITIONS	(1 << LOG2_NUM_RELEXTLOCK_PARTITIONS)
+
 /* Offsets for various chunks of preallocated lwlocks. */
 #define BUFFER_MAPPING_LWLOCK_OFFSET	NUM_INDIVIDUAL_LWLOCKS
 #define LOCK_MANAGER_LWLOCK_OFFSET		\
 	(BUFFER_MAPPING_LWLOCK_OFFSET + NUM_BUFFER_PARTITIONS)
 #define PREDICATELOCK_MANAGER_LWLOCK_OFFSET \
 	(LOCK_MANAGER_LWLOCK_OFFSET + NUM_LOCK_PARTITIONS)
-#define NUM_FIXED_LWLOCKS \
+#define RELEXTLOCK_MANAGER_LWLOCK_OFFSET \
 	(PREDICATELOCK_MANAGER_LWLOCK_OFFSET + NUM_PREDICATELOCK_PARTITIONS)
+#define NUM_FIXED_LWLOCKS \
+	(PREDICATELOCK_MANAGER_LWLOCK_OFFSET + NUM_PREDICATELOCK_PARTITIONS + \
+	 NUM_RELEXTLOCK_PARTITIONS)
 
 typedef enum LWLockMode
 {
@@ -151,6 +158,8 @@ extern void LWLockReleaseClearVar(LWLock *lock, uint64 *valptr, uint64 val);
 extern void LWLockReleaseAll(void);
 extern bool LWLockHeldByMe(LWLock *lock);
 extern bool LWLockHeldByMeInMode(LWLock *lock, LWLockMode mode);
+extern bool LWLockCheckForCleanup(LWLock *lock);
+extern int LWLockWaiterCount(LWLock *lock);
 
 extern bool LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval);
 extern void LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 value);
@@ -211,6 +220,7 @@ typedef enum BuiltinTrancheIds
 	LWTRANCHE_BUFFER_MAPPING,
 	LWTRANCHE_LOCK_MANAGER,
 	LWTRANCHE_PREDICATE_LOCK_MANAGER,
+	LWTRANCHE_RELEXT_LOCK_MANAGER,
 	LWTRANCHE_PARALLEL_QUERY_DSA,
 	LWTRANCHE_SESSION_DSA,
 	LWTRANCHE_SESSION_RECORD_TABLE,

Re: [HACKERS] Moving relation extension locks out of heavyweight lock manager

Reply via email to