Re: Visibility bug in tuple lock

Heikki Linnakangas Wed, 17 Dec 2025 12:06:12 -0800

On 17/12/2025 17:42, Heikki Linnakangas wrote:

On 17/12/2025 16:51, Jasper Smit wrote:

Thanks! That's not quite correct though. This is very subtle, but the
'tuple' argument to heap_lock_updated_tuple() points to the buffer
holding the original tuple. So doing
HeapTupleHeaderGetUpdateXid(tuple->t_data) reads the current xmax of the
tuple, which can now be different than what it was earlier, i.e.
different from the xwait that we waited for. It's important that the

'ctid' and 'priorXmax' that we use to follow the chain are readtogether.


Oops, you are right, I was too quick, it has already been quite some
time since i was dealing with this problem initially.

I see you moved the check ItemPointerEquals(&tuple->t_self, ctid) out
to heap_lock_tuple()
but isn't this redundant in the check `if (follow_updates && updated
&& !ItemPointerEquals(&tuple->t_self, &t_ctid))` if `updated` is
already true in this condition?


Hmm, I don't think so, HEAP_KEYS_UPDATED is also set on deleted tuples.

Ah, but this codepath is taken when HEAP_KEYS_UPDATED is *not* set. Igot that backwards. So I agree the ItemPointerEquals(&tuple->t_self,ctid) check is redundant.

This is so subtle that I'm inclined to nevertheless keep it, at least inbackbranches. Just in case we're missing something. In master, we couldperhaps add an assertion for it.

Here's a new patch version. I worked some more on the test, making itless sensitive to the tuple layout. It should now pass on 32-bit systemsand with different block sizes.


- Heikki

From c8e5eb9c74254520d0d6510fc11c9c4db303911f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <[email protected]>
Date: Wed, 17 Dec 2025 22:05:16 +0200
Subject: [PATCH v4 1/1] Fix bug in following update chain when locking a heap
 tuple

Author: Jasper Smit <[email protected]>
Discussion: https://www.postgresql.org/message-id/CAOG%2BRQ74x0q%3DkgBBQ%3DmezuvOeZBfSxM1qu_o0V28bwDz3dHxLw%40mail.gmail.com
---
 src/backend/access/heap/heapam.c              |  48 ++++---
 src/test/modules/injection_points/Makefile    |   3 +-
 .../expected/heap_lock_update.out             |  83 +++++++++++++
 src/test/modules/injection_points/meson.build |   1 +
 .../specs/heap_lock_update.spec               | 117 ++++++++++++++++++
 5 files changed, 236 insertions(+), 16 deletions(-)
 create mode 100644 src/test/modules/injection_points/expected/heap_lock_update.out
 create mode 100644 src/test/modules/injection_points/specs/heap_lock_update.spec

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 6daf4a87dec..2df1aab6f29 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -85,8 +85,11 @@ static void compute_new_xmax_infomask(TransactionId xmax, uint16 old_infomask,
 									  LockTupleMode mode, bool is_update,
 									  TransactionId *result_xmax, uint16 *result_infomask,
 									  uint16 *result_infomask2);
-static TM_Result heap_lock_updated_tuple(Relation rel, HeapTuple tuple,
-										 const ItemPointerData *ctid, TransactionId xid,
+static TM_Result heap_lock_updated_tuple(Relation rel,
+										 uint16 prior_infomask,
+										 TransactionId prior_rawxmax,
+										 const ItemPointerData *prior_ctid,
+										 TransactionId xid,
 										 LockTupleMode mode);
 static void GetMultiXactIdHintBits(MultiXactId multi, uint16 *new_infomask,
 								   uint16 *new_infomask2);
@@ -4816,11 +4819,13 @@ l3:
 				 * If there are updates, follow the update chain; bail out if
 				 * that cannot be done.
 				 */
-				if (follow_updates && updated)
+				if (follow_updates && updated &&
+					!ItemPointerEquals(&tuple->t_self, &t_ctid))
 				{
 					TM_Result	res;
 
-					res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+					res = heap_lock_updated_tuple(relation,
+												  infomask, xwait, &t_ctid,
 												  GetCurrentTransactionId(),
 												  mode);
 					if (res != TM_Ok)
@@ -5063,11 +5068,13 @@ l3:
 			}
 
 			/* if there are updates, follow the update chain */
-			if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask))
+			if (follow_updates && !HEAP_XMAX_IS_LOCKED_ONLY(infomask) &&
+				!ItemPointerEquals(&tuple->t_self, &t_ctid))
 			{
 				TM_Result	res;
 
-				res = heap_lock_updated_tuple(relation, tuple, &t_ctid,
+				res = heap_lock_updated_tuple(relation,
+											  infomask, xwait, &t_ctid,
 											  GetCurrentTransactionId(),
 											  mode);
 				if (res != TM_Ok)
@@ -5721,7 +5728,8 @@ test_lockmode_for_conflict(MultiXactStatus status, TransactionId xid,
  * version as well.
  */
 static TM_Result
-heap_lock_updated_tuple_rec(Relation rel, const ItemPointerData *tid, TransactionId xid,
+heap_lock_updated_tuple_rec(Relation rel, TransactionId priorXmax,
+							const ItemPointerData *tid, TransactionId xid,
 							LockTupleMode mode)
 {
 	TM_Result	result;
@@ -5734,7 +5742,6 @@ heap_lock_updated_tuple_rec(Relation rel, const ItemPointerData *tid, Transactio
 				old_infomask2;
 	TransactionId xmax,
 				new_xmax;
-	TransactionId priorXmax = InvalidTransactionId;
 	bool		cleared_all_frozen = false;
 	bool		pinned_desired_page;
 	Buffer		vmbuffer = InvalidBuffer;
@@ -6048,7 +6055,10 @@ out_unlocked:
  *		Follow update chain when locking an updated tuple, acquiring locks (row
  *		marks) on the updated versions.
  *
- * The initial tuple is assumed to be already locked.
+ * 'prior_infomask', 'prior_raw_xmax' and 'prior_ctid' are the corresponding
+ * fields from the initial tuple.  We will update the tuples starting from the
+ * one that 'prior_ctid' points to.  Note: This function does not lock the
+ * initial tuple itself.
  *
  * This function doesn't check visibility, it just unconditionally marks the
  * tuple(s) as locked.  If any tuple in the updated chain is being deleted
@@ -6066,16 +6076,22 @@ out_unlocked:
  * levels, because that would lead to a serializability failure.
  */
 static TM_Result
-heap_lock_updated_tuple(Relation rel, HeapTuple tuple, const ItemPointerData *ctid,
+heap_lock_updated_tuple(Relation rel,
+						uint16 prior_infomask,
+						TransactionId prior_raw_xmax,
+						const ItemPointerData *prior_ctid,
 						TransactionId xid, LockTupleMode mode)
 {
+	INJECTION_POINT("heap_lock_updated_tuple", NULL);
+
 	/*
-	 * If the tuple has not been updated, or has moved into another partition
-	 * (effectively a delete) stop here.
+	 * If the tuple has moved into another partition (effectively a delete)
+	 * stop here.
 	 */
-	if (!HeapTupleHeaderIndicatesMovedPartitions(tuple->t_data) &&
-		!ItemPointerEquals(&tuple->t_self, ctid))
+	if (!ItemPointerIndicatesMovedPartitions(prior_ctid))
 	{
+		TransactionId prior_xmax;
+
 		/*
 		 * If this is the first possibly-multixact-able operation in the
 		 * current transaction, set my per-backend OldestMemberMXactId
@@ -6087,7 +6103,9 @@ heap_lock_updated_tuple(Relation rel, HeapTuple tuple, const ItemPointerData *ct
 		 */
 		MultiXactIdSetOldestMember();
 
-		return heap_lock_updated_tuple_rec(rel, ctid, xid, mode);
+		prior_xmax = (prior_infomask & HEAP_XMAX_IS_MULTI) ?
+			MultiXactIdGetUpdateXid(prior_raw_xmax, prior_infomask) : prior_raw_xmax;
+		return heap_lock_updated_tuple_rec(rel, prior_xmax, prior_ctid, xid, mode);
 	}
 
 	/* nothing to lock */
diff --git a/src/test/modules/injection_points/Makefile b/src/test/modules/injection_points/Makefile
index bfdb3f53377..cc9be6dcdd2 100644
--- a/src/test/modules/injection_points/Makefile
+++ b/src/test/modules/injection_points/Makefile
@@ -14,7 +14,8 @@ REGRESS_OPTS = --dlpath=$(top_builddir)/src/test/regress
 
 ISOLATION = basic \
 	    inplace \
-	    syscache-update-pruned
+	    syscache-update-pruned \
+	    heap_lock_update
 
 # Temporarily disabled because of flakiness
 #ISOLATION =+
diff --git a/src/test/modules/injection_points/expected/heap_lock_update.out b/src/test/modules/injection_points/expected/heap_lock_update.out
new file mode 100644
index 00000000000..1ec8d876414
--- /dev/null
+++ b/src/test/modules/injection_points/expected/heap_lock_update.out
@@ -0,0 +1,83 @@
+Parsed test spec with 2 sessions
+
+starting permutation: s1begin s1update s2lock s1abort vacuum reinsert wake
+step s1begin: BEGIN;
+step s1update: UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid;
+ctid 
+-----
+(1,2)
+(1 row)
+
+step s2lock: select * from t where id = 1 for update; <waiting ...>
+step s1abort: ABORT;
+step vacuum: VACUUM t;
+step reinsert: 
+	INSERT INTO t VALUES (10001) RETURNING ctid;
+	UPDATE t SET id = 10002 WHERE id = 10001 RETURNING ctid;
+
+ctid 
+-----
+(1,2)
+(1 row)
+
+ctid 
+-----
+(1,3)
+(1 row)
+
+step wake: 
+	SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+	SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+ <waiting ...>
+step s2lock: <... completed>
+id
+--
+ 1
+(1 row)
+
+step wake: <... completed>
+
+starting permutation: s1begin s1update s2lock s1abort vacuum reinsert_and_lock wake
+step s1begin: BEGIN;
+step s1update: UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid;
+ctid 
+-----
+(1,2)
+(1 row)
+
+step s2lock: select * from t where id = 1 for update; <waiting ...>
+step s1abort: ABORT;
+step vacuum: VACUUM t;
+step reinsert_and_lock: 
+	BEGIN;
+	INSERT INTO t VALUES (10001) RETURNING ctid;
+	SELECT ctid, * FROM t WHERE id = 1 FOR UPDATE;
+	COMMIT;
+	UPDATE t SET id = 10002 WHERE id = 10001 returning ctid;
+
+ctid 
+-----
+(1,2)
+(1 row)
+
+ctid |id
+-----+--
+(0,1)| 1
+(1 row)
+
+ctid 
+-----
+(1,3)
+(1 row)
+
+step wake: 
+	SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+	SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+ <waiting ...>
+step s2lock: <... completed>
+id
+--
+ 1
+(1 row)
+
+step wake: <... completed>
diff --git a/src/test/modules/injection_points/meson.build b/src/test/modules/injection_points/meson.build
index 493e11053dc..c9186ae04d1 100644
--- a/src/test/modules/injection_points/meson.build
+++ b/src/test/modules/injection_points/meson.build
@@ -46,6 +46,7 @@ tests += {
       'basic',
       'inplace',
       'syscache-update-pruned',
+      'heap_lock_update',
       # temporarily disabled because of flakiness
       # 'index-concurrently-upsert',
       # 'index-concurrently-upsert-predicate',
diff --git a/src/test/modules/injection_points/specs/heap_lock_update.spec b/src/test/modules/injection_points/specs/heap_lock_update.spec
new file mode 100644
index 00000000000..b3992a1eb7a
--- /dev/null
+++ b/src/test/modules/injection_points/specs/heap_lock_update.spec
@@ -0,0 +1,117 @@
+# Test race condition in tuple locking
+#
+# This is a reproducer for the bug reported at:
+# https://www.postgresql.org/message-id/CAOG%2BRQ74x0q%3DkgBBQ%3DmezuvOeZBfSxM1qu_o0V28bwDz3dHxLw%40mail.gmail.com
+#
+# The bug was that when following an update chain when locking tuples,
+# we sometimes failed to check that the xmin on the next tuple matched
+# the prior's xmax. If the updated tuple version was vacuumed away and
+# the slot was reused for an unrelated tuple, we'd incorrectly follow
+# and lock the unrelated tuple.
+
+
+# Set up a test table with enough rows to fill a page. We need the
+# UPDATE used in the test to put the new tuple on a different page,
+# because otherwise the VACUUM cannot remove the aborted tuple because
+# we hold a pin on the first page.
+#
+# The exact number of rows inserted doesn't otherwise matter, but we
+# arrange things in a deterministic fashion so that the last inserted
+# tuple goes to (1,1), and the updated and aborted tuple goes to
+# (1,2). That way we can just memorize those ctids in the expected
+# output, to verify that the test exercises the scenario we want.
+setup
+{
+	CREATE EXTENSION injection_points;
+
+	CREATE TABLE t (id int PRIMARY KEY);
+	do $$
+		DECLARE
+			i int;
+			tid tid;
+		BEGIN
+			FOR i IN 1..5000 LOOP
+				INSERT INTO t VALUES (i) RETURNING ctid INTO tid;
+				IF tid = '(1,1)' THEN
+					RETURN;
+				END IF;
+			END LOOP;
+			RAISE 'expected to insert tuple to (1,1)';
+	   END;
+   $$;
+}
+teardown
+{
+	DROP TABLE t;
+	DROP EXTENSION injection_points;
+}
+
+session s1
+step s1begin	{ BEGIN; }
+step s1update	{ UPDATE t SET id = 10000 WHERE id = 1 RETURNING ctid; }
+step s1abort	{ ABORT; }
+step vacuum		{ VACUUM t; }
+
+# Insert a new tuple, and update it.
+step reinsert	{
+	INSERT INTO t VALUES (10001) RETURNING ctid;
+	UPDATE t SET id = 10002 WHERE id = 10001 RETURNING ctid;
+}
+
+# Same as the 'reinsert' step, but for extra confusion, we also stamp
+# the original tuple with the same 'xmax' as the re-inserted one.
+step reinsert_and_lock {
+	BEGIN;
+	INSERT INTO t VALUES (10001) RETURNING ctid;
+	SELECT ctid, * FROM t WHERE id = 1 FOR UPDATE;
+	COMMIT;
+	UPDATE t SET id = 10002 WHERE id = 10001 returning ctid;
+}
+
+step wake	{
+	SELECT FROM injection_points_detach('heap_lock_updated_tuple');
+	SELECT FROM injection_points_wakeup('heap_lock_updated_tuple');
+}
+
+session s2
+setup	{
+	SELECT FROM injection_points_set_local();
+	SELECT FROM injection_points_attach('heap_lock_updated_tuple', 'wait');
+}
+step s2lock	{ select * from t where id = 1 for update; }
+
+permutation
+	# Begin transaction, update a row. Because of how we set up the
+	# test table, the updated tuple lands at (1,2)
+	s1begin
+	s1update
+
+	# While the updating transaction is open, start a new session that
+	# tries to lock the row. This blocks on the open transaction.
+	s2lock
+
+	# Abort the updating transaction. This unblocks session 2, but it
+	# will immediately hit the injection point and block on that.
+	s1abort
+	# Vacuum away the updated, aborted tuple.
+	vacuum
+
+	# Insert a new tuple. It lands at the same location where the
+	# updated tuple was.
+	reinsert
+
+	# Let the locking transaction continue. It should lock the
+	# original tuple, ignoring the re-inserted tuple.
+	wake(s2lock)
+
+# Variant where the re-inserted tuple is also locked by the inserting
+# transaction. This failed an earlier version of the fix during
+# development.
+permutation
+	s1begin
+	s1update
+	s2lock
+	s1abort
+	vacuum
+	reinsert_and_lock
+	wake(s2lock)
-- 
2.47.3

Re: Visibility bug in tuple lock

Reply via email to