Hi,

On 2019-09-20 16:25:21 -0400, Tom Lane wrote:
> Andres Freund <and...@anarazel.de> writes:
> > Since now a number of people (I tried as well), failed to reproduce this
> > locally, I propose that we increase the log-level during this test on
> > master. And perhaps expand the set of debugging information. With the
> > hope that the additional information on the cases encountered on the bf
> > helps us build a reproducer or, even better, diagnose the issue
> > directly.  If people agree, I'll come up with a patch.
> 
> I recreated my freebsd-9-under-qemu setup and I can still reproduce
> the problem, though not with high reliability (order of 1 time in 10).
> Anything particular you want logged?

A DEBUG2 log would help a fair bit, because it'd log some information
about what changes the "horizons" determining when data may be removed.

Perhaps with the additional elogs attached? I lowered some messages to
DEBUG2 so we don't have to suffer the noise of the ipc.c DEBUG3
messages.

If I use a TEMP_CONFIG setting log_min_messages=DEBUG2 with the patches
applied, the subscription tests still pass.

I hope they still fail on your setup, even though the increased logging
volume probably changes timing somewhat.

Greetings,

Andres Freund
diff --git i/src/backend/access/transam/xlog.c w/src/backend/access/transam/xlog.c
index b7ff004234a..0d48ef3f039 100644
--- i/src/backend/access/transam/xlog.c
+++ w/src/backend/access/transam/xlog.c
@@ -2676,9 +2676,16 @@ XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
 void
 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
 {
+	XLogRecPtr old_lsn;
+
 	SpinLockAcquire(&XLogCtl->info_lck);
+	old_lsn = XLogCtl->replicationSlotMinLSN;
 	XLogCtl->replicationSlotMinLSN = lsn;
 	SpinLockRelease(&XLogCtl->info_lck);
+
+	elog(DEBUG2, "updating slot minimum lsn from %X/%X to %X/%X",
+		 (uint32) (old_lsn >> 32), (uint32) old_lsn,
+		 (uint32) (lsn >> 32), (uint32) lsn);
 }
 
 
diff --git i/src/backend/replication/logical/snapbuild.c w/src/backend/replication/logical/snapbuild.c
index 0bd1d0f9545..f7d270340de 100644
--- i/src/backend/replication/logical/snapbuild.c
+++ w/src/backend/replication/logical/snapbuild.c
@@ -913,7 +913,7 @@ SnapBuildPurgeCommittedTxn(SnapBuild *builder)
 	memcpy(builder->committed.xip, workspace,
 		   surviving_xids * sizeof(TransactionId));
 
-	elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
+	elog(DEBUG2, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
 		 (uint32) builder->committed.xcnt, (uint32) surviving_xids,
 		 builder->xmin, builder->xmax);
 	builder->committed.xcnt = surviving_xids;
@@ -1140,7 +1140,7 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact
 	xmin = ReorderBufferGetOldestXmin(builder->reorder);
 	if (xmin == InvalidTransactionId)
 		xmin = running->oldestRunningXid;
-	elog(DEBUG3, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u",
+	elog(DEBUG2, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u",
 		 builder->xmin, builder->xmax, running->oldestRunningXid, xmin);
 	LogicalIncreaseXminForSlot(lsn, xmin);
 
@@ -1952,6 +1952,10 @@ CheckPointSnapBuild(void)
 	/* now check for the restart ptrs from existing slots */
 	cutoff = ReplicationSlotsComputeLogicalRestartLSN();
 
+	elog(DEBUG2, "doing snapbuild checkpoint with restart lsn %X/%X, redo %X/%X",
+		 (uint32) (cutoff >> 32), (uint32) cutoff,
+		 (uint32) (redo >> 32), (uint32) redo);
+
 	/* don't start earlier than the restart lsn */
 	if (redo < cutoff)
 		cutoff = redo;
diff --git i/src/backend/storage/ipc/procarray.c w/src/backend/storage/ipc/procarray.c
index 8abcfdf841f..9e8a93e5962 100644
--- i/src/backend/storage/ipc/procarray.c
+++ w/src/backend/storage/ipc/procarray.c
@@ -2981,16 +2981,24 @@ void
 ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin,
 								bool already_locked)
 {
+	TransactionId old_xmin;
+	TransactionId old_catalog_xmin;
+
 	Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));
 
 	if (!already_locked)
 		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 
+	old_xmin = procArray->replication_slot_xmin;
+	old_catalog_xmin = procArray->replication_slot_catalog_xmin;
 	procArray->replication_slot_xmin = xmin;
 	procArray->replication_slot_catalog_xmin = catalog_xmin;
 
 	if (!already_locked)
 		LWLockRelease(ProcArrayLock);
+
+	elog(DEBUG2, "updating slot xmin/catalog_xmin from %u/%u to %u/%u",
+		 old_xmin, old_catalog_xmin, xmin, catalog_xmin);
 }
 
 /*

Reply via email to