Hi,
On 2019-09-20 16:25:21 -0400, Tom Lane wrote:
> Andres Freund <[email protected]> writes:
> > Since now a number of people (I tried as well), failed to reproduce this
> > locally, I propose that we increase the log-level during this test on
> > master. And perhaps expand the set of debugging information. With the
> > hope that the additional information on the cases encountered on the bf
> > helps us build a reproducer or, even better, diagnose the issue
> > directly. If people agree, I'll come up with a patch.
>
> I recreated my freebsd-9-under-qemu setup and I can still reproduce
> the problem, though not with high reliability (order of 1 time in 10).
> Anything particular you want logged?
A DEBUG2 log would help a fair bit, because it'd log some information
about what changes the "horizons" determining when data may be removed.
Perhaps with the additional elogs attached? I lowered some messages to
DEBUG2 so we don't have to suffer the noise of the ipc.c DEBUG3
messages.
If I use a TEMP_CONFIG setting log_min_messages=DEBUG2 with the patches
applied, the subscription tests still pass.
I hope they still fail on your setup, even though the increased logging
volume probably changes timing somewhat.
Greetings,
Andres Freund
diff --git i/src/backend/access/transam/xlog.c w/src/backend/access/transam/xlog.c
index b7ff004234a..0d48ef3f039 100644
--- i/src/backend/access/transam/xlog.c
+++ w/src/backend/access/transam/xlog.c
@@ -2676,9 +2676,16 @@ XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
void
XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
{
+ XLogRecPtr old_lsn;
+
SpinLockAcquire(&XLogCtl->info_lck);
+ old_lsn = XLogCtl->replicationSlotMinLSN;
XLogCtl->replicationSlotMinLSN = lsn;
SpinLockRelease(&XLogCtl->info_lck);
+
+ elog(DEBUG2, "updating slot minimum lsn from %X/%X to %X/%X",
+ (uint32) (old_lsn >> 32), (uint32) old_lsn,
+ (uint32) (lsn >> 32), (uint32) lsn);
}
diff --git i/src/backend/replication/logical/snapbuild.c w/src/backend/replication/logical/snapbuild.c
index 0bd1d0f9545..f7d270340de 100644
--- i/src/backend/replication/logical/snapbuild.c
+++ w/src/backend/replication/logical/snapbuild.c
@@ -913,7 +913,7 @@ SnapBuildPurgeCommittedTxn(SnapBuild *builder)
memcpy(builder->committed.xip, workspace,
surviving_xids * sizeof(TransactionId));
- elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
+ elog(DEBUG2, "purged committed transactions from %u to %u, xmin: %u, xmax: %u",
(uint32) builder->committed.xcnt, (uint32) surviving_xids,
builder->xmin, builder->xmax);
builder->committed.xcnt = surviving_xids;
@@ -1140,7 +1140,7 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact
xmin = ReorderBufferGetOldestXmin(builder->reorder);
if (xmin == InvalidTransactionId)
xmin = running->oldestRunningXid;
- elog(DEBUG3, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u",
+ elog(DEBUG2, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u",
builder->xmin, builder->xmax, running->oldestRunningXid, xmin);
LogicalIncreaseXminForSlot(lsn, xmin);
@@ -1952,6 +1952,10 @@ CheckPointSnapBuild(void)
/* now check for the restart ptrs from existing slots */
cutoff = ReplicationSlotsComputeLogicalRestartLSN();
+ elog(DEBUG2, "doing snapbuild checkpoint with restart lsn %X/%X, redo %X/%X",
+ (uint32) (cutoff >> 32), (uint32) cutoff,
+ (uint32) (redo >> 32), (uint32) redo);
+
/* don't start earlier than the restart lsn */
if (redo < cutoff)
cutoff = redo;
diff --git i/src/backend/storage/ipc/procarray.c w/src/backend/storage/ipc/procarray.c
index 8abcfdf841f..9e8a93e5962 100644
--- i/src/backend/storage/ipc/procarray.c
+++ w/src/backend/storage/ipc/procarray.c
@@ -2981,16 +2981,24 @@ void
ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin,
bool already_locked)
{
+ TransactionId old_xmin;
+ TransactionId old_catalog_xmin;
+
Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));
if (!already_locked)
LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ old_xmin = procArray->replication_slot_xmin;
+ old_catalog_xmin = procArray->replication_slot_catalog_xmin;
procArray->replication_slot_xmin = xmin;
procArray->replication_slot_catalog_xmin = catalog_xmin;
if (!already_locked)
LWLockRelease(ProcArrayLock);
+
+ elog(DEBUG2, "updating slot xmin/catalog_xmin from %u/%u to %u/%u",
+ old_xmin, old_catalog_xmin, xmin, catalog_xmin);
}
/*