Hi, On 2019-09-20 16:25:21 -0400, Tom Lane wrote: > Andres Freund <and...@anarazel.de> writes: > > Since now a number of people (I tried as well), failed to reproduce this > > locally, I propose that we increase the log-level during this test on > > master. And perhaps expand the set of debugging information. With the > > hope that the additional information on the cases encountered on the bf > > helps us build a reproducer or, even better, diagnose the issue > > directly. If people agree, I'll come up with a patch. > > I recreated my freebsd-9-under-qemu setup and I can still reproduce > the problem, though not with high reliability (order of 1 time in 10). > Anything particular you want logged?
A DEBUG2 log would help a fair bit, because it'd log some information about what changes the "horizons" determining when data may be removed. Perhaps with the additional elogs attached? I lowered some messages to DEBUG2 so we don't have to suffer the noise of the ipc.c DEBUG3 messages. If I use a TEMP_CONFIG setting log_min_messages=DEBUG2 with the patches applied, the subscription tests still pass. I hope they still fail on your setup, even though the increased logging volume probably changes timing somewhat. Greetings, Andres Freund
diff --git i/src/backend/access/transam/xlog.c w/src/backend/access/transam/xlog.c index b7ff004234a..0d48ef3f039 100644 --- i/src/backend/access/transam/xlog.c +++ w/src/backend/access/transam/xlog.c @@ -2676,9 +2676,16 @@ XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN) void XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn) { + XLogRecPtr old_lsn; + SpinLockAcquire(&XLogCtl->info_lck); + old_lsn = XLogCtl->replicationSlotMinLSN; XLogCtl->replicationSlotMinLSN = lsn; SpinLockRelease(&XLogCtl->info_lck); + + elog(DEBUG2, "updating slot minimum lsn from %X/%X to %X/%X", + (uint32) (old_lsn >> 32), (uint32) old_lsn, + (uint32) (lsn >> 32), (uint32) lsn); } diff --git i/src/backend/replication/logical/snapbuild.c w/src/backend/replication/logical/snapbuild.c index 0bd1d0f9545..f7d270340de 100644 --- i/src/backend/replication/logical/snapbuild.c +++ w/src/backend/replication/logical/snapbuild.c @@ -913,7 +913,7 @@ SnapBuildPurgeCommittedTxn(SnapBuild *builder) memcpy(builder->committed.xip, workspace, surviving_xids * sizeof(TransactionId)); - elog(DEBUG3, "purged committed transactions from %u to %u, xmin: %u, xmax: %u", + elog(DEBUG2, "purged committed transactions from %u to %u, xmin: %u, xmax: %u", (uint32) builder->committed.xcnt, (uint32) surviving_xids, builder->xmin, builder->xmax); builder->committed.xcnt = surviving_xids; @@ -1140,7 +1140,7 @@ SnapBuildProcessRunningXacts(SnapBuild *builder, XLogRecPtr lsn, xl_running_xact xmin = ReorderBufferGetOldestXmin(builder->reorder); if (xmin == InvalidTransactionId) xmin = running->oldestRunningXid; - elog(DEBUG3, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u", + elog(DEBUG2, "xmin: %u, xmax: %u, oldest running: %u, oldest xmin: %u", builder->xmin, builder->xmax, running->oldestRunningXid, xmin); LogicalIncreaseXminForSlot(lsn, xmin); @@ -1952,6 +1952,10 @@ CheckPointSnapBuild(void) /* now check for the restart ptrs from existing slots */ cutoff = ReplicationSlotsComputeLogicalRestartLSN(); + elog(DEBUG2, "doing snapbuild checkpoint with restart lsn %X/%X, redo %X/%X", + (uint32) (cutoff >> 32), (uint32) cutoff, + (uint32) (redo >> 32), (uint32) redo); + /* don't start earlier than the restart lsn */ if (redo < cutoff) cutoff = redo; diff --git i/src/backend/storage/ipc/procarray.c w/src/backend/storage/ipc/procarray.c index 8abcfdf841f..9e8a93e5962 100644 --- i/src/backend/storage/ipc/procarray.c +++ w/src/backend/storage/ipc/procarray.c @@ -2981,16 +2981,24 @@ void ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin, bool already_locked) { + TransactionId old_xmin; + TransactionId old_catalog_xmin; + Assert(!already_locked || LWLockHeldByMe(ProcArrayLock)); if (!already_locked) LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE); + old_xmin = procArray->replication_slot_xmin; + old_catalog_xmin = procArray->replication_slot_catalog_xmin; procArray->replication_slot_xmin = xmin; procArray->replication_slot_catalog_xmin = catalog_xmin; if (!already_locked) LWLockRelease(ProcArrayLock); + + elog(DEBUG2, "updating slot xmin/catalog_xmin from %u/%u to %u/%u", + old_xmin, old_catalog_xmin, xmin, catalog_xmin); } /*