Hello hackers, 20.03.2023 09:10, Peter Smith wrote:
Using this I was also able to reproduce the problem. But test failures were rare. The make check-world seemed OK, and indeed the test_decoding tests would also appear to PASS around 14 out of 15 times.
I've stumbled upon this assertion failure again during testing following cd312adc5. This time I've simplified the reproducer to the attached modification. With this patch applied, `make -s check -C contrib/test_decoding` fails on master as below: ok 1 - pgstat_rc_1 14 ms not ok 2 - pgstat_rc_2 1351 ms contrib/test_decoding/output_iso/log/postmaster.log contains:TRAP: failed Assert("pg_atomic_read_u32(&entry_ref->shared_entry->refcount) == 0"), File: "pgstat_shmem.c", Line: 562, PID: 1130928
With extra logging added, I see the following events happening: 1) pgstat_rc_1.setup calls pgstat_create_replslot(), gets ReplicationSlotIndex(slot) = 0 and calls pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, 0, 0). 2) pgstat_rc_1.s0_get_changes executes pg_logical_slot_get_changes(...) and then calls pgstat_gc_entry_refs on shmem_exit() -> pgstat_shutdown_hook() ...; with the sleep added inside pgstat_release_entry_ref, this backend waits after decreasing entry_ref->shared_entry->refcount to 0. 3) pgstat_rc_1.stop removes the replication slot. 4) pgstat_rc_2.setup calls pgstat_create_replslot(), gets ReplicationSlotIndex(slot) = 0 and calls pgstat_get_entry_ref_locked(PGSTAT_KIND_REPLSLOT, InvalidOid, 0, 0), which leads to the call pgstat_reinit_entry(), which increases refcount for the same shared_entry as in (1) and (2), and then to the call pgstat_acquire_entry_ref(), which increases refcount once more. 5) the backend 2 reaches Assert(pg_atomic_read_u32(&entry_ref->shared_entry->refcount) == 0), which fails due to refcount = 2. Best regards, Alexander
diff --git a/contrib/test_decoding/Makefile b/contrib/test_decoding/Makefile index c7ce603706..660eea87eb 100644 --- a/contrib/test_decoding/Makefile +++ b/contrib/test_decoding/Makefile @@ -6,10 +6,12 @@ PGFILEDESC = "test_decoding - example of a logical decoding output plugin" REGRESS = ddl xact rewrite toast permissions decoding_in_xact \ decoding_into_rel binary prepared replorigin time messages \ spill slot truncate stream stats twophase twophase_stream -ISOLATION = mxact delayed_startup ondisk_startup concurrent_ddl_dml \ +XISOLATION = mxact delayed_startup ondisk_startup concurrent_ddl_dml \ oldest_xmin snapshot_transfer subxact_without_top concurrent_stream \ twophase_snapshot slot_creation_error catalog_change_snapshot +ISOLATION = pgstat_rc_1 pgstat_rc_2 + REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/test_decoding/logical.conf ISOLATION_OPTS = --temp-config $(top_srcdir)/contrib/test_decoding/logical.conf diff --git a/contrib/test_decoding/expected/pgstat_rc_1.out b/contrib/test_decoding/expected/pgstat_rc_1.out new file mode 100644 index 0000000000..850adf9930 --- /dev/null +++ b/contrib/test_decoding/expected/pgstat_rc_1.out @@ -0,0 +1,18 @@ +Parsed test spec with 1 sessions + +starting permutation: s0_get_changes +?column? +-------- +init +(1 row) + +step s0_get_changes: SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); +data +---- +(0 rows) + +?column? +-------- +stop +(1 row) + diff --git a/contrib/test_decoding/expected/pgstat_rc_2.out b/contrib/test_decoding/expected/pgstat_rc_2.out new file mode 100644 index 0000000000..f8d4ce83a6 --- /dev/null +++ b/contrib/test_decoding/expected/pgstat_rc_2.out @@ -0,0 +1,19 @@ +Parsed test spec with 1 sessions + +starting permutation: s0_sleep +?column? +-------- +init +(1 row) + +step s0_sleep: SELECT pg_sleep(1); +pg_sleep +-------- + +(1 row) + +?column? +-------- +stop +(1 row) + diff --git a/contrib/test_decoding/specs/pgstat_rc_1.spec b/contrib/test_decoding/specs/pgstat_rc_1.spec new file mode 100644 index 0000000000..7b08704a55 --- /dev/null +++ b/contrib/test_decoding/specs/pgstat_rc_1.spec @@ -0,0 +1,14 @@ +setup +{ + SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); -- must be first write in xact +} + +teardown +{ + SELECT 'stop' FROM pg_drop_replication_slot('isolation_slot'); +} + +session "s0" +step "s0_get_changes" { SELECT data FROM pg_logical_slot_get_changes('isolation_slot', NULL, NULL, 'include-xids', '0', 'skip-empty-xacts', '1'); } + +permutation "s0_get_changes" diff --git a/contrib/test_decoding/specs/pgstat_rc_2.spec b/contrib/test_decoding/specs/pgstat_rc_2.spec new file mode 100644 index 0000000000..4e454ae8e8 --- /dev/null +++ b/contrib/test_decoding/specs/pgstat_rc_2.spec @@ -0,0 +1,14 @@ +setup +{ + SELECT 'init' FROM pg_create_logical_replication_slot('isolation_slot', 'test_decoding'); -- must be first write in xact +} + +teardown +{ + SELECT 'stop' FROM pg_drop_replication_slot('isolation_slot'); +} + +session "s0" +step "s0_sleep" { SELECT pg_sleep(1); } + +permutation "s0_sleep" diff --git a/src/backend/utils/activity/pgstat_shmem.c b/src/backend/utils/activity/pgstat_shmem.c index b1c9437edc..fe517d85af 100644 --- a/src/backend/utils/activity/pgstat_shmem.c +++ b/src/backend/utils/activity/pgstat_shmem.c @@ -551,6 +551,7 @@ pgstat_release_entry_ref(PgStat_HashKey key, PgStat_EntryRef *entry_ref, /* only dropped entries can reach a 0 refcount */ Assert(entry_ref->shared_entry->dropped); +pg_usleep(100000L); shent = dshash_find(pgStatLocal.shared_hash, &entry_ref->shared_entry->key, true);