On Sat, Oct 26, 2024 at 11:49:36AM -0700, Noah Misch wrote:
> intra-grant-inplace-db.spec got a novel failure today:
> https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=sarus&dt=2024-10-26%2014%3A08%3A58
> 
> The isolationtester_waiting query is supposed to detect that step vac2 is
> blocked.  vac2 didn't finish within the timeout, but isolationtester_waiting
> never considered it blocked.

> ... work on reproducing this.

I'm running loops of three workloads that might reproduce this, on s390x:

- A buildfarm config like buildfarm member sarus.

- A workload where autovacuum vac_update_datfrozenxid() takes >10s, but
  non-auto "VACUUM (ONLY_DATABASE_STATS)" runs in a loop under
  lock_timeout=5s.  This notices if the non-auto VACUUM ever fails to cancel
  autovacuum.  I apply the attached inplace200-bench-vac-v0.1.patch, then run
  contrib/amcheck/t/089_vac.pl.

- A workload to make vac2's deadlock detector run when an autovacuum worker is
  starting.  That's one of my candidate explanations of the events behind the
  failure log.  I apply the attached inplace190-repro-autovacuum-v0.1.patch,
  then run intra-grant-inplace-db.spec with debug_parallel_query=regress and
  autovacuum_naptime=1s.  This regularly gets vac2 to cancel autovacuum.

So far, those have seen nothing like the failed run.

I'm inclined to make isolationtester log pg_stat_activity and pg_locks
whenever a step times out.  That would have ruled out many explanations for
what happened on sarus.  However, logging on timeout won't help much until
something can reproduce the timeout.

> It's odd that a new
> auto-analyze starts every minute, each of which exits due to blocking vac2.

Every autovacuum_naptime, a new worker cancels the old one due to wanting the
old worker's LOCKTAG_DATABASE_FROZEN_IDS.  One can see this with "BEGIN; GRANT
TEMP ON DATABASE postgres TO pg_monitor;".  That open transaction's xmax
prevents vac_update_datfrozenxid() from finishing.  While that transaction
remains open, every autovacuum_naptime after the first will log a cancellation
of the previous autovacuum worker.  (This assumes each autovacuum worker,
having negligible real work to do, reaches vac_update_datfrozenxid() before
the next autovacuum_naptime.)  That's likely not ideal, but it doesn't explain
the sarus failure.
Author:     Noah Misch <n...@leadboat.com>
Commit:     Noah Misch <n...@leadboat.com>

    

diff --git a/contrib/amcheck/t/089_vac.pl b/contrib/amcheck/t/089_vac.pl
new file mode 100644
index 0000000..a28ebd9
--- /dev/null
+++ b/contrib/amcheck/t/089_vac.pl
@@ -0,0 +1,50 @@
+
+# Copyright (c) 2021-2024, PostgreSQL Global Development Group
+
+# Test VACUUM vs. slow autovac
+use strict;
+use warnings FATAL => 'all';
+
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+
+use Test::More;
+
+my ($node, $result);
+
+#
+# Test set-up
+#
+$node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+$node->append_conf('postgresql.conf', 'autovacuum_naptime = 1s');
+$node->append_conf('postgresql.conf', 'log_lock_waits = on');
+$node->append_conf('postgresql.conf', 'deadlock_timeout = 10ms');
+$node->append_conf('postgresql.conf', q{backtrace_functions = 
'ProcessInterrupts'});
+$node->start;
+
+my $grant_h = $node->background_psql('postgres');
+
+# This running would block all vac_update_datfrozenxid => lock_timeout always
+# $grant_h->query_safe(
+#      q(
+# BEGIN;
+# GRANT TEMP ON DATABASE postgres TO pg_monitor;
+# ));
+
+{
+       local $ENV{PGOPTIONS} = '-clock_timeout=5s';
+$node->pgbench(
+       '--no-vacuum --client=1 --time=600',
+       0,
+       [qr{actually processed}],
+       [qr{^$}],
+       'vac',
+       {
+               'vac' => 'VACUUM (ONLY_DATABASE_STATS);',
+       });
+}
+
+$grant_h->quit;
+$node->stop;
+done_testing();
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index ac8f5d9..20fe426 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -1582,6 +1582,7 @@ vac_update_relstats(Relation relation,
  *             commits.  As with vac_update_relstats, this avoids leaving dead 
tuples
  *             behind after a VACUUM.
  */
+#include "utils/fmgrprotos.h"
 void
 vac_update_datfrozenxid(void)
 {
@@ -1607,6 +1608,9 @@ vac_update_datfrozenxid(void)
         */
        LockDatabaseFrozenIds(ExclusiveLock);
 
+       if (AmAutoVacuumWorkerProcess())
+               DirectFunctionCall1(pg_sleep, Float8GetDatum(10.0));
+
        /*
         * Initialize the "min" calculation with
         * GetOldestNonRemovableTransactionId(), which is a reasonable
Author:     Noah Misch <n...@leadboat.com>
Commit:     Noah Misch <n...@leadboat.com>

    

diff --git a/src/test/isolation/Makefile b/src/test/isolation/Makefile
index ade2256..ca487de 100644
--- a/src/test/isolation/Makefile
+++ b/src/test/isolation/Makefile
@@ -64,6 +64,9 @@ installcheck: all
 check: all
        $(pg_isolation_regress_check) --schedule=$(srcdir)/isolation_schedule
 
+nmcheck: all temp-install
+       $(pg_isolation_regress_check) intra-grant-inplace-db
+
 # Non-default tests.  It only makes sense to run these if set up to use
 # prepared transactions, via TEMP_CONFIG for the check case, or via the
 # postgresql.conf for the installcheck case.
diff --git a/src/test/isolation/expected/intra-grant-inplace-db.out 
b/src/test/isolation/expected/intra-grant-inplace-db.out
index a91402c..118cbaa 100644
--- a/src/test/isolation/expected/intra-grant-inplace-db.out
+++ b/src/test/isolation/expected/intra-grant-inplace-db.out
@@ -8,6 +8,12 @@ step snap3:
 step b1: BEGIN;
 step grant1: 
        GRANT TEMP ON DATABASE isolation_regression TO regress_temp_grantee;
+       SELECT pg_sleep(random(0.000, 1.200));
+
+pg_sleep
+--------
+        
+(1 row)
 
 step vac2: VACUUM (FREEZE); <waiting ...>
 step snap3: 
diff --git a/src/test/isolation/isolationtester.c 
b/src/test/isolation/isolationtester.c
index 0b342b5..8a5e585 100644
--- a/src/test/isolation/isolationtester.c
+++ b/src/test/isolation/isolationtester.c
@@ -863,7 +863,7 @@ try_complete_step(TestSpec *testspec, PermutationStep 
*pstep, int flags)
        while (PQisBusy(conn))
        {
                FD_SET(sock, &read_set);
-               timeout.tv_sec = 0;
+               timeout.tv_sec = 8;
                timeout.tv_usec = 10000;        /* Check for lock waits every 
10ms. */
 
                ret = select(sock + 1, &read_set, NULL, NULL, &timeout);
diff --git a/src/test/isolation/specs/intra-grant-inplace-db.spec 
b/src/test/isolation/specs/intra-grant-inplace-db.spec
index 9de40ec..408e23f 100644
--- a/src/test/isolation/specs/intra-grant-inplace-db.spec
+++ b/src/test/isolation/specs/intra-grant-inplace-db.spec
@@ -18,6 +18,7 @@ session s1
 step b1        { BEGIN; }
 step grant1    {
        GRANT TEMP ON DATABASE isolation_regression TO regress_temp_grantee;
+       SELECT pg_sleep(random(0.000, 1.200));
 }
 step c1        { COMMIT; }
 

Reply via email to