On Tue, Feb 15, 2022 at 08:58:56AM -0800, Andres Freund wrote:
> Pushed the test yesterday evening, after Tom checked if it is likely to be
> problematic. Seems to worked without problems so far.

 wrasse        │ 2022-02-15 09:29:06 │ HEAD   │ 
http://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=wrasse&dt=2022-02-15%2009%3A29%3A06
 flaviventris  │ 2022-02-24 15:17:30 │ HEAD   │ 
http://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=flaviventris&dt=2022-02-24%2015%3A17%3A30
 calliphoridae │ 2022-03-08 01:14:51 │ HEAD   │ 
http://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=calliphoridae&dt=2022-03-08%2001%3A14%3A51

The buildfarm failed to convey adequate logs for this particular test suite.
Here's regression.diffs from the wrasse case (saved via keep_error_builds):

===
diff -U3 
/export/home/nm/farm/studio64v12_6/HEAD/pgsql/contrib/test_decoding/expected/slot_creation_error.out
 
/export/home/nm/farm/studio64v12_6/HEAD/pgsql.build/contrib/test_decoding/output_iso/results/slot_creation_error.out
--- 
/export/home/nm/farm/studio64v12_6/HEAD/pgsql/contrib/test_decoding/expected/slot_creation_error.out
        Tue Feb 15 06:58:14 2022
+++ 
/export/home/nm/farm/studio64v12_6/HEAD/pgsql.build/contrib/test_decoding/output_iso/results/slot_creation_error.out
        Tue Feb 15 11:38:14 2022
@@ -29,16 +29,17 @@
 t                
 (1 row)
 
-step s2_init: <... completed>
-ERROR:  canceling statement due to user request
 step s1_view_slot: 
     SELECT slot_name, slot_type, active FROM pg_replication_slots WHERE 
slot_name = 'slot_creation_error'
 
-slot_name|slot_type|active
----------+---------+------
-(0 rows)
+slot_name          |slot_type|active
+-------------------+---------+------
+slot_creation_error|logical  |t     
+(1 row)
 
 step s1_c: COMMIT;
+step s2_init: <... completed>
+ERROR:  canceling statement due to user request
 
 starting permutation: s1_b s1_xid s2_init s1_c s1_view_slot s1_drop_slot
 step s1_b: BEGIN;
===

I can make it fail that way by injecting a 1s delay here:

--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -3339,6 +3339,7 @@ ProcessInterrupts(void)
                 */
                if (!DoingCommandRead)
                {
+                       pg_usleep(1 * 1000 * 1000);
                        LockErrorCleanup();
                        ereport(ERROR,
                                        (errcode(ERRCODE_QUERY_CANCELED),

I plan to fix this as attached, similar to how commit c04c767 fixed the same
challenge in detach-partition-concurrently-[34].
Author:     Noah Misch <n...@leadboat.com>
Commit:     Noah Misch <n...@leadboat.com>

    Close race condition in slot_creation_error.spec.
    
    Use the pattern from detach-partition-concurrently-3.spec.  Per
    buildfarm member wrasse.
    
    Reviewed by FIXME.
    
    Discussion: https://postgr.es/m/FIXME

diff --git a/contrib/test_decoding/expected/slot_creation_error.out 
b/contrib/test_decoding/expected/slot_creation_error.out
index 043bdae..25883b5 100644
--- a/contrib/test_decoding/expected/slot_creation_error.out
+++ b/contrib/test_decoding/expected/slot_creation_error.out
@@ -23,14 +23,15 @@ step s1_cancel_s2:
     SELECT pg_cancel_backend(pid)
     FROM pg_stat_activity
     WHERE application_name = 'isolation/slot_creation_error/s2';
-
+ <waiting ...>
+step s2_init: <... completed>
+ERROR:  canceling statement due to user request
+step s1_cancel_s2: <... completed>
 pg_cancel_backend
 -----------------
 t                
 (1 row)
 
-step s2_init: <... completed>
-ERROR:  canceling statement due to user request
 step s1_view_slot: 
     SELECT slot_name, slot_type, active FROM pg_replication_slots WHERE 
slot_name = 'slot_creation_error'
 
@@ -90,18 +91,19 @@ step s1_terminate_s2:
     SELECT pg_terminate_backend(pid)
     FROM pg_stat_activity
     WHERE application_name = 'isolation/slot_creation_error/s2';
-
-pg_terminate_backend
---------------------
-t                   
-(1 row)
-
+ <waiting ...>
 step s2_init: <... completed>
 FATAL:  terminating connection due to administrator command
 server closed the connection unexpectedly
        This probably means the server terminated abnormally
        before or while processing the request.
 
+step s1_terminate_s2: <... completed>
+pg_terminate_backend
+--------------------
+t                   
+(1 row)
+
 step s1_c: COMMIT;
 step s1_view_slot: 
     SELECT slot_name, slot_type, active FROM pg_replication_slots WHERE 
slot_name = 'slot_creation_error'
diff --git a/contrib/test_decoding/specs/slot_creation_error.spec 
b/contrib/test_decoding/specs/slot_creation_error.spec
index 6816696..d1e35bf 100644
--- a/contrib/test_decoding/specs/slot_creation_error.spec
+++ b/contrib/test_decoding/specs/slot_creation_error.spec
@@ -35,7 +35,7 @@ step s2_init {
 # The tests first start a transaction with an xid assigned in s1, then create
 # a slot in s2. The slot creation waits for s1's transaction to end. Instead
 # we cancel / terminate s2.
-permutation s1_b s1_xid s2_init s1_view_slot s1_cancel_s2 s1_view_slot s1_c
+permutation s1_b s1_xid s2_init s1_view_slot s1_cancel_s2(s2_init) 
s1_view_slot s1_c
 permutation s1_b s1_xid s2_init s1_c s1_view_slot s1_drop_slot # check slot 
creation still works
-permutation s1_b s1_xid s2_init s1_terminate_s2 s1_c s1_view_slot
+permutation s1_b s1_xid s2_init s1_terminate_s2(s2_init) s1_c s1_view_slot
 # can't run tests after this, due to s2's connection failure

Reply via email to