On Tue, Mar 10, 2020 at 12:09:12AM -0400, Tom Lane wrote: > Michael Paquier <mich...@paquier.xyz> writes: > > On Mon, Mar 09, 2020 at 10:32:27PM -0400, Tom Lane wrote: > >> It strikes me to wonder whether we could improve matters by teaching > >> isolationtester to watch for particular values in a connected backend's > >> pg_stat_activity.wait_event_type/wait_event columns. Those columns > >> didn't exist when isolationtester was designed, IIRC, so it's not > >> surprising that they're not used in the current design. But we could > >> use them perhaps to detect that a backend has arrived at some state > >> that's not a heavyweight-lock-wait state. > > > Interesting idea. So that would be basically an equivalent of > > PostgresNode::poll_query_until but for the isolation tester? > > No, more like the existing isolationtester wait query, which watches > for something being blocked on a heavyweight lock. Right now, that > one depends on a bespoke function pg_isolation_test_session_is_blocked(), > but it used to be a query on pg_stat_activity/pg_locks.
Ah interesting indeed! > > In short > > we gain a meta-command that runs a SELECT query that waits until the > > query defined in the command returns true. The polling interval may > > be tricky to set though. > > I think it'd be just the same as the polling interval for the existing > wait query. We'd have to have some way to mark a script step to say > what to check to decide that it's blocked ... So basically we could just change pg_isolation_test_session_is_blocked() to also return the wait_event_type and wait_event, and adding something like step "<name>" { SQL } [ cancel on "<wait_event_type>" "<wait_event>" ] to the step definition should be enough. I'm attaching a POC patch for that. On my laptop, the full test now complete in about 400ms. FTR the REINDEX TABLE CONCURRENTLY case is eventually locked on a virtualxid, I'm not sure if that's could lead to too early cancellation.
>From 77c214c9fb2fa7f9a003b96db0e5ec6506217e38 Mon Sep 17 00:00:00 2001 From: Julien Rouhaud <julien.rouh...@free.fr> Date: Fri, 6 Mar 2020 13:40:56 +0100 Subject: [PATCH v2 1/2] Add an optional cancel on clause to isolationtester steps. Some sanity checks can require a command to wait on a lock and eventually be cancelled. The only way to do that was to rely on calls to pg_cancel_backend() filtering pg_stat_activity view, but this isn't a satisfactory solution as there's no way to guarantee that only the wanted backend will be canceled. Instead, add a new optional "cancel on <wait_event_type> <wait_event>" clause to the step definition. When this clause is specified, isolationtester will actively wait on that command, and issue a cancel when the query is waiting on the given wait event. Author: Julien Rouhaud Reviewed-by: Discussion: https://postgr.es/m/20200305035354.GQ2593%40paquier.xyz --- src/backend/utils/adt/lockfuncs.c | 54 ++++++++++++++++++++++++++-- src/include/catalog/pg_proc.dat | 5 ++- src/test/isolation/README | 12 +++++-- src/test/isolation/isolationtester.c | 43 +++++++++++++++++----- src/test/isolation/isolationtester.h | 8 +++++ src/test/isolation/specparse.y | 20 +++++++++-- src/test/isolation/specscanner.l | 2 ++ 7 files changed, 127 insertions(+), 17 deletions(-) diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c index ecb1bf92ff..cc2bc94d0d 100644 --- a/src/backend/utils/adt/lockfuncs.c +++ b/src/backend/utils/adt/lockfuncs.c @@ -17,7 +17,9 @@ #include "catalog/pg_type.h" #include "funcapi.h" #include "miscadmin.h" +#include "pgstat.h" #include "storage/predicate_internals.h" +#include "storage/procarray.h" #include "utils/array.h" #include "utils/builtins.h" @@ -578,17 +580,28 @@ pg_safe_snapshot_blocking_pids(PG_FUNCTION_ARGS) Datum pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) { + TupleDesc tupdesc; int blocked_pid = PG_GETARG_INT32(0); ArrayType *interesting_pids_a = PG_GETARG_ARRAYTYPE_P(1); ArrayType *blocking_pids_a; int32 *interesting_pids; int32 *blocking_pids; + Datum values[3]; + bool nulls[3]; + PGPROC *proc; + uint32 raw_wait_event = 0; + const char *wait_event_type = NULL; + const char *wait_event = NULL; int num_interesting_pids; int num_blocking_pids; int dummy; int i, j; + /* Initialise values and NULL flags arrays */ + MemSet(values, 0, sizeof(values)); + MemSet(nulls, 0, sizeof(nulls)); + /* Validate the passed-in array */ Assert(ARR_ELEMTYPE(interesting_pids_a) == INT4OID); if (array_contains_nulls(interesting_pids_a)) @@ -597,6 +610,34 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) num_interesting_pids = ArrayGetNItems(ARR_NDIM(interesting_pids_a), ARR_DIMS(interesting_pids_a)); + /* Initialise attributes information in the tuple descriptor */ + tupdesc = CreateTemplateTupleDesc(3); + TupleDescInitEntry(tupdesc, (AttrNumber) 1, "blocked", + BOOLOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 2, "wait_event_type", + TEXTOID, -1, 0); + TupleDescInitEntry(tupdesc, (AttrNumber) 3, "wait_even", + TEXTOID, -1, 0); + + BlessTupleDesc(tupdesc); + + proc = BackendPidGetProc(blocked_pid); + if (proc) + { +#define UINT32_ACCESS_ONCE(var) ((uint32)(*((volatile uint32 *)&(var)))) + raw_wait_event = UINT32_ACCESS_ONCE(proc->wait_event_info); + wait_event_type = pgstat_get_wait_event_type(raw_wait_event); + wait_event = pgstat_get_wait_event(raw_wait_event); + + if (wait_event_type != NULL) + values[1] = CStringGetTextDatum(wait_event_type); + if (wait_event != NULL) + values[2] = CStringGetTextDatum(wait_event); + } + + nulls[1] = (wait_event_type == NULL); + nulls[2] = (wait_event == NULL); + /* * Get the PIDs of all sessions blocking the given session's attempt to * acquire heavyweight locks. @@ -623,7 +664,11 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) for (j = 0; j < num_interesting_pids; j++) { if (blocking_pids[i] == interesting_pids[j]) - PG_RETURN_BOOL(true); + { + values[0] = BoolGetDatum(true); + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, + values, nulls))); + } } /* @@ -636,9 +681,12 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS) * buffer and check if the number of safe snapshot blockers is non-zero. */ if (GetSafeSnapshotBlockingPids(blocked_pid, &dummy, 1) > 0) - PG_RETURN_BOOL(true); + values[0] = BoolGetDatum(true); + + values[0] = BoolGetDatum(false); - PG_RETURN_BOOL(false); + PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, + values, nulls))); } diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat index 7fb574f9dc..77529e181d 100644 --- a/src/include/catalog/pg_proc.dat +++ b/src/include/catalog/pg_proc.dat @@ -5869,7 +5869,10 @@ prosrc => 'pg_safe_snapshot_blocking_pids' }, { oid => '3378', descr => 'isolationtester support function', proname => 'pg_isolation_test_session_is_blocked', provolatile => 'v', - prorettype => 'bool', proargtypes => 'int4 _int4', + prorettype => 'record', proargtypes => 'int4 _int4', + proallargtypes => '{int4,_int4,bool,text,text}', + proargmodes => '{i,i,o,o,o}', + proargnames => '{blocking_pid,interesting_pids,blocked,wait_event_type,wait_event}', prosrc => 'pg_isolation_test_session_is_blocked' }, { oid => '1065', descr => 'view two-phase transactions', proname => 'pg_prepared_xact', prorows => '1000', proretset => 't', diff --git a/src/test/isolation/README b/src/test/isolation/README index 217953d183..04aed5cd17 100644 --- a/src/test/isolation/README +++ b/src/test/isolation/README @@ -86,10 +86,12 @@ session "<name>" Each step has the syntax - step "<name>" { <SQL> } + step "<name>" { <SQL> } [ cancel on "<wait_event_type>" "<wait_event>" ] - where <name> is a name identifying this step, and SQL is a SQL statement - (or statements, separated by semicolons) that is executed in the step. + where <name> is a name identifying this step, SQL is a SQL statement + (or statements, separated by semicolons) that is executed in the step and + <wait_event_type> and <wait_event> a wait event specification for which + isolationtester will cancel the query if it's blocked on it. Step names must be unique across the whole spec file. permutation "<step name>" ... @@ -125,6 +127,10 @@ after PGISOLATIONTIMEOUT seconds. If the cancel doesn't work, isolationtester will exit uncleanly after a total of twice PGISOLATIONTIMEOUT. Testing invalid permutations should be avoided because they can make the isolation tests take a very long time to run, and they serve no useful testing purpose. +If a test specified the optionnal cancel on specification, then isolationtester +will actively wait for the step commands completion rather than continuing with +the permutation next step, and send a cancel once the given wait event is +blocking the query. Note that isolationtester recognizes that a command has blocked by looking to see if it is shown as waiting in the pg_locks view; therefore, only diff --git a/src/test/isolation/isolationtester.c b/src/test/isolation/isolationtester.c index f80261c022..f530d9923d 100644 --- a/src/test/isolation/isolationtester.c +++ b/src/test/isolation/isolationtester.c @@ -43,6 +43,7 @@ static void run_permutation(TestSpec *testspec, int nsteps, Step **steps); #define STEP_NONBLOCK 0x1 /* return 0 as soon as cmd waits for a lock */ #define STEP_RETRY 0x2 /* this is a retry of a previously-waiting cmd */ +#define STEP_WAIT 0x4 /* active wait for a given wait event */ static bool try_complete_step(TestSpec *testspec, Step *step, int flags); static int step_qsort_cmp(const void *a, const void *b); @@ -212,7 +213,7 @@ main(int argc, char **argv) */ initPQExpBuffer(&wait_query); appendPQExpBufferStr(&wait_query, - "SELECT pg_catalog.pg_isolation_test_session_is_blocked($1, '{"); + "SELECT * FROM pg_catalog.pg_isolation_test_session_is_blocked($1, '{"); /* The spec syntax requires at least one session; assume that here. */ appendPQExpBufferStr(&wait_query, backend_pid_strs[1]); for (i = 2; i < nconns; i++) @@ -587,8 +588,14 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps) exit(1); } - /* Try to complete this step without blocking. */ - mustwait = try_complete_step(testspec, step, STEP_NONBLOCK); + /* + * Try to complete this step without blocking, unless the step has a + * cancel on clause. + */ + mustwait = try_complete_step(testspec, step, + (step->waitinfo ? STEP_WAIT : STEP_NONBLOCK)); + if (step->waitinfo) + report_error_message(step); /* Check for completion of any steps that were previously waiting. */ w = 0; @@ -720,10 +727,12 @@ try_complete_step(TestSpec *testspec, Step *step, int flags) else if (ret == 0) /* select() timeout: check for lock wait */ { struct timeval current_time; + char *wait_event_type = ""; + char *wait_event = ""; int64 td; /* If it's OK for the step to block, check whether it has. */ - if (flags & STEP_NONBLOCK) + if (flags & (STEP_NONBLOCK | STEP_WAIT)) { bool waiting; @@ -738,9 +747,17 @@ try_complete_step(TestSpec *testspec, Step *step, int flags) exit(1); } waiting = ((PQgetvalue(res, 0, 0))[0] == 't'); + if (waiting && (flags & STEP_WAIT)) + { + wait_event_type = pg_strdup(PQgetvalue(res, 0, 1)); + Assert(wait_event_type != NULL); + + wait_event = pg_strdup(PQgetvalue(res, 0, 2)); + Assert(wait_event != NULL); + } PQclear(res); - if (waiting) /* waiting to acquire a lock */ + if (waiting && (flags & STEP_NONBLOCK)) /* waiting to acquire a lock */ { /* * Since it takes time to perform the lock-check query, @@ -787,7 +804,11 @@ try_complete_step(TestSpec *testspec, Step *step, int flags) * failing, but remaining permutations and tests should still be * OK. */ - if (td > max_step_wait && !canceled) + if ((td > max_step_wait || + (step->waitinfo + && strcmp(step->waitinfo->wait_event_type, wait_event_type) == 0 + && strcmp(step->waitinfo->wait_event, wait_event) == 0)) + && !canceled) { PGcancel *cancel = PQgetCancel(conn); @@ -801,8 +822,14 @@ try_complete_step(TestSpec *testspec, Step *step, int flags) * print to stdout not stderr, as this should appear * in the test case's results */ - printf("isolationtester: canceling step %s after %d seconds\n", - step->name, (int) (td / USECS_PER_SEC)); + if (!step->waitinfo) + printf("isolationtester: canceling step %s after %d seconds\n", + step->name, (int) (td / USECS_PER_SEC)); + else + printf("isolationtester: canceling step %s on wait event %s/%s\n", + step->name, + step->waitinfo->wait_event_type, + step->waitinfo->wait_event); canceled = true; } else diff --git a/src/test/isolation/isolationtester.h b/src/test/isolation/isolationtester.h index 9cf5012416..5df540485c 100644 --- a/src/test/isolation/isolationtester.h +++ b/src/test/isolation/isolationtester.h @@ -26,6 +26,12 @@ struct Session int nsteps; }; +typedef struct WaitInfo +{ + char *wait_event_type; + char *wait_event; +} WaitInfo; + struct Step { int session; @@ -33,6 +39,8 @@ struct Step char *name; char *sql; char *errormsg; + WaitInfo *waitinfo; + struct timeval start_time; }; typedef struct diff --git a/src/test/isolation/specparse.y b/src/test/isolation/specparse.y index 5e007e1bf0..c8e72f4316 100644 --- a/src/test/isolation/specparse.y +++ b/src/test/isolation/specparse.y @@ -27,6 +27,7 @@ TestSpec parseresult; /* result of parsing is left here */ char *str; Session *session; Step *step; + WaitInfo *waitinfo; Permutation *permutation; struct { @@ -43,9 +44,11 @@ TestSpec parseresult; /* result of parsing is left here */ %type <session> session %type <step> step %type <permutation> permutation +%type <waitinfo> opt_cancel %token <str> sqlblock string_literal -%token PERMUTATION SESSION SETUP STEP TEARDOWN TEST +%token <str> LITERAL +%token CANCEL ON PERMUTATION SESSION SETUP STEP TEARDOWN TEST %% @@ -140,16 +143,29 @@ step_list: step: - STEP string_literal sqlblock + STEP string_literal sqlblock opt_cancel { $$ = pg_malloc(sizeof(Step)); $$->name = $2; $$->sql = $3; $$->used = false; $$->errormsg = NULL; + $$->waitinfo = $4; } ; +opt_cancel: + CANCEL ON string_literal string_literal + { + $$ = pg_malloc(sizeof(WaitInfo)); + $$->wait_event_type = $3; + $$->wait_event = $4; + } + | /* EMPTY */ + { + $$ = NULL; + } + ; opt_permutation_list: permutation_list diff --git a/src/test/isolation/specscanner.l b/src/test/isolation/specscanner.l index 410f17727e..672e5fa2fc 100644 --- a/src/test/isolation/specscanner.l +++ b/src/test/isolation/specscanner.l @@ -48,6 +48,8 @@ comment ("#"{non_newline}*) litbufsize = LITBUF_INIT; %} +cancel { return CANCEL; } +on { return ON; } permutation { return PERMUTATION; } session { return SESSION; } setup { return SETUP; } -- 2.25.1
>From 60267b2fb3ff8e3b632ee07d12ee4031543601d3 Mon Sep 17 00:00:00 2001 From: Julien Rouhaud <julien.rouh...@free.fr> Date: Fri, 6 Mar 2020 13:51:35 +0100 Subject: [PATCH v2 2/2] Add regression tests for failed REINDEX TABLE CONCURRENTLY. If a REINDEX TABLE CONCURRENTLY fails on a table having a TOAST table, an invalid index will be present for the TOAST table. As we only allow to drop invalid indexes on TOAST tables, reindexing those would lead to useless duplicated indexes that can't be dropped anymore. Author: Julien Rouhaud Reviewed-by: Discussion: https://postgr.es/m/20200216190835.ga21...@telsasoft.com --- .../expected/reindex-concurrently.out | 54 ++++++++++++++++++- .../isolation/specs/reindex-concurrently.spec | 19 +++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/src/test/isolation/expected/reindex-concurrently.out b/src/test/isolation/expected/reindex-concurrently.out index 9e04169b2f..674a8890bd 100644 --- a/src/test/isolation/expected/reindex-concurrently.out +++ b/src/test/isolation/expected/reindex-concurrently.out @@ -1,4 +1,4 @@ -Parsed test spec with 3 sessions +Parsed test spec with 5 sessions starting permutation: reindex sel1 upd2 ins2 del2 end1 end2 step reindex: REINDEX TABLE CONCURRENTLY reind_con_tab; @@ -76,3 +76,55 @@ step end1: COMMIT; step reindex: REINDEX TABLE CONCURRENTLY reind_con_tab; <waiting ...> step end2: COMMIT; step reindex: <... completed> + +starting permutation: check_invalid lock reindex_fail unlock check_invalid nowarn normal_reindex check_invalid reindex check_invalid +step check_invalid: SELECT i.indisvalid + FROM pg_class c + JOIN pg_class t ON t.oid = c.reltoastrelid + JOIN pg_index i ON i.indrelid = t.oid + WHERE c.relname = 'reind_con_tab' + ORDER BY i.indisvalid::text COLLATE "C"; +indisvalid + +t +step lock: BEGIN; SELECT data FROM reind_con_tab WHERE data = 'aa' FOR UPDATE; +data + +aa +isolationtester: canceling step reindex_fail on wait event Lock/virtualxid +step reindex_fail: REINDEX TABLE CONCURRENTLY reind_con_tab; +ERROR: canceling statement due to user request +step unlock: COMMIT; +step check_invalid: SELECT i.indisvalid + FROM pg_class c + JOIN pg_class t ON t.oid = c.reltoastrelid + JOIN pg_index i ON i.indrelid = t.oid + WHERE c.relname = 'reind_con_tab' + ORDER BY i.indisvalid::text COLLATE "C"; +indisvalid + +f +t +step nowarn: SET client_min_messages = 'ERROR'; +step normal_reindex: REINDEX TABLE reind_con_tab; +step check_invalid: SELECT i.indisvalid + FROM pg_class c + JOIN pg_class t ON t.oid = c.reltoastrelid + JOIN pg_index i ON i.indrelid = t.oid + WHERE c.relname = 'reind_con_tab' + ORDER BY i.indisvalid::text COLLATE "C"; +indisvalid + +f +t +step reindex: REINDEX TABLE CONCURRENTLY reind_con_tab; +step check_invalid: SELECT i.indisvalid + FROM pg_class c + JOIN pg_class t ON t.oid = c.reltoastrelid + JOIN pg_index i ON i.indrelid = t.oid + WHERE c.relname = 'reind_con_tab' + ORDER BY i.indisvalid::text COLLATE "C"; +indisvalid + +f +t diff --git a/src/test/isolation/specs/reindex-concurrently.spec b/src/test/isolation/specs/reindex-concurrently.spec index eb59fe0cba..a3cd8b72a5 100644 --- a/src/test/isolation/specs/reindex-concurrently.spec +++ b/src/test/isolation/specs/reindex-concurrently.spec @@ -31,6 +31,22 @@ step "end2" { COMMIT; } session "s3" step "reindex" { REINDEX TABLE CONCURRENTLY reind_con_tab; } +step "reindex_fail" { REINDEX TABLE CONCURRENTLY reind_con_tab; } cancel on "Lock" "virtualxid" +step "nowarn" { SET client_min_messages = 'ERROR'; } + +session "s4" +step "lock" { BEGIN; SELECT data FROM reind_con_tab WHERE data = 'aa' FOR UPDATE; } +step "unlock" { COMMIT; } + +session "s5" +setup { SET client_min_messages = 'ERROR'; } +step "normal_reindex" { REINDEX TABLE reind_con_tab; } +step "check_invalid" {SELECT i.indisvalid + FROM pg_class c + JOIN pg_class t ON t.oid = c.reltoastrelid + JOIN pg_index i ON i.indrelid = t.oid + WHERE c.relname = 'reind_con_tab' + ORDER BY i.indisvalid::text COLLATE "C"; } permutation "reindex" "sel1" "upd2" "ins2" "del2" "end1" "end2" permutation "sel1" "reindex" "upd2" "ins2" "del2" "end1" "end2" @@ -38,3 +54,6 @@ permutation "sel1" "upd2" "reindex" "ins2" "del2" "end1" "end2" permutation "sel1" "upd2" "ins2" "reindex" "del2" "end1" "end2" permutation "sel1" "upd2" "ins2" "del2" "reindex" "end1" "end2" permutation "sel1" "upd2" "ins2" "del2" "end1" "reindex" "end2" +permutation "check_invalid" "lock" "reindex_fail" "unlock" "check_invalid" + "nowarn" "normal_reindex" "check_invalid" + "reindex" "check_invalid" -- 2.25.1