On Tue, Mar 10, 2020 at 12:09:12AM -0400, Tom Lane wrote:
> Michael Paquier <mich...@paquier.xyz> writes:
> > On Mon, Mar 09, 2020 at 10:32:27PM -0400, Tom Lane wrote:
> >> It strikes me to wonder whether we could improve matters by teaching
> >> isolationtester to watch for particular values in a connected backend's
> >> pg_stat_activity.wait_event_type/wait_event columns.  Those columns
> >> didn't exist when isolationtester was designed, IIRC, so it's not
> >> surprising that they're not used in the current design.  But we could
> >> use them perhaps to detect that a backend has arrived at some state
> >> that's not a heavyweight-lock-wait state.
>
> > Interesting idea.  So that would be basically an equivalent of
> > PostgresNode::poll_query_until but for the isolation tester?
>
> No, more like the existing isolationtester wait query, which watches
> for something being blocked on a heavyweight lock.  Right now, that
> one depends on a bespoke function pg_isolation_test_session_is_blocked(),
> but it used to be a query on pg_stat_activity/pg_locks.

Ah interesting indeed!

> > In short
> > we gain a meta-command that runs a SELECT query that waits until the
> > query defined in the command returns true.  The polling interval may
> > be tricky to set though.
>
> I think it'd be just the same as the polling interval for the existing
> wait query.  We'd have to have some way to mark a script step to say
> what to check to decide that it's blocked ...

So basically we could just change pg_isolation_test_session_is_blocked() to
also return the wait_event_type and wait_event, and adding something like

step "<name>" { SQL } [ cancel on "<wait_event_type>" "<wait_event>" ]

to the step definition should be enough.  I'm attaching a POC patch for that.
On my laptop, the full test now complete in about 400ms.

FTR the REINDEX TABLE CONCURRENTLY case is eventually locked on a virtualxid,
I'm not sure if that's could lead to too early cancellation.
>From 77c214c9fb2fa7f9a003b96db0e5ec6506217e38 Mon Sep 17 00:00:00 2001
From: Julien Rouhaud <julien.rouh...@free.fr>
Date: Fri, 6 Mar 2020 13:40:56 +0100
Subject: [PATCH v2 1/2] Add an optional cancel on clause to isolationtester
 steps.

Some sanity checks can require a command to wait on a lock and eventually be
cancelled.  The only way to do that was to rely on calls to pg_cancel_backend()
filtering pg_stat_activity view, but this isn't a satisfactory solution as
there's no way to guarantee that only the wanted backend will be canceled.

Instead, add a new optional "cancel on <wait_event_type> <wait_event>" clause
to the step definition.  When this clause is specified, isolationtester will
actively wait on that command, and issue a cancel when the query is waiting on
the given wait event.

Author: Julien Rouhaud
Reviewed-by:
Discussion: https://postgr.es/m/20200305035354.GQ2593%40paquier.xyz
---
 src/backend/utils/adt/lockfuncs.c    | 54 ++++++++++++++++++++++++++--
 src/include/catalog/pg_proc.dat      |  5 ++-
 src/test/isolation/README            | 12 +++++--
 src/test/isolation/isolationtester.c | 43 +++++++++++++++++-----
 src/test/isolation/isolationtester.h |  8 +++++
 src/test/isolation/specparse.y       | 20 +++++++++--
 src/test/isolation/specscanner.l     |  2 ++
 7 files changed, 127 insertions(+), 17 deletions(-)

diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c
index ecb1bf92ff..cc2bc94d0d 100644
--- a/src/backend/utils/adt/lockfuncs.c
+++ b/src/backend/utils/adt/lockfuncs.c
@@ -17,7 +17,9 @@
 #include "catalog/pg_type.h"
 #include "funcapi.h"
 #include "miscadmin.h"
+#include "pgstat.h"
 #include "storage/predicate_internals.h"
+#include "storage/procarray.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
 
@@ -578,17 +580,28 @@ pg_safe_snapshot_blocking_pids(PG_FUNCTION_ARGS)
 Datum
 pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS)
 {
+	TupleDesc	tupdesc;
 	int			blocked_pid = PG_GETARG_INT32(0);
 	ArrayType  *interesting_pids_a = PG_GETARG_ARRAYTYPE_P(1);
 	ArrayType  *blocking_pids_a;
 	int32	   *interesting_pids;
 	int32	   *blocking_pids;
+	Datum		values[3];
+	bool		nulls[3];
+	PGPROC	   *proc;
+	uint32		raw_wait_event = 0;
+	const char *wait_event_type = NULL;
+	const char *wait_event = NULL;
 	int			num_interesting_pids;
 	int			num_blocking_pids;
 	int			dummy;
 	int			i,
 				j;
 
+	/* Initialise values and NULL flags arrays */
+	MemSet(values, 0, sizeof(values));
+	MemSet(nulls, 0, sizeof(nulls));
+
 	/* Validate the passed-in array */
 	Assert(ARR_ELEMTYPE(interesting_pids_a) == INT4OID);
 	if (array_contains_nulls(interesting_pids_a))
@@ -597,6 +610,34 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS)
 	num_interesting_pids = ArrayGetNItems(ARR_NDIM(interesting_pids_a),
 										  ARR_DIMS(interesting_pids_a));
 
+	/* Initialise attributes information in the tuple descriptor */
+	tupdesc = CreateTemplateTupleDesc(3);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "blocked",
+					   BOOLOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "wait_event_type",
+					   TEXTOID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "wait_even",
+					   TEXTOID, -1, 0);
+
+	BlessTupleDesc(tupdesc);
+
+	proc = BackendPidGetProc(blocked_pid);
+	if (proc)
+	{
+#define UINT32_ACCESS_ONCE(var)		 ((uint32)(*((volatile uint32 *)&(var))))
+		raw_wait_event = UINT32_ACCESS_ONCE(proc->wait_event_info);
+		wait_event_type = pgstat_get_wait_event_type(raw_wait_event);
+		wait_event = pgstat_get_wait_event(raw_wait_event);
+
+		if (wait_event_type != NULL)
+			values[1] = CStringGetTextDatum(wait_event_type);
+		if (wait_event != NULL)
+			values[2] = CStringGetTextDatum(wait_event);
+	}
+
+	nulls[1] = (wait_event_type == NULL);
+	nulls[2] = (wait_event == NULL);
+
 	/*
 	 * Get the PIDs of all sessions blocking the given session's attempt to
 	 * acquire heavyweight locks.
@@ -623,7 +664,11 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS)
 		for (j = 0; j < num_interesting_pids; j++)
 		{
 			if (blocking_pids[i] == interesting_pids[j])
-				PG_RETURN_BOOL(true);
+			{
+				values[0] = BoolGetDatum(true);
+				PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc,
+								values, nulls)));
+			}
 		}
 
 	/*
@@ -636,9 +681,12 @@ pg_isolation_test_session_is_blocked(PG_FUNCTION_ARGS)
 	 * buffer and check if the number of safe snapshot blockers is non-zero.
 	 */
 	if (GetSafeSnapshotBlockingPids(blocked_pid, &dummy, 1) > 0)
-		PG_RETURN_BOOL(true);
+		values[0] = BoolGetDatum(true);
+
+	values[0] = BoolGetDatum(false);
 
-	PG_RETURN_BOOL(false);
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc,
+					values, nulls)));
 }
 
 
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 7fb574f9dc..77529e181d 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -5869,7 +5869,10 @@
   prosrc => 'pg_safe_snapshot_blocking_pids' },
 { oid => '3378', descr => 'isolationtester support function',
   proname => 'pg_isolation_test_session_is_blocked', provolatile => 'v',
-  prorettype => 'bool', proargtypes => 'int4 _int4',
+  prorettype => 'record', proargtypes => 'int4 _int4',
+  proallargtypes => '{int4,_int4,bool,text,text}',
+  proargmodes => '{i,i,o,o,o}',
+  proargnames => '{blocking_pid,interesting_pids,blocked,wait_event_type,wait_event}',
   prosrc => 'pg_isolation_test_session_is_blocked' },
 { oid => '1065', descr => 'view two-phase transactions',
   proname => 'pg_prepared_xact', prorows => '1000', proretset => 't',
diff --git a/src/test/isolation/README b/src/test/isolation/README
index 217953d183..04aed5cd17 100644
--- a/src/test/isolation/README
+++ b/src/test/isolation/README
@@ -86,10 +86,12 @@ session "<name>"
 
   Each step has the syntax
 
-  step "<name>" { <SQL> }
+  step "<name>" { <SQL> } [ cancel on "<wait_event_type>" "<wait_event>" ]
 
-  where <name> is a name identifying this step, and SQL is a SQL statement
-  (or statements, separated by semicolons) that is executed in the step.
+  where <name> is a name identifying this step, SQL is a SQL statement
+  (or statements, separated by semicolons) that is executed in the step and
+  <wait_event_type> and <wait_event> a wait event specification for which
+  isolationtester will cancel the query if it's blocked on it.
   Step names must be unique across the whole spec file.
 
 permutation "<step name>" ...
@@ -125,6 +127,10 @@ after PGISOLATIONTIMEOUT seconds.  If the cancel doesn't work, isolationtester
 will exit uncleanly after a total of twice PGISOLATIONTIMEOUT.  Testing
 invalid permutations should be avoided because they can make the isolation
 tests take a very long time to run, and they serve no useful testing purpose.
+If a test specified the optionnal cancel on specification, then isolationtester
+will actively wait for the step commands completion rather than continuing with
+the permutation next step, and send a cancel once the given wait event is
+blocking the query.
 
 Note that isolationtester recognizes that a command has blocked by looking
 to see if it is shown as waiting in the pg_locks view; therefore, only
diff --git a/src/test/isolation/isolationtester.c b/src/test/isolation/isolationtester.c
index f80261c022..f530d9923d 100644
--- a/src/test/isolation/isolationtester.c
+++ b/src/test/isolation/isolationtester.c
@@ -43,6 +43,7 @@ static void run_permutation(TestSpec *testspec, int nsteps, Step **steps);
 
 #define STEP_NONBLOCK	0x1		/* return 0 as soon as cmd waits for a lock */
 #define STEP_RETRY		0x2		/* this is a retry of a previously-waiting cmd */
+#define STEP_WAIT		0x4		/* active wait for a given wait event */
 static bool try_complete_step(TestSpec *testspec, Step *step, int flags);
 
 static int	step_qsort_cmp(const void *a, const void *b);
@@ -212,7 +213,7 @@ main(int argc, char **argv)
 	 */
 	initPQExpBuffer(&wait_query);
 	appendPQExpBufferStr(&wait_query,
-						 "SELECT pg_catalog.pg_isolation_test_session_is_blocked($1, '{");
+						 "SELECT * FROM pg_catalog.pg_isolation_test_session_is_blocked($1, '{");
 	/* The spec syntax requires at least one session; assume that here. */
 	appendPQExpBufferStr(&wait_query, backend_pid_strs[1]);
 	for (i = 2; i < nconns; i++)
@@ -587,8 +588,14 @@ run_permutation(TestSpec *testspec, int nsteps, Step **steps)
 			exit(1);
 		}
 
-		/* Try to complete this step without blocking.  */
-		mustwait = try_complete_step(testspec, step, STEP_NONBLOCK);
+		/*
+		 * Try to complete this step without blocking, unless the step has a
+		 * cancel on clause.
+		 */
+		mustwait = try_complete_step(testspec, step,
+				(step->waitinfo ? STEP_WAIT : STEP_NONBLOCK));
+		if (step->waitinfo)
+			report_error_message(step);
 
 		/* Check for completion of any steps that were previously waiting. */
 		w = 0;
@@ -720,10 +727,12 @@ try_complete_step(TestSpec *testspec, Step *step, int flags)
 		else if (ret == 0)		/* select() timeout: check for lock wait */
 		{
 			struct timeval current_time;
+			char  *wait_event_type = "";
+			char  *wait_event = "";
 			int64		td;
 
 			/* If it's OK for the step to block, check whether it has. */
-			if (flags & STEP_NONBLOCK)
+			if (flags & (STEP_NONBLOCK | STEP_WAIT))
 			{
 				bool		waiting;
 
@@ -738,9 +747,17 @@ try_complete_step(TestSpec *testspec, Step *step, int flags)
 					exit(1);
 				}
 				waiting = ((PQgetvalue(res, 0, 0))[0] == 't');
+				if (waiting && (flags & STEP_WAIT))
+				{
+					wait_event_type = pg_strdup(PQgetvalue(res, 0, 1));
+					Assert(wait_event_type != NULL);
+
+					wait_event = pg_strdup(PQgetvalue(res, 0, 2));
+					Assert(wait_event != NULL);
+				}
 				PQclear(res);
 
-				if (waiting)	/* waiting to acquire a lock */
+				if (waiting && (flags & STEP_NONBLOCK))	/* waiting to acquire a lock */
 				{
 					/*
 					 * Since it takes time to perform the lock-check query,
@@ -787,7 +804,11 @@ try_complete_step(TestSpec *testspec, Step *step, int flags)
 			 * failing, but remaining permutations and tests should still be
 			 * OK.
 			 */
-			if (td > max_step_wait && !canceled)
+			if ((td > max_step_wait ||
+				(step->waitinfo
+				 && strcmp(step->waitinfo->wait_event_type, wait_event_type) == 0
+				 && strcmp(step->waitinfo->wait_event, wait_event) == 0))
+				&& !canceled)
 			{
 				PGcancel   *cancel = PQgetCancel(conn);
 
@@ -801,8 +822,14 @@ try_complete_step(TestSpec *testspec, Step *step, int flags)
 						 * print to stdout not stderr, as this should appear
 						 * in the test case's results
 						 */
-						printf("isolationtester: canceling step %s after %d seconds\n",
-							   step->name, (int) (td / USECS_PER_SEC));
+						if (!step->waitinfo)
+							printf("isolationtester: canceling step %s after %d seconds\n",
+								   step->name, (int) (td / USECS_PER_SEC));
+						else
+							printf("isolationtester: canceling step %s on wait event %s/%s\n",
+									step->name,
+									step->waitinfo->wait_event_type,
+									step->waitinfo->wait_event);
 						canceled = true;
 					}
 					else
diff --git a/src/test/isolation/isolationtester.h b/src/test/isolation/isolationtester.h
index 9cf5012416..5df540485c 100644
--- a/src/test/isolation/isolationtester.h
+++ b/src/test/isolation/isolationtester.h
@@ -26,6 +26,12 @@ struct Session
 	int			nsteps;
 };
 
+typedef struct WaitInfo
+{
+	char	   *wait_event_type;
+	char	   *wait_event;
+} WaitInfo;
+
 struct Step
 {
 	int			session;
@@ -33,6 +39,8 @@ struct Step
 	char	   *name;
 	char	   *sql;
 	char	   *errormsg;
+	WaitInfo   *waitinfo;
+	struct timeval start_time;
 };
 
 typedef struct
diff --git a/src/test/isolation/specparse.y b/src/test/isolation/specparse.y
index 5e007e1bf0..c8e72f4316 100644
--- a/src/test/isolation/specparse.y
+++ b/src/test/isolation/specparse.y
@@ -27,6 +27,7 @@ TestSpec		parseresult;			/* result of parsing is left here */
 	char	   *str;
 	Session	   *session;
 	Step	   *step;
+	WaitInfo   *waitinfo;
 	Permutation *permutation;
 	struct
 	{
@@ -43,9 +44,11 @@ TestSpec		parseresult;			/* result of parsing is left here */
 %type <session> session
 %type <step> step
 %type <permutation> permutation
+%type <waitinfo> opt_cancel
 
 %token <str> sqlblock string_literal
-%token PERMUTATION SESSION SETUP STEP TEARDOWN TEST
+%token <str> LITERAL
+%token CANCEL ON PERMUTATION SESSION SETUP STEP TEARDOWN TEST
 
 %%
 
@@ -140,16 +143,29 @@ step_list:
 
 
 step:
-			STEP string_literal sqlblock
+			STEP string_literal sqlblock opt_cancel
 			{
 				$$ = pg_malloc(sizeof(Step));
 				$$->name = $2;
 				$$->sql = $3;
 				$$->used = false;
 				$$->errormsg = NULL;
+				$$->waitinfo = $4;
 			}
 		;
 
+opt_cancel:
+			CANCEL ON string_literal string_literal
+			{
+				$$ = pg_malloc(sizeof(WaitInfo));
+				$$->wait_event_type = $3;
+				$$->wait_event = $4;
+			}
+			| /* EMPTY */
+			{
+				$$ = NULL;
+			}
+		;
 
 opt_permutation_list:
 			permutation_list
diff --git a/src/test/isolation/specscanner.l b/src/test/isolation/specscanner.l
index 410f17727e..672e5fa2fc 100644
--- a/src/test/isolation/specscanner.l
+++ b/src/test/isolation/specscanner.l
@@ -48,6 +48,8 @@ comment			("#"{non_newline}*)
 	litbufsize = LITBUF_INIT;
 %}
 
+cancel			{ return CANCEL; }
+on				{ return ON; }
 permutation		{ return PERMUTATION; }
 session			{ return SESSION; }
 setup			{ return SETUP; }
-- 
2.25.1

>From 60267b2fb3ff8e3b632ee07d12ee4031543601d3 Mon Sep 17 00:00:00 2001
From: Julien Rouhaud <julien.rouh...@free.fr>
Date: Fri, 6 Mar 2020 13:51:35 +0100
Subject: [PATCH v2 2/2] Add regression tests for failed REINDEX TABLE
 CONCURRENTLY.

If a REINDEX TABLE CONCURRENTLY fails on a table having a TOAST table, an
invalid index will be present for the TOAST table.  As we only allow to drop
invalid indexes on TOAST tables, reindexing those would lead to useless
duplicated indexes that can't be dropped anymore.

Author: Julien Rouhaud
Reviewed-by:
Discussion: https://postgr.es/m/20200216190835.ga21...@telsasoft.com
---
 .../expected/reindex-concurrently.out         | 54 ++++++++++++++++++-
 .../isolation/specs/reindex-concurrently.spec | 19 +++++++
 2 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/src/test/isolation/expected/reindex-concurrently.out b/src/test/isolation/expected/reindex-concurrently.out
index 9e04169b2f..674a8890bd 100644
--- a/src/test/isolation/expected/reindex-concurrently.out
+++ b/src/test/isolation/expected/reindex-concurrently.out
@@ -1,4 +1,4 @@
-Parsed test spec with 3 sessions
+Parsed test spec with 5 sessions
 
 starting permutation: reindex sel1 upd2 ins2 del2 end1 end2
 step reindex: REINDEX TABLE CONCURRENTLY reind_con_tab;
@@ -76,3 +76,55 @@ step end1: COMMIT;
 step reindex: REINDEX TABLE CONCURRENTLY reind_con_tab; <waiting ...>
 step end2: COMMIT;
 step reindex: <... completed>
+
+starting permutation: check_invalid lock reindex_fail unlock check_invalid nowarn normal_reindex check_invalid reindex check_invalid
+step check_invalid: SELECT i.indisvalid
+    FROM pg_class c
+    JOIN pg_class t ON t.oid = c.reltoastrelid
+    JOIN pg_index i ON i.indrelid = t.oid
+    WHERE c.relname = 'reind_con_tab'
+    ORDER BY i.indisvalid::text COLLATE "C";
+indisvalid     
+
+t              
+step lock: BEGIN; SELECT data FROM reind_con_tab WHERE data = 'aa' FOR UPDATE;
+data           
+
+aa             
+isolationtester: canceling step reindex_fail on wait event Lock/virtualxid
+step reindex_fail: REINDEX TABLE CONCURRENTLY reind_con_tab;
+ERROR:  canceling statement due to user request
+step unlock: COMMIT;
+step check_invalid: SELECT i.indisvalid
+    FROM pg_class c
+    JOIN pg_class t ON t.oid = c.reltoastrelid
+    JOIN pg_index i ON i.indrelid = t.oid
+    WHERE c.relname = 'reind_con_tab'
+    ORDER BY i.indisvalid::text COLLATE "C";
+indisvalid     
+
+f              
+t              
+step nowarn: SET client_min_messages = 'ERROR';
+step normal_reindex: REINDEX TABLE reind_con_tab;
+step check_invalid: SELECT i.indisvalid
+    FROM pg_class c
+    JOIN pg_class t ON t.oid = c.reltoastrelid
+    JOIN pg_index i ON i.indrelid = t.oid
+    WHERE c.relname = 'reind_con_tab'
+    ORDER BY i.indisvalid::text COLLATE "C";
+indisvalid     
+
+f              
+t              
+step reindex: REINDEX TABLE CONCURRENTLY reind_con_tab;
+step check_invalid: SELECT i.indisvalid
+    FROM pg_class c
+    JOIN pg_class t ON t.oid = c.reltoastrelid
+    JOIN pg_index i ON i.indrelid = t.oid
+    WHERE c.relname = 'reind_con_tab'
+    ORDER BY i.indisvalid::text COLLATE "C";
+indisvalid     
+
+f              
+t              
diff --git a/src/test/isolation/specs/reindex-concurrently.spec b/src/test/isolation/specs/reindex-concurrently.spec
index eb59fe0cba..a3cd8b72a5 100644
--- a/src/test/isolation/specs/reindex-concurrently.spec
+++ b/src/test/isolation/specs/reindex-concurrently.spec
@@ -31,6 +31,22 @@ step "end2" { COMMIT; }
 
 session "s3"
 step "reindex" { REINDEX TABLE CONCURRENTLY reind_con_tab; }
+step "reindex_fail" { REINDEX TABLE CONCURRENTLY reind_con_tab; } cancel on "Lock" "virtualxid"
+step "nowarn" { SET client_min_messages = 'ERROR'; }
+
+session "s4"
+step "lock" { BEGIN; SELECT data FROM reind_con_tab WHERE data = 'aa' FOR UPDATE; }
+step "unlock" { COMMIT; }
+
+session "s5"
+setup { SET client_min_messages = 'ERROR'; }
+step "normal_reindex" { REINDEX TABLE reind_con_tab; }
+step "check_invalid" {SELECT i.indisvalid
+    FROM pg_class c
+    JOIN pg_class t ON t.oid = c.reltoastrelid
+    JOIN pg_index i ON i.indrelid = t.oid
+    WHERE c.relname = 'reind_con_tab'
+    ORDER BY i.indisvalid::text COLLATE "C"; }
 
 permutation "reindex" "sel1" "upd2" "ins2" "del2" "end1" "end2"
 permutation "sel1" "reindex" "upd2" "ins2" "del2" "end1" "end2"
@@ -38,3 +54,6 @@ permutation "sel1" "upd2" "reindex" "ins2" "del2" "end1" "end2"
 permutation "sel1" "upd2" "ins2" "reindex" "del2" "end1" "end2"
 permutation "sel1" "upd2" "ins2" "del2" "reindex" "end1" "end2"
 permutation "sel1" "upd2" "ins2" "del2" "end1" "reindex" "end2"
+permutation "check_invalid" "lock" "reindex_fail" "unlock" "check_invalid"
+            "nowarn" "normal_reindex" "check_invalid"
+            "reindex" "check_invalid"
-- 
2.25.1

Reply via email to