Re: Retry Cached Remote Connections for postgres_fdw in case remote backend gets killed/goes away

Bharath Rupireddy Thu, 01 Oct 2020 05:15:00 -0700

On Wed, Sep 30, 2020 at 11:32 PM Fujii Masao
<masao.fu...@oss.nttdata.com> wrote:
>
> > And another way, if we don't want to use wait_pid() is to have a
> > plpgsql stored procedure, that in a loop keeps on checking for the
> > backed pid from pg_stat_activity, exit when pid is 0. and then proceed
> > to issue the next foreign table query. Thoughts?
>
> +1 for this approach! We can use plpgsql or DO command.
>


Used plpgsql procedure as we have to use the procedure 2 times.

>
> >
> > mypid = -1;
> > while (mypid != 0)
> > SELECT pid INTO mypid FROM pg_stat_activity WHERE backend_type =
> > 'client backend' AND application_name = 'fdw_retry_check';
>
> Or we can just wait for the number of processes with
> appname='fdw_retry_check' to be zero rather than checking the pid.
>

Done.

Attaching v7 patch with above changes. Please review it.



With Regards,
Bharath Rupireddy.
EnterpriseDB: http://www.enterprisedb.com

From e2a85116116e79c76a7b0fcfc1a25eff57627ed4 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddy@enterprisedb.com>
Date: Thu, 1 Oct 2020 15:16:26 +0530
Subject: [PATCH v7] Retry Cached Remote Connections For postgres_fdw

Remote connections are cached for postgres_fdw in the local
backend. There are high chances that the remote backend may
not be available i.e. it can get killed, the subsequent foreign
queries from local backend/session that use cached connection
will fail as the remote backend would have become unavailable.

This patch proposes a retry mechanism. Local backend/session
uses cached connection and if it receives connection bad error,
it deletes the cached entry and retry to get a new connection.
This retry happens only at the start of remote xact.
---
 contrib/postgres_fdw/connection.c             | 55 +++++++++---
 .../postgres_fdw/expected/postgres_fdw.out    | 85 +++++++++++++++++++
 contrib/postgres_fdw/sql/postgres_fdw.sql     | 64 ++++++++++++++
 3 files changed, 192 insertions(+), 12 deletions(-)

diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c
index 08daf26fdf..441124963f 100644
--- a/contrib/postgres_fdw/connection.c
+++ b/contrib/postgres_fdw/connection.c
@@ -108,6 +108,7 @@ PGconn *
 GetConnection(UserMapping *user, bool will_prep_stmt)
 {
 	bool		found;
+	volatile bool retry_conn = false;
 	ConnCacheEntry *entry;
 	ConnCacheKey key;
 
@@ -159,23 +160,25 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
 	/* Reject further use of connections which failed abort cleanup. */
 	pgfdw_reject_incomplete_xact_state_change(entry);
 
+retry:
 	/*
 	 * If the connection needs to be remade due to invalidation, disconnect as
-	 * soon as we're out of all transactions.
+	 * soon as we're out of all transactions. Also, if previous attempt to
+	 * start new remote transaction failed on the cached connection, disconnect
+	 * it to retry a new connection.
 	 */
-	if (entry->conn != NULL && entry->invalidated && entry->xact_depth == 0)
+	if ((entry->conn != NULL && entry->invalidated &&
+		 entry->xact_depth == 0) || retry_conn)
 	{
-		elog(DEBUG3, "closing connection %p for option changes to take effect",
-			 entry->conn);
+		if (retry_conn)
+			elog(DEBUG3, "closing connection %p to reestablish a new one",
+				 entry->conn);
+		else
+			elog(DEBUG3, "closing connection %p for option changes to take effect",
+				 entry->conn);
 		disconnect_pg_server(entry);
 	}
 
-	/*
-	 * We don't check the health of cached connection here, because it would
-	 * require some overhead.  Broken connection will be detected when the
-	 * connection is actually used.
-	 */
-
 	/*
 	 * If cache entry doesn't have a connection, we have to establish a new
 	 * connection.  (If connect_pg_server throws an error, the cache entry
@@ -206,9 +209,37 @@ GetConnection(UserMapping *user, bool will_prep_stmt)
 	}
 
 	/*
-	 * Start a new transaction or subtransaction if needed.
+	 * We check the health of the cached connection here, while the remote xact
+	 * is going to begin.  Broken connection will be detected and the
+	 * connection is retried. We do this only once during each begin remote
+	 * xact call, if the connection is still broken after the retry attempt,
+	 * then the query will fail.
 	 */
-	begin_remote_xact(entry);
+	PG_TRY();
+	{
+		/* Start a new transaction or subtransaction if needed. */
+		begin_remote_xact(entry);
+		retry_conn = false;
+	}
+	PG_CATCH();
+	{
+		if (!entry->conn ||
+			PQstatus(entry->conn) != CONNECTION_BAD ||
+			entry->xact_depth > 0 ||
+			retry_conn)
+			PG_RE_THROW();
+		retry_conn = true;
+	}
+	PG_END_TRY();
+
+	if (retry_conn)
+	{
+		ereport(DEBUG3,
+				(errmsg("could not start remote transaction on connection %p",
+				 entry->conn)),
+				 errdetail_internal("%s", pchomp(PQerrorMessage(entry->conn))));
+		goto retry;
+	}
 
 	/* Remember if caller will prepare statements */
 	entry->have_prep_stmt |= will_prep_stmt;
diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out
index 10e23d02ed..09d0578d05 100644
--- a/contrib/postgres_fdw/expected/postgres_fdw.out
+++ b/contrib/postgres_fdw/expected/postgres_fdw.out
@@ -8987,3 +8987,88 @@ PREPARE TRANSACTION 'fdw_tpc';
 ERROR:  cannot PREPARE a transaction that has operated on postgres_fdw foreign tables
 ROLLBACK;
 WARNING:  there is no transaction in progress
+-- Connection retry feature
+-- Retry cached connections at the beginning of the remote xact in case remote
+-- backend is killed. Let's use a different application name for remote
+-- connection, so that this test will not kill other backends wrongly.
+ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
+-- This procedure waits until the terminated backend has really gone away. We
+-- also had to check for wait event type and wait event of the terminated
+-- backend's row from pg_stat_activity. Otherwise, the usage of this procedure
+-- in an explicit txn block(BEGIN - END) after terminating the backend would
+-- still have an entry with wait event type wait event null, as we have not yet
+-- committed the txn. The entry of the terminated backend from pg_stat_activity
+-- would be removed only after the txn commit.
+CREATE OR REPLACE PROCEDURE wait_for_backend_termination()
+LANGUAGE plpgsql
+AS $$
+DECLARE proccnt int;
+BEGIN
+    LOOP
+		SELECT count(*) INTO proccnt FROM pg_stat_activity
+				WHERE backend_type = 'client backend' AND
+				  application_name = 'fdw_retry_check' AND
+				  wait_event_type IS NOT NULL AND
+				  wait_event IS NOT NULL;
+		EXIT WHEN proccnt = 0;
+    END LOOP;
+END;
+$$;
+-- Connection retry feature test case 001:
+-- Generate a connection to remote. Local backend will cache it.
+SELECT 1 FROM ft1 LIMIT 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+-- Retrieve pid of remote backend with application name fdw_retry_check and
+-- kill intentionally here. Note that, local backend still has the remote
+-- backend info in it's cache.
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+ pg_terminate_backend 
+----------------------
+ t
+(1 row)
+
+-- Despite issuing terminate, there are chances that the backend is still not
+-- actually terminated. Hence, wait for the terminated backend to go away.
+CALL wait_for_backend_termination();
+-- Next query using the same foreign server should succeed if connection retry
+-- works.
+SELECT 1 FROM ft1 LIMIT 1;
+ ?column? 
+----------
+        1
+(1 row)
+
+-- Connection retry feature test case 002:
+-- Subtransaction - remote backend is killed in the middle of a remote xact and
+-- we don't do retry connection, hence the subsequent query using the same
+-- foreign server should fail.
+BEGIN;
+DECLARE c CURSOR FOR SELECT 1 FROM ft1 LIMIT 1;
+FETCH c;
+ ?column? 
+----------
+        1
+(1 row)
+
+SAVEPOINT s;
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+ pg_terminate_backend 
+----------------------
+ t
+(1 row)
+
+CALL wait_for_backend_termination();
+SELECT 1 FROM ft1 LIMIT 1;    -- should fail
+ERROR:  server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+CONTEXT:  remote SQL command: SAVEPOINT s2
+COMMIT;
+-- Clean up
+DROP PROCEDURE wait_for_backend_termination();
diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql
index 78156d10b4..17246f4176 100644
--- a/contrib/postgres_fdw/sql/postgres_fdw.sql
+++ b/contrib/postgres_fdw/sql/postgres_fdw.sql
@@ -2653,3 +2653,67 @@ SELECT count(*) FROM ft1;
 -- error here
 PREPARE TRANSACTION 'fdw_tpc';
 ROLLBACK;
+
+-- Connection retry feature
+-- Retry cached connections at the beginning of the remote xact in case remote
+-- backend is killed. Let's use a different application name for remote
+-- connection, so that this test will not kill other backends wrongly.
+ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check');
+
+-- This procedure waits until the terminated backend has really gone away. We
+-- also had to check for wait event type and wait event of the terminated
+-- backend's row from pg_stat_activity. Otherwise, the usage of this procedure
+-- in an explicit txn block(BEGIN - END) after terminating the backend would
+-- still have an entry with wait event type wait event null, as we have not yet
+-- committed the txn. The entry of the terminated backend from pg_stat_activity
+-- would be removed only after the txn commit.
+CREATE OR REPLACE PROCEDURE wait_for_backend_termination()
+LANGUAGE plpgsql
+AS $$
+DECLARE proccnt int;
+BEGIN
+    LOOP
+		SELECT count(*) INTO proccnt FROM pg_stat_activity
+				WHERE backend_type = 'client backend' AND
+				  application_name = 'fdw_retry_check' AND
+				  wait_event_type IS NOT NULL AND
+				  wait_event IS NOT NULL;
+		EXIT WHEN proccnt = 0;
+    END LOOP;
+END;
+$$;
+
+-- Connection retry feature test case 001:
+-- Generate a connection to remote. Local backend will cache it.
+SELECT 1 FROM ft1 LIMIT 1;
+
+-- Retrieve pid of remote backend with application name fdw_retry_check and
+-- kill intentionally here. Note that, local backend still has the remote
+-- backend info in it's cache.
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+
+-- Despite issuing terminate, there are chances that the backend is still not
+-- actually terminated. Hence, wait for the terminated backend to go away.
+CALL wait_for_backend_termination();
+
+-- Next query using the same foreign server should succeed if connection retry
+-- works.
+SELECT 1 FROM ft1 LIMIT 1;
+
+-- Connection retry feature test case 002:
+-- Subtransaction - remote backend is killed in the middle of a remote xact and
+-- we don't do retry connection, hence the subsequent query using the same
+-- foreign server should fail.
+BEGIN;
+DECLARE c CURSOR FOR SELECT 1 FROM ft1 LIMIT 1;
+FETCH c;
+SAVEPOINT s;
+SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE
+backend_type = 'client backend' AND application_name = 'fdw_retry_check';
+CALL wait_for_backend_termination();
+SELECT 1 FROM ft1 LIMIT 1;    -- should fail
+COMMIT;
+
+-- Clean up
+DROP PROCEDURE wait_for_backend_termination();
\ No newline at end of file
-- 
2.25.1

Re: Retry Cached Remote Connections for postgres_fdw in case remote backend gets killed/goes away

Reply via email to