On Wed, Sep 30, 2020 at 11:32 PM Fujii Masao <masao.fu...@oss.nttdata.com> wrote: > > > And another way, if we don't want to use wait_pid() is to have a > > plpgsql stored procedure, that in a loop keeps on checking for the > > backed pid from pg_stat_activity, exit when pid is 0. and then proceed > > to issue the next foreign table query. Thoughts? > > +1 for this approach! We can use plpgsql or DO command. >
Used plpgsql procedure as we have to use the procedure 2 times. > > > > > mypid = -1; > > while (mypid != 0) > > SELECT pid INTO mypid FROM pg_stat_activity WHERE backend_type = > > 'client backend' AND application_name = 'fdw_retry_check'; > > Or we can just wait for the number of processes with > appname='fdw_retry_check' to be zero rather than checking the pid. > Done. Attaching v7 patch with above changes. Please review it. With Regards, Bharath Rupireddy. EnterpriseDB: http://www.enterprisedb.com
From e2a85116116e79c76a7b0fcfc1a25eff57627ed4 Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy <bharath.rupireddy@enterprisedb.com> Date: Thu, 1 Oct 2020 15:16:26 +0530 Subject: [PATCH v7] Retry Cached Remote Connections For postgres_fdw Remote connections are cached for postgres_fdw in the local backend. There are high chances that the remote backend may not be available i.e. it can get killed, the subsequent foreign queries from local backend/session that use cached connection will fail as the remote backend would have become unavailable. This patch proposes a retry mechanism. Local backend/session uses cached connection and if it receives connection bad error, it deletes the cached entry and retry to get a new connection. This retry happens only at the start of remote xact. --- contrib/postgres_fdw/connection.c | 55 +++++++++--- .../postgres_fdw/expected/postgres_fdw.out | 85 +++++++++++++++++++ contrib/postgres_fdw/sql/postgres_fdw.sql | 64 ++++++++++++++ 3 files changed, 192 insertions(+), 12 deletions(-) diff --git a/contrib/postgres_fdw/connection.c b/contrib/postgres_fdw/connection.c index 08daf26fdf..441124963f 100644 --- a/contrib/postgres_fdw/connection.c +++ b/contrib/postgres_fdw/connection.c @@ -108,6 +108,7 @@ PGconn * GetConnection(UserMapping *user, bool will_prep_stmt) { bool found; + volatile bool retry_conn = false; ConnCacheEntry *entry; ConnCacheKey key; @@ -159,23 +160,25 @@ GetConnection(UserMapping *user, bool will_prep_stmt) /* Reject further use of connections which failed abort cleanup. */ pgfdw_reject_incomplete_xact_state_change(entry); +retry: /* * If the connection needs to be remade due to invalidation, disconnect as - * soon as we're out of all transactions. + * soon as we're out of all transactions. Also, if previous attempt to + * start new remote transaction failed on the cached connection, disconnect + * it to retry a new connection. */ - if (entry->conn != NULL && entry->invalidated && entry->xact_depth == 0) + if ((entry->conn != NULL && entry->invalidated && + entry->xact_depth == 0) || retry_conn) { - elog(DEBUG3, "closing connection %p for option changes to take effect", - entry->conn); + if (retry_conn) + elog(DEBUG3, "closing connection %p to reestablish a new one", + entry->conn); + else + elog(DEBUG3, "closing connection %p for option changes to take effect", + entry->conn); disconnect_pg_server(entry); } - /* - * We don't check the health of cached connection here, because it would - * require some overhead. Broken connection will be detected when the - * connection is actually used. - */ - /* * If cache entry doesn't have a connection, we have to establish a new * connection. (If connect_pg_server throws an error, the cache entry @@ -206,9 +209,37 @@ GetConnection(UserMapping *user, bool will_prep_stmt) } /* - * Start a new transaction or subtransaction if needed. + * We check the health of the cached connection here, while the remote xact + * is going to begin. Broken connection will be detected and the + * connection is retried. We do this only once during each begin remote + * xact call, if the connection is still broken after the retry attempt, + * then the query will fail. */ - begin_remote_xact(entry); + PG_TRY(); + { + /* Start a new transaction or subtransaction if needed. */ + begin_remote_xact(entry); + retry_conn = false; + } + PG_CATCH(); + { + if (!entry->conn || + PQstatus(entry->conn) != CONNECTION_BAD || + entry->xact_depth > 0 || + retry_conn) + PG_RE_THROW(); + retry_conn = true; + } + PG_END_TRY(); + + if (retry_conn) + { + ereport(DEBUG3, + (errmsg("could not start remote transaction on connection %p", + entry->conn)), + errdetail_internal("%s", pchomp(PQerrorMessage(entry->conn)))); + goto retry; + } /* Remember if caller will prepare statements */ entry->have_prep_stmt |= will_prep_stmt; diff --git a/contrib/postgres_fdw/expected/postgres_fdw.out b/contrib/postgres_fdw/expected/postgres_fdw.out index 10e23d02ed..09d0578d05 100644 --- a/contrib/postgres_fdw/expected/postgres_fdw.out +++ b/contrib/postgres_fdw/expected/postgres_fdw.out @@ -8987,3 +8987,88 @@ PREPARE TRANSACTION 'fdw_tpc'; ERROR: cannot PREPARE a transaction that has operated on postgres_fdw foreign tables ROLLBACK; WARNING: there is no transaction in progress +-- Connection retry feature +-- Retry cached connections at the beginning of the remote xact in case remote +-- backend is killed. Let's use a different application name for remote +-- connection, so that this test will not kill other backends wrongly. +ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check'); +-- This procedure waits until the terminated backend has really gone away. We +-- also had to check for wait event type and wait event of the terminated +-- backend's row from pg_stat_activity. Otherwise, the usage of this procedure +-- in an explicit txn block(BEGIN - END) after terminating the backend would +-- still have an entry with wait event type wait event null, as we have not yet +-- committed the txn. The entry of the terminated backend from pg_stat_activity +-- would be removed only after the txn commit. +CREATE OR REPLACE PROCEDURE wait_for_backend_termination() +LANGUAGE plpgsql +AS $$ +DECLARE proccnt int; +BEGIN + LOOP + SELECT count(*) INTO proccnt FROM pg_stat_activity + WHERE backend_type = 'client backend' AND + application_name = 'fdw_retry_check' AND + wait_event_type IS NOT NULL AND + wait_event IS NOT NULL; + EXIT WHEN proccnt = 0; + END LOOP; +END; +$$; +-- Connection retry feature test case 001: +-- Generate a connection to remote. Local backend will cache it. +SELECT 1 FROM ft1 LIMIT 1; + ?column? +---------- + 1 +(1 row) + +-- Retrieve pid of remote backend with application name fdw_retry_check and +-- kill intentionally here. Note that, local backend still has the remote +-- backend info in it's cache. +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE +backend_type = 'client backend' AND application_name = 'fdw_retry_check'; + pg_terminate_backend +---------------------- + t +(1 row) + +-- Despite issuing terminate, there are chances that the backend is still not +-- actually terminated. Hence, wait for the terminated backend to go away. +CALL wait_for_backend_termination(); +-- Next query using the same foreign server should succeed if connection retry +-- works. +SELECT 1 FROM ft1 LIMIT 1; + ?column? +---------- + 1 +(1 row) + +-- Connection retry feature test case 002: +-- Subtransaction - remote backend is killed in the middle of a remote xact and +-- we don't do retry connection, hence the subsequent query using the same +-- foreign server should fail. +BEGIN; +DECLARE c CURSOR FOR SELECT 1 FROM ft1 LIMIT 1; +FETCH c; + ?column? +---------- + 1 +(1 row) + +SAVEPOINT s; +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE +backend_type = 'client backend' AND application_name = 'fdw_retry_check'; + pg_terminate_backend +---------------------- + t +(1 row) + +CALL wait_for_backend_termination(); +SELECT 1 FROM ft1 LIMIT 1; -- should fail +ERROR: server closed the connection unexpectedly + This probably means the server terminated abnormally + before or while processing the request. +CONTEXT: remote SQL command: SAVEPOINT s2 +COMMIT; +-- Clean up +DROP PROCEDURE wait_for_backend_termination(); diff --git a/contrib/postgres_fdw/sql/postgres_fdw.sql b/contrib/postgres_fdw/sql/postgres_fdw.sql index 78156d10b4..17246f4176 100644 --- a/contrib/postgres_fdw/sql/postgres_fdw.sql +++ b/contrib/postgres_fdw/sql/postgres_fdw.sql @@ -2653,3 +2653,67 @@ SELECT count(*) FROM ft1; -- error here PREPARE TRANSACTION 'fdw_tpc'; ROLLBACK; + +-- Connection retry feature +-- Retry cached connections at the beginning of the remote xact in case remote +-- backend is killed. Let's use a different application name for remote +-- connection, so that this test will not kill other backends wrongly. +ALTER SERVER loopback OPTIONS (application_name 'fdw_retry_check'); + +-- This procedure waits until the terminated backend has really gone away. We +-- also had to check for wait event type and wait event of the terminated +-- backend's row from pg_stat_activity. Otherwise, the usage of this procedure +-- in an explicit txn block(BEGIN - END) after terminating the backend would +-- still have an entry with wait event type wait event null, as we have not yet +-- committed the txn. The entry of the terminated backend from pg_stat_activity +-- would be removed only after the txn commit. +CREATE OR REPLACE PROCEDURE wait_for_backend_termination() +LANGUAGE plpgsql +AS $$ +DECLARE proccnt int; +BEGIN + LOOP + SELECT count(*) INTO proccnt FROM pg_stat_activity + WHERE backend_type = 'client backend' AND + application_name = 'fdw_retry_check' AND + wait_event_type IS NOT NULL AND + wait_event IS NOT NULL; + EXIT WHEN proccnt = 0; + END LOOP; +END; +$$; + +-- Connection retry feature test case 001: +-- Generate a connection to remote. Local backend will cache it. +SELECT 1 FROM ft1 LIMIT 1; + +-- Retrieve pid of remote backend with application name fdw_retry_check and +-- kill intentionally here. Note that, local backend still has the remote +-- backend info in it's cache. +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE +backend_type = 'client backend' AND application_name = 'fdw_retry_check'; + +-- Despite issuing terminate, there are chances that the backend is still not +-- actually terminated. Hence, wait for the terminated backend to go away. +CALL wait_for_backend_termination(); + +-- Next query using the same foreign server should succeed if connection retry +-- works. +SELECT 1 FROM ft1 LIMIT 1; + +-- Connection retry feature test case 002: +-- Subtransaction - remote backend is killed in the middle of a remote xact and +-- we don't do retry connection, hence the subsequent query using the same +-- foreign server should fail. +BEGIN; +DECLARE c CURSOR FOR SELECT 1 FROM ft1 LIMIT 1; +FETCH c; +SAVEPOINT s; +SELECT pg_terminate_backend(pid) FROM pg_stat_activity WHERE +backend_type = 'client backend' AND application_name = 'fdw_retry_check'; +CALL wait_for_backend_termination(); +SELECT 1 FROM ft1 LIMIT 1; -- should fail +COMMIT; + +-- Clean up +DROP PROCEDURE wait_for_backend_termination(); \ No newline at end of file -- 2.25.1