Hello Alvaro,

Let me show you another related anomaly, which drongo kindly discovered
recently: [1]. That test failed with:
 SELECT dblink_cancel_query('dtest1');
- dblink_cancel_query
----------------------
- OK
+   dblink_cancel_query
+--------------------------
+ cancel request timed out
 (1 row)

I've managed to reproduce this when running 20 dblink tests in parallel,
and with extra logging added (see attached) I've got:
...
2024-08-28 10:17:12.949 PDT [8236:204] pg_regress/dblink LOG: statement: SELECT 
dblink_cancel_query('dtest1');
!!!PQcancelPoll|8236| conn->status: 2
!!!PQcancelPoll|8236| conn->status: 3
!!!PQconnectPoll|8236| before pqPacketSend(..., &cancelpacket, ...)
!!!pqPacketSend|8236| before pqFlush
!!!pqsecure_raw_write|8236| could not send data to server: Socket is not 
connected (0x00002749/10057)
!!!pqPacketSend|8236| after pqFlush, STATUS_OK
!!!PQconnectPoll|8236| after pqPacketSend, STATUS_OK
2024-08-28 10:17:12.950 PDT [5548:7] pg_regress LOG:  statement: select * from foo 
where f1 < 3
2024-08-28 10:17:12.951 PDT [8692:157] DEBUG:  forked new backend, pid=4644 
socket=5160
2024-08-28 10:17:12.973 PDT [4644:1] [unknown] LOG:  connection received: 
host=::1 port=55073
2024-08-28 10:17:12.973 PDT [4644:2] [unknown] LOG: !!!BackendInitialize| 
before ProcessSSLStartup()
!!!PQcancelPoll|8236| conn->status: 4
!!!PQcancelPoll|8236| conn->status: 4
2024-08-28 10:17:24.060 PDT [1436:1] DEBUG:  snapshot of 0+0 running transaction ids (lsn 0/194C4E0 oldest xid 780 latest complete 779 next xid 780)
!!!PQcancelPoll|8236| conn->status: 4
2024-08-28 10:17:42.951 PDT [4644:3] [unknown] LOG: !!!BackendInitialize| 
ProcessSSLStartup() returned -1
2024-08-28 10:17:42.951 PDT [4644:4] [unknown] DEBUG: shmem_exit(0): 0 
before_shmem_exit callbacks to make
...

Thus, pqsecure_raw_write(), called via PQcancelPoll() -> PQconnectPoll() ->
pqPacketSend() -> pqFlush) -> pqSendSome() -> pqsecure_write(), returned
the WSAENOTCONN error, but it wasn't noticed at upper levels.
Consequently, the cancelling backend waited for the cancel packet that was
never sent.

The first commit, that I could reproduce this test failure on, is 2466d6654.

[1] 
https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=drongo&dt=2024-08-26%2021%3A35%3A04

Best regards,
Alexander
diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c
index cfa2755196..fc9abbfe53 100644
--- a/src/backend/tcop/backend_startup.c
+++ b/src/backend/tcop/backend_startup.c
@@ -252,8 +252,10 @@ BackendInitialize(ClientSocket *client_sock, CAC_state cac)
 	RegisterTimeout(STARTUP_PACKET_TIMEOUT, StartupPacketTimeoutHandler);
 	enable_timeout_after(STARTUP_PACKET_TIMEOUT, AuthenticationTimeout * 1000);
 
+elog(LOG, "!!!BackendInitialize| before ProcessSSLStartup()", status);
 	/* Handle direct SSL handshake */
 	status = ProcessSSLStartup(port);
+elog(LOG, "!!!BackendInitialize| ProcessSSLStartup() returned %d", status);
 
 	/*
 	 * Receive the startup packet (which might turn out to be a cancel request
diff --git a/src/interfaces/libpq/fe-cancel.c b/src/interfaces/libpq/fe-cancel.c
index 213a6f43c2..8482dfa3e8 100644
--- a/src/interfaces/libpq/fe-cancel.c
+++ b/src/interfaces/libpq/fe-cancel.c
@@ -209,6 +209,7 @@ PQcancelPoll(PGcancelConn *cancelConn)
 	PGconn	   *conn = &cancelConn->conn;
 	int			n;
 
+fprintf(stderr, "!!!PQcancelPoll|%d| conn->status: %d\n", getpid(), conn->status);
 	/*
 	 * We leave most of the connection establishment to PQconnectPoll, since
 	 * it's very similar to normal connection establishment. But once we get
diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index 360d9a4547..72a31d29ed 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -3407,12 +3407,14 @@ keep_going:						/* We will come back to here until there is
 					cancelpacket.cancelRequestCode = (MsgType) pg_hton32(CANCEL_REQUEST_CODE);
 					cancelpacket.backendPID = pg_hton32(conn->be_pid);
 					cancelpacket.cancelAuthCode = pg_hton32(conn->be_key);
+fprintf(stderr, "!!!PQconnectPoll|%d| before pqPacketSend(..., &cancelpacket, ...)\n", getpid());
 					if (pqPacketSend(conn, 0, &cancelpacket, packetlen) != STATUS_OK)
 					{
 						libpq_append_conn_error(conn, "could not send cancel packet: %s",
 												SOCK_STRERROR(SOCK_ERRNO, sebuf, sizeof(sebuf)));
 						goto error_return;
 					}
+fprintf(stderr, "!!!PQconnectPoll|%d| after pqPacketSend, STATUS_OK\n", getpid());
 					conn->status = CONNECTION_AWAITING_RESPONSE;
 					return PGRES_POLLING_READING;
 				}
@@ -5012,9 +5014,13 @@ pqPacketSend(PGconn *conn, char pack_type,
 	if (pqPutMsgEnd(conn))
 		return STATUS_ERROR;
 
+if (buf_len == 12)
+fprintf(stderr, "!!!pqPacketSend|%d| before pqFlush\n", getpid());
 	/* Flush to ensure backend gets it. */
 	if (pqFlush(conn))
 		return STATUS_ERROR;
+if (buf_len == 12)
+fprintf(stderr, "!!!pqPacketSend|%d| after pqFlush, STATUS_OK\n", getpid());
 
 	return STATUS_OK;
 }
diff --git a/src/interfaces/libpq/fe-secure.c b/src/interfaces/libpq/fe-secure.c
index f628082337..1a85d9a40f 100644
--- a/src/interfaces/libpq/fe-secure.c
+++ b/src/interfaces/libpq/fe-secure.c
@@ -421,6 +421,7 @@ retry_masked:
 									   sebuf, sizeof(sebuf)));
 				/* keep newline out of translated string */
 				strlcat(msgbuf, "\n", sizeof(msgbuf));
+fprintf(stderr, "!!!pqsecure_raw_write|%d| %s", getpid(), msgbuf);
 				conn->write_err_msg = strdup(msgbuf);
 				/* Now claim the write succeeded */
 				n = len;

Reply via email to