Hello Alvaro, Let me show you another related anomaly, which drongo kindly discovered recently: [1]. That test failed with: SELECT dblink_cancel_query('dtest1'); - dblink_cancel_query ---------------------- - OK + dblink_cancel_query +-------------------------- + cancel request timed out (1 row)
I've managed to reproduce this when running 20 dblink tests in parallel, and with extra logging added (see attached) I've got: ... 2024-08-28 10:17:12.949 PDT [8236:204] pg_regress/dblink LOG: statement: SELECT dblink_cancel_query('dtest1'); !!!PQcancelPoll|8236| conn->status: 2 !!!PQcancelPoll|8236| conn->status: 3 !!!PQconnectPoll|8236| before pqPacketSend(..., &cancelpacket, ...) !!!pqPacketSend|8236| before pqFlush !!!pqsecure_raw_write|8236| could not send data to server: Socket is not connected (0x00002749/10057) !!!pqPacketSend|8236| after pqFlush, STATUS_OK !!!PQconnectPoll|8236| after pqPacketSend, STATUS_OK 2024-08-28 10:17:12.950 PDT [5548:7] pg_regress LOG: statement: select * from foo where f1 < 3 2024-08-28 10:17:12.951 PDT [8692:157] DEBUG: forked new backend, pid=4644 socket=5160 2024-08-28 10:17:12.973 PDT [4644:1] [unknown] LOG: connection received: host=::1 port=55073 2024-08-28 10:17:12.973 PDT [4644:2] [unknown] LOG: !!!BackendInitialize| before ProcessSSLStartup() !!!PQcancelPoll|8236| conn->status: 4 !!!PQcancelPoll|8236| conn->status: 42024-08-28 10:17:24.060 PDT [1436:1] DEBUG: snapshot of 0+0 running transaction ids (lsn 0/194C4E0 oldest xid 780 latest complete 779 next xid 780)
!!!PQcancelPoll|8236| conn->status: 4 2024-08-28 10:17:42.951 PDT [4644:3] [unknown] LOG: !!!BackendInitialize| ProcessSSLStartup() returned -1 2024-08-28 10:17:42.951 PDT [4644:4] [unknown] DEBUG: shmem_exit(0): 0 before_shmem_exit callbacks to make ... Thus, pqsecure_raw_write(), called via PQcancelPoll() -> PQconnectPoll() -> pqPacketSend() -> pqFlush) -> pqSendSome() -> pqsecure_write(), returned the WSAENOTCONN error, but it wasn't noticed at upper levels. Consequently, the cancelling backend waited for the cancel packet that was never sent. The first commit, that I could reproduce this test failure on, is 2466d6654. [1] https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=drongo&dt=2024-08-26%2021%3A35%3A04 Best regards, Alexander
diff --git a/src/backend/tcop/backend_startup.c b/src/backend/tcop/backend_startup.c index cfa2755196..fc9abbfe53 100644 --- a/src/backend/tcop/backend_startup.c +++ b/src/backend/tcop/backend_startup.c @@ -252,8 +252,10 @@ BackendInitialize(ClientSocket *client_sock, CAC_state cac) RegisterTimeout(STARTUP_PACKET_TIMEOUT, StartupPacketTimeoutHandler); enable_timeout_after(STARTUP_PACKET_TIMEOUT, AuthenticationTimeout * 1000); +elog(LOG, "!!!BackendInitialize| before ProcessSSLStartup()", status); /* Handle direct SSL handshake */ status = ProcessSSLStartup(port); +elog(LOG, "!!!BackendInitialize| ProcessSSLStartup() returned %d", status); /* * Receive the startup packet (which might turn out to be a cancel request diff --git a/src/interfaces/libpq/fe-cancel.c b/src/interfaces/libpq/fe-cancel.c index 213a6f43c2..8482dfa3e8 100644 --- a/src/interfaces/libpq/fe-cancel.c +++ b/src/interfaces/libpq/fe-cancel.c @@ -209,6 +209,7 @@ PQcancelPoll(PGcancelConn *cancelConn) PGconn *conn = &cancelConn->conn; int n; +fprintf(stderr, "!!!PQcancelPoll|%d| conn->status: %d\n", getpid(), conn->status); /* * We leave most of the connection establishment to PQconnectPoll, since * it's very similar to normal connection establishment. But once we get diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c index 360d9a4547..72a31d29ed 100644 --- a/src/interfaces/libpq/fe-connect.c +++ b/src/interfaces/libpq/fe-connect.c @@ -3407,12 +3407,14 @@ keep_going: /* We will come back to here until there is cancelpacket.cancelRequestCode = (MsgType) pg_hton32(CANCEL_REQUEST_CODE); cancelpacket.backendPID = pg_hton32(conn->be_pid); cancelpacket.cancelAuthCode = pg_hton32(conn->be_key); +fprintf(stderr, "!!!PQconnectPoll|%d| before pqPacketSend(..., &cancelpacket, ...)\n", getpid()); if (pqPacketSend(conn, 0, &cancelpacket, packetlen) != STATUS_OK) { libpq_append_conn_error(conn, "could not send cancel packet: %s", SOCK_STRERROR(SOCK_ERRNO, sebuf, sizeof(sebuf))); goto error_return; } +fprintf(stderr, "!!!PQconnectPoll|%d| after pqPacketSend, STATUS_OK\n", getpid()); conn->status = CONNECTION_AWAITING_RESPONSE; return PGRES_POLLING_READING; } @@ -5012,9 +5014,13 @@ pqPacketSend(PGconn *conn, char pack_type, if (pqPutMsgEnd(conn)) return STATUS_ERROR; +if (buf_len == 12) +fprintf(stderr, "!!!pqPacketSend|%d| before pqFlush\n", getpid()); /* Flush to ensure backend gets it. */ if (pqFlush(conn)) return STATUS_ERROR; +if (buf_len == 12) +fprintf(stderr, "!!!pqPacketSend|%d| after pqFlush, STATUS_OK\n", getpid()); return STATUS_OK; } diff --git a/src/interfaces/libpq/fe-secure.c b/src/interfaces/libpq/fe-secure.c index f628082337..1a85d9a40f 100644 --- a/src/interfaces/libpq/fe-secure.c +++ b/src/interfaces/libpq/fe-secure.c @@ -421,6 +421,7 @@ retry_masked: sebuf, sizeof(sebuf))); /* keep newline out of translated string */ strlcat(msgbuf, "\n", sizeof(msgbuf)); +fprintf(stderr, "!!!pqsecure_raw_write|%d| %s", getpid(), msgbuf); conn->write_err_msg = strdup(msgbuf); /* Now claim the write succeeded */ n = len;