Hello hackers,

As a recent failure, produced by drongo [1], shows, pg_ctl stop/start
sequence may break on Windows due to the transient DELETE PENDING state of
posmaster.pid.

Please look at the excerpt from the failure log:
...
pg_createsubscriber: stopping the subscriber
2024-08-19 18:02:47.608 UTC [6988:4] LOG:  received fast shutdown request
2024-08-19 18:02:47.608 UTC [6988:5] LOG:  aborting any active transactions
2024-08-19 18:02:47.612 UTC [5884:2] FATAL:  terminating walreceiver process 
due to administrator command
2024-08-19 18:02:47.705 UTC [7036:1] LOG:  shutting down
pg_createsubscriber: server was stopped
### the server instance (1) emitted only "shutting down" yet, but pg_ctl
### considered it stopped and returned 0 to pg_createsubscriber
[18:02:47.900](2.828s) ok 29 - run pg_createsubscriber without --databases
...
pg_createsubscriber: starting the standby with command-line options
pg_createsubscriber: pg_ctl command is: ...
2024-08-19 18:02:48.163 UTC [5284:1] FATAL:  could not create lock file 
"postmaster.pid": File exists
pg_createsubscriber: server was started
pg_createsubscriber: checking settings on subscriber
### pg_createsubscriber attempts to start new server instance (2), but
### it fails due to "postmaster.pid" still found on disk
2024-08-19 18:02:48.484 UTC [6988:6] LOG:  database system is shut down
### the server instance (1) is finally stopped and postmaster.pid unlinked

With extra debug logging and the ntries limit decreased to 10 (in
CreateLockFile()), I reproduced the failure easily (when running 20 tests
in parallel) and got additional information (see attached).

IIUC, the issue is caused by inconsistent checks for postmaster.pid
existence:
"pg_ctl stop" ... -> get_pgpid() calls fopen(pid_file, "r"),
 which fails with ENOENT for the DELETE_PENDING state (see
 pgwin32_open_handle()).

"pg_ctl start" ... -> CreateLockFile() calls
    fd = open(filename, O_RDWR | O_CREAT | O_EXCL, pg_file_create_mode);
which fails with EEXISTS for the same state of postmaster.pid.

[1] 
https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=drongo&dt=2024-08-19%2017%3A32%3A54

Best regards,
Alexander
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c
index 537d92c0cf..570d2d2557 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -1219,11 +1219,13 @@ CreateLockFile(const char *filename, bool amPostmaster,
 		/*
 		 * Couldn't create the pid file. Probably it already exists.
 		 */
-		if ((errno != EEXIST && errno != EACCES) || ntries > 100)
+		if ((errno != EEXIST && errno != EACCES) || ntries > 10)
+{
 			ereport(FATAL,
 					(errcode_for_file_access(),
-					 errmsg("could not create lock file \"%s\": %m",
-							filename)));
+					 errmsg("could not create lock file (ntries: %d) \"%s\": %m",
+							ntries, filename)));
+}
 
 		/*
 		 * Read the file to get the old owner's PID.  Note race condition
diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c
index 6295783cde..583ed7d449 100644
--- a/src/bin/pg_basebackup/pg_createsubscriber.c
+++ b/src/bin/pg_basebackup/pg_createsubscriber.c
@@ -1479,7 +1479,7 @@ stop_standby_server(const char *datadir)
 	char	   *pg_ctl_cmd;
 	int			rc;
 
-	pg_ctl_cmd = psprintf("\"%s\" stop -D \"%s\" -s", pg_ctl_path,
+	pg_ctl_cmd = psprintf("\"%s\" stop -D \"%s\" ", pg_ctl_path,
 						  datadir);
 	pg_log_debug("pg_ctl command is: %s", pg_ctl_cmd);
 	rc = system(pg_ctl_cmd);
diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c
index e7e878c22f..04787c6aec 100644
--- a/src/bin/pg_ctl/pg_ctl.c
+++ b/src/bin/pg_ctl/pg_ctl.c
@@ -276,6 +276,9 @@ get_pgpid(bool is_status_request)
 	pidf = fopen(pid_file, "r");
 	if (pidf == NULL)
 	{
+int en = errno;
+write_stderr("!!!get_pgpid| pid_file: %s, pidf: %p, errno: %d\n", pid_file, pidf, en);
+errno = en;
 		/* No pid file, not an error on startup */
 		if (errno == ENOENT)
 			return 0;
@@ -723,7 +726,10 @@ wait_for_postmaster_stop(void)
 		pid_t		pid;
 
 		if ((pid = get_pgpid(false)) == 0)
+{
+write_stderr("!!!wait_for_postmaster_stop| pid: %d\n", pid);
 			return true;		/* pid file is gone */
+}
 
 		if (kill(pid, 0) != 0)
 		{
diff --git a/src/port/open.c b/src/port/open.c
index 13e49af8d4..f7cbc819c0 100644
--- a/src/port/open.c
+++ b/src/port/open.c
@@ -138,8 +138,14 @@ pgwin32_open_handle(const char *fileName, int fileFlags, bool backup_semantics)
 		 * invisible.  With O_CREAT, we have no choice but to report that
 		 * there's a file in the way (which wouldn't happen on Unix).
 		 */
+DWORD ntstat = pg_RtlGetLastNtStatus();
+if (strstr(fileName, "postmaster.pid") != NULL)
+{
+fprintf(stderr, "!!!pgwin32_open_handle| fileFlags: %X, err: %d, ntstatus: %X\n", fileFlags, err, ntstat);
+}
+
 		if (err == ERROR_ACCESS_DENIED &&
-			pg_RtlGetLastNtStatus() == STATUS_DELETE_PENDING)
+			ntstat == STATUS_DELETE_PENDING)
 		{
 			if (fileFlags & O_CREAT)
 				err = ERROR_FILE_EXISTS;
@@ -214,6 +220,12 @@ pgwin32_fopen(const char *fileName, const char *mode)
 		openmode |= O_TEXT;
 
 	fd = pgwin32_open(fileName, openmode);
+if (strstr(fileName, "postmaster.pid") != NULL)
+{
+int en = errno;
+fprintf(stderr, "!!!pgwin32_fopen| fileName: %s, fd: %d, errno: %d\n", fileName, fd, en);
+errno = en;
+}
 	if (fd == -1)
 		return NULL;
 	return _fdopen(fd, mode);

Attachment: regress_log_040_pg_createsubscriber.tar.bz2
Description: application/bzip

Reply via email to