Hello hackers, As a recent failure, produced by drongo [1], shows, pg_ctl stop/start sequence may break on Windows due to the transient DELETE PENDING state of posmaster.pid.
Please look at the excerpt from the failure log: ... pg_createsubscriber: stopping the subscriber 2024-08-19 18:02:47.608 UTC [6988:4] LOG: received fast shutdown request 2024-08-19 18:02:47.608 UTC [6988:5] LOG: aborting any active transactions 2024-08-19 18:02:47.612 UTC [5884:2] FATAL: terminating walreceiver process due to administrator command 2024-08-19 18:02:47.705 UTC [7036:1] LOG: shutting down pg_createsubscriber: server was stopped ### the server instance (1) emitted only "shutting down" yet, but pg_ctl ### considered it stopped and returned 0 to pg_createsubscriber [18:02:47.900](2.828s) ok 29 - run pg_createsubscriber without --databases ... pg_createsubscriber: starting the standby with command-line options pg_createsubscriber: pg_ctl command is: ... 2024-08-19 18:02:48.163 UTC [5284:1] FATAL: could not create lock file "postmaster.pid": File exists pg_createsubscriber: server was started pg_createsubscriber: checking settings on subscriber ### pg_createsubscriber attempts to start new server instance (2), but ### it fails due to "postmaster.pid" still found on disk 2024-08-19 18:02:48.484 UTC [6988:6] LOG: database system is shut down ### the server instance (1) is finally stopped and postmaster.pid unlinked With extra debug logging and the ntries limit decreased to 10 (in CreateLockFile()), I reproduced the failure easily (when running 20 tests in parallel) and got additional information (see attached). IIUC, the issue is caused by inconsistent checks for postmaster.pid existence: "pg_ctl stop" ... -> get_pgpid() calls fopen(pid_file, "r"), which fails with ENOENT for the DELETE_PENDING state (see pgwin32_open_handle()). "pg_ctl start" ... -> CreateLockFile() calls fd = open(filename, O_RDWR | O_CREAT | O_EXCL, pg_file_create_mode); which fails with EEXISTS for the same state of postmaster.pid. [1] https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=drongo&dt=2024-08-19%2017%3A32%3A54 Best regards, Alexander
diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index 537d92c0cf..570d2d2557 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -1219,11 +1219,13 @@ CreateLockFile(const char *filename, bool amPostmaster, /* * Couldn't create the pid file. Probably it already exists. */ - if ((errno != EEXIST && errno != EACCES) || ntries > 100) + if ((errno != EEXIST && errno != EACCES) || ntries > 10) +{ ereport(FATAL, (errcode_for_file_access(), - errmsg("could not create lock file \"%s\": %m", - filename))); + errmsg("could not create lock file (ntries: %d) \"%s\": %m", + ntries, filename))); +} /* * Read the file to get the old owner's PID. Note race condition diff --git a/src/bin/pg_basebackup/pg_createsubscriber.c b/src/bin/pg_basebackup/pg_createsubscriber.c index 6295783cde..583ed7d449 100644 --- a/src/bin/pg_basebackup/pg_createsubscriber.c +++ b/src/bin/pg_basebackup/pg_createsubscriber.c @@ -1479,7 +1479,7 @@ stop_standby_server(const char *datadir) char *pg_ctl_cmd; int rc; - pg_ctl_cmd = psprintf("\"%s\" stop -D \"%s\" -s", pg_ctl_path, + pg_ctl_cmd = psprintf("\"%s\" stop -D \"%s\" ", pg_ctl_path, datadir); pg_log_debug("pg_ctl command is: %s", pg_ctl_cmd); rc = system(pg_ctl_cmd); diff --git a/src/bin/pg_ctl/pg_ctl.c b/src/bin/pg_ctl/pg_ctl.c index e7e878c22f..04787c6aec 100644 --- a/src/bin/pg_ctl/pg_ctl.c +++ b/src/bin/pg_ctl/pg_ctl.c @@ -276,6 +276,9 @@ get_pgpid(bool is_status_request) pidf = fopen(pid_file, "r"); if (pidf == NULL) { +int en = errno; +write_stderr("!!!get_pgpid| pid_file: %s, pidf: %p, errno: %d\n", pid_file, pidf, en); +errno = en; /* No pid file, not an error on startup */ if (errno == ENOENT) return 0; @@ -723,7 +726,10 @@ wait_for_postmaster_stop(void) pid_t pid; if ((pid = get_pgpid(false)) == 0) +{ +write_stderr("!!!wait_for_postmaster_stop| pid: %d\n", pid); return true; /* pid file is gone */ +} if (kill(pid, 0) != 0) { diff --git a/src/port/open.c b/src/port/open.c index 13e49af8d4..f7cbc819c0 100644 --- a/src/port/open.c +++ b/src/port/open.c @@ -138,8 +138,14 @@ pgwin32_open_handle(const char *fileName, int fileFlags, bool backup_semantics) * invisible. With O_CREAT, we have no choice but to report that * there's a file in the way (which wouldn't happen on Unix). */ +DWORD ntstat = pg_RtlGetLastNtStatus(); +if (strstr(fileName, "postmaster.pid") != NULL) +{ +fprintf(stderr, "!!!pgwin32_open_handle| fileFlags: %X, err: %d, ntstatus: %X\n", fileFlags, err, ntstat); +} + if (err == ERROR_ACCESS_DENIED && - pg_RtlGetLastNtStatus() == STATUS_DELETE_PENDING) + ntstat == STATUS_DELETE_PENDING) { if (fileFlags & O_CREAT) err = ERROR_FILE_EXISTS; @@ -214,6 +220,12 @@ pgwin32_fopen(const char *fileName, const char *mode) openmode |= O_TEXT; fd = pgwin32_open(fileName, openmode); +if (strstr(fileName, "postmaster.pid") != NULL) +{ +int en = errno; +fprintf(stderr, "!!!pgwin32_fopen| fileName: %s, fd: %d, errno: %d\n", fileName, fd, en); +errno = en; +} if (fd == -1) return NULL; return _fdopen(fd, mode);
regress_log_040_pg_createsubscriber.tar.bz2
Description: application/bzip