Hi, On 2024-12-12 14:14:20 +1100, Michael Harris wrote: > On Thu, 12 Dec 2024 at 10:50, Andres Freund <and...@anarazel.de> wrote: > > Just to make sure - you're absolutely certain that you actually have space > > at > > the time of the errors? > > As sure as I can be. The RHEL8 system that I took prints from > yesterday has > 1.5TB free. I can't see it varying by that much.
That does seem unlikely, but it'd probably still be worth monitoring by how much it varies. > It does look as though the system needs to be quite full to provoke > this problem. The systems I have looked at so far have >90% full > filesystems. > > Another interesting snippet: the application has a number of ETL > workers going at once. The actual number varies depending on a number > of factors but might be somewhere from 10 - 150. Each worker will have > a single postgres backend that they are feeding data to. Are they all inserting into distinct tables/partitions or into shared tables? > At the time of the error, it is not the case that all ETL workers > strike it at once - it looks like a lot of the time only a single > worker is affected, or at most a handful of workers. I can't see for > sure what the other workers were doing at the time, but I would expect > they were all importing data as well. When you say that they're not "all striking it at once", do you mean that some of them aren't interacting with the database at the time, or that they're not erroring out? > > If I were to provide you with a patch that showed the amount of free disk > > space at the time of an error, the size of the relation etc, could you > > reproduce the issue with it applied? Or is that unrealistic? > > I have not been able to reproduce it on demand, and so far it has only > happened in production systems. > > As long as the patch doesn't degrade normal performance it should be > possible to deploy it to one of the systems that is regularly > reporting the error, although it might take a while to get approval to > do that. Cool. The patch only has an effect in the branches reporting out-of-space errors, so there's no overhead during normal operation. And the additional detail doesn't have much overhead in the error case either. I attached separate patches for 16, 17 and master, as there's some minor conflicts between the version. Greetings, Andres Freund
>From c8ecdff54fcdbd2cf89ca7888f641db369f207ce Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Thu, 12 Dec 2024 12:57:12 -0500 Subject: [PATCH] md: Report more detail when encountering ENOSPC during extension Author: Reviewed-by: Discussion: https://postgr.es/m/ Backpatch: --- meson.build | 1 + configure.ac | 1 + src/include/pg_config.h.in | 3 ++ src/backend/storage/smgr/md.c | 63 +++++++++++++++++++++++++++++++---- configure | 2 +- 5 files changed, 63 insertions(+), 7 deletions(-) diff --git a/meson.build b/meson.build index 4e59feb91da..e644db41ef9 100644 --- a/meson.build +++ b/meson.build @@ -2269,6 +2269,7 @@ header_checks = [ 'sys/procctl.h', 'sys/signalfd.h', 'sys/ucred.h', + 'sys/vfs.h', 'termios.h', 'ucred.h', ] diff --git a/configure.ac b/configure.ac index 23add80d8fd..0984949a3b9 100644 --- a/configure.ac +++ b/configure.ac @@ -1512,6 +1512,7 @@ AC_CHECK_HEADERS(m4_normalize([ sys/procctl.h sys/signalfd.h sys/ucred.h + sys/vfs.h termios.h ucred.h ])) diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index ce3063b2b22..626de538821 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -478,6 +478,9 @@ /* Define to 1 if you have the <sys/ucred.h> header file. */ #undef HAVE_SYS_UCRED_H +/* Define to 1 if you have the <sys/vfs.h> header file. */ +#undef HAVE_SYS_VFS_H + /* Define to 1 if you have the <termios.h> header file. */ #undef HAVE_TERMIOS_H diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index fdecbad1709..67c42a69c11 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -24,6 +24,9 @@ #include <unistd.h> #include <fcntl.h> #include <sys/file.h> +#ifdef HAVE_SYS_VFS_H +#include <sys/vfs.h> +#endif #include "access/xlog.h" #include "access/xlogutils.h" @@ -449,6 +452,37 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) pfree(path); } +static void +report_disk_space(const char *reason, const char *path) +{ + /* + * I'm sure there's a way to do this on other OSs too, but for the + * debugging here this should be sufficient. + */ +#ifdef HAVE_SYS_VFS_H + int saved_errno = errno; + struct statfs sf; + int ret; + + ret = statfs(path, &sf); + + if (ret != 0) + elog(WARNING, "%s: statfs failed: %m", reason); + else + elog(LOG, "%s: free space for filesystem containing \"%s\" " + "f_blocks: %llu, f_bfree: %llu, f_bavail: %llu " + "f_files: %llu, f_ffree: %llu", + reason, path, + (long long unsigned) sf.f_blocks, + (long long unsigned) sf.f_bfree, + (long long unsigned) sf.f_bavail, + (long long unsigned) sf.f_files, + (long long unsigned) sf.f_ffree); + + errno = saved_errno; +#endif +} + /* * mdextend() -- Add a block to the specified relation. * @@ -496,11 +530,16 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) { + if (errno == ENOSPC) + report_disk_space("mdextend failing with ENOSPC", + FilePathName(v->mdfd_vfd)); + if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not extend file \"%s\": %m", - FilePathName(v->mdfd_vfd)), + errmsg("could not extend file \"%s\" from %u to %u blocks: %m", + FilePathName(v->mdfd_vfd), + blocknum, blocknum + 1), errhint("Check free disk space."))); /* short write: complain appropriately */ ereport(ERROR, @@ -586,10 +625,15 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, WAIT_EVENT_DATA_FILE_EXTEND); if (ret != 0) { + if (errno == ENOSPC) + report_disk_space("mdzeroextend FileFallocate failing with ENOSPC", + FilePathName(v->mdfd_vfd)); + ereport(ERROR, errcode_for_file_access(), - errmsg("could not extend file \"%s\" with FileFallocate(): %m", - FilePathName(v->mdfd_vfd)), + errmsg("could not extend file \"%s\" by %u blocks, from %u to %u, using FileFallocate(): %m", + FilePathName(v->mdfd_vfd), + numblocks, segstartblock, segstartblock+numblocks), errhint("Check free disk space.")); } } @@ -608,11 +652,18 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, seekpos, (off_t) BLCKSZ * numblocks, WAIT_EVENT_DATA_FILE_EXTEND); if (ret < 0) + { + if (errno == ENOSPC) + report_disk_space("mdzeroextend FileZero failing with ENOSPC", + FilePathName(v->mdfd_vfd)); + ereport(ERROR, errcode_for_file_access(), - errmsg("could not extend file \"%s\": %m", - FilePathName(v->mdfd_vfd)), + errmsg("could not extend file \"%s\" by %u blocks, from %u to %u, using FileZero(): %m", + FilePathName(v->mdfd_vfd), + numblocks, segstartblock, segstartblock+numblocks), errhint("Check free disk space.")); + } } if (!skipFsync && !SmgrIsTemp(reln)) diff --git a/configure b/configure index 8c2ab3a1973..f62f4f6d3ab 100755 --- a/configure +++ b/configure @@ -13768,7 +13768,7 @@ fi ## Header files ## -for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h langinfo.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h ucred.h +for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h langinfo.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h sys/vfs.h termios.h ucred.h do : as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" -- 2.45.2.746.g06e570c0df.dirty
>From e7119200b69e0c96f85cede510889be9548b0d73 Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Thu, 12 Dec 2024 12:57:12 -0500 Subject: [PATCH] md: Report more detail when encountering ENOSPC during extension Author: Reviewed-by: Discussion: https://postgr.es/m/ Backpatch: --- meson.build | 1 + configure.ac | 1 + src/include/pg_config.h.in | 3 ++ src/backend/storage/smgr/md.c | 63 +++++++++++++++++++++++++++++++---- configure | 2 +- 5 files changed, 63 insertions(+), 7 deletions(-) diff --git a/meson.build b/meson.build index 005dc9f3532..a0113e84aef 100644 --- a/meson.build +++ b/meson.build @@ -2415,6 +2415,7 @@ header_checks = [ 'sys/procctl.h', 'sys/signalfd.h', 'sys/ucred.h', + 'sys/vfs.h', 'termios.h', 'ucred.h', ] diff --git a/configure.ac b/configure.ac index 3c76c9ebc87..074572aabf5 100644 --- a/configure.ac +++ b/configure.ac @@ -1478,6 +1478,7 @@ AC_CHECK_HEADERS(m4_normalize([ sys/procctl.h sys/signalfd.h sys/ucred.h + sys/vfs.h termios.h ucred.h ])) diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 2397d90b465..7b53c994699 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -462,6 +462,9 @@ /* Define to 1 if you have the <sys/ucred.h> header file. */ #undef HAVE_SYS_UCRED_H +/* Define to 1 if you have the <sys/vfs.h> header file. */ +#undef HAVE_SYS_VFS_H + /* Define to 1 if you have the <termios.h> header file. */ #undef HAVE_TERMIOS_H diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 6796756358f..8c49312db0d 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -24,6 +24,9 @@ #include <unistd.h> #include <fcntl.h> #include <sys/file.h> +#ifdef HAVE_SYS_VFS_H +#include <sys/vfs.h> +#endif #include "access/xlogutils.h" #include "commands/tablespace.h" @@ -447,6 +450,37 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) pfree(path); } +static void +report_disk_space(const char *reason, const char *path) +{ + /* + * I'm sure there's a way to do this on other OSs too, but for the + * debugging here this should be sufficient. + */ +#ifdef HAVE_SYS_VFS_H + int saved_errno = errno; + struct statfs sf; + int ret; + + ret = statfs(path, &sf); + + if (ret != 0) + elog(WARNING, "%s: statfs failed: %m", reason); + else + elog(LOG, "%s: free space for filesystem containing \"%s\" " + "f_blocks: %llu, f_bfree: %llu, f_bavail: %llu " + "f_files: %llu, f_ffree: %llu", + reason, path, + (long long unsigned) sf.f_blocks, + (long long unsigned) sf.f_bfree, + (long long unsigned) sf.f_bavail, + (long long unsigned) sf.f_files, + (long long unsigned) sf.f_ffree); + + errno = saved_errno; +#endif +} + /* * mdextend() -- Add a block to the specified relation. * @@ -494,11 +528,16 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) { + if (errno == ENOSPC) + report_disk_space("mdextend failing with ENOSPC", + FilePathName(v->mdfd_vfd)); + if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not extend file \"%s\": %m", - FilePathName(v->mdfd_vfd)), + errmsg("could not extend file \"%s\" from %u to %u blocks: %m", + FilePathName(v->mdfd_vfd), + blocknum, blocknum + 1), errhint("Check free disk space."))); /* short write: complain appropriately */ ereport(ERROR, @@ -584,10 +623,15 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, WAIT_EVENT_DATA_FILE_EXTEND); if (ret != 0) { + if (errno == ENOSPC) + report_disk_space("mdzeroextend FileFallocate failing with ENOSPC", + FilePathName(v->mdfd_vfd)); + ereport(ERROR, errcode_for_file_access(), - errmsg("could not extend file \"%s\" with FileFallocate(): %m", - FilePathName(v->mdfd_vfd)), + errmsg("could not extend file \"%s\" by %u blocks, from %u to %u, using FileFallocate(): %m", + FilePathName(v->mdfd_vfd), + numblocks, segstartblock, segstartblock+numblocks), errhint("Check free disk space.")); } } @@ -606,11 +650,18 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, seekpos, (off_t) BLCKSZ * numblocks, WAIT_EVENT_DATA_FILE_EXTEND); if (ret < 0) + { + if (errno == ENOSPC) + report_disk_space("mdzeroextend FileZero failing with ENOSPC", + FilePathName(v->mdfd_vfd)); + ereport(ERROR, errcode_for_file_access(), - errmsg("could not extend file \"%s\": %m", - FilePathName(v->mdfd_vfd)), + errmsg("could not extend file \"%s\" by %u blocks, from %u to %u, using FileZero(): %m", + FilePathName(v->mdfd_vfd), + numblocks, segstartblock, segstartblock+numblocks), errhint("Check free disk space.")); + } } if (!skipFsync && !SmgrIsTemp(reln)) diff --git a/configure b/configure index 97996b7f6b7..5efd85bb17a 100755 --- a/configure +++ b/configure @@ -13349,7 +13349,7 @@ fi ## Header files ## -for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h langinfo.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h ucred.h +for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h langinfo.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h sys/vfs.h termios.h ucred.h do : as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" -- 2.45.2.746.g06e570c0df.dirty
>From 66a18a4565ec4cca4a8ce216aae9322a8c68731c Mon Sep 17 00:00:00 2001 From: Andres Freund <and...@anarazel.de> Date: Thu, 12 Dec 2024 12:57:12 -0500 Subject: [PATCH] md: Report more detail when encountering ENOSPC during extension Author: Reviewed-by: Discussion: https://postgr.es/m/ Backpatch: --- meson.build | 1 + configure.ac | 1 + src/include/pg_config.h.in | 3 ++ src/backend/storage/smgr/md.c | 63 +++++++++++++++++++++++++++++++---- configure | 2 +- 5 files changed, 63 insertions(+), 7 deletions(-) diff --git a/meson.build b/meson.build index e5ce437a5c7..05f622ccd79 100644 --- a/meson.build +++ b/meson.build @@ -2389,6 +2389,7 @@ header_checks = [ 'sys/procctl.h', 'sys/signalfd.h', 'sys/ucred.h', + 'sys/vfs.h', 'termios.h', 'ucred.h', 'xlocale.h', diff --git a/configure.ac b/configure.ac index 247ae97fa4c..d68774f9c89 100644 --- a/configure.ac +++ b/configure.ac @@ -1446,6 +1446,7 @@ AC_CHECK_HEADERS(m4_normalize([ sys/procctl.h sys/signalfd.h sys/ucred.h + sys/vfs.h termios.h ucred.h xlocale.h diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 07b2f798abd..c5e083f8793 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -439,6 +439,9 @@ /* Define to 1 if you have the <sys/ucred.h> header file. */ #undef HAVE_SYS_UCRED_H +/* Define to 1 if you have the <sys/vfs.h> header file. */ +#undef HAVE_SYS_VFS_H + /* Define to 1 if you have the <termios.h> header file. */ #undef HAVE_TERMIOS_H diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index cc8a80ee961..eac080f1a43 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -24,6 +24,9 @@ #include <unistd.h> #include <fcntl.h> #include <sys/file.h> +#ifdef HAVE_SYS_VFS_H +#include <sys/vfs.h> +#endif #include "access/xlogutils.h" #include "commands/tablespace.h" @@ -447,6 +450,37 @@ mdunlinkfork(RelFileLocatorBackend rlocator, ForkNumber forknum, bool isRedo) pfree(path); } +static void +report_disk_space(const char *reason, const char *path) +{ + /* + * I'm sure there's a way to do this on other OSs too, but for the + * debugging here this should be sufficient. + */ +#ifdef HAVE_SYS_VFS_H + int saved_errno = errno; + struct statfs sf; + int ret; + + ret = statfs(path, &sf); + + if (ret != 0) + elog(WARNING, "%s: statfs failed: %m", reason); + else + elog(LOG, "%s: free space for filesystem containing \"%s\" " + "f_blocks: %llu, f_bfree: %llu, f_bavail: %llu " + "f_files: %llu, f_ffree: %llu", + reason, path, + (long long unsigned) sf.f_blocks, + (long long unsigned) sf.f_bfree, + (long long unsigned) sf.f_bavail, + (long long unsigned) sf.f_files, + (long long unsigned) sf.f_ffree); + + errno = saved_errno; +#endif +} + /* * mdextend() -- Add a block to the specified relation. * @@ -494,11 +528,16 @@ mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ, seekpos, WAIT_EVENT_DATA_FILE_EXTEND)) != BLCKSZ) { + if (errno == ENOSPC) + report_disk_space("mdextend failing with ENOSPC", + FilePathName(v->mdfd_vfd)); + if (nbytes < 0) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not extend file \"%s\": %m", - FilePathName(v->mdfd_vfd)), + errmsg("could not extend file \"%s\" from %u to %u blocks: %m", + FilePathName(v->mdfd_vfd), + blocknum, blocknum + 1), errhint("Check free disk space."))); /* short write: complain appropriately */ ereport(ERROR, @@ -584,10 +623,15 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, WAIT_EVENT_DATA_FILE_EXTEND); if (ret != 0) { + if (errno == ENOSPC) + report_disk_space("mdzeroextend FileFallocate failing with ENOSPC", + FilePathName(v->mdfd_vfd)); + ereport(ERROR, errcode_for_file_access(), - errmsg("could not extend file \"%s\" with FileFallocate(): %m", - FilePathName(v->mdfd_vfd)), + errmsg("could not extend file \"%s\" by %u blocks, from %u to %u, using FileFallocate(): %m", + FilePathName(v->mdfd_vfd), + numblocks, segstartblock, segstartblock+numblocks), errhint("Check free disk space.")); } } @@ -606,11 +650,18 @@ mdzeroextend(SMgrRelation reln, ForkNumber forknum, seekpos, (off_t) BLCKSZ * numblocks, WAIT_EVENT_DATA_FILE_EXTEND); if (ret < 0) + { + if (errno == ENOSPC) + report_disk_space("mdzeroextend FileZero failing with ENOSPC", + FilePathName(v->mdfd_vfd)); + ereport(ERROR, errcode_for_file_access(), - errmsg("could not extend file \"%s\": %m", - FilePathName(v->mdfd_vfd)), + errmsg("could not extend file \"%s\" by %u blocks, from %u to %u, using FileZero(): %m", + FilePathName(v->mdfd_vfd), + numblocks, segstartblock, segstartblock+numblocks), errhint("Check free disk space.")); + } } if (!skipFsync && !SmgrIsTemp(reln)) diff --git a/configure b/configure index 518c33b73a9..191cbca0844 100755 --- a/configure +++ b/configure @@ -13227,7 +13227,7 @@ fi ## Header files ## -for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h termios.h ucred.h xlocale.h +for ac_header in atomic.h copyfile.h execinfo.h getopt.h ifaddrs.h mbarrier.h sys/epoll.h sys/event.h sys/personality.h sys/prctl.h sys/procctl.h sys/signalfd.h sys/ucred.h sys/vfs.h termios.h ucred.h xlocale.h do : as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" -- 2.45.2.746.g06e570c0df.dirty