On Sat, Feb 5, 2022 at 2:10 AM Robert Haas <robertmh...@gmail.com> wrote: > So my impression is that today we ship defaults that are unsafe on > Windows. I don't really understand much of what you are saying here, > but if there's a way we can stop doing that, +1 from me, especially if > it allows us to retain reasonable performance.
The PostgreSQL default in combination with the Windows default is unsafe on SATA drives, but <get-out-clause>if you read our documentation carefully you might figure out that you need to disable write caching on your disk</>. https://www.postgresql.org/docs/14/wal-reliability.html says: "Consumer-grade IDE and SATA drives are particularly likely to have write-back caches that will not survive a power failure. Many solid-state drives (SSD) also have volatile write-back caches. [...] On Windows, if wal_sync_method is open_datasync (the default), write caching can be disabled by unchecking My Computer\Open\disk drive\Properties\Hardware\Properties\Policies\Enable write caching on the disk. Alternatively, set wal_sync_method to fsync or fsync_writethrough, which prevent write caching." I'm not proposing we change our default to this new level, because it doesn't work on non-NTFS, an annoying complication. This patch would just provide something faster to put after "Alternatively". (Actually that whole page needs a refresh. IDE is gone. The discussion about "recent" support for flushing caches is a bit out of date, and in fact the problem with open_datasync on this OS is because of problems with drivers and https://en.wikipedia.org/wiki/Disk_buffer#Force_Unit_Access_(FUA), not FLUSH CACHE EXT.) Here's an updated patch that fixes some silly problems seen on CI. There's something a bit clunkly/weird about this HAVE_FDATASYNC stuff, maybe I can find a tidier way, but it's enough for experimentation: For Mingw, I unconditionally add src/port/fdatasync.o to LIBOBJS, and I unconditionally #define HAVE_FDATASYNC in win32_port.h, and I also changed c.h's declaration of fdatasync() because it comes before port.h is included (I guess I could move it down instead?). For MSVC, I unconditionally add fdatasync.o to @pgportfiles, and HAVE_FDATASYNC is defined in Solution.pm. It'd be interesting to see pg_test_fsync.exe output on real hardware. Here's what a little Windows 10 VM with a virtual SATA drive says: C:\Users\thmunro>c:\pg\bin\pg_test_fsync.exe 5 seconds per test O_DIRECT supported on this platform for open_datasync and open_sync. Compare file sync methods using one 8kB write: (in wal_sync_method preference order, except fdatasync is Linux's default) open_datasync (direct) 7914.872 ops/sec 126 usecs/op open_datasync (buffered) 6593.056 ops/sec 152 usecs/op fdatasync 650.317 ops/sec 1538 usecs/op fsync 512.423 ops/sec 1952 usecs/op fsync_writethrough 550.881 ops/sec 1815 usecs/op open_sync (direct) n/a open_sync (buffered) n/a
From ea61dab5b8be308bd82e56a5063f7acac70ad291 Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Sun, 12 Dec 2021 14:13:35 +1300 Subject: [PATCH v2 1/2] Fix treatment of direct I/O in pg_test_fsync. pg_test_fsync traditionally enabled O_DIRECT for the open_datasync and open_sync tests. In fact current releases of PostgreSQL only do that if wal_level=minimal (less commonly used). So, run the test both ways. Discussion: https://postgr.es/m/CA%2BhUKGJZJVO%3DiX%2Beb-PXi2_XS9ZRqnn_4URh0NUQOwt6-_51xQ%40mail.gmail.com --- src/bin/pg_test_fsync/pg_test_fsync.c | 87 ++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 8 deletions(-) diff --git a/src/bin/pg_test_fsync/pg_test_fsync.c b/src/bin/pg_test_fsync/pg_test_fsync.c index ddabf64c58..1be017fcd5 100644 --- a/src/bin/pg_test_fsync/pg_test_fsync.c +++ b/src/bin/pg_test_fsync/pg_test_fsync.c @@ -303,12 +303,12 @@ test_sync(int writes_per_op) printf(_("(in wal_sync_method preference order, except fdatasync is Linux's default)\n")); /* - * Test open_datasync if available + * Test open_datasync direct if available */ - printf(LABEL_FORMAT, "open_datasync"); + printf(LABEL_FORMAT, "open_datasync (direct)"); fflush(stdout); -#ifdef OPEN_DATASYNC_FLAG +#if defined(OPEN_DATASYNC_FLAG) && (defined(O_DIRECT) || defined(F_NOCACHE)) if ((tmpfile = open_direct(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1) { printf(NA_FORMAT, _("n/a*")); @@ -333,6 +333,38 @@ test_sync(int writes_per_op) printf(NA_FORMAT, _("n/a")); #endif + /* + * Test open_datasync buffered if available + */ + printf(LABEL_FORMAT, "open_datasync (buffered)"); + fflush(stdout); + +#ifdef OPEN_DATASYNC_FLAG + if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1) + { + printf(NA_FORMAT, _("n/a*")); + fs_warning = true; + } + else + { + START_TIMER; + for (ops = 0; alarm_triggered == false; ops++) + { + for (writes = 0; writes < writes_per_op; writes++) + if (pg_pwrite(tmpfile, + buf, + XLOG_BLCKSZ, + writes * XLOG_BLCKSZ) != XLOG_BLCKSZ) + die("write failed"); + } + STOP_TIMER; + close(tmpfile); + } +#else + printf(NA_FORMAT, _("n/a")); +#endif + + /* * Test fdatasync if available */ @@ -409,13 +441,13 @@ test_sync(int writes_per_op) printf(NA_FORMAT, _("n/a")); #endif -/* - * Test open_sync if available - */ - printf(LABEL_FORMAT, "open_sync"); + /* + * Test open_sync if available + */ + printf(LABEL_FORMAT, "open_sync (direct)"); fflush(stdout); -#ifdef OPEN_SYNC_FLAG +#if defined(OPEN_SYNC_FLAG) && (defined(O_DIRECT) || defined(F_NOCACHE)) if ((tmpfile = open_direct(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1) { printf(NA_FORMAT, _("n/a*")); @@ -447,6 +479,45 @@ test_sync(int writes_per_op) printf(NA_FORMAT, _("n/a")); #endif + /* + * Test open_sync if available + */ + printf(LABEL_FORMAT, "open_sync (buffered)"); + fflush(stdout); + +#ifdef OPEN_SYNC_FLAG + if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1) + { + printf(NA_FORMAT, _("n/a*")); + fs_warning = true; + } + else + { + START_TIMER; + for (ops = 0; alarm_triggered == false; ops++) + { + for (writes = 0; writes < writes_per_op; writes++) + if (pg_pwrite(tmpfile, + buf, + XLOG_BLCKSZ, + writes * XLOG_BLCKSZ) != XLOG_BLCKSZ) + + /* + * This can generate write failures if the filesystem has + * a large block size, e.g. 4k, and there is no support + * for O_DIRECT writes smaller than the file system block + * size, e.g. XFS. + */ + die("write failed"); + } + STOP_TIMER; + close(tmpfile); + } +#else + printf(NA_FORMAT, _("n/a")); +#endif + + if (fs_warning) { printf(_("* This file system and its mount options do not support direct\n" -- 2.30.2
From 4e80c19fd29454c0f8843488381dd24f928588fc Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Sat, 5 Feb 2022 09:15:47 +1300 Subject: [PATCH v2 2/2] Add wal_sync_method=fdatasync for Windows. Windows 10 gained support for flushing NTFS files with fdatasync() semantics. Add support for it, though it is not the default. XXX If we used pg_fdatasync() for more things in the future, we'd have to think about how to handle the case that it doesn't work on some file systems. XXX This patch blithely assumes that we're on Windows 10+ without any version checking. XXX For experimentation only. Discussion: https://postgr.es/m/CA%2BhUKGJZJVO%3DiX%2Beb-PXi2_XS9ZRqnn_4URh0NUQOwt6-_51xQ%40mail.gmail.com --- configure | 6 ++++ configure.ac | 1 + src/include/c.h | 2 +- src/include/port/win32_port.h | 6 ++++ src/include/port/win32ntdll.h | 8 ++++++ src/port/fdatasync.c | 53 +++++++++++++++++++++++++++++++++++ src/port/win32ntdll.c | 6 +++- src/tools/msvc/Mkvcbuild.pm | 3 +- src/tools/msvc/Solution.pm | 2 +- 9 files changed, 83 insertions(+), 4 deletions(-) create mode 100644 src/port/fdatasync.c diff --git a/configure b/configure index 879f92202f..3b9aa5ead6 100755 --- a/configure +++ b/configure @@ -16701,6 +16701,12 @@ fi ;; esac + case " $LIBOBJS " in + *" fdatasync.$ac_objext "* ) ;; + *) LIBOBJS="$LIBOBJS fdatasync.$ac_objext" + ;; +esac + case " $LIBOBJS " in *" kill.$ac_objext "* ) ;; *) LIBOBJS="$LIBOBJS kill.$ac_objext" diff --git a/configure.ac b/configure.ac index 95287705f6..a226f492e2 100644 --- a/configure.ac +++ b/configure.ac @@ -1928,6 +1928,7 @@ if test "$PORTNAME" = "win32"; then AC_CHECK_FUNCS(_configthreadlocale) AC_REPLACE_FUNCS(gettimeofday) AC_LIBOBJ(dirmod) + AC_LIBOBJ(fdatasync) AC_LIBOBJ(kill) AC_LIBOBJ(open) AC_LIBOBJ(system) diff --git a/src/include/c.h b/src/include/c.h index 4f16e589b3..3a7b79998a 100644 --- a/src/include/c.h +++ b/src/include/c.h @@ -1279,7 +1279,7 @@ typedef union PGAlignedXLogBlock * standard C library. */ -#if defined(HAVE_FDATASYNC) && !HAVE_DECL_FDATASYNC +#if !HAVE_DECL_FDATASYNC extern int fdatasync(int fildes); #endif diff --git a/src/include/port/win32_port.h b/src/include/port/win32_port.h index d3cb765976..5a2f33eca0 100644 --- a/src/include/port/win32_port.h +++ b/src/include/port/win32_port.h @@ -83,6 +83,12 @@ #define HAVE_FSYNC_WRITETHROUGH #define FSYNC_WRITETHROUGH_IS_FSYNC +/* + * We have a replacement for fdatasync() in src/port/fdatasync.c, which is + * unconditionally used by MSVC and Mingw builds. + */ +#define HAVE_FDATASYNC + #define USES_WINSOCK /* diff --git a/src/include/port/win32ntdll.h b/src/include/port/win32ntdll.h index 663b9754bd..3a9f2761b5 100644 --- a/src/include/port/win32ntdll.h +++ b/src/include/port/win32ntdll.h @@ -23,9 +23,17 @@ #include <ntstatus.h> #include <winternl.h> +#ifndef FLUSH_FLAGS_FILE_DATA_SYNC_ONLY +#define FLUSH_FLAGS_FILE_DATA_SYNC_ONLY 0x4 +#endif + typedef NTSTATUS (__stdcall *RtlGetLastNtStatus_t) (void); +typedef ULONG (__stdcall *RtlNtStatusToDosError_t) (NTSTATUS); +typedef NTSTATUS (__stdcall *NtFlushBuffersFileEx_t) (HANDLE, ULONG, PVOID, ULONG, PIO_STATUS_BLOCK); extern RtlGetLastNtStatus_t pg_RtlGetLastNtStatus; +extern RtlNtStatusToDosError_t pg_RtlNtStatusToDosError; +extern NtFlushBuffersFileEx_t pg_NtFlushBuffersFileEx; extern int initialize_ntdll(void); diff --git a/src/port/fdatasync.c b/src/port/fdatasync.c new file mode 100644 index 0000000000..afef853aa3 --- /dev/null +++ b/src/port/fdatasync.c @@ -0,0 +1,53 @@ +/*------------------------------------------------------------------------- + * + * fdatasync.c + * Win32 fdatasync() replacement + * + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * + * src/port/fdatasync.c + * + *------------------------------------------------------------------------- + */ + +#define UMDF_USING_NTSTATUS + +#ifdef FRONTEND +#include "postgres_fe.h" +#else +#include "postgres.h" +#endif + +#include "port/win32ntdll.h" + +int +fdatasync(int fd) +{ + IO_STATUS_BLOCK iosb; + NTSTATUS status; + HANDLE handle; + + handle = (HANDLE) _get_osfhandle(fd); + if (handle == INVALID_HANDLE_VALUE) + { + errno = EBADF; + return -1; + } + + if (initialize_ntdll() < 0) + return -1; + + memset(&iosb, 0, sizeof(iosb)); + status = pg_NtFlushBuffersFileEx(handle, + FLUSH_FLAGS_FILE_DATA_SYNC_ONLY, + NULL, + 0, + &iosb); + + if (NT_SUCCESS(status)) + return 0; + + _dosmaperr(pg_RtlNtStatusToDosError(status)); + return -1; +} diff --git a/src/port/win32ntdll.c b/src/port/win32ntdll.c index 10c33c6a01..eb61407754 100644 --- a/src/port/win32ntdll.c +++ b/src/port/win32ntdll.c @@ -20,6 +20,8 @@ #include "port/win32ntdll.h" RtlGetLastNtStatus_t pg_RtlGetLastNtStatus; +RtlNtStatusToDosError_t pg_RtlNtStatusToDosError; +NtFlushBuffersFileEx_t pg_NtFlushBuffersFileEx; typedef struct NtDllRoutine { @@ -28,7 +30,9 @@ typedef struct NtDllRoutine } NtDllRoutine; static const NtDllRoutine routines[] = { - {"RtlGetLastNtStatus", (pg_funcptr_t *) &pg_RtlGetLastNtStatus} + {"RtlGetLastNtStatus", (pg_funcptr_t *) &pg_RtlGetLastNtStatus}, + {"RtlNtStatusToDosError", (pg_funcptr_t *) &pg_RtlNtStatusToDosError}, + {"NtFlushBuffersFileEx", (pg_funcptr_t *) &pg_NtFlushBuffersFileEx} }; static bool initialized; diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm index a310bcb28c..8cc39d4f8d 100644 --- a/src/tools/msvc/Mkvcbuild.pm +++ b/src/tools/msvc/Mkvcbuild.pm @@ -99,7 +99,8 @@ sub mkvcbuild $solution = CreateSolution($vsVersion, $config); our @pgportfiles = qw( - chklocale.c explicit_bzero.c fls.c getpeereid.c getrusage.c inet_aton.c + chklocale.c explicit_bzero.c fls.c fdatasync.c + getpeereid.c getrusage.c inet_aton.c getaddrinfo.c gettimeofday.c inet_net_ntop.c kill.c open.c snprintf.c strlcat.c strlcpy.c dirmod.c noblock.c path.c dirent.c dlopen.c getopt.c getopt_long.c link.c diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index ed1c53000f..4f75c07fcf 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -256,7 +256,7 @@ sub GenerateFiles HAVE_EDITLINE_READLINE_H => undef, HAVE_EXECINFO_H => undef, HAVE_EXPLICIT_BZERO => undef, - HAVE_FDATASYNC => undef, + HAVE_FDATASYNC => 1, HAVE_FLS => undef, HAVE_FSEEKO => 1, HAVE_FUNCNAME__FUNC => undef, -- 2.30.2