On Sat, Feb 5, 2022 at 2:10 AM Robert Haas <robertmh...@gmail.com> wrote:
> So my impression is that today we ship defaults that are unsafe on
> Windows. I don't really understand much of what you are saying here,
> but if there's a way we can stop doing that, +1 from me, especially if
> it allows us to retain reasonable performance.

The PostgreSQL default in combination with the Windows default is
unsafe on SATA drives, but <get-out-clause>if you read our
documentation carefully you might figure out that you need to disable
write caching on your disk</>.
https://www.postgresql.org/docs/14/wal-reliability.html says:

"Consumer-grade IDE and SATA drives are particularly likely to have
write-back caches that will not survive a power failure. Many
solid-state drives (SSD) also have volatile write-back caches. [...]
On Windows, if wal_sync_method is open_datasync (the default), write
caching can be disabled by unchecking My Computer\Open\disk
drive\Properties\Hardware\Properties\Policies\Enable write caching on
the disk. Alternatively, set wal_sync_method to fsync or
fsync_writethrough, which prevent write caching."

I'm not proposing we change our default to this new level, because it
doesn't work on non-NTFS, an annoying complication.  This patch would
just provide something faster to put after "Alternatively".

(Actually that whole page needs a refresh.  IDE is gone.  The
discussion about "recent" support for flushing caches is a bit out of
date, and in fact the problem with open_datasync on this OS is because
of problems with drivers and
https://en.wikipedia.org/wiki/Disk_buffer#Force_Unit_Access_(FUA), not
FLUSH CACHE EXT.)

Here's an updated patch that fixes some silly problems seen on CI.
There's something a bit clunkly/weird about this HAVE_FDATASYNC stuff,
maybe I can find a tidier way, but it's enough for experimentation:

For Mingw, I unconditionally add src/port/fdatasync.o to LIBOBJS, and
I unconditionally #define HAVE_FDATASYNC in win32_port.h, and I also
changed c.h's declaration of fdatasync() because it comes before
port.h is included (I guess I could move it down instead?).

For MSVC, I unconditionally add fdatasync.o to @pgportfiles, and
HAVE_FDATASYNC is defined in Solution.pm.

It'd be interesting to see pg_test_fsync.exe output on real hardware.
Here's what a little Windows 10 VM with a virtual SATA drive says:

C:\Users\thmunro>c:\pg\bin\pg_test_fsync.exe
5 seconds per test
O_DIRECT supported on this platform for open_datasync and open_sync.

Compare file sync methods using one 8kB write:
(in wal_sync_method preference order, except fdatasync is Linux's default)
        open_datasync (direct)             7914.872 ops/sec     126 usecs/op
        open_datasync (buffered)           6593.056 ops/sec     152 usecs/op
        fdatasync                           650.317 ops/sec    1538 usecs/op
        fsync                               512.423 ops/sec    1952 usecs/op
        fsync_writethrough                  550.881 ops/sec    1815 usecs/op
        open_sync (direct)                              n/a
        open_sync (buffered)                            n/a
From ea61dab5b8be308bd82e56a5063f7acac70ad291 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sun, 12 Dec 2021 14:13:35 +1300
Subject: [PATCH v2 1/2] Fix treatment of direct I/O in pg_test_fsync.

pg_test_fsync traditionally enabled O_DIRECT for the open_datasync and
open_sync tests.  In fact current releases of PostgreSQL only do that if
wal_level=minimal (less commonly used).  So, run the test both ways.

Discussion: https://postgr.es/m/CA%2BhUKGJZJVO%3DiX%2Beb-PXi2_XS9ZRqnn_4URh0NUQOwt6-_51xQ%40mail.gmail.com
---
 src/bin/pg_test_fsync/pg_test_fsync.c | 87 ++++++++++++++++++++++++---
 1 file changed, 79 insertions(+), 8 deletions(-)

diff --git a/src/bin/pg_test_fsync/pg_test_fsync.c b/src/bin/pg_test_fsync/pg_test_fsync.c
index ddabf64c58..1be017fcd5 100644
--- a/src/bin/pg_test_fsync/pg_test_fsync.c
+++ b/src/bin/pg_test_fsync/pg_test_fsync.c
@@ -303,12 +303,12 @@ test_sync(int writes_per_op)
 	printf(_("(in wal_sync_method preference order, except fdatasync is Linux's default)\n"));
 
 	/*
-	 * Test open_datasync if available
+	 * Test open_datasync direct if available
 	 */
-	printf(LABEL_FORMAT, "open_datasync");
+	printf(LABEL_FORMAT, "open_datasync (direct)");
 	fflush(stdout);
 
-#ifdef OPEN_DATASYNC_FLAG
+#if defined(OPEN_DATASYNC_FLAG) && (defined(O_DIRECT) || defined(F_NOCACHE))
 	if ((tmpfile = open_direct(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1)
 	{
 		printf(NA_FORMAT, _("n/a*"));
@@ -333,6 +333,38 @@ test_sync(int writes_per_op)
 	printf(NA_FORMAT, _("n/a"));
 #endif
 
+	/*
+	 * Test open_datasync buffered if available
+	 */
+	printf(LABEL_FORMAT, "open_datasync (buffered)");
+	fflush(stdout);
+
+#ifdef OPEN_DATASYNC_FLAG
+	if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1)
+	{
+		printf(NA_FORMAT, _("n/a*"));
+		fs_warning = true;
+	}
+	else
+	{
+		START_TIMER;
+		for (ops = 0; alarm_triggered == false; ops++)
+		{
+			for (writes = 0; writes < writes_per_op; writes++)
+				if (pg_pwrite(tmpfile,
+							  buf,
+							  XLOG_BLCKSZ,
+							  writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+					die("write failed");
+		}
+		STOP_TIMER;
+		close(tmpfile);
+	}
+#else
+	printf(NA_FORMAT, _("n/a"));
+#endif
+
+
 /*
  * Test fdatasync if available
  */
@@ -409,13 +441,13 @@ test_sync(int writes_per_op)
 	printf(NA_FORMAT, _("n/a"));
 #endif
 
-/*
- * Test open_sync if available
- */
-	printf(LABEL_FORMAT, "open_sync");
+	/*
+	 * Test open_sync if available
+	 */
+	printf(LABEL_FORMAT, "open_sync (direct)");
 	fflush(stdout);
 
-#ifdef OPEN_SYNC_FLAG
+#if defined(OPEN_SYNC_FLAG) && (defined(O_DIRECT) || defined(F_NOCACHE))
 	if ((tmpfile = open_direct(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1)
 	{
 		printf(NA_FORMAT, _("n/a*"));
@@ -447,6 +479,45 @@ test_sync(int writes_per_op)
 	printf(NA_FORMAT, _("n/a"));
 #endif
 
+	/*
+	 * Test open_sync if available
+	 */
+	printf(LABEL_FORMAT, "open_sync (buffered)");
+	fflush(stdout);
+
+#ifdef OPEN_SYNC_FLAG
+	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1)
+	{
+		printf(NA_FORMAT, _("n/a*"));
+		fs_warning = true;
+	}
+	else
+	{
+		START_TIMER;
+		for (ops = 0; alarm_triggered == false; ops++)
+		{
+			for (writes = 0; writes < writes_per_op; writes++)
+				if (pg_pwrite(tmpfile,
+							  buf,
+							  XLOG_BLCKSZ,
+							  writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+
+					/*
+					 * This can generate write failures if the filesystem has
+					 * a large block size, e.g. 4k, and there is no support
+					 * for O_DIRECT writes smaller than the file system block
+					 * size, e.g. XFS.
+					 */
+					die("write failed");
+		}
+		STOP_TIMER;
+		close(tmpfile);
+	}
+#else
+	printf(NA_FORMAT, _("n/a"));
+#endif
+
+
 	if (fs_warning)
 	{
 		printf(_("* This file system and its mount options do not support direct\n"
-- 
2.30.2

From 4e80c19fd29454c0f8843488381dd24f928588fc Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sat, 5 Feb 2022 09:15:47 +1300
Subject: [PATCH v2 2/2] Add wal_sync_method=fdatasync for Windows.

Windows 10 gained support for flushing NTFS files with fdatasync()
semantics.  Add support for it, though it is not the default.

XXX If we used pg_fdatasync() for more things in the future, we'd have
to think about how to handle the case that it doesn't work on some file
systems.

XXX This patch blithely assumes that we're on Windows 10+ without any
version checking.

XXX For experimentation only.

Discussion: https://postgr.es/m/CA%2BhUKGJZJVO%3DiX%2Beb-PXi2_XS9ZRqnn_4URh0NUQOwt6-_51xQ%40mail.gmail.com
---
 configure                     |  6 ++++
 configure.ac                  |  1 +
 src/include/c.h               |  2 +-
 src/include/port/win32_port.h |  6 ++++
 src/include/port/win32ntdll.h |  8 ++++++
 src/port/fdatasync.c          | 53 +++++++++++++++++++++++++++++++++++
 src/port/win32ntdll.c         |  6 +++-
 src/tools/msvc/Mkvcbuild.pm   |  3 +-
 src/tools/msvc/Solution.pm    |  2 +-
 9 files changed, 83 insertions(+), 4 deletions(-)
 create mode 100644 src/port/fdatasync.c

diff --git a/configure b/configure
index 879f92202f..3b9aa5ead6 100755
--- a/configure
+++ b/configure
@@ -16701,6 +16701,12 @@ fi
  ;;
 esac
 
+  case " $LIBOBJS " in
+  *" fdatasync.$ac_objext "* ) ;;
+  *) LIBOBJS="$LIBOBJS fdatasync.$ac_objext"
+ ;;
+esac
+
   case " $LIBOBJS " in
   *" kill.$ac_objext "* ) ;;
   *) LIBOBJS="$LIBOBJS kill.$ac_objext"
diff --git a/configure.ac b/configure.ac
index 95287705f6..a226f492e2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1928,6 +1928,7 @@ if test "$PORTNAME" = "win32"; then
   AC_CHECK_FUNCS(_configthreadlocale)
   AC_REPLACE_FUNCS(gettimeofday)
   AC_LIBOBJ(dirmod)
+  AC_LIBOBJ(fdatasync)
   AC_LIBOBJ(kill)
   AC_LIBOBJ(open)
   AC_LIBOBJ(system)
diff --git a/src/include/c.h b/src/include/c.h
index 4f16e589b3..3a7b79998a 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -1279,7 +1279,7 @@ typedef union PGAlignedXLogBlock
  * standard C library.
  */
 
-#if defined(HAVE_FDATASYNC) && !HAVE_DECL_FDATASYNC
+#if !HAVE_DECL_FDATASYNC
 extern int	fdatasync(int fildes);
 #endif
 
diff --git a/src/include/port/win32_port.h b/src/include/port/win32_port.h
index d3cb765976..5a2f33eca0 100644
--- a/src/include/port/win32_port.h
+++ b/src/include/port/win32_port.h
@@ -83,6 +83,12 @@
 #define HAVE_FSYNC_WRITETHROUGH
 #define FSYNC_WRITETHROUGH_IS_FSYNC
 
+/*
+ * We have a replacement for fdatasync() in src/port/fdatasync.c, which is
+ * unconditionally used by MSVC and Mingw builds.
+ */
+#define HAVE_FDATASYNC
+
 #define USES_WINSOCK
 
 /*
diff --git a/src/include/port/win32ntdll.h b/src/include/port/win32ntdll.h
index 663b9754bd..3a9f2761b5 100644
--- a/src/include/port/win32ntdll.h
+++ b/src/include/port/win32ntdll.h
@@ -23,9 +23,17 @@
 #include <ntstatus.h>
 #include <winternl.h>
 
+#ifndef FLUSH_FLAGS_FILE_DATA_SYNC_ONLY
+#define FLUSH_FLAGS_FILE_DATA_SYNC_ONLY 0x4
+#endif
+
 typedef NTSTATUS (__stdcall *RtlGetLastNtStatus_t) (void);
+typedef ULONG (__stdcall *RtlNtStatusToDosError_t) (NTSTATUS);
+typedef NTSTATUS (__stdcall *NtFlushBuffersFileEx_t) (HANDLE, ULONG, PVOID, ULONG, PIO_STATUS_BLOCK);
 
 extern RtlGetLastNtStatus_t pg_RtlGetLastNtStatus;
+extern RtlNtStatusToDosError_t pg_RtlNtStatusToDosError;
+extern NtFlushBuffersFileEx_t pg_NtFlushBuffersFileEx;
 
 extern int	initialize_ntdll(void);
 
diff --git a/src/port/fdatasync.c b/src/port/fdatasync.c
new file mode 100644
index 0000000000..afef853aa3
--- /dev/null
+++ b/src/port/fdatasync.c
@@ -0,0 +1,53 @@
+/*-------------------------------------------------------------------------
+ *
+ * fdatasync.c
+ *	   Win32 fdatasync() replacement
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ * src/port/fdatasync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define UMDF_USING_NTSTATUS
+
+#ifdef FRONTEND
+#include "postgres_fe.h"
+#else
+#include "postgres.h"
+#endif
+
+#include "port/win32ntdll.h"
+
+int
+fdatasync(int fd)
+{
+	IO_STATUS_BLOCK iosb;
+	NTSTATUS	status;
+	HANDLE		handle;
+
+	handle = (HANDLE) _get_osfhandle(fd);
+	if (handle == INVALID_HANDLE_VALUE)
+	{
+		errno = EBADF;
+		return -1;
+	}
+
+	if (initialize_ntdll() < 0)
+		return -1;
+
+	memset(&iosb, 0, sizeof(iosb));
+	status = pg_NtFlushBuffersFileEx(handle,
+									 FLUSH_FLAGS_FILE_DATA_SYNC_ONLY,
+									 NULL,
+									 0,
+									 &iosb);
+
+	if (NT_SUCCESS(status))
+		return 0;
+
+	_dosmaperr(pg_RtlNtStatusToDosError(status));
+	return -1;
+}
diff --git a/src/port/win32ntdll.c b/src/port/win32ntdll.c
index 10c33c6a01..eb61407754 100644
--- a/src/port/win32ntdll.c
+++ b/src/port/win32ntdll.c
@@ -20,6 +20,8 @@
 #include "port/win32ntdll.h"
 
 RtlGetLastNtStatus_t pg_RtlGetLastNtStatus;
+RtlNtStatusToDosError_t pg_RtlNtStatusToDosError;
+NtFlushBuffersFileEx_t pg_NtFlushBuffersFileEx;
 
 typedef struct NtDllRoutine
 {
@@ -28,7 +30,9 @@ typedef struct NtDllRoutine
 } NtDllRoutine;
 
 static const NtDllRoutine routines[] = {
-	{"RtlGetLastNtStatus", (pg_funcptr_t *) &pg_RtlGetLastNtStatus}
+	{"RtlGetLastNtStatus", (pg_funcptr_t *) &pg_RtlGetLastNtStatus},
+	{"RtlNtStatusToDosError", (pg_funcptr_t *) &pg_RtlNtStatusToDosError},
+	{"NtFlushBuffersFileEx", (pg_funcptr_t *) &pg_NtFlushBuffersFileEx}
 };
 
 static bool initialized;
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index a310bcb28c..8cc39d4f8d 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -99,7 +99,8 @@ sub mkvcbuild
 	$solution = CreateSolution($vsVersion, $config);
 
 	our @pgportfiles = qw(
-	  chklocale.c explicit_bzero.c fls.c getpeereid.c getrusage.c inet_aton.c
+	  chklocale.c explicit_bzero.c fls.c fdatasync.c
+	  getpeereid.c getrusage.c inet_aton.c
 	  getaddrinfo.c gettimeofday.c inet_net_ntop.c kill.c open.c
 	  snprintf.c strlcat.c strlcpy.c dirmod.c noblock.c path.c
 	  dirent.c dlopen.c getopt.c getopt_long.c link.c
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index ed1c53000f..4f75c07fcf 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -256,7 +256,7 @@ sub GenerateFiles
 		HAVE_EDITLINE_READLINE_H                    => undef,
 		HAVE_EXECINFO_H                             => undef,
 		HAVE_EXPLICIT_BZERO                         => undef,
-		HAVE_FDATASYNC                              => undef,
+		HAVE_FDATASYNC                              => 1,
 		HAVE_FLS                                    => undef,
 		HAVE_FSEEKO                                 => 1,
 		HAVE_FUNCNAME__FUNC                         => undef,
-- 
2.30.2

Reply via email to