Hi,

While porting some new IO code to lots of OSes I noticed in passing
that there is now a way to do synchronous fdatasync() on Windows.
This mechanism doesn't have an async variant, which is what I was
actually looking for (which turns out to doable with bleeding edge
IoRings, more on that later), but I figured this might be useful
anyway.  I see that at least one other open source database has
discovered it and seen speedups.  Like some other file API
improvements discussed recently, it's Windows 10+ and NTFS only.  I
tried out a quick POC patch and it runs a bit faster than fsync(), as
expected.  I'm not sure if it's worth bothering with or not given the
other options, but figured it was worth sharing.

While testing that I also couldn't resist adding an extra output line
to pg_test_fsync to run open_datasync in buffered I/O mode, like
PostgreSQL actually does in real life.  I guess I should really change
it to duplicate less code, though...

[1] 
https://www.postgresql.org/message-id/flat/1527846213.2475.31.camel%40cybertec.at
From 914b4da18d8249b4c1cb1219bcffd17625b0e2f1 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sun, 12 Dec 2021 14:13:35 +1300
Subject: [PATCH 1/4] Fix treatment of direct I/O in pg_test_fsync.

pg_test_fsync traditionally enabled O_DIRECT for the open_datasync and
open_sync tests.  In fact current releases of PostgreSQL only do that if
wal_level=minimal (less commonly used).  So, run the test both ways.
---
 src/bin/pg_test_fsync/pg_test_fsync.c | 87 ++++++++++++++++++++++++---
 1 file changed, 79 insertions(+), 8 deletions(-)

diff --git a/src/bin/pg_test_fsync/pg_test_fsync.c b/src/bin/pg_test_fsync/pg_test_fsync.c
index ddabf64c58..1be017fcd5 100644
--- a/src/bin/pg_test_fsync/pg_test_fsync.c
+++ b/src/bin/pg_test_fsync/pg_test_fsync.c
@@ -303,12 +303,12 @@ test_sync(int writes_per_op)
 	printf(_("(in wal_sync_method preference order, except fdatasync is Linux's default)\n"));
 
 	/*
-	 * Test open_datasync if available
+	 * Test open_datasync direct if available
 	 */
-	printf(LABEL_FORMAT, "open_datasync");
+	printf(LABEL_FORMAT, "open_datasync (direct)");
 	fflush(stdout);
 
-#ifdef OPEN_DATASYNC_FLAG
+#if defined(OPEN_DATASYNC_FLAG) && (defined(O_DIRECT) || defined(F_NOCACHE))
 	if ((tmpfile = open_direct(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1)
 	{
 		printf(NA_FORMAT, _("n/a*"));
@@ -333,6 +333,38 @@ test_sync(int writes_per_op)
 	printf(NA_FORMAT, _("n/a"));
 #endif
 
+	/*
+	 * Test open_datasync buffered if available
+	 */
+	printf(LABEL_FORMAT, "open_datasync (buffered)");
+	fflush(stdout);
+
+#ifdef OPEN_DATASYNC_FLAG
+	if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_BINARY, 0)) == -1)
+	{
+		printf(NA_FORMAT, _("n/a*"));
+		fs_warning = true;
+	}
+	else
+	{
+		START_TIMER;
+		for (ops = 0; alarm_triggered == false; ops++)
+		{
+			for (writes = 0; writes < writes_per_op; writes++)
+				if (pg_pwrite(tmpfile,
+							  buf,
+							  XLOG_BLCKSZ,
+							  writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+					die("write failed");
+		}
+		STOP_TIMER;
+		close(tmpfile);
+	}
+#else
+	printf(NA_FORMAT, _("n/a"));
+#endif
+
+
 /*
  * Test fdatasync if available
  */
@@ -409,13 +441,13 @@ test_sync(int writes_per_op)
 	printf(NA_FORMAT, _("n/a"));
 #endif
 
-/*
- * Test open_sync if available
- */
-	printf(LABEL_FORMAT, "open_sync");
+	/*
+	 * Test open_sync if available
+	 */
+	printf(LABEL_FORMAT, "open_sync (direct)");
 	fflush(stdout);
 
-#ifdef OPEN_SYNC_FLAG
+#if defined(OPEN_SYNC_FLAG) && (defined(O_DIRECT) || defined(F_NOCACHE))
 	if ((tmpfile = open_direct(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1)
 	{
 		printf(NA_FORMAT, _("n/a*"));
@@ -447,6 +479,45 @@ test_sync(int writes_per_op)
 	printf(NA_FORMAT, _("n/a"));
 #endif
 
+	/*
+	 * Test open_sync if available
+	 */
+	printf(LABEL_FORMAT, "open_sync (buffered)");
+	fflush(stdout);
+
+#ifdef OPEN_SYNC_FLAG
+	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_BINARY, 0)) == -1)
+	{
+		printf(NA_FORMAT, _("n/a*"));
+		fs_warning = true;
+	}
+	else
+	{
+		START_TIMER;
+		for (ops = 0; alarm_triggered == false; ops++)
+		{
+			for (writes = 0; writes < writes_per_op; writes++)
+				if (pg_pwrite(tmpfile,
+							  buf,
+							  XLOG_BLCKSZ,
+							  writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+
+					/*
+					 * This can generate write failures if the filesystem has
+					 * a large block size, e.g. 4k, and there is no support
+					 * for O_DIRECT writes smaller than the file system block
+					 * size, e.g. XFS.
+					 */
+					die("write failed");
+		}
+		STOP_TIMER;
+		close(tmpfile);
+	}
+#else
+	printf(NA_FORMAT, _("n/a"));
+#endif
+
+
 	if (fs_warning)
 	{
 		printf(_("* This file system and its mount options do not support direct\n"
-- 
2.33.1

From 8f58ea44762b547288e55037701f3b0043837dd5 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Sun, 12 Dec 2021 11:55:54 +1300
Subject: [PATCH 2/4] Add fdatasync() wrapper for Windows.

Windows 10 gained support for flushing a file with fdatasync()
semantics.  It only works on NTFS local file systems, but that's
possibly OK because wal_sync_method defaults to open_datasync on that
platform.

XXX If we used pg_fdatasync() for more things in the future, we'd have
to think about how to handle the case that it doesn't work on some file
systems.

XXX This patch blithely assumes that we're on Windows 10+ without any
version checking.

XXX For experimentation only.
---
 configure                             |  6 +++
 configure.ac                          |  1 +
 src/bin/pg_test_fsync/pg_test_fsync.c |  2 +-
 src/include/port/win32ntdll.h         |  8 ++++
 src/port/fdatasync.c                  | 53 +++++++++++++++++++++++++++
 src/port/win32ntdll.c                 |  6 ++-
 src/tools/msvc/Mkvcbuild.pm           |  3 +-
 7 files changed, 76 insertions(+), 3 deletions(-)
 create mode 100644 src/port/fdatasync.c

diff --git a/configure b/configure
index 3b19105328..1a06e7e73e 100755
--- a/configure
+++ b/configure
@@ -16708,6 +16708,12 @@ fi
  ;;
 esac
 
+  case " $LIBOBJS " in
+  *" fdatasync.$ac_objext "* ) ;;
+  *) LIBOBJS="$LIBOBJS fdatasync.$ac_objext"
+ ;;
+esac
+
   case " $LIBOBJS " in
   *" kill.$ac_objext "* ) ;;
   *) LIBOBJS="$LIBOBJS kill.$ac_objext"
diff --git a/configure.ac b/configure.ac
index e77d4dcf2d..aaa0d01a7b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1927,6 +1927,7 @@ if test "$PORTNAME" = "win32"; then
   AC_CHECK_FUNCS(_configthreadlocale)
   AC_REPLACE_FUNCS(gettimeofday)
   AC_LIBOBJ(dirmod)
+  AC_LIBOBJ(fdatasync)
   AC_LIBOBJ(kill)
   AC_LIBOBJ(open)
   AC_LIBOBJ(system)
diff --git a/src/bin/pg_test_fsync/pg_test_fsync.c b/src/bin/pg_test_fsync/pg_test_fsync.c
index 1be017fcd5..580b960e66 100644
--- a/src/bin/pg_test_fsync/pg_test_fsync.c
+++ b/src/bin/pg_test_fsync/pg_test_fsync.c
@@ -371,7 +371,7 @@ test_sync(int writes_per_op)
 	printf(LABEL_FORMAT, "fdatasync");
 	fflush(stdout);
 
-#ifdef HAVE_FDATASYNC
+#if defined(HAVE_FDATASYNC) || defined(WIN32)
 	if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
 		die("could not open output file");
 	START_TIMER;
diff --git a/src/include/port/win32ntdll.h b/src/include/port/win32ntdll.h
index 4d8808b3aa..1e15900030 100644
--- a/src/include/port/win32ntdll.h
+++ b/src/include/port/win32ntdll.h
@@ -20,8 +20,16 @@
 #include <ntstatus.h>
 #include <winternl.h>
 
+#ifndef FLUSH_FLAGS_FILE_DATA_SYNC_ONLY
+#define FLUSH_FLAGS_FILE_DATA_SYNC_ONLY 0x4
+#endif
+
 typedef NTSTATUS (__stdcall *RtlGetLastNtStatus_t) (void);
+typedef ULONG (__stdcall *RtlNtStatusToDosError_t) (NTSTATUS);
+typedef NTSTATUS (__stdcall *NtFlushBuffersFileEx_t) (HANDLE, ULONG, PVOID, ULONG, PIO_STATUS_BLOCK);
 
 extern RtlGetLastNtStatus_t pg_RtlGetLastNtStatus;
+extern RtlNtStatusToDosError_t pg_RtlNtStatusToDosError;
+extern NtFlushBuffersFileEx_t pg_NtFlushBuffersFileEx;
 
 extern int	initialize_ntdll(void);
diff --git a/src/port/fdatasync.c b/src/port/fdatasync.c
new file mode 100644
index 0000000000..9a57f87574
--- /dev/null
+++ b/src/port/fdatasync.c
@@ -0,0 +1,53 @@
+/*-------------------------------------------------------------------------
+ *
+ * fdatasync.c
+ *	   Win32 fdatasync() replacement
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ *
+ * src/port/fdatasync.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#define UMDF_USING_NTSTATUS
+
+#ifdef FRONTEND
+#include "postgres_fe.h"
+#else
+#include "postgres.h"
+#endif
+
+#include "port/win32ntdll.h"
+
+int
+fdatasync(int fd)
+{
+	IO_STATUS_BLOCK iosb;
+	NTSTATUS	status;
+	HANDLE		handle;
+
+	handle = (HANDLE) _get_osfhandle(fd);
+	if (handle == INVALID_HANDLE_VALUE)
+	{
+		errno = EBADF;
+		return -1;
+	}
+
+	if (initialize_ntdll() < 0)
+		return -1;
+
+	memset(&iosb, 0, sizeof(&iosb));
+	status = pg_NtFlushBuffersFileEx(handle,
+									 FLUSH_FLAGS_FILE_DATA_SYNC_ONLY,
+									 NULL,
+									 0,
+									 &iosb);
+
+	if (NT_SUCCESS(status))
+		return 0;
+
+	_dosmaperr(pg_RtlNtStatusToDosError(status));
+	return -1;
+}
diff --git a/src/port/win32ntdll.c b/src/port/win32ntdll.c
index aa3d37c50e..f15897084c 100644
--- a/src/port/win32ntdll.c
+++ b/src/port/win32ntdll.c
@@ -20,6 +20,8 @@
 #include "port/win32ntdll.h"
 
 RtlGetLastNtStatus_t pg_RtlGetLastNtStatus;
+RtlNtStatusToDosError_t pg_RtlNtStatusToDosError;
+NtFlushBuffersFileEx_t pg_NtFlushBuffersFileEx;
 
 typedef struct NtDllRoutine
 {
@@ -28,7 +30,9 @@ typedef struct NtDllRoutine
 } NtDllRoutine;
 
 static const NtDllRoutine routines[] = {
-	{"RtlGetLastNtStatus", (pg_funcptr_t *) &pg_RtlGetLastNtStatus}
+	{"RtlGetLastNtStatus", (pg_funcptr_t *) &pg_RtlGetLastNtStatus},
+	{"RtlNtStatusToDosError", (pg_funcptr_t *) &pg_RtlNtStatusToDosError},
+	{"NtFlushBuffersFileEx", (pg_funcptr_t *) &pg_NtFlushBuffersFileEx}
 };
 
 static bool initialized;
diff --git a/src/tools/msvc/Mkvcbuild.pm b/src/tools/msvc/Mkvcbuild.pm
index 404c45a6f3..6e3d775eae 100644
--- a/src/tools/msvc/Mkvcbuild.pm
+++ b/src/tools/msvc/Mkvcbuild.pm
@@ -99,7 +99,8 @@ sub mkvcbuild
 	$solution = CreateSolution($vsVersion, $config);
 
 	our @pgportfiles = qw(
-	  chklocale.c explicit_bzero.c fls.c getpeereid.c getrusage.c inet_aton.c
+	  chklocale.c explicit_bzero.c fls.c fdatasync.c
+	  getpeereid.c getrusage.c inet_aton.c
 	  getaddrinfo.c gettimeofday.c inet_net_ntop.c kill.c open.c
 	  snprintf.c strlcat.c strlcpy.c dirmod.c noblock.c path.c
 	  dirent.c dlopen.c getopt.c getopt_long.c link.c
-- 
2.33.1

Reply via email to