On Fri, Sep 30, 2022 at 6:46 AM Thomas Munro <thomas.mu...@gmail.com> wrote:
>
> +1 for just doing it always, with a one-liner comment like
> "pg_pwritev*() might move the file position".  No reason to spam the
> source tree with more explanations of the exact reason.

+1 for resetting the file position in a platform-independent manner.
But, a description there won't hurt IMO and it saves time for the
hackers who spend time there and think why it's that way.

> If someone
> ever comes up with another case where p- and non-p- I/O functions are
> intermixed and it's really worth saving a system call (don't get me
> wrong, we call lseek() an obscene amount elsewhere and I'd like to fix
> that, but this case isn't hot?) then I like your idea of a macro to
> tell you whether you need to.

I don't think we go that route as the code isn't a hot path and an
extra system call wouldn't hurt performance much, a comment there
should work.

> Earlier I wondered why we'd want to include "pg_pwritev" in the name
> of this zero-filling function (pwritev being an internal
> implementation detail), but now it seems like maybe a good idea
> because it highlights the file position portability problem by being a
> member of that family of similarly-named functions.

Hm.

On Thu, Sep 29, 2022 at 10:57 PM Nathan Bossart
<nathandboss...@gmail.com> wrote:
>
> +        iov[0].iov_base = zbuffer.data;
>
> This seems superfluous, but I don't think it's hurting anything.

Yes, I removed it. Adding a comment, something like [1], would make it
more verbose, hence I've not added.

I'm attaching the v6 patch set, please review it further.

[1]
        /*
         * Use the first vector buffer to write the remaining size. Note that
         * zero buffer was already pointed to it above, hence just specifying
         * the size is enough here.
         */

--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
From 7ae3f77c0444e6c5383e13a502457813b850fd08 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Fri, 30 Sep 2022 01:26:03 +0000
Subject: [PATCH v6] Move pg_pwritev_with_retry() to file_utils.c

Move pg_pwritev_with_retry() from file/fd.c to common/file_utils.c
so that non-backend code or FRONTEND cases can also use it.

Author: Bharath Rupireddy
Reviewed-by: Thomas Munro
Reviewed-by: Nathan Bossart
Discussion: https://www.postgresql.org/message-id/CA%2BhUKGJKwUrpP0igOFAv5khj3dbHvfWqLzFeo7vtNzDYXv_YZw%40mail.gmail.com
---
 src/backend/storage/file/fd.c   | 65 ---------------------------------
 src/common/file_utils.c         | 65 +++++++++++++++++++++++++++++++++
 src/include/common/file_utils.h |  7 ++++
 src/include/storage/fd.h        |  6 ---
 4 files changed, 72 insertions(+), 71 deletions(-)

diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index e4d954578c..4151cafec5 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -93,7 +93,6 @@
 #include "common/pg_prng.h"
 #include "miscadmin.h"
 #include "pgstat.h"
-#include "port/pg_iovec.h"
 #include "portability/mem.h"
 #include "postmaster/startup.h"
 #include "storage/fd.h"
@@ -3738,67 +3737,3 @@ data_sync_elevel(int elevel)
 {
 	return data_sync_retry ? elevel : PANIC;
 }
-
-/*
- * A convenience wrapper for pg_pwritev() that retries on partial write.  If an
- * error is returned, it is unspecified how much has been written.
- */
-ssize_t
-pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
-{
-	struct iovec iov_copy[PG_IOV_MAX];
-	ssize_t		sum = 0;
-	ssize_t		part;
-
-	/* We'd better have space to make a copy, in case we need to retry. */
-	if (iovcnt > PG_IOV_MAX)
-	{
-		errno = EINVAL;
-		return -1;
-	}
-
-	for (;;)
-	{
-		/* Write as much as we can. */
-		part = pg_pwritev(fd, iov, iovcnt, offset);
-		if (part < 0)
-			return -1;
-
-#ifdef SIMULATE_SHORT_WRITE
-		part = Min(part, 4096);
-#endif
-
-		/* Count our progress. */
-		sum += part;
-		offset += part;
-
-		/* Step over iovecs that are done. */
-		while (iovcnt > 0 && iov->iov_len <= part)
-		{
-			part -= iov->iov_len;
-			++iov;
-			--iovcnt;
-		}
-
-		/* Are they all done? */
-		if (iovcnt == 0)
-		{
-			/* We don't expect the kernel to write more than requested. */
-			Assert(part == 0);
-			break;
-		}
-
-		/*
-		 * Move whatever's left to the front of our mutable copy and adjust
-		 * the leading iovec.
-		 */
-		Assert(iovcnt > 0);
-		memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
-		Assert(iov->iov_len > part);
-		iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
-		iov_copy[0].iov_len -= part;
-		iov = iov_copy;
-	}
-
-	return sum;
-}
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index df4d6d240c..4a4bdc31c4 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -28,6 +28,7 @@
 #ifdef FRONTEND
 #include "common/logging.h"
 #endif
+#include "port/pg_iovec.h"
 
 #ifdef FRONTEND
 
@@ -460,3 +461,67 @@ get_dirent_type(const char *path,
 
 	return result;
 }
+
+/*
+ * A convenience wrapper for pg_pwritev() that retries on partial write.  If an
+ * error is returned, it is unspecified how much has been written.
+ */
+ssize_t
+pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	struct iovec iov_copy[PG_IOV_MAX];
+	ssize_t		sum = 0;
+	ssize_t		part;
+
+	/* We'd better have space to make a copy, in case we need to retry. */
+	if (iovcnt > PG_IOV_MAX)
+	{
+		errno = EINVAL;
+		return -1;
+	}
+
+	for (;;)
+	{
+		/* Write as much as we can. */
+		part = pg_pwritev(fd, iov, iovcnt, offset);
+		if (part < 0)
+			return -1;
+
+#ifdef SIMULATE_SHORT_WRITE
+		part = Min(part, 4096);
+#endif
+
+		/* Count our progress. */
+		sum += part;
+		offset += part;
+
+		/* Step over iovecs that are done. */
+		while (iovcnt > 0 && iov->iov_len <= part)
+		{
+			part -= iov->iov_len;
+			++iov;
+			--iovcnt;
+		}
+
+		/* Are they all done? */
+		if (iovcnt == 0)
+		{
+			/* We don't expect the kernel to write more than requested. */
+			Assert(part == 0);
+			break;
+		}
+
+		/*
+		 * Move whatever's left to the front of our mutable copy and adjust
+		 * the leading iovec.
+		 */
+		Assert(iovcnt > 0);
+		memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
+		Assert(iov->iov_len > part);
+		iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
+		iov_copy[0].iov_len -= part;
+		iov = iov_copy;
+	}
+
+	return sum;
+}
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index 2811744c12..2c5dbcb0b1 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -24,6 +24,8 @@ typedef enum PGFileType
 	PGFILETYPE_LNK
 } PGFileType;
 
+struct iovec;					/* avoid including port/pg_iovec.h here */
+
 #ifdef FRONTEND
 extern int	fsync_fname(const char *fname, bool isdir);
 extern void fsync_pgdata(const char *pg_data, int serverVersion);
@@ -37,4 +39,9 @@ extern PGFileType get_dirent_type(const char *path,
 								  bool look_through_symlinks,
 								  int elevel);
 
+extern ssize_t pg_pwritev_with_retry(int fd,
+									 const struct iovec *iov,
+									 int iovcnt,
+									 off_t offset);
+
 #endif							/* FILE_UTILS_H */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 5a48fccd9c..c0a212487d 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -51,8 +51,6 @@ typedef enum RecoveryInitSyncMethod
 	RECOVERY_INIT_SYNC_METHOD_SYNCFS
 }			RecoveryInitSyncMethod;
 
-struct iovec;					/* avoid including port/pg_iovec.h here */
-
 typedef int File;
 
 
@@ -178,10 +176,6 @@ extern int	pg_fsync_no_writethrough(int fd);
 extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
 extern void pg_flush_data(int fd, off_t offset, off_t nbytes);
-extern ssize_t pg_pwritev_with_retry(int fd,
-									 const struct iovec *iov,
-									 int iovcnt,
-									 off_t offset);
 extern int	pg_truncate(const char *path, off_t length);
 extern void fsync_fname(const char *fname, bool isdir);
 extern int	fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
-- 
2.34.1

From 97a21536fd4ca274fc8730a99b3b27aea214aa70 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Fri, 30 Sep 2022 01:59:43 +0000
Subject: [PATCH v6] Use pg_pwritev_with_retry() instead of write() in
 walmethods.c

Use pg_pwritev_with_retry() while prepadding a WAL segment
instead of write() in walmethods.c dir_open_for_write() to avoid
partial writes. As the pg_pwritev_with_retry() function uses
pwritev, we can avoid explicit lseek() on non-Windows platforms
to seek to the beginning of the WAL segment. It looks like on
Windows, we need an explicit lseek() call here despite using
pwrite() implementation from win32pwrite.c. Otherwise
an error occurs.

These changes are inline with how core postgres initializes the
WAL segment in XLogFileInitInternal().

Author: Bharath Rupireddy
Reviewed-by: Nathan Bossart
Reviewed-by: Thomas Munro
Discussion: https://www.postgresql.org/message-id/CALj2ACUq7nAb7%3DbJNbK3yYmp-SZhJcXFR_pLk8un6XgDzDF3OA%40mail.gmail.com
---
 src/backend/access/transam/xlog.c  | 33 ++-----------
 src/bin/pg_basebackup/walmethods.c | 27 ++++++-----
 src/common/file_utils.c            | 76 ++++++++++++++++++++++++++++++
 src/include/common/file_utils.h    |  2 +
 4 files changed, 98 insertions(+), 40 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8e15256db8..ecfb4b13fb 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2925,7 +2925,6 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 					 bool *added, char *path)
 {
 	char		tmppath[MAXPGPATH];
-	PGAlignedXLogBlock zbuffer;
 	XLogSegNo	installed_segno;
 	XLogSegNo	max_segno;
 	int			fd;
@@ -2969,14 +2968,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", tmppath)));
 
-	memset(zbuffer.data, 0, XLOG_BLCKSZ);
-
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
 	save_errno = 0;
 	if (wal_init_zero)
 	{
-		struct iovec iov[PG_IOV_MAX];
-		int			blocks;
+		ssize_t rc;
 
 		/*
 		 * Zero-fill the file.  With this setting, we do this the hard way to
@@ -2987,29 +2983,10 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 		 * indirect blocks are down on disk.  Therefore, fdatasync(2) or
 		 * O_DSYNC will be sufficient to sync future writes to the log file.
 		 */
+		rc = pg_pwritev_zeros(fd, wal_segment_size);
 
-		/* Prepare to write out a lot of copies of our zero buffer at once. */
-		for (int i = 0; i < lengthof(iov); ++i)
-		{
-			iov[i].iov_base = zbuffer.data;
-			iov[i].iov_len = XLOG_BLCKSZ;
-		}
-
-		/* Loop, writing as many blocks as we can for each system call. */
-		blocks = wal_segment_size / XLOG_BLCKSZ;
-		for (int i = 0; i < blocks;)
-		{
-			int			iovcnt = Min(blocks - i, lengthof(iov));
-			off_t		offset = i * XLOG_BLCKSZ;
-
-			if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
-			{
-				save_errno = errno;
-				break;
-			}
-
-			i += iovcnt;
-		}
+		if (rc < 0)
+			save_errno = errno;
 	}
 	else
 	{
@@ -3018,7 +2995,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 		 * enough.
 		 */
 		errno = 0;
-		if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
+		if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
 		{
 			/* if write didn't set errno, assume no disk space */
 			save_errno = errno ? errno : ENOSPC;
diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c
index bc2e83d02b..da18a738d4 100644
--- a/src/bin/pg_basebackup/walmethods.c
+++ b/src/bin/pg_basebackup/walmethods.c
@@ -220,22 +220,25 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname,
 	/* Do pre-padding on non-compressed files */
 	if (pad_to_size && wwmethod->compression_algorithm == PG_COMPRESSION_NONE)
 	{
-		PGAlignedXLogBlock zerobuf;
-		int			bytes;
+		ssize_t rc;
 
-		memset(zerobuf.data, 0, XLOG_BLCKSZ);
-		for (bytes = 0; bytes < pad_to_size; bytes += XLOG_BLCKSZ)
+		rc = pg_pwritev_zeros(fd, pad_to_size);
+
+		if (rc < 0)
 		{
-			errno = 0;
-			if (write(fd, zerobuf.data, XLOG_BLCKSZ) != XLOG_BLCKSZ)
-			{
-				/* If write didn't set errno, assume problem is no disk space */
-				wwmethod->lasterrno = errno ? errno : ENOSPC;
-				close(fd);
-				return NULL;
-			}
+			wwmethod->lasterrno = errno;
+			close(fd);
+			return NULL;
 		}
 
+		/*
+		 * pg_pwritev*() might move the file position on some platforms, for
+		 * instance, see pg_pwrite() implementation in win32pwrite.c. We want
+		 * to explicitly reset the file position in a platform-independent
+		 * manner for extensibility even though it costs an extra system call
+		 * on the platforms where the file position is guaranteed to not
+		 * change.
+		 */
 		if (lseek(fd, 0, SEEK_SET) != 0)
 		{
 			wwmethod->lasterrno = errno;
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index 4a4bdc31c4..47ece647d7 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -525,3 +525,79 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
 
 	return sum;
 }
+
+/*
+ * Function to zero-fill a file with given size. On failure, a negative value
+ * is returned and errno is set appropriately so that the caller can use it
+ * accordingly.
+ */
+ssize_t
+pg_pwritev_zeros(int fd, size_t size)
+{
+	PGAlignedXLogBlock	zbuffer;
+	struct iovec	iov[PG_IOV_MAX];
+	int		blocks;
+	size_t	block_sz;
+	size_t	remaining_size = 0;
+	int		i;
+	ssize_t	written;
+	ssize_t	total_written = 0;
+
+	/*
+	 * XXX: Writing more than one block of size XLOG_BLCKSZ bytes via
+	 * PGAlignedXLogBlock structure per vector buffer might improve write
+	 * performance on some platforms. However, tests (on some platforms, not
+	 * all) show not much improvements with varying block sizes. Hence we stick
+	 * to one block in PGAlignedXLogBlock structure for now.
+	 */
+	block_sz = sizeof(zbuffer.data);
+
+	memset(zbuffer.data, 0, block_sz);
+
+	/* Prepare to write out a lot of copies of our zero buffer at once. */
+	for (i = 0; i < lengthof(iov); ++i)
+	{
+		iov[i].iov_base = zbuffer.data;
+		iov[i].iov_len = block_sz;
+	}
+
+	/* Loop, writing as many blocks as we can for each system call. */
+	blocks = size / block_sz;
+	remaining_size = size % block_sz;
+	for (i = 0; i < blocks;)
+	{
+		int		iovcnt = Min(blocks - i, lengthof(iov));
+		off_t	offset = i * block_sz;
+
+		written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
+
+		if (written < 0)
+			return written;
+
+		i += iovcnt;
+		total_written += written;
+	}
+
+	/* Now, write the remaining size, if any, of the file with zeros. */
+	if (remaining_size > 0)
+	{
+		/* We'll never write more than one block here */
+		int		iovcnt = 1;
+
+		/* Jump on to the end of previously written blocks */
+		off_t	offset = i * block_sz;
+
+		iov[0].iov_len = remaining_size;
+
+		written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
+
+		if (written < 0)
+			return written;
+
+		total_written += written;
+	}
+
+	Assert(total_written == size);
+
+	return total_written;
+}
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index 2c5dbcb0b1..f8d0d475fa 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -44,4 +44,6 @@ extern ssize_t pg_pwritev_with_retry(int fd,
 									 int iovcnt,
 									 off_t offset);
 
+extern ssize_t pg_pwritev_zeros(int fd, size_t size);
+
 #endif							/* FILE_UTILS_H */
-- 
2.34.1

Reply via email to