On Thu, Oct 27, 2022 at 11:24 AM Michael Paquier <mich...@paquier.xyz> wrote:
>
> On Thu, Sep 29, 2022 at 08:09:56PM -0700, Nathan Bossart wrote:
> > Looks reasonable to me.
>
> 0001, to move pg_pwritev_with_retry() to a new home, seems fine, so
> applied.

Thanks.

> Regarding 0002, using pg_pwrite_zeros() as a routine name, as
> suggested by Thomas, sounds good to me.

Changed.

> However, I am not really a
> fan of its dependency with PGAlignedXLogBlock, because it should be
> able to work with any buffers of any sizes, as long as the input
> buffer is aligned, shouldn't it?  For example, what about
> PGAlignedBlock?  So, should we make this more extensible? My guess
> would be the addition of the block size and the block pointer to the
> arguments of pg_pwrite_zeros(), in combination with a check to make
> sure that the input buffer is MAXALIGN()'d (with an Assert() rather
> than just an elog/pg_log_error?).

+1 to pass in the aligned buffer, its size and an assertion on the buffer size.

Please see the attached v7 patch.

--
Bharath Rupireddy
PostgreSQL Contributors Team
RDS Open Source Databases
Amazon Web Services: https://aws.amazon.com
From 1d81929de5f5b56d39fc7f1273cdb4c1e36337bb Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Thu, 27 Oct 2022 08:51:07 +0000
Subject: [PATCH v7] Use pg_pwritev_with_retry() instead of write() in
 walmethods.c

Use pg_pwritev_with_retry() while prepadding a WAL segment
instead of write() in walmethods.c dir_open_for_write() to avoid
partial writes. As the pg_pwritev_with_retry() function uses
pwritev, we can avoid explicit lseek() on non-Windows platforms
to seek to the beginning of the WAL segment. It looks like on
Windows, we need an explicit lseek() call here despite using
pwrite() implementation from win32pwrite.c. Otherwise
an error occurs.

These changes are inline with how core postgres initializes the
WAL segment in XLogFileInitInternal().

Author: Bharath Rupireddy
Reviewed-by: Nathan Bossart, Michael Paquier
Reviewed-by: Thomas Munro
Discussion: https://www.postgresql.org/message-id/CALj2ACUq7nAb7%3DbJNbK3yYmp-SZhJcXFR_pLk8un6XgDzDF3OA%40mail.gmail.com
---
 src/backend/access/transam/xlog.c  | 35 +++-----------
 src/bin/pg_basebackup/walmethods.c | 28 ++++++-----
 src/common/file_utils.c            | 77 ++++++++++++++++++++++++++++++
 src/include/common/file_utils.h    |  5 ++
 4 files changed, 105 insertions(+), 40 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 8f10effe3a..638d6d448b 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2921,7 +2921,6 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 					 bool *added, char *path)
 {
 	char		tmppath[MAXPGPATH];
-	PGAlignedXLogBlock zbuffer;
 	XLogSegNo	installed_segno;
 	XLogSegNo	max_segno;
 	int			fd;
@@ -2965,14 +2964,12 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", tmppath)));
 
-	memset(zbuffer.data, 0, XLOG_BLCKSZ);
-
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
 	save_errno = 0;
 	if (wal_init_zero)
 	{
-		struct iovec iov[PG_IOV_MAX];
-		int			blocks;
+		PGAlignedXLogBlock zbuffer;
+		ssize_t rc;
 
 		/*
 		 * Zero-fill the file.  With this setting, we do this the hard way to
@@ -2983,29 +2980,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 		 * indirect blocks are down on disk.  Therefore, fdatasync(2) or
 		 * O_DSYNC will be sufficient to sync future writes to the log file.
 		 */
+		rc = pg_pwrite_zeros(fd, wal_segment_size, zbuffer.data,
+							 sizeof(zbuffer.data));
 
-		/* Prepare to write out a lot of copies of our zero buffer at once. */
-		for (int i = 0; i < lengthof(iov); ++i)
-		{
-			iov[i].iov_base = zbuffer.data;
-			iov[i].iov_len = XLOG_BLCKSZ;
-		}
-
-		/* Loop, writing as many blocks as we can for each system call. */
-		blocks = wal_segment_size / XLOG_BLCKSZ;
-		for (int i = 0; i < blocks;)
-		{
-			int			iovcnt = Min(blocks - i, lengthof(iov));
-			off_t		offset = i * XLOG_BLCKSZ;
-
-			if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
-			{
-				save_errno = errno;
-				break;
-			}
-
-			i += iovcnt;
-		}
+		if (rc < 0)
+			save_errno = errno;
 	}
 	else
 	{
@@ -3014,7 +2993,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 		 * enough.
 		 */
 		errno = 0;
-		if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
+		if (pg_pwrite(fd, "\0", 1, wal_segment_size - 1) != 1)
 		{
 			/* if write didn't set errno, assume no disk space */
 			save_errno = errno ? errno : ENOSPC;
diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c
index bc2e83d02b..f7af73c01b 100644
--- a/src/bin/pg_basebackup/walmethods.c
+++ b/src/bin/pg_basebackup/walmethods.c
@@ -220,22 +220,26 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname,
 	/* Do pre-padding on non-compressed files */
 	if (pad_to_size && wwmethod->compression_algorithm == PG_COMPRESSION_NONE)
 	{
-		PGAlignedXLogBlock zerobuf;
-		int			bytes;
+		PGAlignedXLogBlock zbuffer;
+		ssize_t rc;
 
-		memset(zerobuf.data, 0, XLOG_BLCKSZ);
-		for (bytes = 0; bytes < pad_to_size; bytes += XLOG_BLCKSZ)
+		rc = pg_pwrite_zeros(fd, pad_to_size, zbuffer.data,
+							 sizeof(zbuffer.data));
+
+		if (rc < 0)
 		{
-			errno = 0;
-			if (write(fd, zerobuf.data, XLOG_BLCKSZ) != XLOG_BLCKSZ)
-			{
-				/* If write didn't set errno, assume problem is no disk space */
-				wwmethod->lasterrno = errno ? errno : ENOSPC;
-				close(fd);
-				return NULL;
-			}
+			wwmethod->lasterrno = errno;
+			close(fd);
+			return NULL;
 		}
 
+		/*
+		 * pg_pwrite() (called via pg_pwrite_zeros()) might move the file
+		 * position on some platforms (see win32pwrite.c). We want to
+		 * explicitly reset the file position in a platform-independent manner
+		 * for extensibility even though it costs an extra system call on the
+		 * platforms where the file position is guaranteed to not change.
+		 */
 		if (lseek(fd, 0, SEEK_SET) != 0)
 		{
 			wwmethod->lasterrno = errno;
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index eac05a13ed..38a62e3aed 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -527,3 +527,80 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
 
 	return sum;
 }
+
+/*
+ * pg_pwrite_zeros
+ *
+ * Writes zeros to a given file. Input parameters are "fd" (file descriptor of
+ * the file), "size" (size of the file in bytes), "zbuffer" (pointer to a
+ * MAXALIGNed buffer), "zbuffer_sz" (size of the MAXALIGNed buffer).
+ *
+ * Note that this function zero-fills the given "zbuffer" and writes many of
+ * them at once.
+ *
+ * On failure, a negative value is returned and errno is set appropriately so
+ * that the caller can use it accordingly.
+ */
+ssize_t
+pg_pwrite_zeros(int fd, size_t size, char *zbuffer, size_t zbuffer_sz)
+{
+	struct iovec	iov[PG_IOV_MAX];
+	int		blocks;
+	size_t	remaining_size = 0;
+	int		i;
+	ssize_t	written;
+	ssize_t	total_written = 0;
+
+	/* The passed in zero buffer must be MAXALIGNed. */
+	Assert(zbuffer_sz == MAXALIGN(zbuffer_sz));
+
+	/* Zero-fill the passed in buffer. */
+	memset(zbuffer, 0, zbuffer_sz);
+
+	/* Prepare to write out a lot of copies of our zero buffer at once. */
+	for (i = 0; i < lengthof(iov); ++i)
+	{
+		iov[i].iov_base = zbuffer;
+		iov[i].iov_len = zbuffer_sz;
+	}
+
+	/* Loop, writing as many blocks as we can for each system call. */
+	blocks = size / zbuffer_sz;
+	remaining_size = size % zbuffer_sz;
+	for (i = 0; i < blocks;)
+	{
+		int		iovcnt = Min(blocks - i, lengthof(iov));
+		off_t	offset = i * zbuffer_sz;
+
+		written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
+
+		if (written < 0)
+			return written;
+
+		i += iovcnt;
+		total_written += written;
+	}
+
+	/* Now, write the remaining size, if any, of the file with zeros. */
+	if (remaining_size > 0)
+	{
+		/* We'll never write more than one block here */
+		int		iovcnt = 1;
+
+		/* Jump on to the end of previously written blocks */
+		off_t	offset = i * zbuffer_sz;
+
+		iov[0].iov_len = remaining_size;
+
+		written = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
+
+		if (written < 0)
+			return written;
+
+		total_written += written;
+	}
+
+	Assert(total_written == size);
+
+	return total_written;
+}
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index 2c5dbcb0b1..4e427bc657 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -44,4 +44,9 @@ extern ssize_t pg_pwritev_with_retry(int fd,
 									 int iovcnt,
 									 off_t offset);
 
+extern ssize_t pg_pwrite_zeros(int fd,
+							   size_t size,
+							   char *zbuffer,
+							   size_t zbuffer_sz);
+
 #endif							/* FILE_UTILS_H */
-- 
2.34.1

Reply via email to