On Wed, Sep 21, 2022 at 4:30 AM Nathan Bossart <nathandboss...@gmail.com> wrote: > > 0001 looks reasonable to me.
Thanks for reviewing. > + errno = 0; > + rc = pg_pwritev_zeros(fd, pad_to_size); > > Do we need to reset errno? pg_pwritev_zeros() claims to set errno > appropriately. Right, pg_pwritev_zeros(), (rather pg_pwritev_with_retry() ensures that pwritev() or pwrite()) sets the correct errno, please see Thomas's comments [1] as well. Removed it. > +/* > + * PWRITEV_BLCKSZ is same as XLOG_BLCKSZ for now, however it may change if > + * writing more bytes per pg_pwritev_with_retry() call is proven to be more > + * performant. > + */ > +#define PWRITEV_BLCKSZ XLOG_BLCKSZ > > This seems like something we should sort out now instead of leaving as > future work. Given your recent note, I think we should just use > XLOG_BLCKSZ and PGAlignedXLogBlock and add a comment about the performance > findings with different buffer sizes. Agreed. Removed the new structure and added a comment. Another change that I had to do was to allow lseek() even after pwrite() (via pg_pwritev_zeros()) on Windows in walmethods.c. Without this, the regression tests start to fail on Windows. And I figured out that it's not an issue with this patch, it looks like an issue with pwrite() implementation in win32pwrite.c, see the failure here [2], I plan to start a separate thread to discuss this. Please review the attached v4 patch set further. [1] https://www.postgresql.org/message-id/CA%2BhUKGJKwUrpP0igOFAv5khj3dbHvfWqLzFeo7vtNzDYXv_YZw%40mail.gmail.com [2] https://github.com/BRupireddy/postgres/tree/use_pwrite_without_lseek_on_windiws -- Bharath Rupireddy PostgreSQL Contributors Team RDS Open Source Databases Amazon Web Services: https://aws.amazon.com
From c60026519becfc513dc06ddf889caa3b4b0fade3 Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Date: Fri, 23 Sep 2022 00:47:56 +0000 Subject: [PATCH v4] Move pg_pwritev_with_retry() to file_utils.c Move pg_pwritev_with_retry() from file/fd.c to common/file_utils.c so that non-backend code or FRONTEND cases can also use it. --- src/backend/storage/file/fd.c | 65 --------------------------------- src/common/file_utils.c | 65 +++++++++++++++++++++++++++++++++ src/include/common/file_utils.h | 7 ++++ src/include/storage/fd.h | 6 --- 4 files changed, 72 insertions(+), 71 deletions(-) diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 073dab2be5..75ba178dfb 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -93,7 +93,6 @@ #include "common/pg_prng.h" #include "miscadmin.h" #include "pgstat.h" -#include "port/pg_iovec.h" #include "portability/mem.h" #include "postmaster/startup.h" #include "storage/fd.h" @@ -3738,67 +3737,3 @@ data_sync_elevel(int elevel) { return data_sync_retry ? elevel : PANIC; } - -/* - * A convenience wrapper for pwritev() that retries on partial write. If an - * error is returned, it is unspecified how much has been written. - */ -ssize_t -pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) -{ - struct iovec iov_copy[PG_IOV_MAX]; - ssize_t sum = 0; - ssize_t part; - - /* We'd better have space to make a copy, in case we need to retry. */ - if (iovcnt > PG_IOV_MAX) - { - errno = EINVAL; - return -1; - } - - for (;;) - { - /* Write as much as we can. */ - part = pwritev(fd, iov, iovcnt, offset); - if (part < 0) - return -1; - -#ifdef SIMULATE_SHORT_WRITE - part = Min(part, 4096); -#endif - - /* Count our progress. */ - sum += part; - offset += part; - - /* Step over iovecs that are done. */ - while (iovcnt > 0 && iov->iov_len <= part) - { - part -= iov->iov_len; - ++iov; - --iovcnt; - } - - /* Are they all done? */ - if (iovcnt == 0) - { - /* We don't expect the kernel to write more than requested. */ - Assert(part == 0); - break; - } - - /* - * Move whatever's left to the front of our mutable copy and adjust - * the leading iovec. - */ - Assert(iovcnt > 0); - memmove(iov_copy, iov, sizeof(*iov) * iovcnt); - Assert(iov->iov_len > part); - iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part; - iov_copy[0].iov_len -= part; - iov = iov_copy; - } - - return sum; -} diff --git a/src/common/file_utils.c b/src/common/file_utils.c index df4d6d240c..4af216e56c 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -28,6 +28,7 @@ #ifdef FRONTEND #include "common/logging.h" #endif +#include "port/pg_iovec.h" #ifdef FRONTEND @@ -460,3 +461,67 @@ get_dirent_type(const char *path, return result; } + +/* + * A convenience wrapper for pwritev() that retries on partial write. If an + * error is returned, it is unspecified how much has been written. + */ +ssize_t +pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) +{ + struct iovec iov_copy[PG_IOV_MAX]; + ssize_t sum = 0; + ssize_t part; + + /* We'd better have space to make a copy, in case we need to retry. */ + if (iovcnt > PG_IOV_MAX) + { + errno = EINVAL; + return -1; + } + + for (;;) + { + /* Write as much as we can. */ + part = pwritev(fd, iov, iovcnt, offset); + if (part < 0) + return -1; + +#ifdef SIMULATE_SHORT_WRITE + part = Min(part, 4096); +#endif + + /* Count our progress. */ + sum += part; + offset += part; + + /* Step over iovecs that are done. */ + while (iovcnt > 0 && iov->iov_len <= part) + { + part -= iov->iov_len; + ++iov; + --iovcnt; + } + + /* Are they all done? */ + if (iovcnt == 0) + { + /* We don't expect the kernel to write more than requested. */ + Assert(part == 0); + break; + } + + /* + * Move whatever's left to the front of our mutable copy and adjust + * the leading iovec. + */ + Assert(iovcnt > 0); + memmove(iov_copy, iov, sizeof(*iov) * iovcnt); + Assert(iov->iov_len > part); + iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part; + iov_copy[0].iov_len -= part; + iov = iov_copy; + } + + return sum; +} diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 2811744c12..2c5dbcb0b1 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -24,6 +24,8 @@ typedef enum PGFileType PGFILETYPE_LNK } PGFileType; +struct iovec; /* avoid including port/pg_iovec.h here */ + #ifdef FRONTEND extern int fsync_fname(const char *fname, bool isdir); extern void fsync_pgdata(const char *pg_data, int serverVersion); @@ -37,4 +39,9 @@ extern PGFileType get_dirent_type(const char *path, bool look_through_symlinks, int elevel); +extern ssize_t pg_pwritev_with_retry(int fd, + const struct iovec *iov, + int iovcnt, + off_t offset); + #endif /* FILE_UTILS_H */ diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 5a48fccd9c..c0a212487d 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -51,8 +51,6 @@ typedef enum RecoveryInitSyncMethod RECOVERY_INIT_SYNC_METHOD_SYNCFS } RecoveryInitSyncMethod; -struct iovec; /* avoid including port/pg_iovec.h here */ - typedef int File; @@ -178,10 +176,6 @@ extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern void pg_flush_data(int fd, off_t offset, off_t nbytes); -extern ssize_t pg_pwritev_with_retry(int fd, - const struct iovec *iov, - int iovcnt, - off_t offset); extern int pg_truncate(const char *path, off_t length); extern void fsync_fname(const char *fname, bool isdir); extern int fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel); -- 2.34.1
From 7de9b6d239214e6a467dd30c2715527ae3d37360 Mon Sep 17 00:00:00 2001 From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com> Date: Fri, 23 Sep 2022 05:44:13 +0000 Subject: [PATCH v4] Use pg_pwritev_with_retry() instead of write() in walmethods.c Use pg_pwritev_with_retry() while prepadding a WAL segment instead of write() in walmethods.c dir_open_for_write() to avoid partial writes. As the pg_pwritev_with_retry() function uses pwritev, we can avoid explicit lseek() on non-Windows platforms to seek to the beginning of the WAL segment. It looks like on Windows, we need an explicit lseek() call here despite using pwrite() implementation from win32pwrite.c. Otherwise an error occurs. These changes are inline with how core postgres initializes the WAL segment in XLogFileInitInternal(). --- src/backend/access/transam/xlog.c | 35 ++++---------- src/bin/pg_basebackup/walmethods.c | 26 +++++----- src/common/file_utils.c | 77 ++++++++++++++++++++++++++++++ src/include/common/file_utils.h | 2 + 4 files changed, 101 insertions(+), 39 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f32b2124e6..c8a1808133 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2925,7 +2925,6 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, bool *added, char *path) { char tmppath[MAXPGPATH]; - PGAlignedXLogBlock zbuffer; XLogSegNo installed_segno; XLogSegNo max_segno; int fd; @@ -2969,14 +2968,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmppath))); - memset(zbuffer.data, 0, XLOG_BLCKSZ); - pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); save_errno = 0; if (wal_init_zero) { - struct iovec iov[PG_IOV_MAX]; - int blocks; + ssize_t rc; /* * Zero-fill the file. With this setting, we do this the hard way to @@ -2987,32 +2983,17 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, * indirect blocks are down on disk. Therefore, fdatasync(2) or * O_DSYNC will be sufficient to sync future writes to the log file. */ + rc = pg_pwritev_zeros(fd, wal_segment_size); - /* Prepare to write out a lot of copies of our zero buffer at once. */ - for (int i = 0; i < lengthof(iov); ++i) - { - iov[i].iov_base = zbuffer.data; - iov[i].iov_len = XLOG_BLCKSZ; - } - - /* Loop, writing as many blocks as we can for each system call. */ - blocks = wal_segment_size / XLOG_BLCKSZ; - for (int i = 0; i < blocks;) - { - int iovcnt = Min(blocks - i, lengthof(iov)); - off_t offset = i * XLOG_BLCKSZ; - - if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0) - { - save_errno = errno; - break; - } - - i += iovcnt; - } + if (rc < 0) + save_errno = errno; } else { + PGAlignedXLogBlock zbuffer; + + memset(zbuffer.data, 0, XLOG_BLCKSZ); + /* * Otherwise, seeking to the end and writing a solitary byte is * enough. diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c index bc2e83d02b..9b4df62447 100644 --- a/src/bin/pg_basebackup/walmethods.c +++ b/src/bin/pg_basebackup/walmethods.c @@ -220,28 +220,30 @@ dir_open_for_write(WalWriteMethod *wwmethod, const char *pathname, /* Do pre-padding on non-compressed files */ if (pad_to_size && wwmethod->compression_algorithm == PG_COMPRESSION_NONE) { - PGAlignedXLogBlock zerobuf; - int bytes; + ssize_t rc; - memset(zerobuf.data, 0, XLOG_BLCKSZ); - for (bytes = 0; bytes < pad_to_size; bytes += XLOG_BLCKSZ) + rc = pg_pwritev_zeros(fd, pad_to_size); + + if (rc < 0) { - errno = 0; - if (write(fd, zerobuf.data, XLOG_BLCKSZ) != XLOG_BLCKSZ) - { - /* If write didn't set errno, assume problem is no disk space */ - wwmethod->lasterrno = errno ? errno : ENOSPC; - close(fd); - return NULL; - } + wwmethod->lasterrno = errno; + close(fd); + return NULL; } +#ifdef WIN32 + /* + * XXX: It looks like on Windows, we need an explicit lseek() call here + * despite using pwrite() implementation from win32pwrite.c. Otherwise + * an error occurs. + */ if (lseek(fd, 0, SEEK_SET) != 0) { wwmethod->lasterrno = errno; close(fd); return NULL; } +#endif } /* diff --git a/src/common/file_utils.c b/src/common/file_utils.c index 4af216e56c..40e12c9130 100644 --- a/src/common/file_utils.c +++ b/src/common/file_utils.c @@ -525,3 +525,80 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset) return sum; } + +/* + * Function to zero-fill a file with given size. On failure, a negative value + * is returned and errno is set appropriately so that the caller can use it + * accordingly. + */ +ssize_t +pg_pwritev_zeros(int fd, size_t size) +{ + PGAlignedXLogBlock zbuffer; + struct iovec iov[PG_IOV_MAX]; + int blocks; + size_t block_sz; + size_t remaining_size = 0; + int i; + ssize_t written; + ssize_t total_written = 0; + + /* + * XXX: Writing more than one block of size XLOG_BLCKSZ bytes via + * PGAlignedXLogBlock structure per vector buffer might improve write + * performance on some platforms. However, tests (on some platforms, not + * all) show not much improvements with varying block sizes. Hence we stick + * to one block in PGAlignedXLogBlock structure. + */ + block_sz = sizeof(zbuffer.data); + + memset(zbuffer.data, 0, block_sz); + + /* Prepare to write out a lot of copies of our zero buffer at once. */ + for (i = 0; i < lengthof(iov); ++i) + { + iov[i].iov_base = zbuffer.data; + iov[i].iov_len = block_sz; + } + + /* Loop, writing as many blocks as we can for each system call. */ + blocks = size / block_sz; + remaining_size = size % block_sz; + for (i = 0; i < blocks;) + { + int iovcnt = Min(blocks - i, lengthof(iov)); + off_t offset = i * block_sz; + + written = pg_pwritev_with_retry(fd, iov, iovcnt, offset); + + if (written < 0) + return written; + + i += iovcnt; + total_written += written; + } + + /* Now, write the remaining size, if any, of the file with zeros. */ + if (remaining_size > 0) + { + /* We'll never write more than one block here */ + int iovcnt = 1; + + /* Jump on to the end of previously written blocks */ + off_t offset = i * block_sz; + + iov[0].iov_base = zbuffer.data; + iov[0].iov_len = remaining_size; + + written = pg_pwritev_with_retry(fd, iov, iovcnt, offset); + + if (written < 0) + return written; + + total_written += written; + } + + Assert(total_written == size); + + return total_written; +} diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h index 2c5dbcb0b1..f8d0d475fa 100644 --- a/src/include/common/file_utils.h +++ b/src/include/common/file_utils.h @@ -44,4 +44,6 @@ extern ssize_t pg_pwritev_with_retry(int fd, int iovcnt, off_t offset); +extern ssize_t pg_pwritev_zeros(int fd, size_t size); + #endif /* FILE_UTILS_H */ -- 2.34.1