On Tue, Sep 27, 2022 at 6:43 PM Bharath Rupireddy
<bharath.rupireddyforpostg...@gmail.com> wrote:
> On Tue, Sep 27, 2022 at 3:08 AM Thomas Munro <thomas.mu...@gmail.com> wrote:
> >
> > I don't think so, that's an extra kernel call.  I think I'll just have
> > to revert part of my recent change that removed the pg_ prefix from
> > those function names in our code, and restore the comment that warns
> > you about the portability hazard (I thought it went away with HP-UX
> > 10, where we were literally calling lseek() before every write()).
> > The majority of users of these functions don't intermix them with
> > calls to read()/write(), so they don't care about the file position,
> > so I think it's just something we'll have to continue to be mindful of
> > in the places that do.
>
> Yes, all of the existing pwrite() callers don't care about the file
> position, but the new callers such as the actual idea and patch
> proposed here in this thread cares.
>
> Is this the commit cf112c122060568aa06efe4e6e6fb9b2dd4f1090 part of
> which [1] you're planning to revert?

Yeah, just the renaming parts of that.  The lseek()-based emulation is
definitely not coming back.  Something like the attached.
From fc72be32668f0c37b899fd922d6986f39d8a6a2a Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Tue, 27 Sep 2022 21:12:03 +1300
Subject: [PATCH] Restore pg_pread and friends.

Commit cf112c12 was a little too hasty in getting rid of the pg_
prefixes where we use pread(), pwrite() and vectored variants.  We
dropped support for ancient Unixes that needed to use lseek() to
implement replacements for those, but it turned out that Windows also
changes the current position even when you pass in an offset to
ReadFile() and WriteFile(), if the file handle is synchronous.

Switching to asynchronous file handles would fix that, but have other
complications.  For now let's just put back the pg_ prefix and add some
comments to highlight the non-standard side-effect.

Reported-by: Bharath Rupireddy <bharath.rupireddyforpostg...@gmail.com>
Discussion: https://postgr.es/m/20220923202439.GA1156054%40nathanxps13
---
 .../pg_stat_statements/pg_stat_statements.c   |  4 +-
 src/backend/access/heap/rewriteheap.c         |  2 +-
 src/backend/access/transam/slru.c             |  4 +-
 src/backend/access/transam/xlog.c             |  4 +-
 src/backend/access/transam/xlogreader.c       |  2 +-
 src/backend/access/transam/xlogrecovery.c     |  2 +-
 src/backend/backup/basebackup.c               |  2 +-
 src/backend/replication/walreceiver.c         |  2 +-
 src/backend/storage/file/fd.c                 |  8 +--
 src/backend/utils/init/miscinit.c             |  2 +-
 src/bin/pg_test_fsync/pg_test_fsync.c         | 50 +++++++++----------
 src/include/access/xlogreader.h               |  4 +-
 src/include/port.h                            |  9 ++++
 src/include/port/pg_iovec.h                   | 13 ++++-
 src/include/port/win32_port.h                 |  4 +-
 src/port/preadv.c                             |  4 +-
 src/port/pwritev.c                            |  4 +-
 src/port/win32pread.c                         |  2 +-
 src/port/win32pwrite.c                        |  2 +-
 19 files changed, 71 insertions(+), 53 deletions(-)

diff --git a/contrib/pg_stat_statements/pg_stat_statements.c 
b/contrib/pg_stat_statements/pg_stat_statements.c
index ba868f0de9..73439c0199 100644
--- a/contrib/pg_stat_statements/pg_stat_statements.c
+++ b/contrib/pg_stat_statements/pg_stat_statements.c
@@ -2103,9 +2103,9 @@ qtext_store(const char *query, int query_len,
        if (fd < 0)
                goto error;
 
-       if (pwrite(fd, query, query_len, off) != query_len)
+       if (pg_pwrite(fd, query, query_len, off) != query_len)
                goto error;
-       if (pwrite(fd, "\0", 1, off + query_len) != 1)
+       if (pg_pwrite(fd, "\0", 1, off + query_len) != 1)
                goto error;
 
        CloseTransientFile(fd);
diff --git a/src/backend/access/heap/rewriteheap.c 
b/src/backend/access/heap/rewriteheap.c
index 2f08fbe8d3..b01b39b008 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -1150,7 +1150,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r)
        /* write out tail end of mapping file (again) */
        errno = 0;
        pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE);
-       if (pwrite(fd, data, len, xlrec->offset) != len)
+       if (pg_pwrite(fd, data, len, xlrec->offset) != len)
        {
                /* if write didn't set errno, assume problem is no disk space */
                if (errno == 0)
diff --git a/src/backend/access/transam/slru.c 
b/src/backend/access/transam/slru.c
index c9a7b97949..b65cb49d7f 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -718,7 +718,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 
        errno = 0;
        pgstat_report_wait_start(WAIT_EVENT_SLRU_READ);
-       if (pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
+       if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
        {
                pgstat_report_wait_end();
                slru_errcause = SLRU_READ_FAILED;
@@ -873,7 +873,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, 
SlruWriteAll fdata)
 
        errno = 0;
        pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE);
-       if (pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ)
+       if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != 
BLCKSZ)
        {
                pgstat_report_wait_end();
                /* if write didn't set errno, assume problem is no disk space */
diff --git a/src/backend/access/transam/xlog.c 
b/src/backend/access/transam/xlog.c
index 1dd6df0fe1..7f9f350531 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2196,7 +2196,7 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool 
flexible)
                                        INSTR_TIME_SET_CURRENT(start);
 
                                pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
-                               written = pwrite(openLogFile, from, nleft, 
startoffset);
+                               written = pg_pwrite(openLogFile, from, nleft, 
startoffset);
                                pgstat_report_wait_end();
 
                                /*
@@ -3018,7 +3018,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID 
logtli,
                 * enough.
                 */
                errno = 0;
-               if (pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
+               if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
                {
                        /* if write didn't set errno, assume no disk space */
                        save_errno = errno ? errno : ENOSPC;
diff --git a/src/backend/access/transam/xlogreader.c 
b/src/backend/access/transam/xlogreader.c
index 4d6c34e0fc..5a8fe81f82 100644
--- a/src/backend/access/transam/xlogreader.c
+++ b/src/backend/access/transam/xlogreader.c
@@ -1544,7 +1544,7 @@ WALRead(XLogReaderState *state,
 
                /* Reset errno first; eases reporting non-errno-affecting 
errors */
                errno = 0;
-               readbytes = pread(state->seg.ws_file, p, segbytes, (off_t) 
startoff);
+               readbytes = pg_pread(state->seg.ws_file, p, segbytes, (off_t) 
startoff);
 
 #ifndef FRONTEND
                pgstat_report_wait_end();
diff --git a/src/backend/access/transam/xlogrecovery.c 
b/src/backend/access/transam/xlogrecovery.c
index b41e682664..cb07694aea 100644
--- a/src/backend/access/transam/xlogrecovery.c
+++ b/src/backend/access/transam/xlogrecovery.c
@@ -3271,7 +3271,7 @@ retry:
        readOff = targetPageOff;
 
        pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
-       r = pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
+       r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
        if (r != XLOG_BLCKSZ)
        {
                char            fname[MAXFNAMELEN];
diff --git a/src/backend/backup/basebackup.c b/src/backend/backup/basebackup.c
index 411cac9be3..e252ad7421 100644
--- a/src/backend/backup/basebackup.c
+++ b/src/backend/backup/basebackup.c
@@ -1828,7 +1828,7 @@ basebackup_read_file(int fd, char *buf, size_t nbytes, 
off_t offset,
        int                     rc;
 
        pgstat_report_wait_start(WAIT_EVENT_BASEBACKUP_READ);
-       rc = pread(fd, buf, nbytes, offset);
+       rc = pg_pread(fd, buf, nbytes, offset);
        pgstat_report_wait_end();
 
        if (rc < 0)
diff --git a/src/backend/replication/walreceiver.c 
b/src/backend/replication/walreceiver.c
index f6ef0ace2c..3767466ef3 100644
--- a/src/backend/replication/walreceiver.c
+++ b/src/backend/replication/walreceiver.c
@@ -915,7 +915,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr, 
TimeLineID tli)
                /* OK to write the logs */
                errno = 0;
 
-               byteswritten = pwrite(recvFile, buf, segbytes, (off_t) 
startoff);
+               byteswritten = pg_pwrite(recvFile, buf, segbytes, (off_t) 
startoff);
                if (byteswritten <= 0)
                {
                        char            xlogfname[MAXFNAMELEN];
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 073dab2be5..e4d954578c 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2053,7 +2053,7 @@ FileRead(File file, char *buffer, int amount, off_t 
offset,
 
 retry:
        pgstat_report_wait_start(wait_event_info);
-       returnCode = pread(vfdP->fd, buffer, amount, offset);
+       returnCode = pg_pread(vfdP->fd, buffer, amount, offset);
        pgstat_report_wait_end();
 
        if (returnCode < 0)
@@ -2135,7 +2135,7 @@ FileWrite(File file, char *buffer, int amount, off_t 
offset,
 retry:
        errno = 0;
        pgstat_report_wait_start(wait_event_info);
-       returnCode = pwrite(VfdCache[file].fd, buffer, amount, offset);
+       returnCode = pg_pwrite(VfdCache[file].fd, buffer, amount, offset);
        pgstat_report_wait_end();
 
        /* if write didn't set errno, assume problem is no disk space */
@@ -3740,7 +3740,7 @@ data_sync_elevel(int elevel)
 }
 
 /*
- * A convenience wrapper for pwritev() that retries on partial write.  If an
+ * A convenience wrapper for pg_pwritev() that retries on partial write.  If an
  * error is returned, it is unspecified how much has been written.
  */
 ssize_t
@@ -3760,7 +3760,7 @@ pg_pwritev_with_retry(int fd, const struct iovec *iov, 
int iovcnt, off_t offset)
        for (;;)
        {
                /* Write as much as we can. */
-               part = pwritev(fd, iov, iovcnt, offset);
+               part = pg_pwritev(fd, iov, iovcnt, offset);
                if (part < 0)
                        return -1;
 
diff --git a/src/backend/utils/init/miscinit.c 
b/src/backend/utils/init/miscinit.c
index 683f616b1a..4d341d3f7f 100644
--- a/src/backend/utils/init/miscinit.c
+++ b/src/backend/utils/init/miscinit.c
@@ -1527,7 +1527,7 @@ AddToDataDirLockFile(int target_line, const char *str)
        len = strlen(destbuffer);
        errno = 0;
        pgstat_report_wait_start(WAIT_EVENT_LOCK_FILE_ADDTODATADIR_WRITE);
-       if (pwrite(fd, destbuffer, len, 0) != len)
+       if (pg_pwrite(fd, destbuffer, len, 0) != len)
        {
                pgstat_report_wait_end();
                /* if write didn't set errno, assume problem is no disk space */
diff --git a/src/bin/pg_test_fsync/pg_test_fsync.c 
b/src/bin/pg_test_fsync/pg_test_fsync.c
index 585b3ef731..5f8cbb75ff 100644
--- a/src/bin/pg_test_fsync/pg_test_fsync.c
+++ b/src/bin/pg_test_fsync/pg_test_fsync.c
@@ -312,10 +312,10 @@ test_sync(int writes_per_op)
                for (ops = 0; alarm_triggered == false; ops++)
                {
                        for (writes = 0; writes < writes_per_op; writes++)
-                               if (pwrite(tmpfile,
-                                                  buf,
-                                                  XLOG_BLCKSZ,
-                                                  writes * XLOG_BLCKSZ) != 
XLOG_BLCKSZ)
+                               if (pg_pwrite(tmpfile,
+                                                         buf,
+                                                         XLOG_BLCKSZ,
+                                                         writes * XLOG_BLCKSZ) 
!= XLOG_BLCKSZ)
                                        die("write failed");
                }
                STOP_TIMER;
@@ -337,10 +337,10 @@ test_sync(int writes_per_op)
        for (ops = 0; alarm_triggered == false; ops++)
        {
                for (writes = 0; writes < writes_per_op; writes++)
-                       if (pwrite(tmpfile,
-                                          buf,
-                                          XLOG_BLCKSZ,
-                                          writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+                       if (pg_pwrite(tmpfile,
+                                                 buf,
+                                                 XLOG_BLCKSZ,
+                                                 writes * XLOG_BLCKSZ) != 
XLOG_BLCKSZ)
                                die("write failed");
                fdatasync(tmpfile);
        }
@@ -359,10 +359,10 @@ test_sync(int writes_per_op)
        for (ops = 0; alarm_triggered == false; ops++)
        {
                for (writes = 0; writes < writes_per_op; writes++)
-                       if (pwrite(tmpfile,
-                                          buf,
-                                          XLOG_BLCKSZ,
-                                          writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+                       if (pg_pwrite(tmpfile,
+                                                 buf,
+                                                 XLOG_BLCKSZ,
+                                                 writes * XLOG_BLCKSZ) != 
XLOG_BLCKSZ)
                                die("write failed");
                if (fsync(tmpfile) != 0)
                        die("fsync failed");
@@ -383,10 +383,10 @@ test_sync(int writes_per_op)
        for (ops = 0; alarm_triggered == false; ops++)
        {
                for (writes = 0; writes < writes_per_op; writes++)
-                       if (pwrite(tmpfile,
-                                          buf,
-                                          XLOG_BLCKSZ,
-                                          writes * XLOG_BLCKSZ) != XLOG_BLCKSZ)
+                       if (pg_pwrite(tmpfile,
+                                                 buf,
+                                                 XLOG_BLCKSZ,
+                                                 writes * XLOG_BLCKSZ) != 
XLOG_BLCKSZ)
                                die("write failed");
                if (pg_fsync_writethrough(tmpfile) != 0)
                        die("fsync failed");
@@ -415,10 +415,10 @@ test_sync(int writes_per_op)
                for (ops = 0; alarm_triggered == false; ops++)
                {
                        for (writes = 0; writes < writes_per_op; writes++)
-                               if (pwrite(tmpfile,
-                                                  buf,
-                                                  XLOG_BLCKSZ,
-                                                  writes * XLOG_BLCKSZ) != 
XLOG_BLCKSZ)
+                               if (pg_pwrite(tmpfile,
+                                                         buf,
+                                                         XLOG_BLCKSZ,
+                                                         writes * XLOG_BLCKSZ) 
!= XLOG_BLCKSZ)
 
                                        /*
                                         * This can generate write failures if 
the filesystem has
@@ -480,10 +480,10 @@ test_open_sync(const char *msg, int writes_size)
                for (ops = 0; alarm_triggered == false; ops++)
                {
                        for (writes = 0; writes < 16 / writes_size; writes++)
-                               if (pwrite(tmpfile,
-                                                  buf,
-                                                  writes_size * 1024,
-                                                  writes * writes_size * 1024) 
!=
+                               if (pg_pwrite(tmpfile,
+                                                         buf,
+                                                         writes_size * 1024,
+                                                         writes * writes_size 
* 1024) !=
                                        writes_size * 1024)
                                        die("write failed");
                }
@@ -582,7 +582,7 @@ test_non_sync(void)
        START_TIMER;
        for (ops = 0; alarm_triggered == false; ops++)
        {
-               if (pwrite(tmpfile, buf, XLOG_BLCKSZ, 0) != XLOG_BLCKSZ)
+               if (pg_pwrite(tmpfile, buf, XLOG_BLCKSZ, 0) != XLOG_BLCKSZ)
                        die("write failed");
        }
        STOP_TIMER;
diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h
index 6dcde2523a..e87f91316a 100644
--- a/src/include/access/xlogreader.h
+++ b/src/include/access/xlogreader.h
@@ -378,11 +378,11 @@ extern void XLogReaderResetError(XLogReaderState *state);
 
 /*
  * Error information from WALRead that both backend and frontend caller can
- * process.  Currently only errors from pread can be reported.
+ * process.  Currently only errors from pg_pread can be reported.
  */
 typedef struct WALReadError
 {
-       int                     wre_errno;              /* errno set by the 
last pread() */
+       int                     wre_errno;              /* errno set by the 
last pg_pread() */
        int                     wre_off;                /* Offset we tried to 
read from. */
        int                     wre_req;                /* Bytes requested to 
be read. */
        int                     wre_read;               /* Bytes read by the 
last read(). */
diff --git a/src/include/port.h b/src/include/port.h
index 3562d471b5..69d8818d61 100644
--- a/src/include/port.h
+++ b/src/include/port.h
@@ -213,6 +213,15 @@ extern int pg_fprintf(FILE *stream, const char *fmt,...) 
pg_attribute_printf(2,
 extern int     pg_vprintf(const char *fmt, va_list args) 
pg_attribute_printf(1, 0);
 extern int     pg_printf(const char *fmt,...) pg_attribute_printf(1, 2);
 
+#ifndef WIN32
+/*
+ * We add a pg_ prefix as a warning that the Windows implementations have the
+ * non-standard side-effect of changing the current file position.
+ */
+#define pg_pread pread
+#define pg_pwrite pwrite
+#endif
+
 /*
  * We use __VA_ARGS__ for printf to prevent replacing references to
  * the "printf" format archetype in format() attribute declarations.
diff --git a/src/include/port/pg_iovec.h b/src/include/port/pg_iovec.h
index ecdddba7fc..9e1d5dbab3 100644
--- a/src/include/port/pg_iovec.h
+++ b/src/include/port/pg_iovec.h
@@ -35,12 +35,21 @@ struct iovec
 /* Define a reasonable maximum that is safe to use on the stack. */
 #define PG_IOV_MAX Min(IOV_MAX, 32)
 
+/*
+ * Note that pg_preadv and pg_writev have a pg_ prefix as a warning that the
+ * Windows implementations have the side-effect of changing the file position.
+ */
+
 #if !HAVE_DECL_PREADV
-extern ssize_t preadv(int fd, const struct iovec *iov, int iovcnt, off_t 
offset);
+extern ssize_t pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t 
offset);
+#else
+#define pg_preadv preadv
 #endif
 
 #if !HAVE_DECL_PWRITEV
-extern ssize_t pwritev(int fd, const struct iovec *iov, int iovcnt, off_t 
offset);
+extern ssize_t pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t 
offset);
+#else
+#define pg_pwritev pwritev
 #endif
 
 #endif                                                 /* PG_IOVEC_H */
diff --git a/src/include/port/win32_port.h b/src/include/port/win32_port.h
index 707f8760ca..a22867d295 100644
--- a/src/include/port/win32_port.h
+++ b/src/include/port/win32_port.h
@@ -564,9 +564,9 @@ typedef unsigned short mode_t;
 #endif
 
 /* in port/win32pread.c */
-extern ssize_t pread(int fd, void *buf, size_t nbyte, off_t offset);
+extern ssize_t pg_pread(int fd, void *buf, size_t nbyte, off_t offset);
 
 /* in port/win32pwrite.c */
-extern ssize_t pwrite(int fd, const void *buf, size_t nbyte, off_t offset);
+extern ssize_t pg_pwrite(int fd, const void *buf, size_t nbyte, off_t offset);
 
 #endif                                                 /* PG_WIN32_PORT_H */
diff --git a/src/port/preadv.c b/src/port/preadv.c
index 188e10f065..ce5863b696 100644
--- a/src/port/preadv.c
+++ b/src/port/preadv.c
@@ -19,14 +19,14 @@
 #include "port/pg_iovec.h"
 
 ssize_t
-preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+pg_preadv(int fd, const struct iovec *iov, int iovcnt, off_t offset)
 {
        ssize_t         sum = 0;
        ssize_t         part;
 
        for (int i = 0; i < iovcnt; ++i)
        {
-               part = pread(fd, iov[i].iov_base, iov[i].iov_len, offset);
+               part = pg_pread(fd, iov[i].iov_base, iov[i].iov_len, offset);
                if (part < 0)
                {
                        if (i == 0)
diff --git a/src/port/pwritev.c b/src/port/pwritev.c
index de9b7e4e3d..6712a8986c 100644
--- a/src/port/pwritev.c
+++ b/src/port/pwritev.c
@@ -19,14 +19,14 @@
 #include "port/pg_iovec.h"
 
 ssize_t
-pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+pg_pwritev(int fd, const struct iovec *iov, int iovcnt, off_t offset)
 {
        ssize_t         sum = 0;
        ssize_t         part;
 
        for (int i = 0; i < iovcnt; ++i)
        {
-               part = pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset);
+               part = pg_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset);
                if (part < 0)
                {
                        if (i == 0)
diff --git a/src/port/win32pread.c b/src/port/win32pread.c
index ebcdd33756..fb6f00bcbf 100644
--- a/src/port/win32pread.c
+++ b/src/port/win32pread.c
@@ -17,7 +17,7 @@
 #include <windows.h>
 
 ssize_t
-pread(int fd, void *buf, size_t size, off_t offset)
+pg_pread(int fd, void *buf, size_t size, off_t offset)
 {
        OVERLAPPED      overlapped = {0};
        HANDLE          handle;
diff --git a/src/port/win32pwrite.c b/src/port/win32pwrite.c
index 7f2e62e8a7..aa374d0ffc 100644
--- a/src/port/win32pwrite.c
+++ b/src/port/win32pwrite.c
@@ -17,7 +17,7 @@
 #include <windows.h>
 
 ssize_t
-pwrite(int fd, const void *buf, size_t size, off_t offset)
+pg_pwrite(int fd, const void *buf, size_t size, off_t offset)
 {
        OVERLAPPED      overlapped = {0};
        HANDLE          handle;
-- 
2.37.0 (Apple Git-136)

Reply via email to