Hello, Patch 0001 gets rid of the unconditional lseek() calls for SLRU I/O, as a small follow-up to commit c24dcd0c. Patch 0002 gets rid of a few places that usually do a good job of avoiding lseek() calls while reading and writing WAL, but it seems better to have no code at all.
-- Thomas Munro https://enterprisedb.com
From 68c1ec0531f802d39c80b8fa5c310c3c70d795d9 Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Sat, 30 Mar 2019 17:04:08 +1300 Subject: [PATCH 1/2] Use pg_pread() and pg_pwrite() in slru.c. This avoids lseek() system calls at every SLRU I/O, as was done for relation files in commit c24dcd0c. Author: Thomas Munro Discussion: --- src/backend/access/transam/slru.c | 25 ++++--------------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 974d42fc86..6527cfce3b 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -647,7 +647,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; - int offset = rpageno * BLCKSZ; + off_t offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd; @@ -677,17 +677,9 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) return true; } - if (lseek(fd, (off_t) offset, SEEK_SET) < 0) - { - slru_errcause = SLRU_SEEK_FAILED; - slru_errno = errno; - CloseTransientFile(fd); - return false; - } - errno = 0; pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); - if (read(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) + if (pg_pread(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ) { pgstat_report_wait_end(); slru_errcause = SLRU_READ_FAILED; @@ -727,7 +719,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; - int offset = rpageno * BLCKSZ; + off_t offset = rpageno * BLCKSZ; char path[MAXPGPATH]; int fd = -1; @@ -837,18 +829,9 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) } } - if (lseek(fd, (off_t) offset, SEEK_SET) < 0) - { - slru_errcause = SLRU_SEEK_FAILED; - slru_errno = errno; - if (!fdata) - CloseTransientFile(fd); - return false; - } - errno = 0; pgstat_report_wait_start(WAIT_EVENT_SLRU_WRITE); - if (write(fd, shared->page_buffer[slotno], BLCKSZ) != BLCKSZ) + if (pg_pwrite(fd, shared->page_buffer[slotno], BLCKSZ, offset) != BLCKSZ) { pgstat_report_wait_end(); /* if write didn't set errno, assume problem is no disk space */ -- 2.21.0
From 446007aed7fb1bafb125eaca5569f62acdf41069 Mon Sep 17 00:00:00 2001 From: Thomas Munro <thomas.mu...@gmail.com> Date: Sat, 30 Mar 2019 17:33:30 +1300 Subject: [PATCH 2/2] Use pg_pread() and pg_pwrite() in various places. Remove several copies of some code that usually does a good job of avoiding lseek() calls, but it seems better to have no code at all. Also remove a simple case of an unconditional lseek() call. Author: Thomas Munro Discussion: --- src/backend/access/heap/rewriteheap.c | 9 +-------- src/backend/access/transam/xlogutils.c | 25 ++----------------------- src/backend/replication/walreceiver.c | 21 +++------------------ src/backend/replication/walsender.c | 23 ++++------------------- src/bin/pg_waldump/pg_waldump.c | 25 +++---------------------- 5 files changed, 13 insertions(+), 90 deletions(-) diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c index bce4274362..c121e44719 100644 --- a/src/backend/access/heap/rewriteheap.c +++ b/src/backend/access/heap/rewriteheap.c @@ -1165,13 +1165,6 @@ heap_xlog_logical_rewrite(XLogReaderState *r) path, (uint32) xlrec->offset))); pgstat_report_wait_end(); - /* now seek to the position we want to write our data to */ - if (lseek(fd, xlrec->offset, SEEK_SET) != xlrec->offset) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not seek to end of file \"%s\": %m", - path))); - data = XLogRecGetData(r) + sizeof(*xlrec); len = xlrec->num_mappings * sizeof(LogicalRewriteMappingData); @@ -1179,7 +1172,7 @@ heap_xlog_logical_rewrite(XLogReaderState *r) /* write out tail end of mapping file (again) */ errno = 0; pgstat_report_wait_start(WAIT_EVENT_LOGICAL_REWRITE_MAPPING_WRITE); - if (write(fd, data, len) != len) + if (pg_pwrite(fd, data, len, xlrec->offset) != len) { /* if write didn't set errno, assume problem is no disk space */ if (errno == 0) diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 10a663bae6..a63bfed3a6 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -664,7 +664,6 @@ XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr, static int sendFile = -1; static XLogSegNo sendSegNo = 0; static TimeLineID sendTLI = 0; - static uint32 sendOff = 0; Assert(segsize == wal_segment_size); @@ -708,28 +707,9 @@ XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr, errmsg("could not open file \"%s\": %m", path))); } - sendOff = 0; sendTLI = tli; } - /* Need to seek in the file? */ - if (sendOff != startoff) - { - if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) - { - char path[MAXPGPATH]; - int save_errno = errno; - - XLogFilePath(path, tli, sendSegNo, segsize); - errno = save_errno; - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not seek in log segment %s to offset %u: %m", - path, startoff))); - } - sendOff = startoff; - } - /* How many bytes are within this segment? */ if (nbytes > (segsize - startoff)) segbytes = segsize - startoff; @@ -737,7 +717,7 @@ XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr, segbytes = nbytes; pgstat_report_wait_start(WAIT_EVENT_WAL_READ); - readbytes = read(sendFile, p, segbytes); + readbytes = pg_pread(sendFile, p, segbytes, (off_t) startoff); pgstat_report_wait_end(); if (readbytes <= 0) { @@ -749,13 +729,12 @@ XLogRead(char *buf, int segsize, TimeLineID tli, XLogRecPtr startptr, ereport(ERROR, (errcode_for_file_access(), errmsg("could not read from log segment %s, offset %u, length %lu: %m", - path, sendOff, (unsigned long) segbytes))); + path, startoff, (unsigned long) segbytes))); } /* Update state for read */ recptr += readbytes; - sendOff += readbytes; nbytes -= readbytes; p += readbytes; } diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index f32cf91ffb..d2c0f964d0 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -83,14 +83,13 @@ WalReceiverFunctionsType *WalReceiverFunctions = NULL; #define NAPTIME_PER_CYCLE 100 /* max sleep time between cycles (100ms) */ /* - * These variables are used similarly to openLogFile/SegNo/Off, + * These variables are used similarly to openLogFile/SegNo, * but for walreceiver to write the XLOG. recvFileTLI is the TimeLineID * corresponding the filename of recvFile. */ static int recvFile = -1; static TimeLineID recvFileTLI = 0; static XLogSegNo recvSegNo = 0; -static uint32 recvOff = 0; /* * Flags set by interrupt handlers of walreceiver for later service in the @@ -971,7 +970,6 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) use_existent = true; recvFile = XLogFileInit(recvSegNo, &use_existent, true); recvFileTLI = ThisTimeLineID; - recvOff = 0; } /* Calculate the start offset of the received logs */ @@ -982,22 +980,10 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) else segbytes = nbytes; - /* Need to seek in the file? */ - if (recvOff != startoff) - { - if (lseek(recvFile, (off_t) startoff, SEEK_SET) < 0) - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not seek in log segment %s to offset %u: %m", - XLogFileNameP(recvFileTLI, recvSegNo), - startoff))); - recvOff = startoff; - } - /* OK to write the logs */ errno = 0; - byteswritten = write(recvFile, buf, segbytes); + byteswritten = pg_pwrite(recvFile, buf, segbytes, (off_t) startoff); if (byteswritten <= 0) { /* if write didn't set errno, assume no disk space */ @@ -1008,13 +994,12 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) errmsg("could not write to log segment %s " "at offset %u, length %lu: %m", XLogFileNameP(recvFileTLI, recvSegNo), - recvOff, (unsigned long) segbytes))); + startoff, (unsigned long) segbytes))); } /* Update state for write */ recptr += byteswritten; - recvOff += byteswritten; nbytes -= byteswritten; buf += byteswritten; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 21f5c868f1..bc6d4bc500 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -129,12 +129,11 @@ bool log_replication_commands = false; bool wake_wal_senders = false; /* - * These variables are used similarly to openLogFile/SegNo/Off, + * These variables are used similarly to openLogFile/SegNo, * but for walsender to read the XLOG. */ static int sendFile = -1; static XLogSegNo sendSegNo = 0; -static uint32 sendOff = 0; /* Timeline ID of the currently open file */ static TimeLineID curFileTimeLine = 0; @@ -2441,19 +2440,6 @@ retry: errmsg("could not open file \"%s\": %m", path))); } - sendOff = 0; - } - - /* Need to seek in the file? */ - if (sendOff != startoff) - { - if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not seek in log segment %s to offset %u: %m", - XLogFileNameP(curFileTimeLine, sendSegNo), - startoff))); - sendOff = startoff; } /* How many bytes are within this segment? */ @@ -2463,7 +2449,7 @@ retry: segbytes = nbytes; pgstat_report_wait_start(WAIT_EVENT_WAL_READ); - readbytes = read(sendFile, p, segbytes); + readbytes = pg_pread(sendFile, p, segbytes, (off_t) startoff); pgstat_report_wait_end(); if (readbytes < 0) { @@ -2471,7 +2457,7 @@ retry: (errcode_for_file_access(), errmsg("could not read from log segment %s, offset %u, length %zu: %m", XLogFileNameP(curFileTimeLine, sendSegNo), - sendOff, (Size) segbytes))); + startoff, (Size) segbytes))); } else if (readbytes == 0) { @@ -2479,13 +2465,12 @@ retry: (errcode(ERRCODE_DATA_CORRUPTED), errmsg("could not read from log segment %s, offset %u: read %d of %zu", XLogFileNameP(curFileTimeLine, sendSegNo), - sendOff, readbytes, (Size) segbytes))); + startoff, readbytes, (Size) segbytes))); } /* Update state for read */ recptr += readbytes; - sendOff += readbytes; nbytes -= readbytes; p += readbytes; } diff --git a/src/bin/pg_waldump/pg_waldump.c b/src/bin/pg_waldump/pg_waldump.c index 1e5379eb3e..35216b4550 100644 --- a/src/bin/pg_waldump/pg_waldump.c +++ b/src/bin/pg_waldump/pg_waldump.c @@ -329,7 +329,6 @@ XLogDumpXLogRead(const char *directory, TimeLineID timeline_id, static int sendFile = -1; static XLogSegNo sendSegNo = 0; - static uint32 sendOff = 0; p = buf; recptr = startptr; @@ -384,23 +383,6 @@ XLogDumpXLogRead(const char *directory, TimeLineID timeline_id, if (sendFile < 0) fatal_error("could not find file \"%s\": %s", fname, strerror(errno)); - sendOff = 0; - } - - /* Need to seek in the file? */ - if (sendOff != startoff) - { - if (lseek(sendFile, (off_t) startoff, SEEK_SET) < 0) - { - int err = errno; - char fname[MAXPGPATH]; - - XLogFileName(fname, timeline_id, sendSegNo, WalSegSz); - - fatal_error("could not seek in log file %s to offset %u: %s", - fname, startoff, strerror(err)); - } - sendOff = startoff; } /* How many bytes are within this segment? */ @@ -409,7 +391,7 @@ XLogDumpXLogRead(const char *directory, TimeLineID timeline_id, else segbytes = nbytes; - readbytes = read(sendFile, p, segbytes); + readbytes = pg_pread(sendFile, p, segbytes, (off_t) startoff); if (readbytes <= 0) { int err = errno; @@ -421,16 +403,15 @@ XLogDumpXLogRead(const char *directory, TimeLineID timeline_id, if (readbytes < 0) fatal_error("could not read from log file %s, offset %u, length %d: %s", - fname, sendOff, segbytes, strerror(err)); + fname, startoff, segbytes, strerror(err)); else if (readbytes == 0) fatal_error("could not read from log file %s, offset %u: read %d of %zu", - fname, sendOff, readbytes, (Size) segbytes); + fname, startoff, readbytes, (Size) segbytes); } /* Update state for read */ recptr += readbytes; - sendOff += readbytes; nbytes -= readbytes; p += readbytes; } -- 2.21.0