On 2021/03/22 9:50, ikedamsh wrote:
Agreed. I separated the patches. If only the former is committed, my trivial concern is that there may be a disadvantage, but no advantage for the standby server. It may lead to performance degradation to the wal receiver by calling INSTR_TIME_SET_CURRENT(), but the stats can't visible for users until the latter patch is committed.
Your concern is valid, so let's polish and commit also the 0003 patch to v14. I'm still thinking that it's better to separate wal_xxx columns into walreceiver's and the others. But if we count even walreceiver activity on the existing columns, regarding 0003 patch, we need to update the document? For example, "Number of times WAL buffers were written out to disk via XLogWrite request." should be "Number of times WAL buffers were written out to disk via XLogWrite request and by WAL receiver process."? Maybe we need to append some descriptions about this into "WAL configuration" section?
I followed the argument of pg_pwrite(). But, I think "char *" is better, so fixed it.
Thanks for updating the patch! +extern int XLogWriteFile(int fd, char *buf, + size_t nbyte, off_t offset, + TimeLineID timelineid, XLogSegNo segno, + bool write_all); write_all seems not to be necessary. You added this flag for walreceiver, I guess. But even without the argument, walreceiver seems to work expectedly. So, what about the attached patch? I applied some cosmetic changes to the patch. Regards, -- Fujii Masao Advanced Computing Technology Center Research and Development Headquarters NTT DATA CORPORATION
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 6f8810e149..9d8ea7edca 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2534,63 +2534,15 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) { char *from; Size nbytes; - Size nleft; int written; - instr_time start; /* OK to write the page(s) */ from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ; nbytes = npages * (Size) XLOG_BLCKSZ; - nleft = nbytes; - do - { - errno = 0; - - /* Measure I/O timing to write WAL data */ - if (track_wal_io_timing) - INSTR_TIME_SET_CURRENT(start); - - pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); - written = pg_pwrite(openLogFile, from, nleft, startoffset); - pgstat_report_wait_end(); - - /* - * Increment the I/O timing and the number of times WAL data - * were written out to disk. - */ - if (track_wal_io_timing) - { - instr_time duration; - - INSTR_TIME_SET_CURRENT(duration); - INSTR_TIME_SUBTRACT(duration, start); - WalStats.m_wal_write_time += INSTR_TIME_GET_MICROSEC(duration); - } - - WalStats.m_wal_write++; - - if (written <= 0) - { - char xlogfname[MAXFNAMELEN]; - int save_errno; - - if (errno == EINTR) - continue; - - save_errno = errno; - XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, - wal_segment_size); - errno = save_errno; - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write to log file %s " - "at offset %u, length %zu: %m", - xlogfname, startoffset, nleft))); - } - nleft -= written; - from += written; - startoffset += written; - } while (nleft > 0); + written = XLogWriteFile(openLogFile, from, nbytes, (off_t) startoffset, + ThisTimeLineID, openLogSegNo); + Assert(written == nbytes); + startoffset += written; npages = 0; @@ -2707,6 +2659,94 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible) } } +/* + * Write the WAL data to the WAL file. + * + * 'fd' is the file descriptor for the WAL file to write WAL to. 'buf' is + * the starting address of the buffer storing WAL data to write. + * 'nbytes' is the number of bytes to write WAL data up to. 'offset' + * is the offset of WAL file to be set. 'tli' and 'segno' are the + * timeline ID and segment number of WAL file. + * + * Return the total number of bytes written. This must be the same as + * 'nbytes'. PANIC error is thrown if WAL data fails to be written. + */ +int +XLogWriteFile(int fd, char *buf, Size nbytes, off_t offset, + TimeLineID tli, XLogSegNo segno) +{ + Size total_written = 0; + + /* + * Loop until 'nbytes' bytes of the buffer data have been written or an + * error occurs. + */ + do + { + int written; + instr_time start; + + errno = 0; + + /* Measure I/O timing to write WAL data */ + if (track_wal_io_timing) + INSTR_TIME_SET_CURRENT(start); + + pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); + written = pg_pwrite(fd, buf, nbytes, offset); + pgstat_report_wait_end(); + + if (written <= 0) + { + char xlogfname[MAXFNAMELEN]; + int save_errno; + + /* + * Retry on EINTR. All signals used in the backend and background + * processes are flagged SA_RESTART, so it shouldn't happen, but + * better to be defensive. + */ + if (errno == EINTR) + continue; + + /* if write didn't set errno, assume no disk space */ + if (errno == 0) + errno = ENOSPC; + + save_errno = errno; + XLogFileName(xlogfname, tli, segno, wal_segment_size); + errno = save_errno; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to log file %s " + "at offset %u, length %zu: %m", + xlogfname, (unsigned int) offset, nbytes))); + } + + /* + * Increment the I/O timing and the number of times WAL data were + * written out to disk. + */ + if (track_wal_io_timing) + { + instr_time duration; + + INSTR_TIME_SET_CURRENT(duration); + INSTR_TIME_SUBTRACT(duration, start); + WalStats.m_wal_write_time += INSTR_TIME_GET_MICROSEC(duration); + } + + WalStats.m_wal_write++; + + nbytes -= written; + buf += written; + offset += written; + total_written += written; + } while (nbytes > 0); + + return total_written; +} + /* * Record the LSN for an asynchronous transaction commit/abort * and nudge the WALWriter if there is work for it to do. diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index f719ab4f6d..e33565d859 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -871,7 +871,7 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len) static void XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) { - int startoff; + uint32 startoff; int byteswritten; while (nbytes > 0) @@ -938,27 +938,9 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) segbytes = nbytes; /* OK to write the logs */ - errno = 0; - - byteswritten = pg_pwrite(recvFile, buf, segbytes, (off_t) startoff); - if (byteswritten <= 0) - { - char xlogfname[MAXFNAMELEN]; - int save_errno; - - /* if write didn't set errno, assume no disk space */ - if (errno == 0) - errno = ENOSPC; - - save_errno = errno; - XLogFileName(xlogfname, recvFileTLI, recvSegNo, wal_segment_size); - errno = save_errno; - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write to log segment %s " - "at offset %u, length %lu: %m", - xlogfname, startoff, (unsigned long) segbytes))); - } + byteswritten = XLogWriteFile(recvFile, buf, segbytes, (off_t) startoff, + recvFileTLI, recvSegNo); + Assert(byteswritten == segbytes); /* Update state for write */ recptr += byteswritten; diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 77187c12be..c1f3ddab89 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -298,6 +298,8 @@ extern bool XLogBackgroundFlush(void); extern bool XLogNeedsFlush(XLogRecPtr RecPtr); extern int XLogFileInit(XLogSegNo segno, bool *use_existent, bool use_lock); extern int XLogFileOpen(XLogSegNo segno); +extern int XLogWriteFile(int fd, char *buf, Size nbyte, off_t offset, + TimeLineID tli, XLogSegNo segno); extern void CheckXLogRemoved(XLogSegNo segno, TimeLineID tli); extern XLogSegNo XLogGetLastRemovedSegno(void);