The cfbot showed issues compiling on linux and windows. http://cfbot.cputube.org/takashi-menjo.html
https://cirrus-ci.com/task/6125740327436288 [02:30:06.538] In file included from xlog.c:38: [02:30:06.538] ../../../../src/include/access/xlogpmem.h:32:42: error: unknown type name ‘tli’ [02:30:06.538] 32 | PmemXLogEnsurePrevMapped(XLogRecPtr ptr, tli) [02:30:06.538] | ^~~ [02:30:06.538] xlog.c: In function ‘GetXLogBuffer’: [02:30:06.538] xlog.c:1959:19: warning: implicit declaration of function ‘PmemXLogEnsurePrevMapped’ [-Wimplicit-function-declaration] [02:30:06.538] 1959 | openLogSegNo = PmemXLogEnsurePrevMapped(endptr, tli); https://cirrus-ci.com/task/6688690280857600?logs=build#L379 [02:33:25.752] c:\cirrus\src\include\access\xlogpmem.h(33,1): error C2081: 'tli': name in formal parameter list illegal (compiling source file src/backend/access/transam/xlog.c) [c:\cirrus\postgres.vcxproj] I'm attaching a probable fix. Unfortunately, for patches like this, most of the functionality isn't exercised unless the library is installed and compilation and runtime are enabled by default. In 0009: recaluculated => recalculated 0010-Update-document should be squished with 0003-Add-wal_pmem_map-to-GUC (and maybe 0002 and 0001). I believe the patches after 0005 are more WIP, so it's fine if they're not squished yet. I'm not sure what the point is of this one: 0008-Let-wal_pmem_map-be-constant-unl + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not pmem_map_file \"%s\": %m", path))); => The outer parenthesis are not needed since e3a87b4.
>From e5614f2ea3ff6aaf016343f81f74366440e18f6f Mon Sep 17 00:00:00 2001 From: Takashi Menjo <takashi.menjou...@hco.ntt.co.jp> Date: Tue, 23 Mar 2021 13:32:27 +0900 Subject: [PATCH 01/13] Add --with-libpmem option for PMEM support --- configure | 99 ++++++++++++++++++++++++++++++++++++++ configure.ac | 17 +++++++ src/include/pg_config.h.in | 6 +++ 3 files changed, 122 insertions(+) diff --git a/configure b/configure index 3b19105328d..22c364fac4f 100755 --- a/configure +++ b/configure @@ -699,6 +699,7 @@ with_gnu_ld LD LDFLAGS_SL LDFLAGS_EX +with_libpmem LZ4_LIBS LZ4_CFLAGS with_lz4 @@ -868,6 +869,7 @@ with_libxslt with_system_tzdata with_zlib with_lz4 +with_libpmem with_gnu_ld with_ssl with_openssl @@ -1576,6 +1578,7 @@ Optional Packages: use system time zone data in DIR --without-zlib do not use Zlib --with-lz4 build with LZ4 support + --with-libpmem build with PMEM support --with-gnu-ld assume the C compiler uses GNU ld [default=no] --with-ssl=LIB use LIB for SSL/TLS support (openssl) --with-openssl obsolete spelling of --with-ssl=openssl @@ -9033,6 +9036,41 @@ fi done fi +# +# libpmem +# +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to build with PMEM support" >&5 +$as_echo_n "checking whether to build with PMEM support... " >&6; } + + + +# Check whether --with-libpmem was given. +if test "${with_libpmem+set}" = set; then : + withval=$with_libpmem; + case $withval in + yes) + +$as_echo "#define USE_LIBPMEM 1" >>confdefs.h + + ;; + no) + : + ;; + *) + as_fn_error $? "no argument expected for --with-libpmem option" "$LINENO" 5 + ;; + esac + +else + with_libpmem=no + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $with_libpmem" >&5 +$as_echo "$with_libpmem" >&6; } + + # # Assignments # @@ -13504,6 +13542,56 @@ fi fi +if test "$with_libpmem" = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for pmem_memcpy in -lpmem" >&5 +$as_echo_n "checking for pmem_memcpy in -lpmem... " >&6; } +if ${ac_cv_lib_pmem_pmem_memcpy+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lpmem $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char pmem_memcpy (); +int +main () +{ +return pmem_memcpy (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_pmem_pmem_memcpy=yes +else + ac_cv_lib_pmem_pmem_memcpy=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_pmem_pmem_memcpy" >&5 +$as_echo "$ac_cv_lib_pmem_pmem_memcpy" >&6; } +if test "x$ac_cv_lib_pmem_pmem_memcpy" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBPMEM 1 +_ACEOF + + LIBS="-lpmem $LIBS" + +else + as_fn_error $? "library 'libpmem' (version >= 1.5) is required for PMEM support" "$LINENO" 5 +fi + +fi + ## ## Header files @@ -14215,6 +14303,17 @@ fi done +fi + +if test "$with_libpmem" = yes ; then + ac_fn_c_check_header_mongrel "$LINENO" "libpmem.h" "ac_cv_header_libpmem_h" "$ac_includes_default" +if test "x$ac_cv_header_libpmem_h" = xyes; then : + +else + as_fn_error $? "header file <libpmem.h> is required for PMEM support" "$LINENO" 5 +fi + + fi ## diff --git a/configure.ac b/configure.ac index e77d4dcf2d2..2bea131375a 100644 --- a/configure.ac +++ b/configure.ac @@ -1056,6 +1056,15 @@ if test "$with_lz4" = yes; then done fi +# +# libpmem +# +AC_MSG_CHECKING([whether to build with PMEM support]) +PGAC_ARG_BOOL(with, libpmem, no, [build with PMEM support], + [AC_DEFINE([USE_LIBPMEM], 1, [Define to 1 to build with PMEM support. (--with-libpmem)])]) +AC_MSG_RESULT([$with_libpmem]) +AC_SUBST(with_libpmem) + # # Assignments # @@ -1385,6 +1394,10 @@ elif test "$with_uuid" = ossp ; then fi AC_SUBST(UUID_LIBS) +if test "$with_libpmem" = yes; then + AC_CHECK_LIB(pmem, pmem_memcpy, [], [AC_MSG_ERROR([library 'libpmem' (version >= 1.5) is required for PMEM support])]) +fi + ## ## Header files @@ -1571,6 +1584,10 @@ if test "$PORTNAME" = "win32" ; then AC_CHECK_HEADERS(crtdefs.h) fi +if test "$with_libpmem" = yes ; then + AC_CHECK_HEADER(libpmem.h, [], [AC_MSG_ERROR([header file <libpmem.h> is required for PMEM support])]) +fi + ## ## Types, structures, compiler characteristics ## diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index 7525c165974..96604aa130d 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -331,6 +331,9 @@ /* Define to 1 if you have the `pam' library (-lpam). */ #undef HAVE_LIBPAM +/* Define to 1 if you have the `pmem' library (-lpmem). */ +#undef HAVE_LIBPMEM + /* Define if you have a function readline library */ #undef HAVE_LIBREADLINE @@ -898,6 +901,9 @@ /* Define to 1 to build with LDAP support. (--with-ldap) */ #undef USE_LDAP +/* Define to 1 to build with PMEM support. (--with-libpmem) */ +#undef USE_LIBPMEM + /* Define to 1 to build with XML support. (--with-libxml) */ #undef USE_LIBXML -- 2.17.1
>From f014b7923e9cee1f1f3cfd3ea23fc91ace3f1474 Mon Sep 17 00:00:00 2001 From: Takashi Menjo <takashi.menjou...@hco.ntt.co.jp> Date: Fri, 5 Nov 2021 14:16:33 +0900 Subject: [PATCH 02/13] Support build with MSVC on Windows --- src/tools/msvc/Solution.pm | 13 +++++++++++++ src/tools/msvc/config_default.pl | 3 ++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm index 2c8cd521e94..d843391050b 100644 --- a/src/tools/msvc/Solution.pm +++ b/src/tools/msvc/Solution.pm @@ -304,6 +304,7 @@ sub GenerateFiles HAVE_LIBLZ4 => undef, HAVE_LIBM => undef, HAVE_LIBPAM => undef, + HAVE_LIBPMEM => undef, HAVE_LIBREADLINE => undef, HAVE_LIBSELINUX => undef, HAVE_LIBSSL => undef, @@ -488,6 +489,7 @@ sub GenerateFiles USE_BONJOUR => undef, USE_BSD_AUTH => undef, USE_ICU => $self->{options}->{icu} ? 1 : undef, + USE_LIBPMEM => undef, USE_LIBXML => undef, USE_LIBXSLT => undef, USE_LZ4 => undef, @@ -538,6 +540,11 @@ sub GenerateFiles $define{HAVE_LZ4_H} = 1; $define{USE_LZ4} = 1; } + if ($self->{options}->{pmem}) + { + $define{HAVE_LIBPMEM} = 1; + $define{USE_LIBPMEM} = 1; + } if ($self->{options}->{openssl}) { $define{USE_OPENSSL} = 1; @@ -1085,6 +1092,11 @@ sub AddProject $proj->AddIncludeDir($self->{options}->{uuid} . '\include'); $proj->AddLibrary($self->{options}->{uuid} . '\lib\uuid.lib'); } + if ($self->{options}->{pmem}) + { + $proj->AddIncludeDir($self->{options}->{pmem} . '\include'); + $proj->AddLibrary($self->{options}->{pmem} . '\lib\libpmem.lib'); + } return $proj; } @@ -1197,6 +1209,7 @@ sub GetFakeConfigure $cfg .= ' --with-tcl' if ($self->{options}->{tcl}); $cfg .= ' --with-perl' if ($self->{options}->{perl}); $cfg .= ' --with-python' if ($self->{options}->{python}); + $cfg .= ' --with-libpmem' if ($self->{options}->{pmem}); my $port = $self->{options}->{'--with-pgport'}; $cfg .= " --with-pgport=$port" if defined($port); diff --git a/src/tools/msvc/config_default.pl b/src/tools/msvc/config_default.pl index 460c0375d4b..774730c9a8f 100644 --- a/src/tools/msvc/config_default.pl +++ b/src/tools/msvc/config_default.pl @@ -25,7 +25,8 @@ our $config = { xml => undef, # --with-libxml=<path> xslt => undef, # --with-libxslt=<path> iconv => undef, # (not in configure, path to iconv) - zlib => undef # --with-zlib=<path> + zlib => undef, # --with-zlib=<path> + pmem => undef # --with-libpmem=<path> }; 1; -- 2.17.1
>From 7c2d6665a925dc5615f6ffd999374e10c3ea2199 Mon Sep 17 00:00:00 2001 From: Takashi Menjo <takashi.menjou...@hco.ntt.co.jp> Date: Thu, 11 Mar 2021 17:55:53 +0900 Subject: [PATCH 03/13] Add wal_pmem_map to GUC --- src/backend/access/transam/xlog.c | 51 ++++++++++++++++++++++++------- src/backend/utils/misc/guc.c | 14 +++++++++ src/include/access/xlog.h | 1 + 3 files changed, 55 insertions(+), 11 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 87cd05c9454..02f63c31387 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -115,6 +115,7 @@ int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; bool track_wal_io_timing = false; +bool wal_pmem_map = false; #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -5194,13 +5195,28 @@ XLOGShmemSize(void) { Size size; + /* + * If we use WAL segment files as WAL buffers, we don't use the given + * value of wal_buffers. Instead, we set it to the value based on the + * segment size and the page size. This should be done before calculating + * the size of xlblocks array. + */ + if (wal_pmem_map) + { + int npages; + char buf[32]; + + npages = wal_segment_size / XLOG_BLCKSZ; + snprintf(buf, sizeof(buf), "%d", (int) npages); + SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE); + } /* * If the value of wal_buffers is -1, use the preferred auto-tune value. * This isn't an amazingly clean place to do this, but we must wait till * NBuffers has received its final value, and must do it before using the * value of XLOGbuffers to do anything important. */ - if (XLOGbuffers == -1) + else if (XLOGbuffers == -1) { char buf[32]; @@ -5216,10 +5232,17 @@ XLOGShmemSize(void) size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1)); /* xlblocks array */ size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers)); - /* extra alignment padding for XLOG I/O buffers */ - size = add_size(size, XLOG_BLCKSZ); - /* and the buffers themselves */ - size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers)); + + /* + * If we use WAL segment files as WAL buffers, we don't need volatile ones. + */ + if (!wal_pmem_map) + { + /* extra alignment padding for XLOG I/O buffers */ + size = add_size(size, XLOG_BLCKSZ); + /* and the buffers themselves */ + size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers)); + } /* * Note: we don't count ControlFileData, it comes out of the "slop factor" @@ -5313,13 +5336,19 @@ XLOGShmemInit(void) } /* - * Align the start of the page buffers to a full xlog block size boundary. - * This simplifies some calculations in XLOG insertion. It is also - * required for O_DIRECT. + * If we use WAL segment files as WAL buffers, we don't need volatile ones. */ - allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr); - XLogCtl->pages = allocptr; - memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers); + if (!wal_pmem_map) + { + /* + * Align the start of the page buffers to a full xlog block size boundary. + * This simplifies some calculations in XLOG insertion. It is also + * required for O_DIRECT. + */ + allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr); + XLogCtl->pages = allocptr; + memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers); + } /* * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index f9504d3aec4..ee18a9cf338 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -1344,6 +1344,20 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, +#ifdef USE_LIBPMEM + { + {"wal_pmem_map", PGC_POSTMASTER, WAL_SETTINGS, + gettext_noop("Map WAL segment files on PMEM as WAL buffers."), + gettext_noop("If true, postgres will memory-map WAL segment files " + "on PMEM to use them as WAL buffers instead of the " + "traditional volatile ones."), + }, + &wal_pmem_map, + false, + NULL, NULL, NULL + }, +#endif + { {"log_checkpoints", PGC_SIGHUP, LOGGING_WHAT, gettext_noop("Logs each checkpoint."), diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 34f6c89f067..73900cbc9e7 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -88,6 +88,7 @@ extern char *PrimaryConnInfo; extern char *PrimarySlotName; extern bool wal_receiver_create_temp_slot; extern bool track_wal_io_timing; +extern bool wal_pmem_map; /* indirectly set via GUC system */ extern TransactionId recoveryTargetXid; -- 2.17.1
>From a09110f5932cac18d7822d296d189de47698f70a Mon Sep 17 00:00:00 2001 From: Takashi Menjo <takashi.menjou...@hco.ntt.co.jp> Date: Tue, 23 Mar 2021 07:32:05 +0900 Subject: [PATCH 04/13] Export InstallXLogFileSegment --- src/backend/access/transam/xlog.c | 5 +---- src/include/access/xlog_internal.h | 4 ++++ 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 02f63c31387..73a3477be04 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -923,9 +923,6 @@ static void AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic); static bool XLogCheckpointNeeded(XLogSegNo new_segno); static void XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible); -static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, - bool find_free, XLogSegNo max_segno, - TimeLineID tli); static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, XLogSource source, bool notfoundOk); static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source); @@ -3700,7 +3697,7 @@ XLogFileCopy(TimeLineID destTLI, XLogSegNo destsegno, * max_segno limit was exceeded, the startup process has disabled this * function for now, or an error occurred while renaming the file into place. */ -static bool +bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, bool find_free, XLogSegNo max_segno, TimeLineID tli) { diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index c0da76cab49..328128b48d5 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -324,6 +324,10 @@ extern XLogRecPtr RequestXLogSwitch(bool mark_unimportant); extern void GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli); +extern bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, + bool find_free, XLogSegNo max_segno, + TimeLineID tli); + /* * Exported for the functions in timeline.c and xlogarchive.c. Only valid * in the startup process. -- 2.17.1
>From 443e47112de2d8c735be0ce9bddb01e7e77de672 Mon Sep 17 00:00:00 2001 From: Takashi Menjo <takashi.menjou...@hco.ntt.co.jp> Date: Tue, 23 Mar 2021 11:45:44 +0900 Subject: [PATCH 05/13] Map WAL segment files on PMEM as WAL buffers Fixes introduced in patchset v2: - Keep openLogSegNo even if wal_pmem_map=true - Fix sync issue of PmemXLogCreate - Fix unmapping issue of PmemXLogUnmap - Remove unused XLogPageOffset --- src/backend/access/transam/Makefile | 1 + src/backend/access/transam/xlog.c | 153 +++++++++---- src/backend/access/transam/xlogpmem.c | 297 ++++++++++++++++++++++++++ src/include/access/xlogpmem.h | 59 +++++ 4 files changed, 474 insertions(+), 36 deletions(-) create mode 100644 src/backend/access/transam/xlogpmem.c create mode 100644 src/include/access/xlogpmem.h diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile index 595e02de722..3a29583bc03 100644 --- a/src/backend/access/transam/Makefile +++ b/src/backend/access/transam/Makefile @@ -31,6 +31,7 @@ OBJS = \ xlogarchive.o \ xlogfuncs.o \ xloginsert.o \ + xlogpmem.o \ xlogreader.o \ xlogutils.o diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 73a3477be04..be56599f9fd 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -35,6 +35,7 @@ #include "access/xlog_internal.h" #include "access/xlogarchive.h" #include "access/xloginsert.h" +#include "access/xlogpmem.h" #include "access/xlogreader.h" #include "access/xlogutils.h" #include "catalog/catversion.h" @@ -2024,7 +2025,14 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli) * offset within the page. */ cachedPage = ptr / XLOG_BLCKSZ; - cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ; + if (wal_pmem_map) + { + openLogTLI = tli; + openLogSegNo = PmemXLogEnsurePrevMapped(endptr, tli); + cachedPos = PmemXLogGetBufferPages() + idx * (Size) XLOG_BLCKSZ; + } + else + cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ; Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC); Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ)); @@ -2258,7 +2266,14 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx); - NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ); + if (wal_pmem_map) + { + openLogTLI = tli; + openLogSegNo = PmemXLogEnsurePrevMapped(NewPageEndPtr, tli); + NewPage = (XLogPageHeader) (PmemXLogGetBufferPages() + nextidx * (Size) XLOG_BLCKSZ); + } + else + NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ); /* * Be sure to re-zero the buffer so that bytes beyond what we've @@ -2477,6 +2492,8 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) int npages; int startidx; uint32 startoffset; + bool isfirstpage; + XLogRecPtr startpageptr; /* We should always be inside a critical section here */ Assert(CritSectionCount > 0); @@ -2499,6 +2516,10 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) startidx = 0; startoffset = 0; + /* Those are used actually only if wal_pmem_map=true */ + isfirstpage = true; + startpageptr = 0; + /* * Within the loop, curridx is the cache block index of the page to * consider writing. Begin at the buffer containing the next unwritten @@ -2524,33 +2545,36 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) LogwrtResult.Write = EndPtr; ispartialpage = WriteRqst.Write < LogwrtResult.Write; - if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, - wal_segment_size)) + if (!wal_pmem_map) { - /* - * Switch to new logfile segment. We cannot have any pending - * pages here (since we dump what we have at segment end). - */ - Assert(npages == 0); - if (openLogFile >= 0) - XLogFileClose(); - XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, - wal_segment_size); - openLogTLI = tli; + if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size)) + { + /* + * Switch to new logfile segment. We cannot have any pending + * pages here (since we dump what we have at segment end). + */ + Assert(npages == 0); + if (openLogFile >= 0) + XLogFileClose(); + XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size); + openLogTLI = tli; - /* create/use new log file */ - openLogFile = XLogFileInit(openLogSegNo, tli); - ReserveExternalFD(); - } + /* create/use new log file */ + openLogFile = XLogFileInit(openLogSegNo, tli); + ReserveExternalFD(); + } - /* Make sure we have the current logfile open */ - if (openLogFile < 0) - { - XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, - wal_segment_size); - openLogTLI = tli; - openLogFile = XLogFileOpen(openLogSegNo, tli); - ReserveExternalFD(); + /* Make sure we have the current logfile open */ + if (openLogFile < 0) + { + XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo, + wal_segment_size); + openLogTLI = tli; + openLogFile = XLogFileOpen(openLogSegNo, tli); + ReserveExternalFD(); + } } /* Add current page to the set of pending pages-to-dump */ @@ -2558,8 +2582,8 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) { /* first of group */ startidx = curridx; - startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ, - wal_segment_size); + startpageptr = LogwrtResult.Write - XLOG_BLCKSZ; + startoffset = XLogSegmentOffset(startpageptr, wal_segment_size); } npages++; @@ -2597,7 +2621,38 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) INSTR_TIME_SET_CURRENT(start); pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); - written = pg_pwrite(openLogFile, from, nleft, startoffset); + + /* + * If we use a WAL segment file as WAL buffers, we cache-flush + * records on the buffers byte by byte, not page by page. To do + * so, here we fix the range being cache-flushed. + */ + if (wal_pmem_map) + { + XLogRecPtr startbyteptr; + XLogRecPtr endbyteptr; + + startbyteptr = (isfirstpage) + ? XLogCtl->LogwrtResult.Write + : startpageptr; + + endbyteptr = (ispartialpage) + ? WriteRqst.Write + : LogwrtResult.Write; + + /* Now we cache-flush records */ + openLogTLI = tli; + openLogSegNo = PmemXLogEnsurePrevMapped(endbyteptr, tli); + PmemXLogFlush(startbyteptr, endbyteptr); + + /* Mark the first page is consumed */ + isfirstpage = false; + + /* Tell all the "pages" have been written successfully */ + written = nleft; + } + else + written = pg_pwrite(openLogFile, from, nleft, startoffset); pgstat_report_wait_end(); /* @@ -2655,7 +2710,10 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) */ if (finishing_seg) { - issue_xlog_fsync(openLogFile, openLogSegNo, tli); + if (wal_pmem_map) + PmemXLogSync(); + else + issue_xlog_fsync(openLogFile, openLogSegNo, tli); /* signal that we need to wakeup walsenders later */ WalSndWakeupRequest(); @@ -2706,12 +2764,14 @@ XLogWrite(XLogwrtRqst WriteRqst, TimeLineID tli, bool flexible) LogwrtResult.Flush < LogwrtResult.Write) { + if (wal_pmem_map) + PmemXLogSync(); /* * Could get here without iterating above loop, in which case we might * have no open file or the wrong one. However, we do not need to * fsync more than one file. */ - if (sync_method != SYNC_METHOD_OPEN && + else if (sync_method != SYNC_METHOD_OPEN && sync_method != SYNC_METHOD_OPEN_DSYNC) { if (openLogFile >= 0 && @@ -8099,11 +8159,32 @@ StartupXLOG(void) firstIdx = XLogRecPtrToBufIdx(EndOfLog); - /* Copy the valid part of the last block, and zero the rest */ - page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ]; - len = EndOfLog % XLOG_BLCKSZ; - memcpy(page, xlogreader->readBuf, len); - memset(page + len, 0, XLOG_BLCKSZ - len); + if (wal_pmem_map) + { + /* + * Keep the valid part of the last block, and zero the rest. + * Note that "len" indicates the size of the valid part. + * + * TODO how about if (newTLI != replayTLI) ? + */ + openLogTLI = newTLI; + openLogSegNo = PmemXLogEnsurePrevMapped(EndOfLog, newTLI); + page = PmemXLogGetBufferPages() + firstIdx * (Size) XLOG_BLCKSZ; + len = EndOfLog % XLOG_BLCKSZ; + memset(page + len, 0, XLOG_BLCKSZ - len); + + /* Cache-flush and sync now */ + PmemXLogFlush(EndOfLog, pageBeginPtr + XLOG_BLCKSZ); + PmemXLogSync(); + } + else + { + /* Copy the valid part of the last block, and zero the rest */ + page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ]; + len = EndOfLog % XLOG_BLCKSZ; + memcpy(page, xlogreader->readBuf, len); + memset(page + len, 0, XLOG_BLCKSZ - len); + } XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ; XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ; diff --git a/src/backend/access/transam/xlogpmem.c b/src/backend/access/transam/xlogpmem.c new file mode 100644 index 00000000000..5b50ba80a7a --- /dev/null +++ b/src/backend/access/transam/xlogpmem.c @@ -0,0 +1,297 @@ +#include "postgres.h" + +#ifdef USE_LIBPMEM + +#include <errno.h> +#include <limits.h> /* INT_MAX */ +#include <stddef.h> /* size_t */ +#include <stdint.h> /* uintptr_t */ +#include <unistd.h> /* getpid, unlink */ + +#include <libpmem.h> + +#include "c.h" /* bool, Size */ +#include "access/xlog.h" +#include "access/xlog_internal.h" /* XLogFilePath, XLByteToSeg */ +#include "access/xlogpmem.h" +#include "common/file_perm.h" /* pg_file_create_mode */ +#include "miscadmin.h" /* enableFsync */ +#include "pgstat.h" + +static char *mappedPages = NULL; +static XLogSegNo mappedSegNo = 0; + +#define PG_DAX_HUGEPAGE_SIZE (((uintptr_t) 1) << 21) +#define PG_DAX_HUGEPAGE_MASK (~(PG_DAX_HUGEPAGE_SIZE - 1)) + +static XLogSegNo PmemXLogMap(XLogSegNo segno, TimeLineID tli); +static void PmemXLogCreate(XLogSegNo segno, TimeLineID tli); +static void PmemXLogUnmap(void); + +static void *PmemCreateMapFile(const char *path, size_t len); +static void *PmemOpenMapFile(const char *path, size_t expected_len); +static void *PmemTryOpenMapFile(const char *path, size_t expected_len); +static void *PmemMapFile(const char *path, size_t expected_len, int flags, + bool try_open); +static void PmemUnmapForError(void *addr, size_t len); + +/* + * Ensures the WAL segment containg {ptr-1} to be mapped. + * + * Returns mapped XLogSegNo. + */ +XLogSegNo +PmemXLogEnsurePrevMapped(XLogRecPtr ptr, TimeLineID tli) +{ + XLogSegNo segno; + + Assert(wal_pmem_map); + + XLByteToPrevSeg(ptr, segno, wal_segment_size); + + if (mappedPages != NULL) + { + /* Fast return: The segment we need is already mapped */ + if (mappedSegNo == segno) + return mappedSegNo; + + /* Unmap the current segment we don't need */ + PmemXLogUnmap(); + } + + return PmemXLogMap(segno, tli); +} + +/* + * Creates a new XLOG file segment, or open a pre-existing one, for WAL buffers. + * + * Returns mapped XLogSegNo. + * + * See also XLogFileInit in xlog.c. + */ +static XLogSegNo +PmemXLogMap(XLogSegNo segno, TimeLineID tli) +{ + char path[MAXPGPATH]; + + Assert(mappedPages == NULL); + + XLogFilePath(path, tli, segno, wal_segment_size); + + /* PmemTryOpenMapFile will handle error except ENOENT */ + mappedPages = PmemTryOpenMapFile(path, wal_segment_size); + + /* Fast return if already exists */ + if (mappedPages != NULL) + { + mappedSegNo = segno; + return mappedSegNo; + } + + elog(DEBUG2, "creating and filling new WAL file"); + PmemXLogCreate(segno, tli); + + /* PmemCreateMapFile will handle error */ + mappedPages = PmemOpenMapFile(path, wal_segment_size); + mappedSegNo = segno; + + elog(DEBUG2, "done creating and filling new WAL file"); + return mappedSegNo; +} + +/* + * Creates a new XLOG file segment. + * + * See also XLogFileInit in xlog.c. + */ +static void +PmemXLogCreate(XLogSegNo segno, TimeLineID tli) +{ + char *addr; + char tmppath[MAXPGPATH]; + XLogSegNo inst_segno; + XLogSegNo max_segno; + + snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid()); + unlink(tmppath); + + /* PmemCreateMapFile will handle error */ + addr = PmemCreateMapFile(tmppath, wal_segment_size); + + /* + * Initialize whole the buffers. + * + * Note that we don't put any single byte if not wal_init_zero. It's okay + * because we already have a new segment file truncated to the proper size. + */ + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE); + if (wal_init_zero) + pmem_memset_nodrain(addr, 0, wal_segment_size); + pgstat_report_wait_end(); + + pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC); + if (enableFsync) + pmem_drain(); + pgstat_report_wait_end(); + + if (pmem_unmap(addr, wal_segment_size) < 0) + elog(ERROR, "could not pmem_unmap temporal WAL buffers: %m"); + + inst_segno = segno; + max_segno = segno + CheckPointSegments; + if (!InstallXLogFileSegment(&inst_segno, tmppath, true, max_segno, tli)) + unlink(tmppath); +} + +/* + * Unmaps the current WAL segment file if mapped. + */ +static void +PmemXLogUnmap(void) +{ + /* Fast return if not mapped */ + if (mappedPages == NULL) + return; + + if (pmem_unmap(mappedPages, wal_segment_size) < 0) + elog(ERROR, "could not pmem_unmap WAL buffers: %m"); + + mappedPages = NULL; +} + +/* + * Gets the head address of the WAL buffers. + */ +char * +PmemXLogGetBufferPages(void) +{ + Assert(wal_pmem_map); + Assert(mappedPages != NULL); + + return mappedPages; +} + +/* + * Flushes records in the given range [start, end) within a single segment. + */ +void +PmemXLogFlush(XLogRecPtr start, XLogRecPtr end) +{ + Size off; + + Assert(wal_pmem_map); + Assert(start < end); + Assert(mappedPages != NULL); + Assert(XLByteInSeg(start, mappedSegNo, wal_segment_size)); + Assert(XLByteInPrevSeg(end, mappedSegNo, wal_segment_size)); + + off = XLogSegmentOffset(start, wal_segment_size); + pmem_flush(mappedPages + off, end - start); +} + +/* + * Wait for cache-flush to finish. + */ +void +PmemXLogSync(void) +{ + Assert(wal_pmem_map); + + /* Fast return */ + if (!enableFsync) + return; + + pmem_drain(); +} + +/* + * Wrappers for pmem_map_file. + */ +static void * +PmemCreateMapFile(const char *path, size_t len) +{ + return PmemMapFile(path, len, PMEM_FILE_CREATE | PMEM_FILE_EXCL, false); +} + +static void * +PmemOpenMapFile(const char *path, size_t expected_len) +{ + return PmemMapFile(path, expected_len, 0, false); +} + +static void * +PmemTryOpenMapFile(const char *path, size_t expected_len) +{ + return PmemMapFile(path, expected_len, 0, true); +} + +static void * +PmemMapFile(const char *path, size_t expected_len, int flags, bool try_open) +{ + size_t param_len; + int mode; + size_t mapped_len; + int is_pmem; + void *addr; + + Assert(expected_len > 0); + Assert(expected_len <= INT_MAX); + + param_len = (flags & PMEM_FILE_CREATE) ? expected_len : 0; + mode = (flags & PMEM_FILE_CREATE) ? pg_file_create_mode : 0; + + mapped_len = 0; + is_pmem = 0; + addr = pmem_map_file(path, param_len, flags, mode, &mapped_len, &is_pmem); + + if (addr == NULL) + { + if (try_open && errno == ENOENT) + return NULL; + + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not pmem_map_file \"%s\": %m", path))); + } + + if (mapped_len > INT_MAX) + { + PmemUnmapForError(addr, mapped_len); + elog(ERROR, + "unexpected file size: path \"%s\" actual (greater than %d) expected %d", + path, INT_MAX, (int) expected_len); + } + + if (mapped_len != expected_len) + { + PmemUnmapForError(addr, mapped_len); + elog(ERROR, + "unexpected file size: path \"%s\" actual %d expected %d", + path, (int) mapped_len, (int) expected_len); + } + + if (!is_pmem) + { + PmemUnmapForError(addr, mapped_len); + elog(ERROR, "file not on PMEM: path \"%s\"", path); + } + + if ((uintptr_t) addr & ~PG_DAX_HUGEPAGE_MASK) + elog(WARNING, + "file not mapped on DAX hugepage boundary: path \"%s\" addr %p", + path, addr); + + return addr; +} + +static void +PmemUnmapForError(void *addr, size_t len) +{ + int saved_errno; + + saved_errno = errno; + (void) pmem_unmap(addr, len); + errno = saved_errno; +} + +#endif /* USE_LIBPMEM */ diff --git a/src/include/access/xlogpmem.h b/src/include/access/xlogpmem.h new file mode 100644 index 00000000000..3978640b82f --- /dev/null +++ b/src/include/access/xlogpmem.h @@ -0,0 +1,59 @@ +/* + * xlogpmem.h + * + * Definitions for PMEM-mapped WAL buffers. + * + * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * + * src/include/access/xlogpmem.h + */ +#ifndef XLOGPMEM_H +#define XLOGPMEM_H + +#include "postgres.h" + +#include "c.h" /* Size */ +#include "access/xlogdefs.h" /* XLogRecPtr, XLogSegNo */ + +#ifdef USE_LIBPMEM + +/* Prototypes */ +extern XLogSegNo PmemXLogEnsurePrevMapped(XLogRecPtr ptr, TimeLineID tli); +extern char *PmemXLogGetBufferPages(void); +extern void PmemXLogFlush(XLogRecPtr start, XLogRecPtr end); +extern void PmemXLogSync(void); + +#else /* USE_LIBPMEM */ + +#include <stdlib.h> /* abort */ + +static inline XLogSegNo +PmemXLogEnsurePrevMapped(XLogRecPtr ptr, tli) +{ + abort(); + return 0; +} + +static inline char * +PmemXLogGetBufferPages(void) +{ + abort(); + return NULL; +} + +static inline void +PmemXLogFlush(XLogRecPtr start, XLogRecPtr end) +{ + abort(); +} + +static inline void +PmemXLogSync(void) +{ + abort(); +} + +#endif /* USE_LIBPMEM */ + +#endif /* XLOGPMEM_H */ -- 2.17.1
>From 90ea943904793c5212cd1fed450e4f3b1f97f8a7 Mon Sep 17 00:00:00 2001 From: Justin Pryzby <pryz...@telsasoft.com> Date: Wed, 5 Jan 2022 21:28:51 -0600 Subject: [PATCH 06/13] compile-fix-without-pmem --- src/include/access/xlogpmem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/access/xlogpmem.h b/src/include/access/xlogpmem.h index 3978640b82f..cd83bffc883 100644 --- a/src/include/access/xlogpmem.h +++ b/src/include/access/xlogpmem.h @@ -29,7 +29,7 @@ extern void PmemXLogSync(void); #include <stdlib.h> /* abort */ static inline XLogSegNo -PmemXLogEnsurePrevMapped(XLogRecPtr ptr, tli) +PmemXLogEnsurePrevMapped(XLogRecPtr ptr, TimeLineID tli) { abort(); return 0; -- 2.17.1
>From 9b8091780f0156b35297beaecc5868a2df835e80 Mon Sep 17 00:00:00 2001 From: Takashi Menjo <takashi.menjou...@hco.ntt.co.jp> Date: Fri, 5 Nov 2021 14:16:25 +0900 Subject: [PATCH 07/13] Compatible to Windows --- src/backend/access/transam/xlogpmem.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/backend/access/transam/xlogpmem.c b/src/backend/access/transam/xlogpmem.c index 5b50ba80a7a..8bd990f1cd0 100644 --- a/src/backend/access/transam/xlogpmem.c +++ b/src/backend/access/transam/xlogpmem.c @@ -8,7 +8,24 @@ #include <stdint.h> /* uintptr_t */ #include <unistd.h> /* getpid, unlink */ +/* + * On Windows, we will have two ported but conflicting mode_t: + * + * mode_t in libpmem: + * libpmem.h -> pmemcompat.h -> typedef int mode_t + * mode_t in PostgreSQL: + * c.h -> port.h -> win32_port.h -> typedef unsigned short mode_t + * + * We want to use PostgreSQL's one, so conseal libpmem's one. + */ +#if defined(WIN32) && !defined(__CYGWIN__) +#define mode_t unused_libpmem_mode_t +#include <libpmem.h> +#undef mode_t +/* On other platforms, simply include libpmem.h */ +#else #include <libpmem.h> +#endif #include "c.h" /* bool, Size */ #include "access/xlog.h" @@ -242,7 +259,11 @@ PmemMapFile(const char *path, size_t expected_len, int flags, bool try_open) mapped_len = 0; is_pmem = 0; +#if defined(WIN32) && !defined(__CYGWIN__) + addr = pmem_map_fileU(path, param_len, flags, mode, &mapped_len, &is_pmem); +#else addr = pmem_map_file(path, param_len, flags, mode, &mapped_len, &is_pmem); +#endif if (addr == NULL) { -- 2.17.1
>From 35c77c59d2e4aec95d6c1489b052d4456855f72f Mon Sep 17 00:00:00 2001 From: Takashi Menjo <takashi.menjou...@hco.ntt.co.jp> Date: Wed, 19 May 2021 11:57:49 +0900 Subject: [PATCH 08/13] WAL statistics in cases of wal_pmem_map=true --- src/backend/access/transam/xlogpmem.c | 47 +++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/backend/access/transam/xlogpmem.c b/src/backend/access/transam/xlogpmem.c index 8bd990f1cd0..d91fb2175b7 100644 --- a/src/backend/access/transam/xlogpmem.c +++ b/src/backend/access/transam/xlogpmem.c @@ -195,6 +195,7 @@ void PmemXLogFlush(XLogRecPtr start, XLogRecPtr end) { Size off; + instr_time start_time; Assert(wal_pmem_map); Assert(start < end); @@ -203,22 +204,68 @@ PmemXLogFlush(XLogRecPtr start, XLogRecPtr end) Assert(XLByteInPrevSeg(end, mappedSegNo, wal_segment_size)); off = XLogSegmentOffset(start, wal_segment_size); + + /* Measure I/O timing to write WAL data */ + if (track_wal_io_timing) + INSTR_TIME_SET_CURRENT(start_time); + + pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE); pmem_flush(mappedPages + off, end - start); + pgstat_report_wait_end(); + + /* + * Increment the I/O timing and the number of times WAL data + * were written out to disk. + */ + if (track_wal_io_timing) + { + instr_time duration; + + INSTR_TIME_SET_CURRENT(duration); + INSTR_TIME_SUBTRACT(duration, start_time); + WalStats.m_wal_write_time += INSTR_TIME_GET_MICROSEC(duration); + } + + WalStats.m_wal_write++; } /* * Wait for cache-flush to finish. + * + * See also issue_xlog_fsync in xlog.c. */ void PmemXLogSync(void) { + instr_time start; + Assert(wal_pmem_map); /* Fast return */ if (!enableFsync) return; + /* Measure I/O timing to sync the WAL file */ + if (track_wal_io_timing) + INSTR_TIME_SET_CURRENT(start); + + pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC); pmem_drain(); + pgstat_report_wait_end(); + + /* + * Increment the I/O timing and the number of times WAL files were synced. + */ + if (track_wal_io_timing) + { + instr_time duration; + + INSTR_TIME_SET_CURRENT(duration); + INSTR_TIME_SUBTRACT(duration, start); + WalStats.m_wal_sync_time += INSTR_TIME_GET_MICROSEC(duration); + } + + WalStats.m_wal_sync++; } /* -- 2.17.1
>From 557ae7c84ea24bc6c7c42104144c422b0154d166 Mon Sep 17 00:00:00 2001 From: Takashi Menjo <takashi.menjou...@hco.ntt.co.jp> Date: Wed, 13 Oct 2021 11:10:17 +0900 Subject: [PATCH 09/13] Let wal_pmem_map be constant unless --with-libpmem --- src/backend/access/transam/xlog.c | 3 +++ src/include/access/xlog.h | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index be56599f9fd..62f08cb50bb 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -116,7 +116,10 @@ int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ int wal_retrieve_retry_interval = 5000; int max_slot_wal_keep_size_mb = -1; bool track_wal_io_timing = false; + +#ifdef USE_LIBPMEM bool wal_pmem_map = false; +#endif #ifdef WAL_DEBUG bool XLOG_DEBUG = false; diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 73900cbc9e7..ab3eb3887b9 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -88,7 +88,12 @@ extern char *PrimaryConnInfo; extern char *PrimarySlotName; extern bool wal_receiver_create_temp_slot; extern bool track_wal_io_timing; + +#ifdef USE_LIBPMEM extern bool wal_pmem_map; +#else +#define wal_pmem_map false +#endif /* indirectly set via GUC system */ extern TransactionId recoveryTargetXid; -- 2.17.1
>From 133e1281d4b308df6f63edebae94cf1856598af4 Mon Sep 17 00:00:00 2001 From: Takashi Menjo <takashi.menjou...@hco.ntt.co.jp> Date: Thu, 28 Oct 2021 13:35:28 +0900 Subject: [PATCH 10/13] Ensure WAL mappings before assertion --- src/backend/access/transam/xlog.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 62f08cb50bb..f0d7a317d23 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -1945,6 +1945,23 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli) */ if (ptr / XLOG_BLCKSZ == cachedPage) { + /* + * Ensure WAL mappings before assersion. + * + * cachedPos should be recaluculated because it has been probably + * invalidated due to WAL remapping. This should be done even if + * openLogSegNo seems not to change because the address of the + * mapping could have changed (ABA problem). + */ + if (wal_pmem_map) + { + endptr = ptr - ptr % XLOG_BLCKSZ + XLOG_BLCKSZ; + openLogSegNo = PmemXLogEnsurePrevMapped(endptr, tli); + cachedPos = PmemXLogGetBufferPages() + + (Size) XLogSegmentOffset(endptr - XLOG_BLCKSZ, + wal_segment_size); + } + Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC); Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ)); return cachedPos + ptr % XLOG_BLCKSZ; -- 2.17.1
>From f3a91d9198d6498c156b8078b77f0e4c401e224f Mon Sep 17 00:00:00 2001 From: Takashi Menjo <takashi.menjou...@hco.ntt.co.jp> Date: Thu, 30 Sep 2021 12:51:56 +0900 Subject: [PATCH 11/13] Update document --- doc/src/sgml/config.sgml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 10aa18b7636..5e55564f42a 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -3239,6 +3239,33 @@ include_dir 'conf.d' </listitem> </varlistentry> + <varlistentry id="guc-wal-pmem-map" xreflabel="wal_pmem_map"> + <term><varname>wal_pmem_map</varname> (<type>boolean</type>) + <indexterm> + <primary><varname>wal_pmem_map</varname> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + If set to <literal>on</literal>, this parameter causes WAL files to be + memory-mapped and used as WAL buffer pages. The WAL files in pg_wal + directory (or the directory given by <command>initdb -X</command> + option) should be on <firstterm>persistent memory</firstterm> (PMEM) and + the filesystem for those files should support the <firstterm>Direct + Access</firstterm> (DAX) feature. <varname>wal_sync_method</varname> + for the primary server is ignored and WAL updates are forced out to + PMEM in a more optimal way which avoids calling into the kernel. + <varname>min_wal_size</varname> should be multiple of the size of a + WAL file, and <varname>wal_buffers</varname> is ignored and set to + the equivalent of <varname>min_wal_size</varname>. + </para> + <para> + This parameter can only be set at server start. The default is + <literal>off</literal>. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-wal-writer-delay" xreflabel="wal_writer_delay"> <term><varname>wal_writer_delay</varname> (<type>integer</type>) <indexterm> -- 2.17.1
>From b5d695e660b2dad66c96c5944d343ba0e825c7c3 Mon Sep 17 00:00:00 2001 From: Takashi Menjo <takashi.menjou...@hco.ntt.co.jp> Date: Tue, 1 Jun 2021 19:29:22 +0900 Subject: [PATCH 12/13] Preallocate and initialize more WAL if wal_pmem_map=true --- src/backend/access/transam/xlog.c | 30 ++++++++++++++++++++++++------ src/backend/utils/misc/guc.c | 2 +- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f0d7a317d23..1196ae21e80 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2049,7 +2049,9 @@ GetXLogBuffer(XLogRecPtr ptr, TimeLineID tli) { openLogTLI = tli; openLogSegNo = PmemXLogEnsurePrevMapped(endptr, tli); - cachedPos = PmemXLogGetBufferPages() + idx * (Size) XLOG_BLCKSZ; + cachedPos = PmemXLogGetBufferPages() + + (Size) XLogSegmentOffset(endptr - XLOG_BLCKSZ, + wal_segment_size); } else cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ; @@ -2290,7 +2292,9 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, TimeLineID tli, bool opportunistic) { openLogTLI = tli; openLogSegNo = PmemXLogEnsurePrevMapped(NewPageEndPtr, tli); - NewPage = (XLogPageHeader) (PmemXLogGetBufferPages() + nextidx * (Size) XLOG_BLCKSZ); + NewPage = (XLogPageHeader) + (PmemXLogGetBufferPages() + + (Size) XLogSegmentOffset(NewPageBeginPtr, wal_segment_size)); } else NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ); @@ -5275,15 +5279,25 @@ XLOGShmemSize(void) /* * If we use WAL segment files as WAL buffers, we don't use the given * value of wal_buffers. Instead, we set it to the value based on the - * segment size and the page size. This should be done before calculating + * min_wal_size and the page size. This should be done before calculating * the size of xlblocks array. + * + * TODO Do not allow changing min_wal_size by SIGHUP if wal_pmem_map=true. + * + * TODO Move validations to check_hook functions. */ if (wal_pmem_map) { int npages; char buf[32]; - npages = wal_segment_size / XLOG_BLCKSZ; + if (min_wal_size_mb % (wal_segment_size / (1024 * 1024)) != 0) + elog(PANIC, "min_wal_size should be multiple of wal_segment_size when wal_pmem_map=true"); + + if (min_wal_size_mb / (XLOG_BLCKSZ / 1024) > INT_MAX / 1024) + elog(PANIC, "too many wal buffer pages"); + + npages = min_wal_size_mb / (XLOG_BLCKSZ / 1024) * 1024; snprintf(buf, sizeof(buf), "%d", (int) npages); SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE); } @@ -8189,7 +8203,8 @@ StartupXLOG(void) */ openLogTLI = newTLI; openLogSegNo = PmemXLogEnsurePrevMapped(EndOfLog, newTLI); - page = PmemXLogGetBufferPages() + firstIdx * (Size) XLOG_BLCKSZ; + page = PmemXLogGetBufferPages() + + (Size) XLogSegmentOffset(pageBeginPtr, wal_segment_size); len = EndOfLog % XLOG_BLCKSZ; memset(page + len, 0, XLOG_BLCKSZ - len); @@ -8229,7 +8244,10 @@ StartupXLOG(void) /* * Preallocate additional log files, if wanted. */ - PreallocXlogFiles(EndOfLog, newTLI); + if (wal_pmem_map) + AdvanceXLInsertBuffer(InvalidXLogRecPtr, newTLI, true); + else + PreallocXlogFiles(EndOfLog, newTLI); /* * Okay, we're officially UP. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index ee18a9cf338..6f667d5c43f 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2866,7 +2866,7 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_XBLOCKS }, &XLOGbuffers, - -1, -1, (INT_MAX / XLOG_BLCKSZ), + -1, -1, INT_MAX, check_wal_buffers, NULL, NULL }, -- 2.17.1