Thanks again Thomas, > Oh, so maybe we need a configure test for them? And if you don't have > it, a runtime error if you try to set the page size to something other > than 0 (like we do for effective_io_concurrency if you don't have a > posix_fadvise() function).
Ahh, yes, that sounds reasonable. Did some fiddling with the configure script to add a check, and think I got it right (but not 100% sure tho.). Added new v3 patch. > If you set it to an unsupported size, that seems reasonable to me. If > you set it to an unsupported size and have huge_pages=try, do we fall > back to using no huge pages? Yes, the "fallback" with huge_pages=try is the same for both huge_page_size=0 and huge_page_size=nMB, and is the same as without this patch. > For what it's worth, here's what I know about this on other operating systems: Thanks for all the background info! > 1. AIX can do huge pages, but only if you use System V shared memory > (not for mmap() anonymous shared). In > https://commitfest.postgresql.org/25/1960/ we got as far as adding > support for shared_memory_type=sysv, but to go further we'll need > someone willing to hack on the patch on an AIX system, preferably with > root access so they can grant the postgres user wired memory > privileges (or whatever they call that over there). But at a glance, > they don't have a way to ask for a specific page size, just "large". Interesting. I might get access to some AIX systems at university this fall, so maybe I will get some time to dive into the patch. Odin
From 8cb876bf73258646044a6a99d72e7c12d1d03e3a Mon Sep 17 00:00:00 2001 From: Odin Ugedal <o...@ugedal.com> Date: Sun, 7 Jun 2020 21:04:57 +0200 Subject: [PATCH v3] Add support for choosing huge page size This adds support for using non-default huge page sizes for shared memory. This is achived via the new "huge_page_size" config entry. The config value defaults to 0, meaning it will use the system default. --- configure | 26 +++++++ configure.in | 4 ++ doc/src/sgml/config.sgml | 27 ++++++++ doc/src/sgml/runtime.sgml | 41 +++++++----- src/backend/port/sysv_shmem.c | 67 ++++++++++++++----- src/backend/utils/misc/guc.c | 25 +++++++ src/backend/utils/misc/postgresql.conf.sample | 2 + src/include/pg_config.h.in | 8 +++ src/include/pg_config_manual.h | 6 ++ src/include/storage/pg_shmem.h | 1 + 10 files changed, 176 insertions(+), 31 deletions(-) diff --git a/configure b/configure index 2feff37fe3..11e3112ee4 100755 --- a/configure +++ b/configure @@ -15488,6 +15488,32 @@ _ACEOF fi # fi +# Check if system supports mmap flags for allocating huge page memory with page sizes +# other than the default +ac_fn_c_check_decl "$LINENO" "MAP_HUGE_MASK" "ac_cv_have_decl_MAP_HUGE_MASK" "#include <sys/mman.h> +" +if test "x$ac_cv_have_decl_MAP_HUGE_MASK" = xyes; then : + ac_have_decl=1 +else + ac_have_decl=0 +fi + +cat >>confdefs.h <<_ACEOF +#define HAVE_DECL_MAP_HUGE_MASK $ac_have_decl +_ACEOF +ac_fn_c_check_decl "$LINENO" "MAP_HUGE_SHIFT" "ac_cv_have_decl_MAP_HUGE_SHIFT" "#include <sys/mman.h> +" +if test "x$ac_cv_have_decl_MAP_HUGE_SHIFT" = xyes; then : + ac_have_decl=1 +else + ac_have_decl=0 +fi + +cat >>confdefs.h <<_ACEOF +#define HAVE_DECL_MAP_HUGE_SHIFT $ac_have_decl +_ACEOF + + ac_fn_c_check_decl "$LINENO" "fdatasync" "ac_cv_have_decl_fdatasync" "#include <unistd.h> " if test "x$ac_cv_have_decl_fdatasync" = xyes; then : diff --git a/configure.in b/configure.in index 0188c6ff07..f56c06eb3d 100644 --- a/configure.in +++ b/configure.in @@ -1687,6 +1687,10 @@ AC_CHECK_FUNCS(posix_fadvise) AC_CHECK_DECLS(posix_fadvise, [], [], [#include <fcntl.h>]) ]) # fi +# Check if system supports mmap flags for allocating huge page memory with page sizes +# other than the default +AC_CHECK_DECLS([MAP_HUGE_MASK, MAP_HUGE_SHIFT], [], [], [#include <sys/mman.h>]) + AC_CHECK_DECLS(fdatasync, [], [], [#include <unistd.h>]) AC_CHECK_DECLS([strlcat, strlcpy, strnlen]) # This is probably only present on macOS, but may as well check always diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index aca8f73a50..42f06a41cb 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1582,6 +1582,33 @@ include_dir 'conf.d' </listitem> </varlistentry> + <varlistentry id="guc-huge-page-size" xreflabel="huge_page_size"> + <term><varname>huge_page_size</varname> (<type>integer</type>) + <indexterm> + <primary><varname>huge_page_size</varname> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + Controls what size of huge pages is used in conjunction with + <xref linkend="guc-huge-pages"/>. + The default is zero (<literal>0</literal>). + When set to <literal>0</literal>, the default huge page size on the system will + be used. + </para> + <para> + Some commonly available page sizes on modern 64 bit server architectures include: + <literal>2MB<literal> and <literal>1GB</literal> (Intel and AMD), <literal>16MB</literal> and + <literal>16GB</literal> (IBM POWER), and <literal>64kB</literal>, <literal>2MB<literal>, + <literal>32MB</literal> and <literal>1GB</literal> (ARM). For more information + about usage and support, see <xref linkend="linux-huge-pages"/>. + </para> + <para> + Controlling huge page size is currently not supported on Windows. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers"> <term><varname>temp_buffers</varname> (<type>integer</type>) <indexterm> diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml index 88210c4a5d..cbdbcb4fdf 100644 --- a/doc/src/sgml/runtime.sgml +++ b/doc/src/sgml/runtime.sgml @@ -1391,41 +1391,50 @@ export PG_OOM_ADJUST_VALUE=0 using large values of <xref linkend="guc-shared-buffers"/>. To use this feature in <productname>PostgreSQL</productname> you need a kernel with <varname>CONFIG_HUGETLBFS=y</varname> and - <varname>CONFIG_HUGETLB_PAGE=y</varname>. You will also have to adjust - the kernel setting <varname>vm.nr_hugepages</varname>. To estimate the - number of huge pages needed, start <productname>PostgreSQL</productname> - without huge pages enabled and check the - postmaster's anonymous shared memory segment size, as well as the system's - huge page size, using the <filename>/proc</filename> file system. This might - look like: + <varname>CONFIG_HUGETLB_PAGE=y</varname>. You will also have to pre-allocate + huge pages with the the desired huge page size. To estimate the number of + huge pages needed, start <productname>PostgreSQL</productname> without huge + pages enabled and check the postmaster's anonymous shared memory segment size, + as well as the system's supported huge page sizes, using the + <filename>/sys</filename> file system. This might look like: <programlisting> $ <userinput>head -1 $PGDATA/postmaster.pid</userinput> 4170 $ <userinput>pmap 4170 | awk '/rw-s/ && /zero/ {print $2}'</userinput> 6490428K +$ <userinput>ls /sys/kernel/mm/hugepages</userinput> +hugepages-1048576kB hugepages-2048kB +</programlisting> + + You can now choose between the supported sizes, 2MiB and 1GiB in this case. + By default <productname>PostgreSQL</productname> will use the default huge + page size on the system, but that can be configured via + <xref linkend="guc-huge-page-size"/>. + The default huge page size can be found with: +<programlisting> $ <userinput>grep ^Hugepagesize /proc/meminfo</userinput> Hugepagesize: 2048 kB </programlisting> + + For <literal>2MiB</literal>, <literal>6490428</literal> / <literal>2048</literal> gives approximately <literal>3169.154</literal>, so in this example we need at least <literal>3170</literal> huge pages, which we can set with: <programlisting> -$ <userinput>sysctl -w vm.nr_hugepages=3170</userinput> +$ <userinput>echo 3170 | tee /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages</userinput> </programlisting> A larger setting would be appropriate if other programs on the machine - also need huge pages. Don't forget to add this setting - to <filename>/etc/sysctl.conf</filename> so that it will be reapplied - after reboots. + also need huge pages. It is also possible to pre allocate huge pages on boot + by adding the kernel parameters <literal>hugepagesz=2M hugepages=3170</literal>. </para> <para> Sometimes the kernel is not able to allocate the desired number of huge - pages immediately, so it might be necessary to repeat the command or to - reboot. (Immediately after a reboot, most of the machine's memory - should be available to convert into huge pages.) To verify the huge - page allocation situation, use: + pages immediately due to external fragmentation, so it might be necessary to + repeat the command or to reboot. To verify the huge page allocation situation + for a given size, use: <programlisting> -$ <userinput>grep Huge /proc/meminfo</userinput> +$ <userinput>cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages</userinput> </programlisting> </para> diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c index 198a6985bf..91c0135b17 100644 --- a/src/backend/port/sysv_shmem.c +++ b/src/backend/port/sysv_shmem.c @@ -32,6 +32,7 @@ #endif #include "miscadmin.h" +#include "port/pg_bitutils.h" #include "portability/mem.h" #include "storage/dsm.h" #include "storage/fd.h" @@ -464,25 +465,15 @@ PGSharedMemoryAttach(IpcMemoryId shmId, * hugepage sizes, we might want to think about more invasive strategies, * such as increasing shared_buffers to absorb the extra space. * - * Returns the (real or assumed) page size into *hugepagesize, + * Returns the (real, assumed or config provided) page size into *hugepagesize, * and the hugepage-related mmap flags to use into *mmap_flags. - * - * Currently *mmap_flags is always just MAP_HUGETLB. Someday, on systems - * that support it, we might OR in additional bits to specify a particular - * non-default huge page size. */ + + static void GetHugePageSize(Size *hugepagesize, int *mmap_flags) { - /* - * If we fail to find out the system's default huge page size, assume it - * is 2MB. This will work fine when the actual size is less. If it's - * more, we might get mmap() or munmap() failures due to unaligned - * requests; but at this writing, there are no reports of any non-Linux - * systems being picky about that. - */ - *hugepagesize = 2 * 1024 * 1024; - *mmap_flags = MAP_HUGETLB; + Size default_hugepagesize = 0; /* * System-dependent code to find out the default huge page size. @@ -491,6 +482,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) * nnnn kB". Ignore any failures, falling back to the preset default. */ #ifdef __linux__ + { FILE *fp = AllocateFile("/proc/meminfo", "r"); char buf[128]; @@ -505,7 +497,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) { if (ch == 'k') { - *hugepagesize = sz * (Size) 1024; + default_hugepagesize = sz * (Size) 1024; break; } /* We could accept other units besides kB, if needed */ @@ -515,6 +507,51 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags) } } #endif /* __linux__ */ + + if (huge_page_size != 0) + { + /* If huge page size is provided in in config we use that size */ + *hugepagesize = (Size) huge_page_size * 1024; + } + else if (default_hugepagesize != 0) + { + *hugepagesize = default_hugepagesize; + } + else + { + /* + * If we fail to find out the system's default huge page size, or no + * huge page size is provided in config, assume it is 2MB. This will + * work fine when the actual size is less. If it's more, we might get + * mmap() or munmap() failures due to unaligned requests; but at this + * writing, there are no reports of any non-Linux systems being picky + * about that. + */ + *hugepagesize = 2 * 1024 * 1024; + } + + + *mmap_flags = MAP_HUGETLB; + + /* + * System-dependent code to configure mmap_flags. + * + * On Linux, configure flags to include page size, since default huge page + * size will be used in case no size is provided. + */ +#ifdef USE_NON_DEFAULT_HUGE_PAGE_SIZES + + /* + * If the selected huge page size is not the default, add flag to mmap to + * specify it + */ + if (*hugepagesize != default_hugepagesize) + { + int shift = pg_ceil_log2_64(*hugepagesize); + + *mmap_flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT; + } +#endif /* USE_NON_DEFAULT_HUGE_PAGE_SIZES */ } #endif /* MAP_HUGETLB */ diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 2f3e0a70e0..019e2690c3 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -197,6 +197,7 @@ static bool check_autovacuum_max_workers(int *newval, void **extra, GucSource so static bool check_max_wal_senders(int *newval, void **extra, GucSource source); static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource source); static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source); +static bool check_huge_page_size(int *newval, void **extra, GucSource source); static bool check_maintenance_io_concurrency(int *newval, void **extra, GucSource source); static void assign_pgstat_temp_directory(const char *newval, void *extra); static bool check_application_name(char **newval, void **extra, GucSource source); @@ -585,6 +586,7 @@ int ssl_renegotiation_limit; * need to be duplicated in all the different implementations of pg_shmem.c. */ int huge_pages; +int huge_page_size; /* * These variables are all dummies that don't do anything, except in some @@ -2269,6 +2271,16 @@ static struct config_int ConfigureNamesInt[] = 1024, 16, INT_MAX / 2, NULL, NULL, NULL }, + { + {"huge_page_size", PGC_POSTMASTER, RESOURCES_MEM, + gettext_noop("The size of huge page that should be used."), + NULL, + GUC_UNIT_KB + }, + &huge_page_size, + 0, 0, INT_MAX, + check_huge_page_size, NULL, NULL + }, { {"temp_buffers", PGC_USERSET, RESOURCES_MEM, @@ -11573,6 +11585,19 @@ check_effective_io_concurrency(int *newval, void **extra, GucSource source) return true; } +static bool +check_huge_page_size(int *newval, void **extra, GucSource source) +{ +#ifndef USE_NON_DEFAULT_HUGE_PAGE_SIZES + if (*newval != 0) + { + GUC_check_errdetail("huge_page_size must be set to 0 on platforms that lack support for choosing huge page size."); + return false; + } +#endif /* USE_NON_DEFAULT_HUGE_PAGE_SIZES */ + return true; +} + static bool check_maintenance_io_concurrency(int *newval, void **extra, GucSource source) { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index ac02bd0c00..750d3f6245 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -122,6 +122,8 @@ # (change requires restart) #huge_pages = try # on, off, or try # (change requires restart) +#huge_page_size = 0 # use defualt huge page size when set to zero + # (change requires restart) #temp_buffers = 8MB # min 800kB #max_prepared_transactions = 0 # zero disables the feature # (change requires restart) diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in index c199cd46d2..4ee8e23b47 100644 --- a/src/include/pg_config.h.in +++ b/src/include/pg_config.h.in @@ -138,6 +138,14 @@ to 0 if you don't. */ #undef HAVE_DECL_LLVMORCGETSYMBOLADDRESSIN +/* Define to 1 if you have the declaration of `MAP_HUGE_MASK', and to 0 if you + don't. */ +#undef HAVE_DECL_MAP_HUGE_MASK + +/* Define to 1 if you have the declaration of `MAP_HUGE_SHIFT', and to 0 if + you don't. */ +#undef HAVE_DECL_MAP_HUGE_SHIFT + /* Define to 1 if you have the declaration of `posix_fadvise', and to 0 if you don't. */ #undef HAVE_DECL_POSIX_FADVISE diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index 8f3ec6bde1..f994652190 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -156,6 +156,12 @@ #define USE_PREFETCH #endif +/* + * USE_NON_DEFAULT_HUGE_PAGE_SIZES */ +#if defined(HAVE_DECL_MAP_HUGE_SHIFT) && defined(HAVE_DECL_MAP_HUGE_MASK) +#define USE_NON_DEFAULT_HUGE_PAGE_SIZES +#endif + /* * Default and maximum values for backend_flush_after, bgwriter_flush_after * and checkpoint_flush_after; measured in blocks. Currently, these are diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h index 0de26b3427..9992932a00 100644 --- a/src/include/storage/pg_shmem.h +++ b/src/include/storage/pg_shmem.h @@ -44,6 +44,7 @@ typedef struct PGShmemHeader /* standard header for all Postgres shmem */ /* GUC variables */ extern int shared_memory_type; extern int huge_pages; +extern int huge_page_size; /* Possible values for huge_pages */ typedef enum -- 2.27.0