Thanks again Thomas,

> Oh, so maybe we need a configure test for them?  And if you don't have
> it, a runtime error if you try to set the page size to something other
> than 0 (like we do for effective_io_concurrency if you don't have a
> posix_fadvise() function).

Ahh, yes, that sounds reasonable. Did some fiddling with the configure
script to add a check, and think I got it right (but not 100% sure
tho.). Added new v3 patch.

> If you set it to an unsupported size, that seems reasonable to me.  If
> you set it to an unsupported size and have huge_pages=try, do we fall
> back to using no huge pages?

Yes, the "fallback" with huge_pages=try is the same for both
huge_page_size=0 and huge_page_size=nMB, and is the same as without
this patch.

> For what it's worth, here's what I know about this on other operating systems:

Thanks for all the background info!

> 1.  AIX can do huge pages, but only if you use System V shared memory
> (not for mmap() anonymous shared).  In
> https://commitfest.postgresql.org/25/1960/ we got as far as adding
> support for shared_memory_type=sysv, but to go further we'll need
> someone willing to hack on the patch on an AIX system, preferably with
> root access so they can grant the postgres user wired memory
> privileges (or whatever they call that over there).  But at a glance,
> they don't have a way to ask for a specific page size, just "large".

Interesting. I might get access to some AIX systems at university this fall,
so maybe I will get some time to dive into the patch.


Odin
From 8cb876bf73258646044a6a99d72e7c12d1d03e3a Mon Sep 17 00:00:00 2001
From: Odin Ugedal <o...@ugedal.com>
Date: Sun, 7 Jun 2020 21:04:57 +0200
Subject: [PATCH v3] Add support for choosing huge page size

This adds support for using non-default huge page sizes for shared
memory. This is achived via the new "huge_page_size" config entry.
The config value defaults to 0, meaning it will use the system default.
---
 configure                                     | 26 +++++++
 configure.in                                  |  4 ++
 doc/src/sgml/config.sgml                      | 27 ++++++++
 doc/src/sgml/runtime.sgml                     | 41 +++++++-----
 src/backend/port/sysv_shmem.c                 | 67 ++++++++++++++-----
 src/backend/utils/misc/guc.c                  | 25 +++++++
 src/backend/utils/misc/postgresql.conf.sample |  2 +
 src/include/pg_config.h.in                    |  8 +++
 src/include/pg_config_manual.h                |  6 ++
 src/include/storage/pg_shmem.h                |  1 +
 10 files changed, 176 insertions(+), 31 deletions(-)

diff --git a/configure b/configure
index 2feff37fe3..11e3112ee4 100755
--- a/configure
+++ b/configure
@@ -15488,6 +15488,32 @@ _ACEOF
 
 fi # fi
 
+# Check if system supports mmap flags for allocating huge page memory with page sizes
+# other than the default
+ac_fn_c_check_decl "$LINENO" "MAP_HUGE_MASK" "ac_cv_have_decl_MAP_HUGE_MASK" "#include <sys/mman.h>
+"
+if test "x$ac_cv_have_decl_MAP_HUGE_MASK" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_MAP_HUGE_MASK $ac_have_decl
+_ACEOF
+ac_fn_c_check_decl "$LINENO" "MAP_HUGE_SHIFT" "ac_cv_have_decl_MAP_HUGE_SHIFT" "#include <sys/mman.h>
+"
+if test "x$ac_cv_have_decl_MAP_HUGE_SHIFT" = xyes; then :
+  ac_have_decl=1
+else
+  ac_have_decl=0
+fi
+
+cat >>confdefs.h <<_ACEOF
+#define HAVE_DECL_MAP_HUGE_SHIFT $ac_have_decl
+_ACEOF
+
+
 ac_fn_c_check_decl "$LINENO" "fdatasync" "ac_cv_have_decl_fdatasync" "#include <unistd.h>
 "
 if test "x$ac_cv_have_decl_fdatasync" = xyes; then :
diff --git a/configure.in b/configure.in
index 0188c6ff07..f56c06eb3d 100644
--- a/configure.in
+++ b/configure.in
@@ -1687,6 +1687,10 @@ AC_CHECK_FUNCS(posix_fadvise)
 AC_CHECK_DECLS(posix_fadvise, [], [], [#include <fcntl.h>])
 ]) # fi
 
+# Check if system supports mmap flags for allocating huge page memory with page sizes
+# other than the default
+AC_CHECK_DECLS([MAP_HUGE_MASK, MAP_HUGE_SHIFT], [], [], [#include <sys/mman.h>])
+
 AC_CHECK_DECLS(fdatasync, [], [], [#include <unistd.h>])
 AC_CHECK_DECLS([strlcat, strlcpy, strnlen])
 # This is probably only present on macOS, but may as well check always
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index aca8f73a50..42f06a41cb 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1582,6 +1582,33 @@ include_dir 'conf.d'
       </listitem>
      </varlistentry>
 
+     <varlistentry id="guc-huge-page-size" xreflabel="huge_page_size">
+      <term><varname>huge_page_size</varname> (<type>integer</type>)
+      <indexterm>
+       <primary><varname>huge_page_size</varname> configuration parameter</primary>
+      </indexterm>
+      </term>
+      <listitem>
+       <para>
+        Controls what size of huge pages is used in conjunction with
+        <xref linkend="guc-huge-pages"/>.
+        The default is zero (<literal>0</literal>).
+        When set to <literal>0</literal>, the default huge page size on the system will
+        be used.
+       </para>
+       <para>
+        Some commonly available page sizes on modern 64 bit server architectures include:
+        <literal>2MB<literal> and <literal>1GB</literal> (Intel and AMD), <literal>16MB</literal> and
+        <literal>16GB</literal> (IBM POWER), and <literal>64kB</literal>, <literal>2MB<literal>,
+        <literal>32MB</literal> and <literal>1GB</literal> (ARM). For more information
+        about usage and support, see <xref linkend="linux-huge-pages"/>.
+       </para>
+       <para>
+        Controlling huge page size is currently not supported on Windows.
+       </para>
+      </listitem>
+     </varlistentry>
+
      <varlistentry id="guc-temp-buffers" xreflabel="temp_buffers">
       <term><varname>temp_buffers</varname> (<type>integer</type>)
       <indexterm>
diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml
index 88210c4a5d..cbdbcb4fdf 100644
--- a/doc/src/sgml/runtime.sgml
+++ b/doc/src/sgml/runtime.sgml
@@ -1391,41 +1391,50 @@ export PG_OOM_ADJUST_VALUE=0
     using large values of <xref linkend="guc-shared-buffers"/>.  To use this
     feature in <productname>PostgreSQL</productname> you need a kernel
     with <varname>CONFIG_HUGETLBFS=y</varname> and
-    <varname>CONFIG_HUGETLB_PAGE=y</varname>. You will also have to adjust
-    the kernel setting <varname>vm.nr_hugepages</varname>. To estimate the
-    number of huge pages needed, start <productname>PostgreSQL</productname>
-    without huge pages enabled and check the
-    postmaster's anonymous shared memory segment size, as well as the system's
-    huge page size, using the <filename>/proc</filename> file system.  This might
-    look like:
+    <varname>CONFIG_HUGETLB_PAGE=y</varname>. You will also have to pre-allocate
+    huge pages with the the desired huge page size. To estimate the number of
+    huge pages needed, start <productname>PostgreSQL</productname> without huge
+    pages enabled and check the postmaster's anonymous shared memory segment size,
+    as well as the system's supported huge page sizes, using the
+    <filename>/sys</filename> file system.  This might look like:
 <programlisting>
 $ <userinput>head -1 $PGDATA/postmaster.pid</userinput>
 4170
 $ <userinput>pmap 4170 | awk '/rw-s/ &amp;&amp; /zero/ {print $2}'</userinput>
 6490428K
+$ <userinput>ls /sys/kernel/mm/hugepages</userinput>
+hugepages-1048576kB  hugepages-2048kB
+</programlisting>
+
+     You can now choose between the supported sizes, 2MiB and 1GiB in this case.
+     By default <productname>PostgreSQL</productname> will use the default huge
+     page size on the system, but that can be configured via
+     <xref linkend="guc-huge-page-size"/>.
+     The default huge page size can be found with:
+<programlisting>
 $ <userinput>grep ^Hugepagesize /proc/meminfo</userinput>
 Hugepagesize:       2048 kB
 </programlisting>
+
+     For <literal>2MiB</literal>,
      <literal>6490428</literal> / <literal>2048</literal> gives approximately
      <literal>3169.154</literal>, so in this example we need at
      least <literal>3170</literal> huge pages, which we can set with:
 <programlisting>
-$ <userinput>sysctl -w vm.nr_hugepages=3170</userinput>
+$ <userinput>echo 3170 | tee /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages</userinput>
 </programlisting>
     A larger setting would be appropriate if other programs on the machine
-    also need huge pages.  Don't forget to add this setting
-    to <filename>/etc/sysctl.conf</filename> so that it will be reapplied
-    after reboots.
+    also need huge pages. It is also possible to pre allocate huge pages on boot
+    by adding the kernel parameters <literal>hugepagesz=2M hugepages=3170</literal>.
    </para>
 
    <para>
     Sometimes the kernel is not able to allocate the desired number of huge
-    pages immediately, so it might be necessary to repeat the command or to
-    reboot.  (Immediately after a reboot, most of the machine's memory
-    should be available to convert into huge pages.)  To verify the huge
-    page allocation situation, use:
+    pages immediately due to external fragmentation, so it might be necessary to
+    repeat the command or to reboot. To verify the huge page allocation situation
+    for a given size, use:
 <programlisting>
-$ <userinput>grep Huge /proc/meminfo</userinput>
+$ <userinput>cat /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages</userinput>
 </programlisting>
    </para>
 
diff --git a/src/backend/port/sysv_shmem.c b/src/backend/port/sysv_shmem.c
index 198a6985bf..91c0135b17 100644
--- a/src/backend/port/sysv_shmem.c
+++ b/src/backend/port/sysv_shmem.c
@@ -32,6 +32,7 @@
 #endif
 
 #include "miscadmin.h"
+#include "port/pg_bitutils.h"
 #include "portability/mem.h"
 #include "storage/dsm.h"
 #include "storage/fd.h"
@@ -464,25 +465,15 @@ PGSharedMemoryAttach(IpcMemoryId shmId,
  * hugepage sizes, we might want to think about more invasive strategies,
  * such as increasing shared_buffers to absorb the extra space.
  *
- * Returns the (real or assumed) page size into *hugepagesize,
+ * Returns the (real, assumed or config provided) page size into *hugepagesize,
  * and the hugepage-related mmap flags to use into *mmap_flags.
- *
- * Currently *mmap_flags is always just MAP_HUGETLB.  Someday, on systems
- * that support it, we might OR in additional bits to specify a particular
- * non-default huge page size.
  */
+
+
 static void
 GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 {
-	/*
-	 * If we fail to find out the system's default huge page size, assume it
-	 * is 2MB.  This will work fine when the actual size is less.  If it's
-	 * more, we might get mmap() or munmap() failures due to unaligned
-	 * requests; but at this writing, there are no reports of any non-Linux
-	 * systems being picky about that.
-	 */
-	*hugepagesize = 2 * 1024 * 1024;
-	*mmap_flags = MAP_HUGETLB;
+	Size		default_hugepagesize = 0;
 
 	/*
 	 * System-dependent code to find out the default huge page size.
@@ -491,6 +482,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 	 * nnnn kB".  Ignore any failures, falling back to the preset default.
 	 */
 #ifdef __linux__
+
 	{
 		FILE	   *fp = AllocateFile("/proc/meminfo", "r");
 		char		buf[128];
@@ -505,7 +497,7 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 				{
 					if (ch == 'k')
 					{
-						*hugepagesize = sz * (Size) 1024;
+						default_hugepagesize = sz * (Size) 1024;
 						break;
 					}
 					/* We could accept other units besides kB, if needed */
@@ -515,6 +507,51 @@ GetHugePageSize(Size *hugepagesize, int *mmap_flags)
 		}
 	}
 #endif							/* __linux__ */
+
+	if (huge_page_size != 0)
+	{
+		/* If huge page size is provided in in config we use that size */
+		*hugepagesize = (Size) huge_page_size * 1024;
+	}
+	else if (default_hugepagesize != 0)
+	{
+		*hugepagesize = default_hugepagesize;
+	}
+	else
+	{
+		/*
+		 * If we fail to find out the system's default huge page size, or no
+		 * huge page size is provided in config, assume it is 2MB. This will
+		 * work fine when the actual size is less.  If it's more, we might get
+		 * mmap() or munmap() failures due to unaligned requests; but at this
+		 * writing, there are no reports of any non-Linux systems being picky
+		 * about that.
+		 */
+		*hugepagesize = 2 * 1024 * 1024;
+	}
+
+
+	*mmap_flags = MAP_HUGETLB;
+
+	/*
+	 * System-dependent code to configure mmap_flags.
+	 *
+	 * On Linux, configure flags to include page size, since default huge page
+	 * size will be used in case no size is provided.
+	 */
+#ifdef USE_NON_DEFAULT_HUGE_PAGE_SIZES
+
+	/*
+	 * If the selected huge page size is not the default, add flag to mmap to
+	 * specify it
+	 */
+	if (*hugepagesize != default_hugepagesize)
+	{
+		int			shift = pg_ceil_log2_64(*hugepagesize);
+
+		*mmap_flags |= (shift & MAP_HUGE_MASK) << MAP_HUGE_SHIFT;
+	}
+#endif							/* USE_NON_DEFAULT_HUGE_PAGE_SIZES */
 }
 
 #endif							/* MAP_HUGETLB */
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index 2f3e0a70e0..019e2690c3 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -197,6 +197,7 @@ static bool check_autovacuum_max_workers(int *newval, void **extra, GucSource so
 static bool check_max_wal_senders(int *newval, void **extra, GucSource source);
 static bool check_autovacuum_work_mem(int *newval, void **extra, GucSource source);
 static bool check_effective_io_concurrency(int *newval, void **extra, GucSource source);
+static bool check_huge_page_size(int *newval, void **extra, GucSource source);
 static bool check_maintenance_io_concurrency(int *newval, void **extra, GucSource source);
 static void assign_pgstat_temp_directory(const char *newval, void *extra);
 static bool check_application_name(char **newval, void **extra, GucSource source);
@@ -585,6 +586,7 @@ int			ssl_renegotiation_limit;
  * need to be duplicated in all the different implementations of pg_shmem.c.
  */
 int			huge_pages;
+int			huge_page_size;
 
 /*
  * These variables are all dummies that don't do anything, except in some
@@ -2269,6 +2271,16 @@ static struct config_int ConfigureNamesInt[] =
 		1024, 16, INT_MAX / 2,
 		NULL, NULL, NULL
 	},
+	{
+		{"huge_page_size", PGC_POSTMASTER, RESOURCES_MEM,
+			gettext_noop("The size of huge page that should be used."),
+			NULL,
+			GUC_UNIT_KB
+		},
+		&huge_page_size,
+		0, 0, INT_MAX,
+		check_huge_page_size, NULL, NULL
+	},
 
 	{
 		{"temp_buffers", PGC_USERSET, RESOURCES_MEM,
@@ -11573,6 +11585,19 @@ check_effective_io_concurrency(int *newval, void **extra, GucSource source)
 	return true;
 }
 
+static bool
+check_huge_page_size(int *newval, void **extra, GucSource source)
+{
+#ifndef USE_NON_DEFAULT_HUGE_PAGE_SIZES
+	if (*newval != 0)
+	{
+		GUC_check_errdetail("huge_page_size must be set to 0 on platforms that lack support for choosing huge page size.");
+		return false;
+	}
+#endif							/* USE_NON_DEFAULT_HUGE_PAGE_SIZES */
+	return true;
+}
+
 static bool
 check_maintenance_io_concurrency(int *newval, void **extra, GucSource source)
 {
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index ac02bd0c00..750d3f6245 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -122,6 +122,8 @@
 					# (change requires restart)
 #huge_pages = try			# on, off, or try
 					# (change requires restart)
+#huge_page_size = 0			# use defualt huge page size when set to zero
+					# (change requires restart)
 #temp_buffers = 8MB			# min 800kB
 #max_prepared_transactions = 0		# zero disables the feature
 					# (change requires restart)
diff --git a/src/include/pg_config.h.in b/src/include/pg_config.h.in
index c199cd46d2..4ee8e23b47 100644
--- a/src/include/pg_config.h.in
+++ b/src/include/pg_config.h.in
@@ -138,6 +138,14 @@
    to 0 if you don't. */
 #undef HAVE_DECL_LLVMORCGETSYMBOLADDRESSIN
 
+/* Define to 1 if you have the declaration of `MAP_HUGE_MASK', and to 0 if you
+   don't. */
+#undef HAVE_DECL_MAP_HUGE_MASK
+
+/* Define to 1 if you have the declaration of `MAP_HUGE_SHIFT', and to 0 if
+   you don't. */
+#undef HAVE_DECL_MAP_HUGE_SHIFT
+
 /* Define to 1 if you have the declaration of `posix_fadvise', and to 0 if you
    don't. */
 #undef HAVE_DECL_POSIX_FADVISE
diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h
index 8f3ec6bde1..f994652190 100644
--- a/src/include/pg_config_manual.h
+++ b/src/include/pg_config_manual.h
@@ -156,6 +156,12 @@
 #define USE_PREFETCH
 #endif
 
+/*
+ * USE_NON_DEFAULT_HUGE_PAGE_SIZES  */
+#if defined(HAVE_DECL_MAP_HUGE_SHIFT) && defined(HAVE_DECL_MAP_HUGE_MASK)
+#define USE_NON_DEFAULT_HUGE_PAGE_SIZES
+#endif
+
 /*
  * Default and maximum values for backend_flush_after, bgwriter_flush_after
  * and checkpoint_flush_after; measured in blocks.  Currently, these are
diff --git a/src/include/storage/pg_shmem.h b/src/include/storage/pg_shmem.h
index 0de26b3427..9992932a00 100644
--- a/src/include/storage/pg_shmem.h
+++ b/src/include/storage/pg_shmem.h
@@ -44,6 +44,7 @@ typedef struct PGShmemHeader	/* standard header for all Postgres shmem */
 /* GUC variables */
 extern int	shared_memory_type;
 extern int	huge_pages;
+extern int	huge_page_size;
 
 /* Possible values for huge_pages */
 typedef enum
-- 
2.27.0

Reply via email to