Here's a new version that also adjusts the code that just landed in da722699:

-       /*
-        * Each IO handle can have an PG_IOV_MAX long iovec.
-        *
-        * XXX: Right now the amount of space available for each IO is
PG_IOV_MAX.
-        * While it's tempting to use the io_combine_limit GUC, that's
-        * PGC_USERSET, so we can't allocate shared memory based on that.
-        */
+       /* each IO handle can have up to io_max_combine_limit iovec objects */
        return mul_size(sizeof(struct iovec),
-                                       mul_size(mul_size(PG_IOV_MAX,
AioProcs()),
+
mul_size(mul_size(io_max_combine_limit, AioProcs()),
                                                         io_max_concurrency));

While doing that, this juxtaposition jumped out at me:

+       uint32          per_backend_iovecs = io_max_concurrency *
max_io_combine_limit;

Surely one of these names has to give!  First commit wins, so the new
name in this version is "io_max_combine_limit".

I was contemplating making the same change to the MAX_IO_COMBINE_LIMIT
macro when it occurred to me that we could just delete it.  It has few
references and they could just use PG_IOV_MAX instead; it's perhaps a
little less clear as names go, but it's also pretty confusing that
there's a macro with the same name as a GUC...
From 1200802f3d647a8c9107b2148595dab4b6a1bd82 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Wed, 12 Feb 2025 13:52:22 +1300
Subject: [PATCH v4 1/3] Introduce io_max_combine_limit.

The existing io_combine_limit setting can be changed by users.  The new
io_max_combine_limit setting is defined only at server startup time, and
applies as a silent cap.

This allows aio_init.c to know what the maximum could possibly be when
allocating shared memory at startup, instead of having to assume
PG_IOV_MAX (which a later commit will quadruple).

Since our GUC system doesn't allow dependencies between GUCs, the
user-settable one now assigns a "raw" value to io_combine_limit_guc, and
the lower of io_combine_limit_guc and io_max_combine_limit is maintained
in io_combine_limit.  Most code should continue to refer to
io_combine_limit directly, and io_combine_limit_guc is an implementation
detail that could eventually be removed if the GUC system ever learns
how to deal with dependencies.

Reviewed-by: Andres Freund <and...@anarazel.de>
Discussion: https://postgr.es/m/CA%2BhUKG%2B2T9p-%2BzM6Eeou-RAJjTML6eit1qn26f9twznX59qtCA%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      | 23 ++++++++++++++++++-
 src/backend/commands/variable.c               | 18 +++++++++++++++
 src/backend/storage/aio/aio_init.c            | 17 +++++---------
 src/backend/storage/buffer/bufmgr.c           |  5 +++-
 src/backend/utils/misc/guc_tables.c           | 16 ++++++++++++-
 src/backend/utils/misc/postgresql.conf.sample |  2 ++
 src/include/storage/bufmgr.h                  |  4 +++-
 src/include/utils/guc_hooks.h                 |  2 ++
 8 files changed, 72 insertions(+), 15 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 7ec18bb7627..d1080dac97f 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2625,6 +2625,24 @@ include_dir 'conf.d'
        </listitem>
       </varlistentry>
 
+      <varlistentry id="guc-io-max-combine-limit" xreflabel="io_max_combine_limit">
+       <term><varname>io_max_combine_limit</varname> (<type>integer</type>)
+       <indexterm>
+        <primary><varname>io_max_combine_limit</varname> configuration parameter</primary>
+       </indexterm>
+       </term>
+       <listitem>
+        <para>
+         Controls the largest I/O size in operations that combine I/O, and silently
+         limits the user-settable parameter <varname>io_combine_limit</varname>.
+         This parameter can only be set in
+         the <filename>postgresql.conf</filename> file or on the server
+         command line.
+         The default is 128kB.
+        </para>
+       </listitem>
+      </varlistentry>
+
       <varlistentry id="guc-io-combine-limit" xreflabel="io_combine_limit">
        <term><varname>io_combine_limit</varname> (<type>integer</type>)
        <indexterm>
@@ -2633,7 +2651,10 @@ include_dir 'conf.d'
        </term>
        <listitem>
         <para>
-         Controls the largest I/O size in operations that combine I/O.
+         Controls the largest I/O size in operations that combine I/O.  If set
+         higher than the <varname>io_max_combine_limit</varname> parameter, the
+         smaller value will silently be used instead, so both may need to be raised
+         to increase the I/O size.
          The default is 128kB.
         </para>
        </listitem>
diff --git a/src/backend/commands/variable.c b/src/backend/commands/variable.c
index 4ad6e236d69..f550a3c0c63 100644
--- a/src/backend/commands/variable.c
+++ b/src/backend/commands/variable.c
@@ -1156,6 +1156,24 @@ assign_maintenance_io_concurrency(int newval, void *extra)
 #endif
 }
 
+/*
+ * GUC assign hooks that recompute io_combine_limit whenever
+ * io_combine_limit_guc and io_max_combine_limit are changed.  These are needed
+ * because the GUC subsystem doesn't support dependencies between GUCs, and
+ * they may be assigned in either order.
+ */
+void
+assign_io_max_combine_limit(int newval, void *extra)
+{
+	io_max_combine_limit = newval;
+	io_combine_limit = Min(io_max_combine_limit, io_combine_limit_guc);
+}
+void
+assign_io_combine_limit(int newval, void *extra)
+{
+	io_combine_limit_guc = newval;
+	io_combine_limit = Min(io_max_combine_limit, io_combine_limit_guc);
+}
 
 /*
  * These show hooks just exist because we want to show the values in octal.
diff --git a/src/backend/storage/aio/aio_init.c b/src/backend/storage/aio/aio_init.c
index 6fe55510fae..93aba6e03ba 100644
--- a/src/backend/storage/aio/aio_init.c
+++ b/src/backend/storage/aio/aio_init.c
@@ -18,6 +18,7 @@
 #include "storage/aio.h"
 #include "storage/aio_internal.h"
 #include "storage/aio_subsys.h"
+#include "storage/bufmgr.h"
 #include "storage/ipc.h"
 #include "storage/proc.h"
 #include "storage/shmem.h"
@@ -66,15 +67,9 @@ AioHandleShmemSize(void)
 static Size
 AioHandleIOVShmemSize(void)
 {
-	/*
-	 * Each IO handle can have an PG_IOV_MAX long iovec.
-	 *
-	 * XXX: Right now the amount of space available for each IO is PG_IOV_MAX.
-	 * While it's tempting to use the io_combine_limit GUC, that's
-	 * PGC_USERSET, so we can't allocate shared memory based on that.
-	 */
+	/* each IO handle can have up to io_max_combine_limit iovec objects */
 	return mul_size(sizeof(struct iovec),
-					mul_size(mul_size(PG_IOV_MAX, AioProcs()),
+					mul_size(mul_size(io_max_combine_limit, AioProcs()),
 							 io_max_concurrency));
 }
 
@@ -83,7 +78,7 @@ AioHandleDataShmemSize(void)
 {
 	/* each buffer referenced by an iovec can have associated data */
 	return mul_size(sizeof(uint64),
-					mul_size(mul_size(PG_IOV_MAX, AioProcs()),
+					mul_size(mul_size(io_max_combine_limit, AioProcs()),
 							 io_max_concurrency));
 }
 
@@ -154,7 +149,7 @@ AioShmemInit(void)
 	bool		found;
 	uint32		io_handle_off = 0;
 	uint32		iovec_off = 0;
-	uint32		per_backend_iovecs = io_max_concurrency * PG_IOV_MAX;
+	uint32		per_backend_iovecs = io_max_concurrency * io_max_combine_limit;
 
 	pgaio_ctl = (PgAioCtl *)
 		ShmemInitStruct("AioCtl", AioCtlShmemSize(), &found);
@@ -207,7 +202,7 @@ AioShmemInit(void)
 			ConditionVariableInit(&ioh->cv);
 
 			dclist_push_tail(&bs->idle_ios, &ioh->node);
-			iovec_off += PG_IOV_MAX;
+			iovec_off += io_max_combine_limit;
 		}
 	}
 
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 79ca9d18d07..d04afa5ab9c 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -160,9 +160,12 @@ int			maintenance_io_concurrency = DEFAULT_MAINTENANCE_IO_CONCURRENCY;
 /*
  * Limit on how many blocks should be handled in single I/O operations.
  * StartReadBuffers() callers should respect it, as should other operations
- * that call smgr APIs directly.
+ * that call smgr APIs directly.  It is computed as the minimum of underlying
+ * GUCs io_combine_limit_guc and io_max_combine_limit.
  */
 int			io_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
+int			io_combine_limit_guc = DEFAULT_IO_COMBINE_LIMIT;
+int			io_max_combine_limit = DEFAULT_IO_COMBINE_LIMIT;
 
 /*
  * GUC variables about triggering kernel writeback for buffers written; OS
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 0d3ebf06a95..720f28a5180 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -3241,6 +3241,20 @@ struct config_int ConfigureNamesInt[] =
 		NULL
 	},
 
+	{
+		{"io_max_combine_limit",
+			PGC_POSTMASTER,
+			RESOURCES_IO,
+			gettext_noop("System-wide limit applied to io_combine_limit."),
+			NULL,
+			GUC_UNIT_BLOCKS
+		},
+		&io_max_combine_limit,
+		DEFAULT_IO_COMBINE_LIMIT,
+		1, MAX_IO_COMBINE_LIMIT,
+		NULL, assign_io_max_combine_limit, NULL
+	},
+
 	{
 		{"io_combine_limit",
 			PGC_USERSET,
@@ -3252,7 +3266,7 @@ struct config_int ConfigureNamesInt[] =
 		&io_combine_limit,
 		DEFAULT_IO_COMBINE_LIMIT,
 		1, MAX_IO_COMBINE_LIMIT,
-		NULL, NULL, NULL
+		NULL, assign_io_combine_limit, NULL
 	},
 
 	{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 43c2ec2153e..bd9a3507135 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -200,6 +200,8 @@
 #backend_flush_after = 0		# measured in pages, 0 disables
 #effective_io_concurrency = 16		# 1-1000; 0 disables prefetching
 #maintenance_io_concurrency = 10	# 1-1000; 0 disables prefetching
+#io_max_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
+					# (change requires restart)
 #io_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
 
 #io_method = sync			# sync (change requires restart)
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 79a89f87fcc..71b3af61b6a 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -163,7 +163,9 @@ extern PGDLLIMPORT int maintenance_io_concurrency;
 
 #define MAX_IO_COMBINE_LIMIT PG_IOV_MAX
 #define DEFAULT_IO_COMBINE_LIMIT Min(MAX_IO_COMBINE_LIMIT, (128 * 1024) / BLCKSZ)
-extern PGDLLIMPORT int io_combine_limit;
+extern PGDLLIMPORT int io_combine_limit;	/* min of the two GUCs below */
+extern PGDLLIMPORT int io_combine_limit_guc;
+extern PGDLLIMPORT int io_max_combine_limit;
 
 extern PGDLLIMPORT int checkpoint_flush_after;
 extern PGDLLIMPORT int backend_flush_after;
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index a3eba8fbe21..0f1e74f96c9 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -86,6 +86,8 @@ extern const char *show_log_timezone(void);
 extern bool check_maintenance_io_concurrency(int *newval, void **extra,
 											 GucSource source);
 extern void assign_maintenance_io_concurrency(int newval, void *extra);
+extern void assign_io_max_combine_limit(int newval, void *extra);
+extern void assign_io_combine_limit(int newval, void *extra);
 extern bool check_max_slot_wal_keep_size(int *newval, void **extra,
 										 GucSource source);
 extern void assign_max_wal_size(int newval, void *extra);
-- 
2.39.5

From 4abac6442061744b791031a25a31cd45563cc111 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Tue, 21 Jan 2025 11:44:35 +1300
Subject: [PATCH v4 2/3] Increase the maximum I/O combine size to 1MB.

The default value of 128kB is not changed, but the upper limit is
changed from 32 blocks to 128 blocks (1MB with 8kB blocks), assuming the
operating system's IOV_MAX doesn't limit us to a smaller size.  This is
around where some other RDBMSes seem to cap their buffer pool I/O size,
and it seems like to good idea to allow experiments with that.

The concrete change is to our definition of PG_IOV_MAX, which provides
the maximum limit for io_combine_limit and io_max_combine_limit.  It
also affects a couple of other places that work with arrays of struct
iovec or smaller objects on the stack, so we still don't want to use the
system IOV_MAX directly without a clamp: it is not under our control and
likely to be 1024.  128 seems acceptable for all our current use cases.

The last Unix on our target list known to have a low IOV_MAX was Solaris
before 11.4 SRU72 (it was 16, the minimum requirement for POSIX
conformance, but is now 1024, matching all other systems I looked at).

For Windows, we can't use real scatter/gather yet (though it's possible,
in later work), so we continue to define our own IOV_MAX value of 16 and
emulate preadv()/pwritev() with loops there.  Someone would need to
research the trade-off.

This change also makes it possible for read_stream.c's internal cap of
INT16_MAX to be hit, so adjust comments about that.  With
*_io_concurrent and io_combine_limit set to their maximum, it would want
to be able to pin 128K buffers at once (= 1GB of data), but the choice
of data type limits streams to 32K buffers.  That could be revisited in
future, but you'll probably hit other limits long before that one in
your quest to run 1,000 concurrent I/Os of size 1MB.

Suggested-by: Tomas Vondra <to...@vondra.me>
Discussion: https://postgr.es/m/CA%2BhUKG%2B2T9p-%2BzM6Eeou-RAJjTML6eit1qn26f9twznX59qtCA%40mail.gmail.com
---
 doc/src/sgml/config.sgml                      | 4 ++++
 src/backend/storage/aio/read_stream.c         | 7 ++++---
 src/backend/utils/misc/postgresql.conf.sample | 4 ++--
 src/include/port/pg_iovec.h                   | 8 ++++++--
 4 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index d1080dac97f..93eea7f96d2 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -2638,6 +2638,8 @@ include_dir 'conf.d'
          This parameter can only be set in
          the <filename>postgresql.conf</filename> file or on the server
          command line.
+         The maximum possible size depends on the operating system and block
+         size, but is typically 1MB on Unix and 128kB on Windows.
          The default is 128kB.
         </para>
        </listitem>
@@ -2655,6 +2657,8 @@ include_dir 'conf.d'
          higher than the <varname>io_max_combine_limit</varname> parameter, the
          smaller value will silently be used instead, so both may need to be raised
          to increase the I/O size.
+         The maximum possible size depends on the operating system and block
+         size, but is typically 1MB on Unix and 128kB on Windows.
          The default is 128kB.
         </para>
        </listitem>
diff --git a/src/backend/storage/aio/read_stream.c b/src/backend/storage/aio/read_stream.c
index d65fa07b44c..45bdf819d57 100644
--- a/src/backend/storage/aio/read_stream.c
+++ b/src/backend/storage/aio/read_stream.c
@@ -515,9 +515,10 @@ read_stream_begin_impl(int flags,
 	 * finishes we don't want to have to wait for its buffers to be consumed
 	 * before starting a new one.
 	 *
-	 * Be careful not to allow int16 to overflow (even though that's not
-	 * possible with the current GUC range limits), allowing also for the
-	 * spare entry and the overflow space.
+	 * Be careful not to allow int16 to overflow.  That is possible with the
+	 * current GUC range limits, so this is an artificial limit of ~32k
+	 * buffers and we'd need to adjust the types to exceed that.  We also have
+	 * to allow for the spare entry and the overflow space.
 	 */
 	max_pinned_buffers = (max_ios + 1) * io_combine_limit;
 	max_pinned_buffers = Min(max_pinned_buffers,
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index bd9a3507135..e43d803b278 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -200,9 +200,9 @@
 #backend_flush_after = 0		# measured in pages, 0 disables
 #effective_io_concurrency = 16		# 1-1000; 0 disables prefetching
 #maintenance_io_concurrency = 10	# 1-1000; 0 disables prefetching
-#io_max_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
+#io_max_combine_limit = 128kB		# usually 1-128 blocks (depends on OS)
 					# (change requires restart)
-#io_combine_limit = 128kB		# usually 1-32 blocks (depends on OS)
+#io_combine_limit = 128kB		# usually 1-128 blocks (depends on OS)
 
 #io_method = sync			# sync (change requires restart)
 #io_max_concurrency = -1		# Max number of IOs that one process
diff --git a/src/include/port/pg_iovec.h b/src/include/port/pg_iovec.h
index d9891d3805d..df40c7208be 100644
--- a/src/include/port/pg_iovec.h
+++ b/src/include/port/pg_iovec.h
@@ -33,8 +33,12 @@ struct iovec
 
 #endif
 
-/* Define a reasonable maximum that is safe to use on the stack. */
-#define PG_IOV_MAX Min(IOV_MAX, 32)
+/*
+ * Define a reasonable maximum that is safe to use on the stack in arrays of
+ * struct iovec and other small types.  The operating system could limit us to
+ * a number as low as 16, but most systems have 1024.
+ */
+#define PG_IOV_MAX Min(IOV_MAX, 128)
 
 /*
  * Like preadv(), but with a prefix to remind us of a side-effect: on Windows
-- 
2.39.5

From c33e3b50be396819a73aca805d33f32d88560ace Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.mu...@gmail.com>
Date: Tue, 18 Mar 2025 16:12:43 +1300
Subject: [PATCH v4 3/3] Remove MAX_IO_COMBINE_LIMIT macro.

Now that we have a setting io_max_combine_limit, the macro is a bit
confusing.  It only has a few references, and they can be replaced with
PG_IOV_MAX.
---
 src/backend/storage/buffer/bufmgr.c | 6 +++---
 src/backend/utils/misc/guc_tables.c | 4 ++--
 src/include/storage/bufmgr.h        | 3 +--
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index d04afa5ab9c..5e4701d6f19 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1259,7 +1259,7 @@ StartReadBuffersImpl(ReadBuffersOperation *operation,
 	int			maxcombine = 0;
 
 	Assert(*nblocks > 0);
-	Assert(*nblocks <= MAX_IO_COMBINE_LIMIT);
+	Assert(*nblocks <= PG_IOV_MAX);
 
 	for (int i = 0; i < actual_nblocks; ++i)
 	{
@@ -1452,8 +1452,8 @@ WaitReadBuffers(ReadBuffersOperation *operation)
 	for (int i = 0; i < nblocks; ++i)
 	{
 		int			io_buffers_len;
-		Buffer		io_buffers[MAX_IO_COMBINE_LIMIT];
-		void	   *io_pages[MAX_IO_COMBINE_LIMIT];
+		Buffer		io_buffers[PG_IOV_MAX];
+		void	   *io_pages[PG_IOV_MAX];
 		instr_time	io_start;
 		BlockNumber io_first_block;
 
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 720f28a5180..d9540d31c8e 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -3251,7 +3251,7 @@ struct config_int ConfigureNamesInt[] =
 		},
 		&io_max_combine_limit,
 		DEFAULT_IO_COMBINE_LIMIT,
-		1, MAX_IO_COMBINE_LIMIT,
+		1, PG_IOV_MAX,
 		NULL, assign_io_max_combine_limit, NULL
 	},
 
@@ -3265,7 +3265,7 @@ struct config_int ConfigureNamesInt[] =
 		},
 		&io_combine_limit,
 		DEFAULT_IO_COMBINE_LIMIT,
-		1, MAX_IO_COMBINE_LIMIT,
+		1, PG_IOV_MAX,
 		NULL, assign_io_combine_limit, NULL
 	},
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 71b3af61b6a..0452fe1f1e8 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -161,8 +161,7 @@ extern PGDLLIMPORT bool track_io_timing;
 extern PGDLLIMPORT int effective_io_concurrency;
 extern PGDLLIMPORT int maintenance_io_concurrency;
 
-#define MAX_IO_COMBINE_LIMIT PG_IOV_MAX
-#define DEFAULT_IO_COMBINE_LIMIT Min(MAX_IO_COMBINE_LIMIT, (128 * 1024) / BLCKSZ)
+#define DEFAULT_IO_COMBINE_LIMIT Min(PG_IOV_MAX, (128 * 1024) / BLCKSZ)
 extern PGDLLIMPORT int io_combine_limit;	/* min of the two GUCs below */
 extern PGDLLIMPORT int io_combine_limit_guc;
 extern PGDLLIMPORT int io_max_combine_limit;
-- 
2.39.5

Reply via email to