Thanks Zsolt

I changed it to use the configured BLCKSZ in attached patch.

But you may be right that current_setting('block_size')::int is the
better way to go as we are interested in the page size at the target
database, not what the pg_sump was compiled with

I'll wait for other feedback as well and then send the next patc with changes




On Mon, Jan 19, 2026 at 10:15 PM Zsolt Parragi
<[email protected]> wrote:
>
> Hello
>
> pgdump.c:7174
>
> + appendPQExpBufferStr(query, "pg_relation_size(c.tableoid)/8192 AS
> relpages, ");
>
> Shouldn't this be something like
>
> + appendPQExpBufferStr(query,
> "pg_relation_size(c.oid)/current_setting('block_size')::int AS
> relpages, ");
>
> instead?
From 6577b964a4b4b85aa51cc7ba12f785ed5567894a Mon Sep 17 00:00:00 2001
From: Hannu Krosing <[email protected]>
Date: Mon, 19 Jan 2026 23:56:49 +0100
Subject: [PATCH v8] * changed flag mname to max-table-segment-pages * added
 check for amname = "heap" * added simple chunked dump and restore test *
 switched to using of pg_relation_size()/BLCKSZ when --max-table-segment-pages
 is set

* added a WARNING with count and table data hash to source and chunked restore database
---
 src/bin/pg_dump/pg_backup.h               |   1 +
 src/bin/pg_dump/pg_backup_archiver.c      |   1 +
 src/bin/pg_dump/pg_dump.c                 | 172 +++++++++++++++++-----
 src/bin/pg_dump/pg_dump.h                 |   5 +
 src/bin/pg_dump/t/004_pg_dump_parallel.pl |  52 +++++++
 5 files changed, 193 insertions(+), 38 deletions(-)

diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index d9041dad720..28df18fd993 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -178,6 +178,7 @@ typedef struct _dumpOptions
 	bool		aclsSkip;
 	const char *lockWaitTimeout;
 	int			dump_inserts;	/* 0 = COPY, otherwise rows per INSERT */
+	int			max_table_segment_pages; /* chunk when relpages is above this */
 
 	/* flags for various command-line long options */
 	int			disable_dollar_quoting;
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 4a63f7392ae..70e4da9a970 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -154,6 +154,7 @@ InitDumpOptions(DumpOptions *opts)
 	opts->dumpSchema = true;
 	opts->dumpData = true;
 	opts->dumpStatistics = false;
+	opts->max_table_segment_pages = UINT32_MAX; /* == InvalidBlockNumber, disable chunking by default */
 }
 
 /*
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 687dc98e46d..747b396c788 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -539,6 +539,7 @@ main(int argc, char **argv)
 		{"exclude-extension", required_argument, NULL, 17},
 		{"sequence-data", no_argument, &dopt.sequence_data, 1},
 		{"restrict-key", required_argument, NULL, 25},
+		{"max-table-segment-pages", required_argument, NULL, 26},
 
 		{NULL, 0, NULL, 0}
 	};
@@ -803,6 +804,13 @@ main(int argc, char **argv)
 				dopt.restrict_key = pg_strdup(optarg);
 				break;
 
+			case 26:			/* huge table chunk pages */
+				if (!option_parse_int(optarg, "--max-table-segment-pages", 1, INT32_MAX,
+									  &dopt.max_table_segment_pages))
+					exit_nicely(1);
+				pg_log_warning("CHUNKING: set dopt.max_table_segment_pages to [%u]",(BlockNumber) dopt.max_table_segment_pages);
+				break;
+
 			default:
 				/* getopt_long already emitted a complaint */
 				pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -1372,6 +1380,9 @@ help(const char *progname)
 	printf(_("  --extra-float-digits=NUM     override default setting for extra_float_digits\n"));
 	printf(_("  --filter=FILENAME            include or exclude objects and data from dump\n"
 			 "                               based on expressions in FILENAME\n"));
+	printf(_("  --max-table-segment-pages=NUMPAGES\n"
+		     "                               Number of main table pages above which data is \n"
+			 "                               copied out in chunks, also determines the chunk size\n"));
 	printf(_("  --if-exists                  use IF EXISTS when dropping objects\n"));
 	printf(_("  --include-foreign-data=PATTERN\n"
 			 "                               include data of foreign tables on foreign\n"
@@ -2412,7 +2423,7 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
 	 * a filter condition was specified.  For other cases a simple COPY
 	 * suffices.
 	 */
-	if (tdinfo->filtercond || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
+	if (tdinfo->filtercond || tdinfo->chunking || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
 	{
 		/* Temporary allows to access to foreign tables to dump data */
 		if (tbinfo->relkind == RELKIND_FOREIGN_TABLE)
@@ -2428,9 +2439,23 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
 		else
 			appendPQExpBufferStr(q, "* ");
 
-		appendPQExpBuffer(q, "FROM %s %s) TO stdout;",
+		appendPQExpBuffer(q, "FROM %s %s",
 						  fmtQualifiedDumpable(tbinfo),
 						  tdinfo->filtercond ? tdinfo->filtercond : "");
+		if (tdinfo->chunking)
+		{
+			if(tdinfo->endPage != InvalidBlockNumber)
+				appendPQExpBuffer(q, "%s ctid BETWEEN '(%u,1)' AND '(%u,32000)'", /* there is no (*,0) tuple */
+								 tdinfo->filtercond?" AND ":" WHERE ",
+								 tdinfo->startPage, tdinfo->endPage);
+			else
+				appendPQExpBuffer(q, "%s ctid >= '(%u,1)'", /* there is no (*,0) tuple */
+								 tdinfo->filtercond?" AND ":" WHERE ",
+								 tdinfo->startPage);
+			pg_log_warning("CHUNKING: pages [%u:%u]",tdinfo->startPage, tdinfo->endPage);
+		}
+		
+		appendPQExpBuffer(q, ") TO stdout;");
 	}
 	else
 	{
@@ -2438,6 +2463,9 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
 						  fmtQualifiedDumpable(tbinfo),
 						  column_list);
 	}
+
+	pg_log_warning("CHUNKING: data query: %s", q->data);
+	
 	res = ExecuteSqlQuery(fout, q->data, PGRES_COPY_OUT);
 	PQclear(res);
 	destroyPQExpBuffer(clistBuf);
@@ -2933,42 +2961,100 @@ dumpTableData(Archive *fout, const TableDataInfo *tdinfo)
 	{
 		TocEntry   *te;
 
-		te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
-						  ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
-									   .namespace = tbinfo->dobj.namespace->dobj.name,
-									   .owner = tbinfo->rolname,
-									   .description = "TABLE DATA",
-									   .section = SECTION_DATA,
-									   .createStmt = tdDefn,
-									   .copyStmt = copyStmt,
-									   .deps = &(tbinfo->dobj.dumpId),
-									   .nDeps = 1,
-									   .dumpFn = dumpFn,
-									   .dumpArg = tdinfo));
-
-		/*
-		 * Set the TocEntry's dataLength in case we are doing a parallel dump
-		 * and want to order dump jobs by table size.  We choose to measure
-		 * dataLength in table pages (including TOAST pages) during dump, so
-		 * no scaling is needed.
-		 *
-		 * However, relpages is declared as "integer" in pg_class, and hence
-		 * also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
-		 * Cast so that we get the right interpretation of table sizes
-		 * exceeding INT_MAX pages.
+		/* chunking works off relpages, which may be slightly off
+		 * but is the best we have without doing our own page count
+		 * it should be enough for typical use case of huge tables which 
+		 * should have their relpages updated by autovacuum
+		 * 
+		 * For now we only do cunking when table access method is heap
+		 * we may add other chunking methods later. 
 		 */
-		te->dataLength = (BlockNumber) tbinfo->relpages;
-		te->dataLength += (BlockNumber) tbinfo->toastpages;
+		if ((BlockNumber) tbinfo->relpages < dopt->max_table_segment_pages || 
+			strcmp(tbinfo->amname, "heap") != 0)
+		{
+			te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
+							ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+										.namespace = tbinfo->dobj.namespace->dobj.name,
+										.owner = tbinfo->rolname,
+										.description = "TABLE DATA",
+										.section = SECTION_DATA,
+										.createStmt = tdDefn,
+										.copyStmt = copyStmt,
+										.deps = &(tbinfo->dobj.dumpId),
+										.nDeps = 1,
+										.dumpFn = dumpFn,
+										.dumpArg = tdinfo));
 
-		/*
-		 * If pgoff_t is only 32 bits wide, the above refinement is useless,
-		 * and instead we'd better worry about integer overflow.  Clamp to
-		 * INT_MAX if the correct result exceeds that.
-		 */
-		if (sizeof(te->dataLength) == 4 &&
-			(tbinfo->relpages < 0 || tbinfo->toastpages < 0 ||
-			 te->dataLength < 0))
-			te->dataLength = INT_MAX;
+			/*
+			* Set the TocEntry's dataLength in case we are doing a parallel dump
+			* and want to order dump jobs by table size.  We choose to measure
+			* dataLength in table pages (including TOAST pages) during dump, so
+			* no scaling is needed.
+			*
+			* However, relpages is declared as "integer" in pg_class, and hence
+			* also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
+			* Cast so that we get the right interpretation of table sizes
+			* exceeding INT_MAX pages.
+			*/
+			te->dataLength = (BlockNumber) tbinfo->relpages;
+			te->dataLength += (BlockNumber) tbinfo->toastpages;
+
+			/*
+			* If pgoff_t is only 32 bits wide, the above refinement is useless,
+			* and instead we'd better worry about integer overflow.  Clamp to
+			* INT_MAX if the correct result exceeds that.
+			*/
+			if (sizeof(te->dataLength) == 4 &&
+				(tbinfo->relpages < 0 || tbinfo->toastpages < 0 ||
+				te->dataLength < 0))
+				te->dataLength = INT_MAX;
+		}
+		else
+		{
+			BlockNumber current_chunk_start = 0;
+			PQExpBuffer chunk_desc = createPQExpBuffer();
+			
+			pg_log_warning("CHUNKING: toc for chunked relpages [%u]",(BlockNumber) tbinfo->relpages);
+
+			while (current_chunk_start < (BlockNumber) tbinfo->relpages)
+			{
+				TableDataInfo *chunk_tdinfo = (TableDataInfo *) pg_malloc(sizeof(TableDataInfo));
+
+				memcpy(chunk_tdinfo, tdinfo, sizeof(TableDataInfo));
+				AssignDumpId(&chunk_tdinfo->dobj);
+				//addObjectDependency(&chunk_tdinfo->dobj, tbinfo->dobj.dumpId); /* do we need this here */
+				chunk_tdinfo->chunking = true;
+				chunk_tdinfo->startPage = current_chunk_start;
+				chunk_tdinfo->endPage = current_chunk_start + dopt->max_table_segment_pages - 1;
+
+				pg_log_warning("CHUNKING: toc for pages [%u:%u]",chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+				
+				current_chunk_start += dopt->max_table_segment_pages;
+				if (current_chunk_start >= (BlockNumber) tbinfo->relpages)
+					chunk_tdinfo->endPage = UINT32_MAX; /* last chunk is for "all the rest" */
+
+				printfPQExpBuffer(chunk_desc, "TABLE DATA (pages %u:%u)", chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+
+				te = ArchiveEntry(fout, chunk_tdinfo->dobj.catId, chunk_tdinfo->dobj.dumpId,
+							ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+										.namespace = tbinfo->dobj.namespace->dobj.name,
+										.owner = tbinfo->rolname,
+										.description = chunk_desc->data,
+										.section = SECTION_DATA,
+										.createStmt = tdDefn,
+										.copyStmt = copyStmt,
+										.deps = &(tbinfo->dobj.dumpId),
+										.nDeps = 1,
+										.dumpFn = dumpFn,
+										.dumpArg = chunk_tdinfo));
+
+				te->dataLength = dopt->max_table_segment_pages;
+				/* let's assume toast pages distribute evenly among chunks */
+				te->dataLength += (off_t)dopt->max_table_segment_pages * tbinfo->toastpages / tbinfo->relpages;
+			}
+
+			destroyPQExpBuffer(chunk_desc);
+		}
 	}
 
 	destroyPQExpBuffer(copyBuf);
@@ -3092,6 +3178,9 @@ makeTableDataInfo(DumpOptions *dopt, TableInfo *tbinfo)
 	tdinfo->dobj.namespace = tbinfo->dobj.namespace;
 	tdinfo->tdtable = tbinfo;
 	tdinfo->filtercond = NULL;	/* might get set later */
+	tdinfo->chunking = false; /* defaults */
+	tdinfo->startPage = 0;
+	tdinfo->endPage = InvalidBlockNumber;
 	addObjectDependency(&tdinfo->dobj, tbinfo->dobj.dumpId);
 
 	/* A TableDataInfo contains data, of course */
@@ -7254,8 +7343,15 @@ getTables(Archive *fout, int *numTables)
 						 "c.relnamespace, c.relkind, c.reltype, "
 						 "c.relowner, "
 						 "c.relchecks, "
-						 "c.relhasindex, c.relhasrules, c.relpages, "
-						 "c.reltuples, c.relallvisible, ");
+						 "c.relhasindex, c.relhasrules, ");
+
+	/* use real relation size if chunking is requested */
+	if(dopt->max_table_segment_pages != InvalidBlockNumber)
+		appendPQExpBuffer(query, "pg_relation_size(c.tableoid)/%d AS relpages, ", BLCKSZ);
+	else
+		appendPQExpBufferStr(query, "c.relpages, ");
+
+	appendPQExpBufferStr(query, "c.reltuples, c.relallvisible, ");
 
 	if (fout->remoteVersion >= 180000)
 		appendPQExpBufferStr(query, "c.relallfrozen, ");
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h
index 4c4b14e5fc7..ddaf341bb3b 100644
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -16,6 +16,7 @@
 
 #include "pg_backup.h"
 #include "catalog/pg_publication_d.h"
+#include "storage/block.h"
 
 
 #define oidcmp(x,y) ( ((x) < (y) ? -1 : ((x) > (y)) ?  1 : 0) )
@@ -413,6 +414,10 @@ typedef struct _tableDataInfo
 	DumpableObject dobj;
 	TableInfo  *tdtable;		/* link to table to dump */
 	char	   *filtercond;		/* WHERE condition to limit rows dumped */
+	bool 		chunking;
+	BlockNumber	startPage;		/* starting table page */
+	BlockNumber	endPage;		/* ending table page for page-range dump,
+	                    		 * mostly startPage+max_table_segment_pages */
 } TableDataInfo;
 
 typedef struct _indxInfo
diff --git a/src/bin/pg_dump/t/004_pg_dump_parallel.pl b/src/bin/pg_dump/t/004_pg_dump_parallel.pl
index 738f34b1c1b..9094352e29f 100644
--- a/src/bin/pg_dump/t/004_pg_dump_parallel.pl
+++ b/src/bin/pg_dump/t/004_pg_dump_parallel.pl
@@ -11,6 +11,7 @@ use Test::More;
 my $dbname1 = 'regression_src';
 my $dbname2 = 'regression_dest1';
 my $dbname3 = 'regression_dest2';
+my $dbname4 = 'regression_dest3';
 
 my $node = PostgreSQL::Test::Cluster->new('main');
 $node->init;
@@ -21,6 +22,7 @@ my $backupdir = $node->backup_dir;
 $node->run_log([ 'createdb', $dbname1 ]);
 $node->run_log([ 'createdb', $dbname2 ]);
 $node->run_log([ 'createdb', $dbname3 ]);
+$node->run_log([ 'createdb', $dbname4 ]);
 
 $node->safe_psql(
 	$dbname1,
@@ -44,6 +46,18 @@ create table tht_p1 partition of tht for values with (modulus 3, remainder 0);
 create table tht_p2 partition of tht for values with (modulus 3, remainder 1);
 create table tht_p3 partition of tht for values with (modulus 3, remainder 2);
 insert into tht select (x%10)::text::digit, x from generate_series(1,1000) x;
+
+-- raise warning so I can check in .log if data was correct
+DO \$\$
+DECLARE
+    thash_rec RECORD;
+BEGIN
+    SELECT 'tplain', count(*), sum(hashtext(t::text)) as tablehash 
+	  INTO thash_rec
+	  FROM tplain AS t;
+    RAISE WARNING 'thash: %', thash_rec;
+END;
+\$\$;
 	});
 
 $node->command_ok(
@@ -87,4 +101,42 @@ $node->command_ok(
 	],
 	'parallel restore as inserts');
 
+$node->command_ok(
+	[
+		'pg_dump',
+		'--format' => 'directory',
+		'--max-table-segment-pages' => 5,
+		'--no-sync',
+		'--jobs' => 2,
+		'--file' => "$backupdir/dump3",
+		$node->connstr($dbname1),
+	],
+	'parallel dump with chunks of five heap pages');
+
+$node->command_ok(
+	[
+		'pg_restore', '--verbose',
+		'--dbname' => $node->connstr($dbname4),
+		'--jobs' => 3,
+		"$backupdir/dump3",
+	],
+	'parallel restore with chunks of five heap pages');
+
+$node->safe_psql(
+	$dbname4,
+	qq{
+
+-- raise warning so I can check in .log if data was correct
+DO \$\$
+DECLARE
+    thash_rec RECORD;
+BEGIN
+    SELECT 'tplain', count(*), sum(hashtext(t::text)) as tablehash 
+	  INTO thash_rec
+	  FROM tplain AS t;
+    RAISE WARNING 'thash after parallel chunked restore: %', thash_rec;
+END;
+\$\$;
+	});
+
 done_testing();
-- 
2.43.0

Reply via email to