Thanks Zsolt
I changed it to use the configured BLCKSZ in attached patch.
But you may be right that current_setting('block_size')::int is the
better way to go as we are interested in the page size at the target
database, not what the pg_sump was compiled with
I'll wait for other feedback as well and then send the next patc with changes
On Mon, Jan 19, 2026 at 10:15 PM Zsolt Parragi
<[email protected]> wrote:
>
> Hello
>
> pgdump.c:7174
>
> + appendPQExpBufferStr(query, "pg_relation_size(c.tableoid)/8192 AS
> relpages, ");
>
> Shouldn't this be something like
>
> + appendPQExpBufferStr(query,
> "pg_relation_size(c.oid)/current_setting('block_size')::int AS
> relpages, ");
>
> instead?
From 6577b964a4b4b85aa51cc7ba12f785ed5567894a Mon Sep 17 00:00:00 2001
From: Hannu Krosing <[email protected]>
Date: Mon, 19 Jan 2026 23:56:49 +0100
Subject: [PATCH v8] * changed flag mname to max-table-segment-pages * added
check for amname = "heap" * added simple chunked dump and restore test *
switched to using of pg_relation_size()/BLCKSZ when --max-table-segment-pages
is set
* added a WARNING with count and table data hash to source and chunked restore database
---
src/bin/pg_dump/pg_backup.h | 1 +
src/bin/pg_dump/pg_backup_archiver.c | 1 +
src/bin/pg_dump/pg_dump.c | 172 +++++++++++++++++-----
src/bin/pg_dump/pg_dump.h | 5 +
src/bin/pg_dump/t/004_pg_dump_parallel.pl | 52 +++++++
5 files changed, 193 insertions(+), 38 deletions(-)
diff --git a/src/bin/pg_dump/pg_backup.h b/src/bin/pg_dump/pg_backup.h
index d9041dad720..28df18fd993 100644
--- a/src/bin/pg_dump/pg_backup.h
+++ b/src/bin/pg_dump/pg_backup.h
@@ -178,6 +178,7 @@ typedef struct _dumpOptions
bool aclsSkip;
const char *lockWaitTimeout;
int dump_inserts; /* 0 = COPY, otherwise rows per INSERT */
+ int max_table_segment_pages; /* chunk when relpages is above this */
/* flags for various command-line long options */
int disable_dollar_quoting;
diff --git a/src/bin/pg_dump/pg_backup_archiver.c b/src/bin/pg_dump/pg_backup_archiver.c
index 4a63f7392ae..70e4da9a970 100644
--- a/src/bin/pg_dump/pg_backup_archiver.c
+++ b/src/bin/pg_dump/pg_backup_archiver.c
@@ -154,6 +154,7 @@ InitDumpOptions(DumpOptions *opts)
opts->dumpSchema = true;
opts->dumpData = true;
opts->dumpStatistics = false;
+ opts->max_table_segment_pages = UINT32_MAX; /* == InvalidBlockNumber, disable chunking by default */
}
/*
diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c
index 687dc98e46d..747b396c788 100644
--- a/src/bin/pg_dump/pg_dump.c
+++ b/src/bin/pg_dump/pg_dump.c
@@ -539,6 +539,7 @@ main(int argc, char **argv)
{"exclude-extension", required_argument, NULL, 17},
{"sequence-data", no_argument, &dopt.sequence_data, 1},
{"restrict-key", required_argument, NULL, 25},
+ {"max-table-segment-pages", required_argument, NULL, 26},
{NULL, 0, NULL, 0}
};
@@ -803,6 +804,13 @@ main(int argc, char **argv)
dopt.restrict_key = pg_strdup(optarg);
break;
+ case 26: /* huge table chunk pages */
+ if (!option_parse_int(optarg, "--max-table-segment-pages", 1, INT32_MAX,
+ &dopt.max_table_segment_pages))
+ exit_nicely(1);
+ pg_log_warning("CHUNKING: set dopt.max_table_segment_pages to [%u]",(BlockNumber) dopt.max_table_segment_pages);
+ break;
+
default:
/* getopt_long already emitted a complaint */
pg_log_error_hint("Try \"%s --help\" for more information.", progname);
@@ -1372,6 +1380,9 @@ help(const char *progname)
printf(_(" --extra-float-digits=NUM override default setting for extra_float_digits\n"));
printf(_(" --filter=FILENAME include or exclude objects and data from dump\n"
" based on expressions in FILENAME\n"));
+ printf(_(" --max-table-segment-pages=NUMPAGES\n"
+ " Number of main table pages above which data is \n"
+ " copied out in chunks, also determines the chunk size\n"));
printf(_(" --if-exists use IF EXISTS when dropping objects\n"));
printf(_(" --include-foreign-data=PATTERN\n"
" include data of foreign tables on foreign\n"
@@ -2412,7 +2423,7 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
* a filter condition was specified. For other cases a simple COPY
* suffices.
*/
- if (tdinfo->filtercond || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
+ if (tdinfo->filtercond || tdinfo->chunking || tbinfo->relkind == RELKIND_FOREIGN_TABLE)
{
/* Temporary allows to access to foreign tables to dump data */
if (tbinfo->relkind == RELKIND_FOREIGN_TABLE)
@@ -2428,9 +2439,23 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
else
appendPQExpBufferStr(q, "* ");
- appendPQExpBuffer(q, "FROM %s %s) TO stdout;",
+ appendPQExpBuffer(q, "FROM %s %s",
fmtQualifiedDumpable(tbinfo),
tdinfo->filtercond ? tdinfo->filtercond : "");
+ if (tdinfo->chunking)
+ {
+ if(tdinfo->endPage != InvalidBlockNumber)
+ appendPQExpBuffer(q, "%s ctid BETWEEN '(%u,1)' AND '(%u,32000)'", /* there is no (*,0) tuple */
+ tdinfo->filtercond?" AND ":" WHERE ",
+ tdinfo->startPage, tdinfo->endPage);
+ else
+ appendPQExpBuffer(q, "%s ctid >= '(%u,1)'", /* there is no (*,0) tuple */
+ tdinfo->filtercond?" AND ":" WHERE ",
+ tdinfo->startPage);
+ pg_log_warning("CHUNKING: pages [%u:%u]",tdinfo->startPage, tdinfo->endPage);
+ }
+
+ appendPQExpBuffer(q, ") TO stdout;");
}
else
{
@@ -2438,6 +2463,9 @@ dumpTableData_copy(Archive *fout, const void *dcontext)
fmtQualifiedDumpable(tbinfo),
column_list);
}
+
+ pg_log_warning("CHUNKING: data query: %s", q->data);
+
res = ExecuteSqlQuery(fout, q->data, PGRES_COPY_OUT);
PQclear(res);
destroyPQExpBuffer(clistBuf);
@@ -2933,42 +2961,100 @@ dumpTableData(Archive *fout, const TableDataInfo *tdinfo)
{
TocEntry *te;
- te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
- ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
- .namespace = tbinfo->dobj.namespace->dobj.name,
- .owner = tbinfo->rolname,
- .description = "TABLE DATA",
- .section = SECTION_DATA,
- .createStmt = tdDefn,
- .copyStmt = copyStmt,
- .deps = &(tbinfo->dobj.dumpId),
- .nDeps = 1,
- .dumpFn = dumpFn,
- .dumpArg = tdinfo));
-
- /*
- * Set the TocEntry's dataLength in case we are doing a parallel dump
- * and want to order dump jobs by table size. We choose to measure
- * dataLength in table pages (including TOAST pages) during dump, so
- * no scaling is needed.
- *
- * However, relpages is declared as "integer" in pg_class, and hence
- * also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
- * Cast so that we get the right interpretation of table sizes
- * exceeding INT_MAX pages.
+ /* chunking works off relpages, which may be slightly off
+ * but is the best we have without doing our own page count
+ * it should be enough for typical use case of huge tables which
+ * should have their relpages updated by autovacuum
+ *
+ * For now we only do cunking when table access method is heap
+ * we may add other chunking methods later.
*/
- te->dataLength = (BlockNumber) tbinfo->relpages;
- te->dataLength += (BlockNumber) tbinfo->toastpages;
+ if ((BlockNumber) tbinfo->relpages < dopt->max_table_segment_pages ||
+ strcmp(tbinfo->amname, "heap") != 0)
+ {
+ te = ArchiveEntry(fout, tdinfo->dobj.catId, tdinfo->dobj.dumpId,
+ ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+ .namespace = tbinfo->dobj.namespace->dobj.name,
+ .owner = tbinfo->rolname,
+ .description = "TABLE DATA",
+ .section = SECTION_DATA,
+ .createStmt = tdDefn,
+ .copyStmt = copyStmt,
+ .deps = &(tbinfo->dobj.dumpId),
+ .nDeps = 1,
+ .dumpFn = dumpFn,
+ .dumpArg = tdinfo));
- /*
- * If pgoff_t is only 32 bits wide, the above refinement is useless,
- * and instead we'd better worry about integer overflow. Clamp to
- * INT_MAX if the correct result exceeds that.
- */
- if (sizeof(te->dataLength) == 4 &&
- (tbinfo->relpages < 0 || tbinfo->toastpages < 0 ||
- te->dataLength < 0))
- te->dataLength = INT_MAX;
+ /*
+ * Set the TocEntry's dataLength in case we are doing a parallel dump
+ * and want to order dump jobs by table size. We choose to measure
+ * dataLength in table pages (including TOAST pages) during dump, so
+ * no scaling is needed.
+ *
+ * However, relpages is declared as "integer" in pg_class, and hence
+ * also in TableInfo, but it's really BlockNumber a/k/a unsigned int.
+ * Cast so that we get the right interpretation of table sizes
+ * exceeding INT_MAX pages.
+ */
+ te->dataLength = (BlockNumber) tbinfo->relpages;
+ te->dataLength += (BlockNumber) tbinfo->toastpages;
+
+ /*
+ * If pgoff_t is only 32 bits wide, the above refinement is useless,
+ * and instead we'd better worry about integer overflow. Clamp to
+ * INT_MAX if the correct result exceeds that.
+ */
+ if (sizeof(te->dataLength) == 4 &&
+ (tbinfo->relpages < 0 || tbinfo->toastpages < 0 ||
+ te->dataLength < 0))
+ te->dataLength = INT_MAX;
+ }
+ else
+ {
+ BlockNumber current_chunk_start = 0;
+ PQExpBuffer chunk_desc = createPQExpBuffer();
+
+ pg_log_warning("CHUNKING: toc for chunked relpages [%u]",(BlockNumber) tbinfo->relpages);
+
+ while (current_chunk_start < (BlockNumber) tbinfo->relpages)
+ {
+ TableDataInfo *chunk_tdinfo = (TableDataInfo *) pg_malloc(sizeof(TableDataInfo));
+
+ memcpy(chunk_tdinfo, tdinfo, sizeof(TableDataInfo));
+ AssignDumpId(&chunk_tdinfo->dobj);
+ //addObjectDependency(&chunk_tdinfo->dobj, tbinfo->dobj.dumpId); /* do we need this here */
+ chunk_tdinfo->chunking = true;
+ chunk_tdinfo->startPage = current_chunk_start;
+ chunk_tdinfo->endPage = current_chunk_start + dopt->max_table_segment_pages - 1;
+
+ pg_log_warning("CHUNKING: toc for pages [%u:%u]",chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+
+ current_chunk_start += dopt->max_table_segment_pages;
+ if (current_chunk_start >= (BlockNumber) tbinfo->relpages)
+ chunk_tdinfo->endPage = UINT32_MAX; /* last chunk is for "all the rest" */
+
+ printfPQExpBuffer(chunk_desc, "TABLE DATA (pages %u:%u)", chunk_tdinfo->startPage, chunk_tdinfo->endPage);
+
+ te = ArchiveEntry(fout, chunk_tdinfo->dobj.catId, chunk_tdinfo->dobj.dumpId,
+ ARCHIVE_OPTS(.tag = tbinfo->dobj.name,
+ .namespace = tbinfo->dobj.namespace->dobj.name,
+ .owner = tbinfo->rolname,
+ .description = chunk_desc->data,
+ .section = SECTION_DATA,
+ .createStmt = tdDefn,
+ .copyStmt = copyStmt,
+ .deps = &(tbinfo->dobj.dumpId),
+ .nDeps = 1,
+ .dumpFn = dumpFn,
+ .dumpArg = chunk_tdinfo));
+
+ te->dataLength = dopt->max_table_segment_pages;
+ /* let's assume toast pages distribute evenly among chunks */
+ te->dataLength += (off_t)dopt->max_table_segment_pages * tbinfo->toastpages / tbinfo->relpages;
+ }
+
+ destroyPQExpBuffer(chunk_desc);
+ }
}
destroyPQExpBuffer(copyBuf);
@@ -3092,6 +3178,9 @@ makeTableDataInfo(DumpOptions *dopt, TableInfo *tbinfo)
tdinfo->dobj.namespace = tbinfo->dobj.namespace;
tdinfo->tdtable = tbinfo;
tdinfo->filtercond = NULL; /* might get set later */
+ tdinfo->chunking = false; /* defaults */
+ tdinfo->startPage = 0;
+ tdinfo->endPage = InvalidBlockNumber;
addObjectDependency(&tdinfo->dobj, tbinfo->dobj.dumpId);
/* A TableDataInfo contains data, of course */
@@ -7254,8 +7343,15 @@ getTables(Archive *fout, int *numTables)
"c.relnamespace, c.relkind, c.reltype, "
"c.relowner, "
"c.relchecks, "
- "c.relhasindex, c.relhasrules, c.relpages, "
- "c.reltuples, c.relallvisible, ");
+ "c.relhasindex, c.relhasrules, ");
+
+ /* use real relation size if chunking is requested */
+ if(dopt->max_table_segment_pages != InvalidBlockNumber)
+ appendPQExpBuffer(query, "pg_relation_size(c.tableoid)/%d AS relpages, ", BLCKSZ);
+ else
+ appendPQExpBufferStr(query, "c.relpages, ");
+
+ appendPQExpBufferStr(query, "c.reltuples, c.relallvisible, ");
if (fout->remoteVersion >= 180000)
appendPQExpBufferStr(query, "c.relallfrozen, ");
diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h
index 4c4b14e5fc7..ddaf341bb3b 100644
--- a/src/bin/pg_dump/pg_dump.h
+++ b/src/bin/pg_dump/pg_dump.h
@@ -16,6 +16,7 @@
#include "pg_backup.h"
#include "catalog/pg_publication_d.h"
+#include "storage/block.h"
#define oidcmp(x,y) ( ((x) < (y) ? -1 : ((x) > (y)) ? 1 : 0) )
@@ -413,6 +414,10 @@ typedef struct _tableDataInfo
DumpableObject dobj;
TableInfo *tdtable; /* link to table to dump */
char *filtercond; /* WHERE condition to limit rows dumped */
+ bool chunking;
+ BlockNumber startPage; /* starting table page */
+ BlockNumber endPage; /* ending table page for page-range dump,
+ * mostly startPage+max_table_segment_pages */
} TableDataInfo;
typedef struct _indxInfo
diff --git a/src/bin/pg_dump/t/004_pg_dump_parallel.pl b/src/bin/pg_dump/t/004_pg_dump_parallel.pl
index 738f34b1c1b..9094352e29f 100644
--- a/src/bin/pg_dump/t/004_pg_dump_parallel.pl
+++ b/src/bin/pg_dump/t/004_pg_dump_parallel.pl
@@ -11,6 +11,7 @@ use Test::More;
my $dbname1 = 'regression_src';
my $dbname2 = 'regression_dest1';
my $dbname3 = 'regression_dest2';
+my $dbname4 = 'regression_dest3';
my $node = PostgreSQL::Test::Cluster->new('main');
$node->init;
@@ -21,6 +22,7 @@ my $backupdir = $node->backup_dir;
$node->run_log([ 'createdb', $dbname1 ]);
$node->run_log([ 'createdb', $dbname2 ]);
$node->run_log([ 'createdb', $dbname3 ]);
+$node->run_log([ 'createdb', $dbname4 ]);
$node->safe_psql(
$dbname1,
@@ -44,6 +46,18 @@ create table tht_p1 partition of tht for values with (modulus 3, remainder 0);
create table tht_p2 partition of tht for values with (modulus 3, remainder 1);
create table tht_p3 partition of tht for values with (modulus 3, remainder 2);
insert into tht select (x%10)::text::digit, x from generate_series(1,1000) x;
+
+-- raise warning so I can check in .log if data was correct
+DO \$\$
+DECLARE
+ thash_rec RECORD;
+BEGIN
+ SELECT 'tplain', count(*), sum(hashtext(t::text)) as tablehash
+ INTO thash_rec
+ FROM tplain AS t;
+ RAISE WARNING 'thash: %', thash_rec;
+END;
+\$\$;
});
$node->command_ok(
@@ -87,4 +101,42 @@ $node->command_ok(
],
'parallel restore as inserts');
+$node->command_ok(
+ [
+ 'pg_dump',
+ '--format' => 'directory',
+ '--max-table-segment-pages' => 5,
+ '--no-sync',
+ '--jobs' => 2,
+ '--file' => "$backupdir/dump3",
+ $node->connstr($dbname1),
+ ],
+ 'parallel dump with chunks of five heap pages');
+
+$node->command_ok(
+ [
+ 'pg_restore', '--verbose',
+ '--dbname' => $node->connstr($dbname4),
+ '--jobs' => 3,
+ "$backupdir/dump3",
+ ],
+ 'parallel restore with chunks of five heap pages');
+
+$node->safe_psql(
+ $dbname4,
+ qq{
+
+-- raise warning so I can check in .log if data was correct
+DO \$\$
+DECLARE
+ thash_rec RECORD;
+BEGIN
+ SELECT 'tplain', count(*), sum(hashtext(t::text)) as tablehash
+ INTO thash_rec
+ FROM tplain AS t;
+ RAISE WARNING 'thash after parallel chunked restore: %', thash_rec;
+END;
+\$\$;
+ });
+
done_testing();
--
2.43.0