Hi Hackers, I noticed several additional paths in contrib modules, beyond [1], that are potentially suitable for streamification:
1) pgstattuple — pgstatapprox.c and parts of pgstattuple_approx_internal 2) Bloom — scan paths in blgetbitmap() and maintenance paths in blbulkdelete() The following patches streamify those code paths. No benchmarks have been run yet. [1] https://www.postgresql.org/message-id/flat/CABPTF7UeN2o-trr9r7K76rZExnO2M4SLfvTfbUY2CwQjCekgnQ%40mail.gmail.com Feedbacks welcome. -- Best, Xuneng
From 8444107113a3b9a237520b41d322f7202e8c1502 Mon Sep 17 00:00:00 2001 From: alterego655 <[email protected]> Date: Thu, 25 Dec 2025 13:40:13 +0800 Subject: [PATCH v1 3/3] Streamify heap bloat estimation scan. Introduce a read-stream callback to skip all-visible pages via VM/FSM lookup and stream-read the rest, reducing page reads and improving pgstattuple_approx execution time on large relations. --- contrib/pgstattuple/pgstatapprox.c | 125 ++++++++++++++++++++++------- 1 file changed, 94 insertions(+), 31 deletions(-) diff --git a/contrib/pgstattuple/pgstatapprox.c b/contrib/pgstattuple/pgstatapprox.c index a59ff4e9d4f..eb5c26ffc10 100644 --- a/contrib/pgstattuple/pgstatapprox.c +++ b/contrib/pgstattuple/pgstatapprox.c @@ -23,6 +23,7 @@ #include "storage/bufmgr.h" #include "storage/freespace.h" #include "storage/procarray.h" +#include "storage/read_stream.h" PG_FUNCTION_INFO_V1(pgstattuple_approx); PG_FUNCTION_INFO_V1(pgstattuple_approx_v1_5); @@ -45,6 +46,61 @@ typedef struct output_type #define NUM_OUTPUT_COLUMNS 10 +/* + * Struct for statapprox_heap read stream callback. + */ +typedef struct StatApproxReadStreamPrivate +{ + Relation rel; + output_type *stat; + BlockNumber current_blocknum; + BlockNumber nblocks; + BlockNumber scanned; /* count of pages actually read */ + Buffer vmbuffer; /* for VM lookups */ +} StatApproxReadStreamPrivate; + +/* + * Read stream callback for statapprox_heap. + * + * This callback checks the visibility map for each block. If the block is + * all-visible, we can get the free space from the FSM without reading the + * actual page, and skip to the next block. Only blocks that are not + * all-visible are returned for actual reading. + */ +static BlockNumber +statapprox_heap_read_stream_next(ReadStream *stream, + void *callback_private_data, + void *per_buffer_data) +{ + StatApproxReadStreamPrivate *p = callback_private_data; + + while (p->current_blocknum < p->nblocks) + { + BlockNumber blkno = p->current_blocknum++; + Size freespace; + + CHECK_FOR_INTERRUPTS(); + + /* + * If the page has only visible tuples, then we can find out the free + * space from the FSM and move on without reading the page. + */ + if (VM_ALL_VISIBLE(p->rel, blkno, &p->vmbuffer)) + { + freespace = GetRecordedFreeSpace(p->rel, blkno); + p->stat->tuple_len += BLCKSZ - freespace; + p->stat->free_space += freespace; + continue; + } + + /* This block needs to be read */ + p->scanned++; + return blkno; + } + + return InvalidBlockNumber; +} + /* * This function takes an already open relation and scans its pages, * skipping those that have the corresponding visibility map bit set. @@ -58,53 +114,58 @@ typedef struct output_type static void statapprox_heap(Relation rel, output_type *stat) { - BlockNumber scanned, - nblocks, - blkno; - Buffer vmbuffer = InvalidBuffer; + BlockNumber nblocks; BufferAccessStrategy bstrategy; TransactionId OldestXmin; + StatApproxReadStreamPrivate p; + ReadStream *stream; OldestXmin = GetOldestNonRemovableTransactionId(rel); bstrategy = GetAccessStrategy(BAS_BULKREAD); nblocks = RelationGetNumberOfBlocks(rel); - scanned = 0; - for (blkno = 0; blkno < nblocks; blkno++) + /* Initialize read stream private data */ + p.rel = rel; + p.stat = stat; + p.current_blocknum = 0; + p.nblocks = nblocks; + p.scanned = 0; + p.vmbuffer = InvalidBuffer; + + /* + * Create the read stream. We don't use READ_STREAM_USE_BATCHING because + * the callback accesses the visibility map which may need to read VM + * pages. While this shouldn't cause deadlocks, we err on the side of + * caution. + */ + stream = read_stream_begin_relation(READ_STREAM_FULL, + bstrategy, + rel, + MAIN_FORKNUM, + statapprox_heap_read_stream_next, + &p, + 0); + + for (;;) { Buffer buf; Page page; OffsetNumber offnum, maxoff; - Size freespace; - - CHECK_FOR_INTERRUPTS(); - - /* - * If the page has only visible tuples, then we can find out the free - * space from the FSM and move on. - */ - if (VM_ALL_VISIBLE(rel, blkno, &vmbuffer)) - { - freespace = GetRecordedFreeSpace(rel, blkno); - stat->tuple_len += BLCKSZ - freespace; - stat->free_space += freespace; - continue; - } + BlockNumber blkno; - buf = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, - RBM_NORMAL, bstrategy); + buf = read_stream_next_buffer(stream, NULL); + if (buf == InvalidBuffer) + break; LockBuffer(buf, BUFFER_LOCK_SHARE); page = BufferGetPage(buf); + blkno = BufferGetBlockNumber(buf); stat->free_space += PageGetExactFreeSpace(page); - /* We may count the page as scanned even if it's new/empty */ - scanned++; - if (PageIsNew(page) || PageIsEmpty(page)) { UnlockReleaseBuffer(buf); @@ -169,6 +230,8 @@ statapprox_heap(Relation rel, output_type *stat) UnlockReleaseBuffer(buf); } + read_stream_end(stream); + stat->table_len = (uint64) nblocks * BLCKSZ; /* @@ -179,7 +242,7 @@ statapprox_heap(Relation rel, output_type *stat) * tuples in all-visible pages, so no correction is needed for that, and * we already accounted for the space in those pages, too. */ - stat->tuple_count = vac_estimate_reltuples(rel, nblocks, scanned, + stat->tuple_count = vac_estimate_reltuples(rel, nblocks, p.scanned, stat->tuple_count); /* It's not clear if we could get -1 here, but be safe. */ @@ -190,16 +253,16 @@ statapprox_heap(Relation rel, output_type *stat) */ if (nblocks != 0) { - stat->scanned_percent = 100.0 * scanned / nblocks; + stat->scanned_percent = 100.0 * p.scanned / nblocks; stat->tuple_percent = 100.0 * stat->tuple_len / stat->table_len; stat->dead_tuple_percent = 100.0 * stat->dead_tuple_len / stat->table_len; stat->free_percent = 100.0 * stat->free_space / stat->table_len; } - if (BufferIsValid(vmbuffer)) + if (BufferIsValid(p.vmbuffer)) { - ReleaseBuffer(vmbuffer); - vmbuffer = InvalidBuffer; + ReleaseBuffer(p.vmbuffer); + p.vmbuffer = InvalidBuffer; } } -- 2.51.0
From d3b15792ee09d7aba4df76273d8883a739d215ce Mon Sep 17 00:00:00 2001 From: alterego655 <[email protected]> Date: Thu, 25 Dec 2025 13:40:02 +0800 Subject: [PATCH v1 2/3] Streamify Bloom VACUUM paths. Use streaming reads in blbulkdelete() and blvacuumcleanup() to iterate index pages without repeated ReadBuffer calls, improving VACUUM performance and reducing buffer manager overhead during maintenance operations. --- contrib/bloom/blvacuum.c | 55 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c index e68a9008f56..7452302f022 100644 --- a/contrib/bloom/blvacuum.c +++ b/contrib/bloom/blvacuum.c @@ -17,6 +17,7 @@ #include "commands/vacuum.h" #include "storage/bufmgr.h" #include "storage/indexfsm.h" +#include "storage/read_stream.h" /* @@ -40,6 +41,8 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, Page page; BloomMetaPageData *metaData; GenericXLogState *gxlogState; + BlockRangeReadStreamPrivate p; + ReadStream *stream; if (stats == NULL) stats = palloc0_object(IndexBulkDeleteResult); @@ -51,6 +54,25 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, * they can't contain tuples to delete. */ npages = RelationGetNumberOfBlocks(index); + + /* Scan all blocks except the metapage using streaming reads */ + p.current_blocknum = BLOOM_HEAD_BLKNO; + p.last_exclusive = npages; + + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE | + READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, + info->strategy, + index, + MAIN_FORKNUM, + block_range_read_stream_cb, + &p, + 0); + for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) { BloomTuple *itup, @@ -59,8 +81,7 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, vacuum_delay_point(false); - buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, - RBM_NORMAL, info->strategy); + buffer = read_stream_next_buffer(stream, NULL); LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); gxlogState = GenericXLogStart(index); @@ -133,6 +154,9 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, UnlockReleaseBuffer(buffer); } + Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer); + read_stream_end(stream); + /* * Update the metapage's notFullPage list with whatever we found. Our * info could already be out of date at this point, but blinsert() will @@ -166,6 +190,8 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) Relation index = info->index; BlockNumber npages, blkno; + BlockRangeReadStreamPrivate p; + ReadStream *stream; if (info->analyze_only) return stats; @@ -181,6 +207,25 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) stats->num_pages = npages; stats->pages_free = 0; stats->num_index_tuples = 0; + + /* Scan all blocks except the metapage using streaming reads */ + p.current_blocknum = BLOOM_HEAD_BLKNO; + p.last_exclusive = npages; + + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + stream = read_stream_begin_relation(READ_STREAM_MAINTENANCE | + READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, + info->strategy, + index, + MAIN_FORKNUM, + block_range_read_stream_cb, + &p, + 0); + for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) { Buffer buffer; @@ -188,8 +233,7 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) vacuum_delay_point(false); - buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, - RBM_NORMAL, info->strategy); + buffer = read_stream_next_buffer(stream, NULL); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); @@ -206,6 +250,9 @@ blvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) UnlockReleaseBuffer(buffer); } + Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer); + read_stream_end(stream); + IndexFreeSpaceMapVacuum(info->index); return stats; -- 2.51.0
From 0a211e788d964aebd876dc4472440e8f234ce38a Mon Sep 17 00:00:00 2001 From: alterego655 <[email protected]> Date: Thu, 25 Dec 2025 13:39:42 +0800 Subject: [PATCH v1 1/3] Switch Bloom scan paths to streaming read. Replace per-page ReadBuffer loops in blgetbitmap() with read_stream_begin_relation() and sequential buffer iteration, reducing buffer churn and improving scan efficiency on large Bloom indexes. --- contrib/bloom/blscan.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/contrib/bloom/blscan.c b/contrib/bloom/blscan.c index 0d71edbe91c..b1fdabaab74 100644 --- a/contrib/bloom/blscan.c +++ b/contrib/bloom/blscan.c @@ -17,6 +17,7 @@ #include "miscadmin.h" #include "pgstat.h" #include "storage/bufmgr.h" +#include "storage/read_stream.h" /* * Begin scan of bloom index. @@ -75,11 +76,13 @@ int64 blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) { int64 ntids = 0; - BlockNumber blkno = BLOOM_HEAD_BLKNO, + BlockNumber blkno, npages; int i; BufferAccessStrategy bas; BloomScanOpaque so = (BloomScanOpaque) scan->opaque; + BlockRangeReadStreamPrivate p; + ReadStream *stream; if (so->sign == NULL) { @@ -119,14 +122,29 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) if (scan->instrument) scan->instrument->nsearches++; + /* Scan all blocks except the metapage using streaming reads */ + p.current_blocknum = BLOOM_HEAD_BLKNO; + p.last_exclusive = npages; + + /* + * It is safe to use batchmode as block_range_read_stream_cb takes no + * locks. + */ + stream = read_stream_begin_relation(READ_STREAM_FULL | + READ_STREAM_USE_BATCHING, + bas, + scan->indexRelation, + MAIN_FORKNUM, + block_range_read_stream_cb, + &p, + 0); + for (blkno = BLOOM_HEAD_BLKNO; blkno < npages; blkno++) { Buffer buffer; Page page; - buffer = ReadBufferExtended(scan->indexRelation, MAIN_FORKNUM, - blkno, RBM_NORMAL, bas); - + buffer = read_stream_next_buffer(stream, NULL); LockBuffer(buffer, BUFFER_LOCK_SHARE); page = BufferGetPage(buffer); @@ -162,6 +180,9 @@ blgetbitmap(IndexScanDesc scan, TIDBitmap *tbm) UnlockReleaseBuffer(buffer); CHECK_FOR_INTERRUPTS(); } + + Assert(read_stream_next_buffer(stream, NULL) == InvalidBuffer); + read_stream_end(stream); FreeAccessStrategy(bas); return ntids; -- 2.51.0
