Hi all, During index bulk-deletion in lazy vacuum, we currently check the deletability of each index tuple individually using the vac_tid_reaped() function. The attached proof-of-concept patches propose batching multiple TID lookups for deletability checks to reduce overhead. This optimization aims to minimize redundant function calls and repeated TidStore entry retrievals for TIDs on the same page. I have conducted benchmarks across several scenarios to evaluate the performance impact.
# Case-1 (btree tuples are regular tuples and dead TIDs are concentrated): create unlogged table test (c int) with (autovacuum_enabled = off); insert into test select generate_series(1, ${NROWS}); create index on test (c); delete from test where c < ${NROWS} * 0.3; # Case-2 (btree tuples are regular tuples and dead TIDs are sparsed): create unlogged table test (c int) with (autovacuum_enabled = off); insert into test select generate_series(1, ${NROWS}); create index on test (c); delete from test where random() < 0.3; # Case-3 (btree tuples are deduplicated tuples): create unlogged table test (c int) with (autovacuum_enabled = off); insert into test select c % 1000 from generate_series(1, ${NROWS}) c; create index on test (c); select pg_relation_size('test') / 8192 as relpages \gset delete from test where (ctid::text::point)[0] < ((:'relpages')::int * 0.3); # Case-4 (btree tuples are deduplicated tuples and table is clustered): create unlogged table test (c int) with (autovacuum_enabled = off); insert into test select c % 1000 from generate_series(1, ${NROWS}) c; create index on test (c); cluster test using test_c_idx; select pg_relation_size('test') / 8192 as relpages \gset delete from test where (ctid::text::point)[0] < ((:'relpages')::int * 0.3); # Case-5 (creating btree index on UUID column) create unlogged table test (c uuid) with (autovacuum_enabled = off); insert into test select uuidv4() from generate_series(1, ${NROWS}) c; create index on test (c); select pg_relation_size('test') / 8192 as relpages \gset delete from test where (ctid::text::point)[0] < ((:'relpages')::int * 0.3); Here are the results (NROWS = 50000000): HEAD PATCHED DIFF case-1: 3,021 ms 2.818 ms 93.29% case-2: 5, 697 ms 5.545 ms 97.34% case-3: 2,833 ms 2.790 ms 98.48% case-4: 2,564 ms 2.279 ms 88.86% case-5: 4,657 ms 4.706 ms 101.04% I've measured 6 ~ 11% improvement in btree bulk-deletion. Here are the summary of each attached patch: 0001: Introduce TIdStoreIsMemberMulti() where we can do IsMember check for multiple TIDs in one function call. If the given TIDs are sorted (at least in block number), we can save radix tree lookup for the same page entry. 0002: Convert IndexBUlkDeleteCallback() to batched operation. 0003: Use batch TIDs lookup in btree index bulk-deletion. In patch 0003, we implement batch TID lookups for both each deduplicated index tuple and remaining all regular index tuples, which seems to be the most straightforward approach. While further optimizations are possible, such as performing batch TID lookups for all index tuples on a single page, these could introduce additional overhead from sorting and re-sorting TIDs. Moreover, considering that radix tree lookups are relatively inexpensive, the benefits of sorting TIDs and using TidStoreIsMemberMulti() might be minimal. Nevertheless, these potential optimizations warrant further evaluation to determine their actual impact on performance. Also, the patch includes the batch TIDs lookup support only for btree indexes but we potentially can improve other index AMs too. Regards, -- Masahiko Sawada Amazon Web Services: https://aws.amazon.com
From ba4e7dc1f06f3dceab896b6ee4bf68ff23db1c09 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada <sawada.mshk@gmail.com> Date: Tue, 22 Apr 2025 12:25:11 -0700 Subject: [PATCH v1 3/3] Use batch TIDs lookup in btree index bulk-deletion. TIDs in the postlist are sorted. But TIDs of the gathered regular index tuples are not sorted. Author: Reviewed-by: Discussion: https://postgr.es/m/ Backpatch-through: --- src/backend/access/nbtree/nbtree.c | 107 +++++++++++++++++++---------- 1 file changed, 71 insertions(+), 36 deletions(-) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 821e16d5691..c92efecc076 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -23,6 +23,7 @@ #include "access/stratnum.h" #include "commands/progress.h" #include "commands/vacuum.h" +#include "common/int.h" #include "nodes/execnodes.h" #include "pgstat.h" #include "storage/bulk_write.h" @@ -1309,6 +1310,13 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexFreeSpaceMapVacuum(rel); } +/* qsort comparator for sorting OffsetNumbers */ +static int +cmpOffsetNumbers(const void *a, const void *b) +{ + return pg_cmp_u16(*(const OffsetNumber *) a, *(const OffsetNumber *) b); +} + /* * btvacuumpage --- VACUUM one page * @@ -1472,6 +1480,10 @@ backtrack: nhtidslive = 0; if (callback) { + OffsetNumber workbuf_offs[MaxIndexTuplesPerPage]; + ItemPointerData workbuf_htids[MaxIndexTuplesPerPage]; + int workbuf_nitem = 0; + /* btbulkdelete callback tells us what to delete (or update) */ for (offnum = minoff; offnum <= maxoff; @@ -1485,14 +1497,13 @@ backtrack: Assert(!BTreeTupleIsPivot(itup)); if (!BTreeTupleIsPosting(itup)) { - /* Regular tuple, standard table TID representation */ - if (callback(&itup->t_tid, 1, NULL, callback_state) > 0) - { - deletable[ndeletable++] = offnum; - nhtidsdead++; - } - else - nhtidslive++; + /* + * Regular tuple, standard table TID representation. Will + * verify them as a whole later. + */ + workbuf_offs[workbuf_nitem] = offnum; + workbuf_htids[workbuf_nitem] = itup->t_tid; + workbuf_nitem++; } else { @@ -1539,6 +1550,38 @@ backtrack: nhtidslive += nremaining; } } + + if (workbuf_nitem > 0) + { + bool workbuf_deletable[MaxIndexTuplesPerPage]; + bool need_sort; + int ndels; + + /* + * We will sort the deletable array if there are existing + * offsets as it should be sorted in ascending order (see + * _bt_delitems_vacuum()). + */ + need_sort = (ndeletable > 0); + + ndels = callback(workbuf_htids, workbuf_nitem, workbuf_deletable, + callback_state); + if (ndels > 0) + { + for (int i = 0; i < workbuf_nitem; i++) + { + if (workbuf_deletable[i]) + deletable[ndeletable++] = workbuf_offs[i]; + } + + if (need_sort) + qsort(deletable, ndeletable, sizeof(OffsetNumber), cmpOffsetNumbers); + + nhtidsdead += ndels; + } + + nhtidslive += workbuf_nitem - ndels; + } } /* @@ -1663,43 +1706,35 @@ static BTVacuumPosting btreevacuumposting(BTVacState *vstate, IndexTuple posting, OffsetNumber updatedoffset, int *nremaining) { - int live = 0; + int ndeletable; int nitem = BTreeTupleGetNPosting(posting); ItemPointer items = BTreeTupleGetPosting(posting); + bool deletable[MaxIndexTuplesPerPage]; BTVacuumPosting vacposting = NULL; - for (int i = 0; i < nitem; i++) + ndeletable = vstate->callback(items, nitem, deletable, vstate->callback_state); + + /* All items are live */ + if (ndeletable == 0) { - if (vstate->callback(items + i, 1, NULL, vstate->callback_state) == 0) - { - /* Live table TID */ - live++; - } - else if (vacposting == NULL) - { - /* - * First dead table TID encountered. - * - * It's now clear that we need to delete one or more dead table - * TIDs, so start maintaining metadata describing how to update - * existing posting list tuple. - */ - vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + - nitem * sizeof(uint16)); + *nremaining = nitem; + return NULL; + } - vacposting->itup = posting; - vacposting->updatedoffset = updatedoffset; - vacposting->ndeletedtids = 0; - vacposting->deletetids[vacposting->ndeletedtids++] = i; - } - else - { - /* Second or subsequent dead table TID */ + vacposting = palloc(offsetof(BTVacuumPostingData, deletetids) + + ndeletable * sizeof(uint16)); + vacposting->itup = posting; + vacposting->updatedoffset = updatedoffset; + vacposting->ndeletedtids = 0; + + for (int i = 0; i < nitem; i++) + { + if (deletable[i]) vacposting->deletetids[vacposting->ndeletedtids++] = i; - } } + Assert(ndeletable == vacposting->ndeletedtids); - *nremaining = live; + *nremaining = nitem - ndeletable; return vacposting; } -- 2.43.5
From 236764cdfc211e04e9404ee8a38bd0d4fac9b823 Mon Sep 17 00:00:00 2001 From: Masahiko Sawada <sawada.mshk@gmail.com> Date: Mon, 21 Apr 2025 22:40:57 -0700 Subject: [PATCH v1 1/3] tidstore.c: introduce TidStoreIsMemberMulti. Author: Reviewed-by: Discussion: https://postgr.es/m/ Backpatch-through: --- src/backend/access/common/tidstore.c | 63 +++++++++++++++++++++++----- src/include/access/tidstore.h | 2 + 2 files changed, 54 insertions(+), 11 deletions(-) diff --git a/src/backend/access/common/tidstore.c b/src/backend/access/common/tidstore.c index 5bd75fb499c..c3ee9db30cc 100644 --- a/src/backend/access/common/tidstore.c +++ b/src/backend/access/common/tidstore.c @@ -416,20 +416,11 @@ TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, local_ts_set(ts->tree.local, blkno, page); } -/* Return true if the given TID is present in the TidStore */ -bool -TidStoreIsMember(TidStore *ts, ItemPointer tid) +static pg_attribute_always_inline int +tidstore_is_member_page(BlocktableEntry *page, OffsetNumber off) { int wordnum; int bitnum; - BlocktableEntry *page; - BlockNumber blk = ItemPointerGetBlockNumber(tid); - OffsetNumber off = ItemPointerGetOffsetNumber(tid); - - if (TidStoreIsShared(ts)) - page = shared_ts_find(ts->tree.shared, blk); - else - page = local_ts_find(ts->tree.local, blk); /* no entry for the blk */ if (page == NULL) @@ -458,6 +449,56 @@ TidStoreIsMember(TidStore *ts, ItemPointer tid) } } +/* Return true if the given TID is present in the TidStore */ +bool +TidStoreIsMember(TidStore *ts, ItemPointer tid) +{ + BlocktableEntry *page; + BlockNumber blk = ItemPointerGetBlockNumber(tid); + OffsetNumber off = ItemPointerGetOffsetNumber(tid); + + if (TidStoreIsShared(ts)) + page = shared_ts_find(ts->tree.shared, blk); + else + page = local_ts_find(ts->tree.local, blk); + + return tidstore_is_member_page(page, off); +} + +/* + * Batched operation of TidStoreIsMember(). + */ +int +TidStoreIsMemberMulti(TidStore *ts, ItemPointer tids, int ntids, bool *ismembers) +{ + BlocktableEntry *page = NULL; + BlockNumber last_blk = InvalidBlockNumber; + int nmembers = 0; + + for (int i = 0; i < ntids; i++) + { + ItemPointer tid = &(tids[i]); + BlockNumber blk = ItemPointerGetBlockNumber(tid); + OffsetNumber off = ItemPointerGetOffsetNumber(tid); + + if (blk != last_blk) + { + if (TidStoreIsShared(ts)) + page = shared_ts_find(ts->tree.shared, blk); + else + page = local_ts_find(ts->tree.local, blk); + } + + ismembers[i] = tidstore_is_member_page(page, off); + if (ismembers[i]) + nmembers++; + + last_blk = blk; + } + + return nmembers; +} + /* * Prepare to iterate through a TidStore. * diff --git a/src/include/access/tidstore.h b/src/include/access/tidstore.h index 041091df278..c3545d3d0fd 100644 --- a/src/include/access/tidstore.h +++ b/src/include/access/tidstore.h @@ -41,6 +41,8 @@ extern void TidStoreDestroy(TidStore *ts); extern void TidStoreSetBlockOffsets(TidStore *ts, BlockNumber blkno, OffsetNumber *offsets, int num_offsets); extern bool TidStoreIsMember(TidStore *ts, ItemPointer tid); +extern int TidStoreIsMemberMulti(TidStore *ts, ItemPointer tids, int ntids, + bool *ismembers); extern TidStoreIter *TidStoreBeginIterate(TidStore *ts); extern TidStoreIterResult *TidStoreIterateNext(TidStoreIter *iter); extern int TidStoreGetBlockOffsets(TidStoreIterResult *result, -- 2.43.5
From e693adff50f09fe791c87a444950b193b642aabb Mon Sep 17 00:00:00 2001 From: Masahiko Sawada <sawada.mshk@gmail.com> Date: Mon, 21 Apr 2025 22:41:33 -0700 Subject: [PATCH v1 2/3] Convert IndexBulkDeleteCallback to process TIDs in batch. Author: Reviewed-by: Discussion: https://postgr.es/m/ Backpatch-through: --- contrib/bloom/blvacuum.c | 4 +++- src/backend/access/gin/ginvacuum.c | 3 ++- src/backend/access/gist/gistvacuum.c | 3 ++- src/backend/access/hash/hash.c | 3 ++- src/backend/access/nbtree/nbtree.c | 4 ++-- src/backend/access/spgist/spgvacuum.c | 13 ++++++++----- src/backend/catalog/index.c | 21 ++++++++++++++------- src/backend/commands/vacuum.c | 8 ++++---- src/include/access/genam.h | 3 ++- 9 files changed, 39 insertions(+), 23 deletions(-) diff --git a/contrib/bloom/blvacuum.c b/contrib/bloom/blvacuum.c index 86b15a75f6f..93089456855 100644 --- a/contrib/bloom/blvacuum.c +++ b/contrib/bloom/blvacuum.c @@ -83,8 +83,10 @@ blbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, OffsetNumberNext(BloomPageGetMaxOffset(page))); while (itup < itupEnd) { + bool dead; + /* Do we have to delete this tuple? */ - if (callback(&itup->heapPtr, callback_state)) + if (callback(&itup->heapPtr, 1, &dead, callback_state) > 0) { /* Yes; adjust count of tuples that will be left on page */ BloomPageGetOpaque(page)->maxoff--; diff --git a/src/backend/access/gin/ginvacuum.c b/src/backend/access/gin/ginvacuum.c index fbbe3a6dd70..336f100261b 100644 --- a/src/backend/access/gin/ginvacuum.c +++ b/src/backend/access/gin/ginvacuum.c @@ -56,7 +56,8 @@ ginVacuumItemPointers(GinVacuumState *gvs, ItemPointerData *items, */ for (i = 0; i < nitem; i++) { - if (gvs->callback(items + i, gvs->callback_state)) + bool deletable; + if (gvs->callback(items + i, 1, &deletable, gvs->callback_state) > 0) { gvs->result->tuples_removed += 1; if (!tmpitems) diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c index 6a359c98c60..0abefb7b664 100644 --- a/src/backend/access/gist/gistvacuum.c +++ b/src/backend/access/gist/gistvacuum.c @@ -383,8 +383,9 @@ restart: { ItemId iid = PageGetItemId(page, off); IndexTuple idxtuple = (IndexTuple) PageGetItem(page, iid); + bool deletable; - if (callback(&(idxtuple->t_tid), callback_state)) + if (callback(&(idxtuple->t_tid), 1, &deletable, callback_state) > 0) todelete[ntodelete++] = off; } } diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index 53061c819fb..e0fffadf698 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -734,6 +734,7 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, IndexTuple itup; Bucket bucket; bool kill_tuple = false; + bool dead; itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offno)); @@ -743,7 +744,7 @@ hashbucketcleanup(Relation rel, Bucket cur_bucket, Buffer bucket_buf, * To remove the dead tuples, we strictly want to rely on results * of callback function. refer btvacuumpage for detailed reason. */ - if (callback && callback(htup, callback_state)) + if (callback && callback(htup, 1, &dead, callback_state) > 0) { kill_tuple = true; if (tuples_removed) diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index accc7fe8bbe..821e16d5691 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -1486,7 +1486,7 @@ backtrack: if (!BTreeTupleIsPosting(itup)) { /* Regular tuple, standard table TID representation */ - if (callback(&itup->t_tid, callback_state)) + if (callback(&itup->t_tid, 1, NULL, callback_state) > 0) { deletable[ndeletable++] = offnum; nhtidsdead++; @@ -1670,7 +1670,7 @@ btreevacuumposting(BTVacState *vstate, IndexTuple posting, for (int i = 0; i < nitem; i++) { - if (!vstate->callback(items + i, vstate->callback_state)) + if (vstate->callback(items + i, 1, NULL, vstate->callback_state) == 0) { /* Live table TID */ live++; diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index 81171f35451..a9a61fa152e 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -153,9 +153,11 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, PageGetItemId(page, i)); if (lt->tupstate == SPGIST_LIVE) { + bool dead; + Assert(ItemPointerIsValid(<->heapPtr)); - if (bds->callback(<->heapPtr, bds->callback_state)) + if (bds->callback(<->heapPtr, 1, &dead, bds->callback_state) > 0) { bds->stats->tuples_removed += 1; deletable[i] = true; @@ -425,9 +427,10 @@ vacuumLeafRoot(spgBulkDeleteState *bds, Relation index, Buffer buffer) PageGetItemId(page, i)); if (lt->tupstate == SPGIST_LIVE) { + bool dead; Assert(ItemPointerIsValid(<->heapPtr)); - if (bds->callback(<->heapPtr, bds->callback_state)) + if (bds->callback(<->heapPtr, 1, &dead, bds->callback_state) > 0) { bds->stats->tuples_removed += 1; toDelete[xlrec.nDelete] = i; @@ -965,10 +968,10 @@ spgbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, } /* Dummy callback to delete no tuples during spgvacuumcleanup */ -static bool -dummy_callback(ItemPointer itemptr, void *state) +static int +dummy_callback(ItemPointer itemptrs, int nitem, bool *deletable, void *state) { - return false; + return 0; } /* diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 739a92bdcc1..fc40317728f 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -126,7 +126,8 @@ static void index_update_stats(Relation rel, static void IndexCheckExclusion(Relation heapRelation, Relation indexRelation, IndexInfo *indexInfo); -static bool validate_index_callback(ItemPointer itemptr, void *opaque); +static int validate_index_callback(ItemPointer itemptrs, int nitem, bool *deletable, + void *opaque); static bool ReindexIsCurrentlyProcessingIndex(Oid indexOid); static void SetReindexProcessing(Oid heapOid, Oid indexOid); static void ResetReindexProcessing(void); @@ -3479,15 +3480,21 @@ validate_index(Oid heapId, Oid indexId, Snapshot snapshot) /* * validate_index_callback - bulkdelete callback to collect the index TIDs */ -static bool -validate_index_callback(ItemPointer itemptr, void *opaque) +static int +validate_index_callback(ItemPointer itemptrs, int nitem, bool *deletable, + void *opaque) { ValidateIndexState *state = (ValidateIndexState *) opaque; - int64 encoded = itemptr_encode(itemptr); - tuplesort_putdatum(state->tuplesort, Int64GetDatum(encoded), false); - state->itups += 1; - return false; /* never actually delete anything */ + for (int i = 0; i < nitem; i++) + { + int64 encoded = itemptr_encode(&(itemptrs[i])); + + tuplesort_putdatum(state->tuplesort, Int64GetDatum(encoded), false); + } + state->itups += nitem; + + return 0; /* never actually delete anything */ } /* diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 33a33bf6b1c..cc8d458ea89 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -127,7 +127,7 @@ static bool vacuum_rel(Oid relid, RangeVar *relation, VacuumParams *params, BufferAccessStrategy bstrategy); static double compute_parallel_delay(void); static VacOptValue get_vacoptval_from_boolean(DefElem *def); -static bool vac_tid_reaped(ItemPointer itemptr, void *state); +static int vac_tid_reaped(ItemPointer itemptrs, int nitem, bool *deletable, void *state); /* * GUC check function to ensure GUC value specified is within the allowable @@ -2654,10 +2654,10 @@ vac_cleanup_one_index(IndexVacuumInfo *ivinfo, IndexBulkDeleteResult *istat) * * This has the right signature to be an IndexBulkDeleteCallback. */ -static bool -vac_tid_reaped(ItemPointer itemptr, void *state) +static int +vac_tid_reaped(ItemPointer itemptrs, int nitem, bool *deletable, void *state) { TidStore *dead_items = (TidStore *) state; - return TidStoreIsMember(dead_items, itemptr); + return TidStoreIsMemberMulti(dead_items, itemptrs, nitem, deletable); } diff --git a/src/include/access/genam.h b/src/include/access/genam.h index 5b2ab181b5f..6f559a1209d 100644 --- a/src/include/access/genam.h +++ b/src/include/access/genam.h @@ -107,7 +107,8 @@ typedef struct IndexBulkDeleteResult } IndexBulkDeleteResult; /* Typedef for callback function to determine if a tuple is bulk-deletable */ -typedef bool (*IndexBulkDeleteCallback) (ItemPointer itemptr, void *state); +typedef int (*IndexBulkDeleteCallback) (ItemPointer itemptr, int nitem, + bool *deletable, void *state); /* struct definitions appear in relscan.h */ typedef struct IndexScanDescData *IndexScanDesc; -- 2.43.5