On Mon, Sep 12, 2022 at 2:01 PM Peter Geoghegan <p...@bowt.ie> wrote: > I'd like to talk about one such technique on this thread. The attached > WIP patch reduces the size of xl_heap_freeze_page records by applying > a simple deduplication process.
Attached is v2, which I'm just posting to keep CFTester happy. No real changes here. -- Peter Geoghegan
From c555f716a79c7de1f26a5e1e87265f2702b980ee Mon Sep 17 00:00:00 2001 From: Peter Geoghegan <pg@bowt.ie> Date: Sat, 30 Jul 2022 10:47:59 -0700 Subject: [PATCH v2] Shrink freeze WAL records via deduplication. --- src/include/access/heapam.h | 25 ++ src/include/access/heapam_xlog.h | 34 +-- src/backend/access/heap/heapam.c | 310 ++++++++++++++++++++----- src/backend/access/heap/vacuumlazy.c | 39 +--- src/backend/access/rmgrdesc/heapdesc.c | 4 +- 5 files changed, 289 insertions(+), 123 deletions(-) diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 9dab35551..579ef32b0 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -99,6 +99,19 @@ typedef enum HEAPTUPLE_DELETE_IN_PROGRESS /* deleting xact is still in progress */ } HTSV_Result; +/* heap_prepare_freeze_tuple state describing how to freeze a tuple */ +typedef struct HeapFreezeTuple +{ + /* Fields describing how to process tuple */ + uint16 t_infomask2; + uint16 t_infomask; + TransactionId xmax; + uint8 frzflags; + + /* Page offset number for tuple */ + OffsetNumber offset; +} HeapFreezeTuple; + /* ---------------- * function prototypes for heap access method * @@ -164,6 +177,18 @@ extern TM_Result heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer, struct TM_FailureData *tmfd); extern void heap_inplace_update(Relation relation, HeapTuple tuple); +extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, + TransactionId relfrozenxid, + TransactionId relminmxid, + TransactionId cutoff_xid, + TransactionId cutoff_multi, + HeapFreezeTuple *frz, + bool *totally_frozen, + TransactionId *relfrozenxid_out, + MultiXactId *relminmxid_out); +extern void heap_freeze_prepared_tuples(Relation rel, Buffer buffer, + TransactionId latestRemovedXid, + HeapFreezeTuple *tuples, int ntuples); extern bool heap_freeze_tuple(HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId cutoff_xid, TransactionId cutoff_multi); diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 34220d93c..4e3a023eb 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -315,34 +315,36 @@ typedef struct xl_heap_inplace /* * This struct represents a 'freeze plan', which is what we need to know about - * a single tuple being frozen during vacuum. + * a group of tuples being frozen from the same page. */ /* 0x01 was XLH_FREEZE_XMIN */ #define XLH_FREEZE_XVAC 0x02 #define XLH_INVALID_XVAC 0x04 -typedef struct xl_heap_freeze_tuple +typedef struct xl_heap_freeze_plan { TransactionId xmax; - OffsetNumber offset; + uint16 ntuples; uint16 t_infomask2; uint16 t_infomask; uint8 frzflags; -} xl_heap_freeze_tuple; +} xl_heap_freeze_plan; /* * This is what we need to know about a block being frozen during vacuum * - * Backup block 0's data contains an array of xl_heap_freeze_tuple structs, - * one for each tuple. + * Backup block 0's data contains an array of xl_heap_freeze_plan structs. */ typedef struct xl_heap_freeze_page { - TransactionId cutoff_xid; - uint16 ntuples; + TransactionId latestRemovedXid; + uint16 nplans; + + /* FREEZE PLANS FOLLOW */ + /* OFFSET NUMBERS FOLLOW */ } xl_heap_freeze_page; -#define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, ntuples) + sizeof(uint16)) +#define SizeOfHeapFreezePage (offsetof(xl_heap_freeze_page, nplans) + sizeof(uint16)) /* * This is what we need to know about setting a visibility map bit @@ -401,20 +403,6 @@ extern void heap2_desc(StringInfo buf, XLogReaderState *record); extern const char *heap2_identify(uint8 info); extern void heap_xlog_logical_rewrite(XLogReaderState *r); -extern XLogRecPtr log_heap_freeze(Relation reln, Buffer buffer, - TransactionId cutoff_xid, xl_heap_freeze_tuple *tuples, - int ntuples); -extern bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, - TransactionId relfrozenxid, - TransactionId relminmxid, - TransactionId cutoff_xid, - TransactionId cutoff_multi, - xl_heap_freeze_tuple *frz, - bool *totally_frozen, - TransactionId *relfrozenxid_out, - MultiXactId *relminmxid_out); -extern void heap_execute_freeze_tuple(HeapTupleHeader tuple, - xl_heap_freeze_tuple *frz); extern XLogRecPtr log_heap_visible(RelFileLocator rlocator, Buffer heap_buffer, Buffer vm_buffer, TransactionId cutoff_xid, uint8 vmflags); diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index eb811d751..df2105e3a 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -110,6 +110,9 @@ static int bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate); static XLogRecPtr log_heap_new_cid(Relation relation, HeapTuple tup); static HeapTuple ExtractReplicaIdentity(Relation relation, HeapTuple tp, bool key_required, bool *copy); +static int heap_xlog_freeze_plan(HeapFreezeTuple *tuples, int ntuples, + xl_heap_freeze_plan *plans_out, + OffsetNumber *offsets_out); /* @@ -6431,7 +6434,9 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, * will be totally frozen after these operations are performed and false if * more freezing will eventually be required. * - * Caller must set frz->offset itself, before heap_execute_freeze_tuple call. + * VACUUM caller must assemble HeapFreezeTuple entries for every tuple that we + * returned true for when called. A later heap_freeze_prepared_tuples call + * will execute freezing for caller's page as a whole. * * It is assumed that the caller has checked the tuple with * HeapTupleSatisfiesVacuum() and determined that it is not HEAPTUPLE_DEAD @@ -6455,15 +6460,12 @@ FreezeMultiXactId(MultiXactId multi, uint16 t_infomask, * It will be set as tuple's new xmax when our *frz output is processed within * heap_execute_freeze_tuple later on. If the tuple is in a shared buffer * then caller had better have an exclusive lock on it already. - * - * NB: It is not enough to set hint bits to indicate an XID committed/aborted. - * The *frz WAL record we output completely removes all old XIDs during REDO. */ bool heap_prepare_freeze_tuple(HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId cutoff_xid, TransactionId cutoff_multi, - xl_heap_freeze_tuple *frz, bool *totally_frozen, + HeapFreezeTuple *frz, bool *totally_frozen, TransactionId *relfrozenxid_out, MultiXactId *relminmxid_out) { @@ -6754,10 +6756,12 @@ heap_prepare_freeze_tuple(HeapTupleHeader tuple, * tuple status. Also, getting exclusive lock makes it safe to adjust the * infomask bits. * - * NB: All code in here must be safe to execute during crash recovery! + * NB: All code in here must be safe to execute during crash recovery! It is + * not enough to set hint bits to indicate an XID committed/aborted, either. + * Caller's REDO routine must reliably remove all old XIDs. */ -void -heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz) +static void +heap_execute_freeze_tuple(HeapTupleHeader tuple, HeapFreezeTuple *frz) { HeapTupleHeaderSetXmax(tuple, frz->xmax); @@ -6771,6 +6775,82 @@ heap_execute_freeze_tuple(HeapTupleHeader tuple, xl_heap_freeze_tuple *frz) tuple->t_infomask2 = frz->t_infomask2; } +/* + * heap_freeze_prepared_tuples + * Execute freezing of prepared tuple plans. + * + * If we need to freeze any tuples we'll mark the buffer dirty, and write a + * WAL record recording the changes. We must log the changes to be crash-safe + * against future truncation of CLOG. + * + * Caller must always set 'offset' page offset number from each tuple plan. + * + * NB: We sort the tuples array. Caller should be finished with it by now. + */ +void +heap_freeze_prepared_tuples(Relation rel, Buffer buffer, + TransactionId latestRemovedXid, + HeapFreezeTuple *tuples, int ntuples) +{ + Page page = BufferGetPage(buffer); + + /* nor when there are no tuples to freeze */ + Assert(ntuples > 0); + Assert(TransactionIdIsValid(latestRemovedXid)); + + START_CRIT_SECTION(); + + MarkBufferDirty(buffer); + + /* execute collected freezes */ + for (int i = 0; i < ntuples; i++) + { + HeapTupleHeader htup; + ItemId itemid = PageGetItemId(page, tuples[i].offset); + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + heap_execute_freeze_tuple(htup, &tuples[i]); + } + + /* Now WAL-log freezing if necessary */ + if (RelationNeedsWAL(rel)) + { + xl_heap_freeze_plan plans[MaxHeapTuplesPerPage]; + OffsetNumber offsets[MaxHeapTuplesPerPage]; + int nplans; + xl_heap_freeze_page xlrec; + XLogRecPtr recptr; + + /* Prepare deduplicated representation for use in WAL record */ + nplans = heap_xlog_freeze_plan(tuples, ntuples, plans, offsets); + + Assert(nplans > 0 && nplans <= ntuples); + + xlrec.latestRemovedXid = latestRemovedXid; + xlrec.nplans = nplans; + + XLogBeginInsert(); + XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage); + + /* + * The freeze plan array and offset array are not actually in the + * buffer, but pretend that they are. When XLogInsert stores the + * whole buffer, the arrays need not be stored too. + */ + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + XLogRegisterBufData(0, (char *) plans, + nplans * sizeof(xl_heap_freeze_plan)); + XLogRegisterBufData(0, (char *) offsets, + ntuples * sizeof(OffsetNumber)); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); +} + /* * heap_freeze_tuple * Freeze tuple in place, without WAL logging. @@ -6782,7 +6862,7 @@ heap_freeze_tuple(HeapTupleHeader tuple, TransactionId relfrozenxid, TransactionId relminmxid, TransactionId cutoff_xid, TransactionId cutoff_multi) { - xl_heap_freeze_tuple frz; + HeapFreezeTuple frz; bool do_freeze; bool tuple_totally_frozen; TransactionId relfrozenxid_out = cutoff_xid; @@ -8143,42 +8223,6 @@ bottomup_sort_and_shrink(TM_IndexDeleteOp *delstate) return nblocksfavorable; } -/* - * Perform XLogInsert for a heap-freeze operation. Caller must have already - * modified the buffer and marked it dirty. - */ -XLogRecPtr -log_heap_freeze(Relation reln, Buffer buffer, TransactionId cutoff_xid, - xl_heap_freeze_tuple *tuples, int ntuples) -{ - xl_heap_freeze_page xlrec; - XLogRecPtr recptr; - - /* Caller should not call me on a non-WAL-logged relation */ - Assert(RelationNeedsWAL(reln)); - /* nor when there are no tuples to freeze */ - Assert(ntuples > 0); - - xlrec.cutoff_xid = cutoff_xid; - xlrec.ntuples = ntuples; - - XLogBeginInsert(); - XLogRegisterData((char *) &xlrec, SizeOfHeapFreezePage); - - /* - * The freeze plan array is not actually in the buffer, but pretend that - * it is. When XLogInsert stores the whole buffer, the freeze plan need - * not be stored too. - */ - XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); - XLogRegisterBufData(0, (char *) tuples, - ntuples * sizeof(xl_heap_freeze_tuple)); - - recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_FREEZE_PAGE); - - return recptr; -} - /* * Perform XLogInsert for a heap-visible operation. 'block' is the block * being marked all-visible, and vm_buffer is the buffer containing the @@ -8915,6 +8959,133 @@ heap_xlog_visible(XLogReaderState *record) UnlockReleaseBuffer(vmbuffer); } +/* + * qsort comparator used to deduplicate tuple-based freeze plans for WAL + */ +static int +heap_xlog_freeze_cmp(const void *arg1, const void *arg2) +{ + HeapFreezeTuple *frz1 = (HeapFreezeTuple *) arg1; + HeapFreezeTuple *frz2 = (HeapFreezeTuple *) arg2; + + /* Compare fields that describe actions required to freeze this tuple */ + if (frz1->t_infomask2 < frz2->t_infomask2) + return -1; + else if (frz1->t_infomask2 > frz2->t_infomask2) + return 1; + + if (frz1->t_infomask < frz2->t_infomask) + return -1; + else if (frz1->t_infomask > frz2->t_infomask) + return 1; + + if (frz1->xmax < frz2->xmax) + return -1; + else if (frz1->xmax > frz2->xmax) + return 1; + + if (frz1->frzflags < frz2->frzflags) + return -1; + else if (frz1->frzflags > frz2->frzflags) + return 1; + + /* Tiebreak on an individual tuple's page offset number */ + if (frz1->offset < frz2->offset) + return -1; + else if (frz1->offset > frz2->offset) + return 1; + + Assert(false); + return 0; +} + +/* + * Compare fields that describe actions required to freeze tuple with caller's + * open plan. If everything matches then the frz tuple plan is equivalent to + * caller's plan. + */ +static bool +heap_xlog_freeze_eq(xl_heap_freeze_plan *plan, HeapFreezeTuple *frz) +{ + if (plan->t_infomask2 == frz->t_infomask2 && + plan->t_infomask == frz->t_infomask && + plan->xmax == frz->xmax && plan->frzflags == frz->frzflags) + return true; + + /* Caller must call heap_xlog_new_freeze_plan again for frz */ + return false; +} + +/* + * Start new plan initialized using tuple-level actions. At least one tuple + * will have steps required to freeze described by caller's plan during REDO. + */ +static void +heap_xlog_new_freeze_plan(xl_heap_freeze_plan *plan, HeapFreezeTuple *frz) +{ + plan->xmax = frz->xmax; + plan->t_infomask = frz->t_infomask; + plan->t_infomask2 = frz->t_infomask2; + plan->frzflags = frz->frzflags; + plan->ntuples = 1; +} + +/* + * Deduplicate tuple-based freeze plans so that each distinct set of + * processing steps is only stored once in final WAL record. + * + * Return value is number of plans set in *plans_out for caller. *offsets_out + * is set to offset numbers for use in WAL record by caller. + */ +static int +heap_xlog_freeze_plan(HeapFreezeTuple *tuples, int ntuples, + xl_heap_freeze_plan *plans_out, + OffsetNumber *offsets_out) +{ + int nplans = 0; + + /* Sort tuple-based freeze plans in the order required to deduplicate */ + qsort(tuples, ntuples, sizeof(HeapFreezeTuple), heap_xlog_freeze_cmp); + + for (int i = 0; i < ntuples; i++) + { + HeapFreezeTuple *frz = tuples + i; + + if (i == 0) + { + /* New canonical freeze plan starting with first tup */ + heap_xlog_new_freeze_plan(plans_out, frz); + nplans++; + } + else if (heap_xlog_freeze_eq(plans_out, frz)) + { + /* tup matches open canonical plan -- include tup in it */ + Assert(offsets_out[i - 1] < frz->offset); + plans_out->ntuples++; + } + else + { + /* Tup doesn't match current plan -- done with it now */ + plans_out++; + + /* New canonical freeze plan starting with this tup */ + heap_xlog_new_freeze_plan(plans_out, frz); + nplans++; + } + + /* + * Save page offset number in dedicated buffer in passing. + * + * REDO routine relies on the record's offset numbers array grouping + * offset numbers by freeze plan. The sort order within each grouping + * is ascending offset number order, just to keep things tidy. + */ + offsets_out[i] = frz->offset; + } + + return nplans; +} + /* * Replay XLOG_HEAP2_FREEZE_PAGE records */ @@ -8923,20 +9094,17 @@ heap_xlog_freeze_page(XLogReaderState *record) { XLogRecPtr lsn = record->EndRecPtr; xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) XLogRecGetData(record); - TransactionId cutoff_xid = xlrec->cutoff_xid; + TransactionId latestRemovedXid = xlrec->latestRemovedXid; Buffer buffer; - int ntup; /* * In Hot Standby mode, ensure that there's no queries running which still * consider the frozen xids as running. */ + Assert(TransactionIdIsValid(latestRemovedXid)); if (InHotStandby) { RelFileLocator rlocator; - TransactionId latestRemovedXid = cutoff_xid; - - TransactionIdRetreat(latestRemovedXid); XLogRecGetBlockTag(record, 0, &rlocator, NULL, NULL); ResolveRecoveryConflictWithSnapshot(latestRemovedXid, rlocator); @@ -8945,22 +9113,40 @@ heap_xlog_freeze_page(XLogReaderState *record) if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO) { Page page = BufferGetPage(buffer); - xl_heap_freeze_tuple *tuples; + xl_heap_freeze_plan *plans; + OffsetNumber *offsets; + int curoff = 0; - tuples = (xl_heap_freeze_tuple *) XLogRecGetBlockData(record, 0, NULL); - - /* now execute freeze plan for each frozen tuple */ - for (ntup = 0; ntup < xlrec->ntuples; ntup++) + /* + * Convert plan/tuple grouping representation from WAL record into + * per-tuple representation for heap_execute_freeze_tuple call + */ + plans = (xl_heap_freeze_plan *) XLogRecGetBlockData(record, 0, NULL); + offsets = (OffsetNumber *) ((char *) plans + + (xlrec->nplans * + sizeof(xl_heap_freeze_plan))); + for (int plan = 0; plan < xlrec->nplans; plan++) { - xl_heap_freeze_tuple *xlrec_tp; - ItemId lp; - HeapTupleHeader tuple; + xl_heap_freeze_plan *tuple_grouping = &plans[plan]; + HeapFreezeTuple frz; - xlrec_tp = &tuples[ntup]; - lp = PageGetItemId(page, xlrec_tp->offset); /* offsets are one-based */ - tuple = (HeapTupleHeader) PageGetItem(page, lp); + frz.t_infomask2 = tuple_grouping->t_infomask2; + frz.t_infomask = tuple_grouping->t_infomask; + frz.xmax = tuple_grouping->xmax; + frz.frzflags = tuple_grouping->frzflags; + frz.offset = InvalidOffsetNumber; - heap_execute_freeze_tuple(tuple, xlrec_tp); + /* Iterate through tuple grouping in offset number order */ + for (int i = 0; i < tuple_grouping->ntuples; i++) + { + ItemId lp; + HeapTupleHeader tuple; + OffsetNumber offset = offsets[curoff++]; + + lp = PageGetItemId(page, offset); + tuple = (HeapTupleHeader) PageGetItem(page, lp); + heap_execute_freeze_tuple(tuple, &frz); + } } PageSetLSN(page, lsn); diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c index dfbe37472..0623923c4 100644 --- a/src/backend/access/heap/vacuumlazy.c +++ b/src/backend/access/heap/vacuumlazy.c @@ -1566,7 +1566,7 @@ lazy_scan_prune(LVRelState *vacrel, TransactionId NewRelfrozenXid; MultiXactId NewRelminMxid; OffsetNumber deadoffsets[MaxHeapTuplesPerPage]; - xl_heap_freeze_tuple frozen[MaxHeapTuplesPerPage]; + HeapFreezeTuple frozen[MaxHeapTuplesPerPage]; Assert(BufferGetBlockNumber(buf) == blkno); @@ -1824,41 +1824,8 @@ retry: Assert(prunestate->hastup); vacrel->frozen_pages++; - - /* - * At least one tuple with storage needs to be frozen -- execute that - * now. - * - * If we need to freeze any tuples we'll mark the buffer dirty, and - * write a WAL record recording the changes. We must log the changes - * to be crash-safe against future truncation of CLOG. - */ - START_CRIT_SECTION(); - - MarkBufferDirty(buf); - - /* execute collected freezes */ - for (int i = 0; i < tuples_frozen; i++) - { - HeapTupleHeader htup; - - itemid = PageGetItemId(page, frozen[i].offset); - htup = (HeapTupleHeader) PageGetItem(page, itemid); - - heap_execute_freeze_tuple(htup, &frozen[i]); - } - - /* Now WAL-log freezing if necessary */ - if (RelationNeedsWAL(vacrel->rel)) - { - XLogRecPtr recptr; - - recptr = log_heap_freeze(vacrel->rel, buf, vacrel->FreezeLimit, - frozen, tuples_frozen); - PageSetLSN(page, recptr); - } - - END_CRIT_SECTION(); + heap_freeze_prepared_tuples(vacrel->rel, buf, vacrel->FreezeLimit, + frozen, tuples_frozen); } /* diff --git a/src/backend/access/rmgrdesc/heapdesc.c b/src/backend/access/rmgrdesc/heapdesc.c index 923d3bc43..3f8c5e63f 100644 --- a/src/backend/access/rmgrdesc/heapdesc.c +++ b/src/backend/access/rmgrdesc/heapdesc.c @@ -140,8 +140,8 @@ heap2_desc(StringInfo buf, XLogReaderState *record) { xl_heap_freeze_page *xlrec = (xl_heap_freeze_page *) rec; - appendStringInfo(buf, "cutoff xid %u ntuples %u", - xlrec->cutoff_xid, xlrec->ntuples); + appendStringInfo(buf, "latestRemovedXid %u nplans %u", + xlrec->latestRemovedXid, xlrec->nplans); } else if (info == XLOG_HEAP2_VISIBLE) { -- 2.34.1