Hi, To start with: I think this is an extremely helpful and important feature. Both for checking production systems and for finding problems during development.
> From 08fe01f5073c0a850541265494bb4a875bec7d3f Mon Sep 17 00:00:00 2001 > From: Himanshu Upadhyaya <himanshu.upadhy...@enterprisedb.com> > Date: Fri, 30 Sep 2022 17:44:56 +0530 > Subject: [PATCH v6] Implement HOT chain validation in verify_heapam() > > Himanshu Upadhyaya, reviewed by Robert Haas, Aleksander Alekseev > > Discussion: > https://postgr.es/m/CAPF61jBBR2-iE-EmN_9v0hcQEfyz_17e5Lbb0%2Bu2%3D9ukA9sWmQ%40mail.gmail.com > --- > contrib/amcheck/verify_heapam.c | 207 ++++++++++++++++++++++ > src/bin/pg_amcheck/t/004_verify_heapam.pl | 192 ++++++++++++++++++-- > 2 files changed, 388 insertions(+), 11 deletions(-) > > diff --git a/contrib/amcheck/verify_heapam.c b/contrib/amcheck/verify_heapam.c > index c875f3e5a2..007f7b2f37 100644 > --- a/contrib/amcheck/verify_heapam.c > +++ b/contrib/amcheck/verify_heapam.c > @@ -399,6 +399,9 @@ verify_heapam(PG_FUNCTION_ARGS) > for (ctx.blkno = first_block; ctx.blkno <= last_block; ctx.blkno++) > { > OffsetNumber maxoff; > + OffsetNumber predecessor[MaxOffsetNumber] = {0}; > + OffsetNumber successor[MaxOffsetNumber] = {0}; > + bool lp_valid[MaxOffsetNumber] = {false}; > > CHECK_FOR_INTERRUPTS(); > > @@ -433,6 +436,8 @@ verify_heapam(PG_FUNCTION_ARGS) > for (ctx.offnum = FirstOffsetNumber; ctx.offnum <= maxoff; > ctx.offnum = OffsetNumberNext(ctx.offnum)) > { > + OffsetNumber nextoffnum; > + > ctx.itemid = PageGetItemId(ctx.page, ctx.offnum); > > /* Skip over unused/dead line pointers */ > @@ -469,6 +474,13 @@ verify_heapam(PG_FUNCTION_ARGS) > report_corruption(&ctx, > > psprintf("line pointer redirection to unused item at offset %u", > > (unsigned) rdoffnum)); > + > + /* > + * make entry in successor array, redirected > tuple will be > + * validated at the time when we loop over > successor array > + */ > + successor[ctx.offnum] = rdoffnum; > + lp_valid[ctx.offnum] = true; > continue; > } > > @@ -504,9 +516,197 @@ verify_heapam(PG_FUNCTION_ARGS) > /* It should be safe to examine the tuple's header, at > least */ > ctx.tuphdr = (HeapTupleHeader) PageGetItem(ctx.page, > ctx.itemid); > ctx.natts = HeapTupleHeaderGetNatts(ctx.tuphdr); > + lp_valid[ctx.offnum] = true; > > /* Ok, ready to check this next tuple */ > check_tuple(&ctx); > + > + /* > + * Add the data to the successor array if next updated > tuple is in > + * the same page. It will be used later to generate the > + * predecessor array. > + * > + * We need to access the tuple's header to populate the > + * predecessor array. However the tuple is not > necessarily sanity > + * checked yet so delaying construction of predecessor > array until > + * all tuples are sanity checked. > + */ > + nextoffnum = > ItemPointerGetOffsetNumber(&(ctx.tuphdr)->t_ctid); > + if (ItemPointerGetBlockNumber(&(ctx.tuphdr)->t_ctid) == > ctx.blkno && > + nextoffnum != ctx.offnum) > + { > + successor[ctx.offnum] = nextoffnum; > + } I don't really understand this logic - why can't we populate the predecessor array, if we can construct a successor entry? > + } > + > + /* > + * Loop over offset and populate predecessor array from all > entries > + * that are present in successor array. > + */ > + ctx.attnum = -1; > + for (ctx.offnum = FirstOffsetNumber; ctx.offnum <= maxoff; > + ctx.offnum = OffsetNumberNext(ctx.offnum)) > + { > + ItemId curr_lp; > + ItemId next_lp; > + HeapTupleHeader curr_htup; > + HeapTupleHeader next_htup; > + TransactionId curr_xmax; > + TransactionId next_xmin; > + > + OffsetNumber nextoffnum = successor[ctx.offnum]; > + > + curr_lp = PageGetItemId(ctx.page, ctx.offnum); Why do we get the item when nextoffnum is 0? > + if (nextoffnum == 0 || !lp_valid[ctx.offnum] || > !lp_valid[nextoffnum]) > + { > + /* > + * This is either the last updated tuple in the > chain or a > + * corruption raised for this tuple. > + */ "or a corruption raised" isn't quite right grammatically. > + continue; > + } > + if (ItemIdIsRedirected(curr_lp)) > + { > + next_lp = PageGetItemId(ctx.page, nextoffnum); > + if (ItemIdIsRedirected(next_lp)) > + { > + report_corruption(&ctx, > + > psprintf("redirected line pointer pointing to another redirected line pointer > at offset %u", > + > (unsigned) nextoffnum)); > + continue; > + } > + next_htup = (HeapTupleHeader) > PageGetItem(ctx.page, next_lp); > + if (!HeapTupleHeaderIsHeapOnly(next_htup)) > + { > + report_corruption(&ctx, > + > psprintf("redirected tuple at line pointer offset %u is not heap only tuple", > + > (unsigned) nextoffnum)); > + } > + if ((next_htup->t_infomask & HEAP_UPDATED) == 0) > + { > + report_corruption(&ctx, > + > psprintf("redirected tuple at line pointer offset %u is not heap updated > tuple", > + > (unsigned) nextoffnum)); > + } > + continue; > + } > + > + /* > + * Add a line pointer offset to the predecessor array > if xmax is > + * matching with xmin of next tuple (reaching via its > t_ctid). > + * Prior to PostgreSQL 9.4, we actually changed the > xmin to > + * FrozenTransactionId I'm doubtful it's a good idea to try to validate the 9.4 case. The likelihood of getting that right seems low and I don't see us gaining much by even trying. > so we must add offset to predecessor > + * array(irrespective of xmax-xmin matching) if updated > tuple xmin > + * is frozen, so that we can later do validation > related to frozen > + * xmin. Raise corruption if we have two tuples having > the same > + * predecessor. > + * We add the offset to the predecessor array > irrespective of the > + * transaction (t_xmin) status. We will do validation > related to > + * the transaction status (and also all other > validations) when we > + * loop over the predecessor array. > + */ > + curr_htup = (HeapTupleHeader) PageGetItem(ctx.page, > curr_lp); > + curr_xmax = HeapTupleHeaderGetUpdateXid(curr_htup); > + next_lp = PageGetItemId(ctx.page, nextoffnum); > + next_htup = (HeapTupleHeader) PageGetItem(ctx.page, > next_lp); > + next_xmin = HeapTupleHeaderGetXmin(next_htup); > + if (TransactionIdIsValid(curr_xmax) && > + (TransactionIdEquals(curr_xmax, next_xmin) || > + next_xmin == FrozenTransactionId)) > + { > + if (predecessor[nextoffnum] != 0) > + { > + report_corruption(&ctx, > + > psprintf("updated version at offset %u is also the updated version of tuple > at offset %u", > + > (unsigned) nextoffnum, (unsigned) predecessor[nextoffnum])); > + continue; I doubt it is correct to enter this path with next_xmin == FrozenTransactionId. This is following a ctid chain that we normally wouldn't follow, because it doesn't satisfy the t_self->xmax == t_ctid->xmin condition. I don't immediately see what prevents the frozen tuple being from an entirely different HOT chain than the two tuples pointing to it. > + } > + > + /* Loop over offsets and validate the data in the predecessor > array. */ > + for (OffsetNumber currentoffnum = FirstOffsetNumber; > currentoffnum <= maxoff; > + currentoffnum = OffsetNumberNext(currentoffnum)) > + { > + HeapTupleHeader pred_htup; > + HeapTupleHeader curr_htup; > + TransactionId pred_xmin; > + TransactionId curr_xmin; > + ItemId pred_lp; > + ItemId curr_lp; > + > + ctx.offnum = predecessor[currentoffnum]; > + ctx.attnum = -1; > + > + if (ctx.offnum == 0) > + { > + /* > + * Either the root of the chain or an > xmin-aborted tuple from > + * an abandoned portion of the HOT chain. > + */ Hm - couldn't we check that the tuple could conceivably be at the root of a chain? I.e. isn't HEAP_HOT_UPDATED? Or alternatively has an aborted xmin? > + continue; > + } > + > + curr_lp = PageGetItemId(ctx.page, currentoffnum); > + curr_htup = (HeapTupleHeader) PageGetItem(ctx.page, > curr_lp); > + curr_xmin = HeapTupleHeaderGetXmin(curr_htup); > + > + ctx.itemid = pred_lp = PageGetItemId(ctx.page, > ctx.offnum); > + pred_htup = (HeapTupleHeader) PageGetItem(ctx.page, > pred_lp); > + pred_xmin = HeapTupleHeaderGetXmin(pred_htup); > + > + /* > + * If the predecessor's xmin is aborted or in progress, > the > + * current tuples xmin should be aborted or in progress > + * respectively. Also both xmin's must be equal. > + */ > + if (!TransactionIdEquals(pred_xmin, curr_xmin) && > + !TransactionIdDidCommit(pred_xmin)) > + { > + report_corruption(&ctx, > + > psprintf("tuple with uncommitted xmin %u was updated to produce a tuple at > offset %u with differing xmin %u", > + > (unsigned) pred_xmin, (unsigned) currentoffnum, (unsigned) curr_xmin)); Is this necessarily true? What about a tuple that was inserted in a subtransaction and then updated in another subtransaction of the same toplevel transaction? > + } > + > + /* > + * If the predecessor's xmin is not frozen, then > current tuple's > + * shouldn't be either. > + */ > + if (pred_xmin != FrozenTransactionId && curr_xmin == > FrozenTransactionId) > + { > + report_corruption(&ctx, > + > psprintf("unfrozen tuple was updated to produce a tuple at offset %u which is > frozen", > + > (unsigned) currentoffnum)); > + } Can't we have a an update chain that is e.g. xmin 10, xmax 5 -> xmin 5, xmax invalid and a vacuum cutoff of 7? That'd preent the first tuple from being removed, but would allow 5 to be frozen. I think there were recent patches proposing we don't freeze in that case, but we'll having done that in the past.... Greetings, Andres Freund