On 25/03/2019 15:20, Heikki Linnakangas wrote:
On 24/03/2019 18:50, Andrey Borodin wrote:
I was working on new version of gist check in amcheck and understand one more
thing:
/* Can this page be recycled yet? */
bool
gistPageRecyclable(Page page)
{
return PageIsNew(page) ||
(GistPageIsDeleted(page) &&
TransactionIdPrecedes(GistPageGetDeleteXid(page), RecentGlobalXmin));
}
Here RecentGlobalXmin can wraparound and page will become unrecyclable for half
of xid cycle. Can we prevent it by resetting PageDeleteXid to
InvalidTransactionId before doing RecordFreeIndexPage()?
(Seems like same applies to GIN...)
True, and B-tree has the same issue. I thought I saw a comment somewhere
in the B-tree code about that earlier, but now I can't find it. I
must've imagined it.
We could reset it, but that would require dirtying the page. That would
be just extra I/O overhead, if the page gets reused before XID
wraparound. We could avoid that if we stored the full XID+epoch, not
just XID. I think we should do that in GiST, at least, where this is
new. In the B-tree, it would require some extra code to deal with
backwards-compatibility, but maybe it would be worth it even there.
I suggest that we do the attached. It fixes this for GiST. The patch
changes expands the "deletion XID" to 64-bits, and changes where it's
stored. Instead of storing it pd_prune_xid, it's stored in the page
contents. Luckily, a deleted page has no real content.
I think we should fix this in a similar manner in B-tree, too, but that
can be done separately. For B-tree, we need to worry about
backwards-compatibility, but that seems simple enough; we just need to
continue to understand old deleted pages, where the deletion XID is
stored in the page opaque field.
- Heikki
>From b7897577c83a81ec04394ce7113d1d8a47804086 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Thu, 4 Apr 2019 18:06:48 +0300
Subject: [PATCH 1/2] Refactor checks for deleted GiST pages.
The explicit check in gistScanPage() isn't currently really necessary, as
a deleted page is always empty, so the loop would fall through without
doing anything, anyway. But it's a marginal optimization, and it gives a
nice place to attach a comment to explain how it works.
---
src/backend/access/gist/gist.c | 40 ++++++++++++-------------------
src/backend/access/gist/gistget.c | 14 +++++++++++
2 files changed, 29 insertions(+), 25 deletions(-)
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 2db790c840..028b06b264 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -693,14 +693,15 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
continue;
}
- if (stack->blkno != GIST_ROOT_BLKNO &&
- stack->parent->lsn < GistPageGetNSN(stack->page))
+ if ((stack->blkno != GIST_ROOT_BLKNO &&
+ stack->parent->lsn < GistPageGetNSN(stack->page)) ||
+ GistPageIsDeleted(stack->page))
{
/*
- * Concurrent split detected. There's no guarantee that the
- * downlink for this page is consistent with the tuple we're
- * inserting anymore, so go back to parent and rechoose the best
- * child.
+ * Concurrent split or page deletion detected. There's no
+ * guarantee that the downlink for this page is consistent with
+ * the tuple we're inserting anymore, so go back to parent and
+ * rechoose the best child.
*/
UnlockReleaseBuffer(stack->buffer);
xlocked = false;
@@ -719,9 +720,6 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
GISTInsertStack *item;
OffsetNumber downlinkoffnum;
- /* currently, internal pages are never deleted */
- Assert(!GistPageIsDeleted(stack->page));
-
downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
iid = PageGetItemId(stack->page, downlinkoffnum);
idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
@@ -842,12 +840,13 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
* leaf/inner is enough to recognize split for root
*/
}
- else if (GistFollowRight(stack->page) ||
- stack->parent->lsn < GistPageGetNSN(stack->page))
+ else if ((GistFollowRight(stack->page) ||
+ stack->parent->lsn < GistPageGetNSN(stack->page)) &&
+ GistPageIsDeleted(stack->page))
{
/*
- * The page was split while we momentarily unlocked the
- * page. Go back to parent.
+ * The page was split or deleted while we momentarily
+ * unlocked the page. Go back to parent.
*/
UnlockReleaseBuffer(stack->buffer);
xlocked = false;
@@ -856,18 +855,6 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace,
}
}
- /*
- * The page might have been deleted after we scanned the parent
- * and saw the downlink.
- */
- if (GistPageIsDeleted(stack->page))
- {
- UnlockReleaseBuffer(stack->buffer);
- xlocked = false;
- state.stack = stack = stack->parent;
- continue;
- }
-
/* now state.stack->(page, buffer and blkno) points to leaf page */
gistinserttuple(&state, stack, giststate, itup,
@@ -931,6 +918,9 @@ gistFindPath(Relation r, BlockNumber child, OffsetNumber *downlinkoffnum)
break;
}
+ /* currently, internal pages are never deleted */
+ Assert(!GistPageIsDeleted(page));
+
top->lsn = BufferGetLSNAtomic(buffer);
/*
diff --git a/src/backend/access/gist/gistget.c b/src/backend/access/gist/gistget.c
index 8108fbb7d8..77ae2fb339 100644
--- a/src/backend/access/gist/gistget.c
+++ b/src/backend/access/gist/gistget.c
@@ -377,6 +377,20 @@ gistScanPage(IndexScanDesc scan, GISTSearchItem *pageItem, double *myDistances,
MemoryContextSwitchTo(oldcxt);
}
+ /*
+ * Check if the page was deleted after we saw the downlink. There's
+ * nothing of interest on a deleted page. Note that we must do this
+ * after checking the NSN for concurrent splits! It's possible that
+ * the page originally contained some tuples that are visible to use,
+ * but was split so that all the visible tuples were moved to another
+ * page, and then this page was deleted.
+ */
+ if (GistPageIsDeleted(page))
+ {
+ UnlockReleaseBuffer(buffer);
+ return;
+ }
+
so->nPageData = so->curPageData = 0;
scan->xs_hitup = NULL; /* might point into pageDataCxt */
if (so->pageDataCxt)
--
2.20.1
>From 38e676b20ab57370e3761898f3657dc64329f211 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakan...@iki.fi>
Date: Thu, 4 Apr 2019 18:05:48 +0300
Subject: [PATCH 2/2] Use full 64-bit XID for checking if a deleted GiST page
is old enough.
Otherwise, after a deleted page gets even older, it becomes unrecyclable
again. B-tree has the same problem, and has had since time immemorial,
but let's at least fix this in GiST, where this is new.
---
src/backend/access/gist/gistutil.c | 63 ++++++++++++++++++++++----
src/backend/access/gist/gistvacuum.c | 4 +-
src/backend/access/gist/gistxlog.c | 20 ++++++--
src/backend/access/rmgrdesc/gistdesc.c | 6 ++-
src/include/access/gist.h | 20 +++++++-
src/include/access/gist_private.h | 4 +-
src/include/access/gistxlog.h | 2 +-
7 files changed, 98 insertions(+), 21 deletions(-)
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 94b6ad6a59..f6bbc0426c 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -839,16 +839,16 @@ gistNewBuffer(Relation r)
gistcheckpage(r, buffer);
/*
- * Otherwise, recycle it if deleted, and too old to have any processes
- * interested in it.
+ * Otherwise, recycle it if deleted, and too old to have any
+ * processes interested in it.
*/
if (gistPageRecyclable(page))
{
/*
- * If we are generating WAL for Hot Standby then create a
- * WAL record that will allow us to conflict with queries
- * running on standby, in case they have snapshots older
- * than the page's deleteXid.
+ * If we are generating WAL for Hot Standby then create a WAL
+ * record that will allow us to conflict with queries running
+ * on standby, in case they have snapshots older than the
+ * page's deleteXid.
*/
if (XLogStandbyInfoActive() && RelationNeedsWAL(r))
gistXLogPageReuse(r, blkno, GistPageGetDeleteXid(page));
@@ -882,9 +882,54 @@ gistNewBuffer(Relation r)
bool
gistPageRecyclable(Page page)
{
- return PageIsNew(page) ||
- (GistPageIsDeleted(page) &&
- TransactionIdPrecedes(GistPageGetDeleteXid(page), RecentGlobalXmin));
+ if (PageIsNew(page))
+ return true;
+ if (GistPageIsDeleted(page))
+ {
+ /*
+ * The page was deleted, but when? If it was just deleted, a scan
+ * might have seen the downlink to it, and will read the page later.
+ * As long as that can happen, we must keep the deleted page around as
+ * a tombstone.
+ *
+ * Compare the deletion XID with RecentGlobalXmin. If deleteXid <
+ * RecentGlobalXmin, then no scan that's still in progress could have
+ * seen its downlink, and we can recycle it.
+ *
+ * One complication here is that the delete XID is a "full" 64-bit
+ * transaction ID, but RecentGlobalXmin doesn't include the epoch. So
+ * we first have to form a full 64-bit version of RecentGlobalXmin to
+ * compare with.
+ */
+ FullTransactionId deletexid_full = GistPageGetDeleteXid(page);
+ FullTransactionId nextxid_full;
+ uint32 nextxid_epoch;
+ TransactionId nextxid_xid;
+ FullTransactionId recentxmin_full;
+ uint32 recentxmin_epoch;
+ TransactionId recentxmin_xid;
+
+ nextxid_full = ReadNextFullTransactionId();
+ nextxid_epoch = EpochFromFullTransactionId(nextxid_full);
+ nextxid_xid = XidFromFullTransactionId(nextxid_full);
+
+ recentxmin_xid = RecentGlobalXmin;
+ if (recentxmin_xid > nextxid_xid)
+ recentxmin_epoch = nextxid_epoch - 1;
+ else
+ recentxmin_epoch = nextxid_epoch;
+ recentxmin_full =
+ FullTransactionIdFromEpochAndXid(recentxmin_epoch,
+ recentxmin_xid);
+
+ /*
+ * Now we have a 64-bit version of RecentGlobalXmin. Compare deletion
+ * XID against it.
+ */
+ if (FullTransactionIdPrecedes(deletexid_full, recentxmin_full))
+ return true;
+ }
+ return false;
}
bytea *
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index e2029d842c..07096194b2 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -595,7 +595,7 @@ gistdeletepage(IndexVacuumInfo *info, GistBulkDeleteResult *stats,
ItemId iid;
IndexTuple idxtuple;
XLogRecPtr recptr;
- TransactionId txid;
+ FullTransactionId txid;
/*
* Check that the leaf is still empty and deletable.
@@ -648,7 +648,7 @@ gistdeletepage(IndexVacuumInfo *info, GistBulkDeleteResult *stats,
* currently in progress must have ended. (That's much more conservative
* than needed, but let's keep it safe and simple.)
*/
- txid = ReadNewTransactionId();
+ txid = ReadNextFullTransactionId();
START_CRIT_SECTION();
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 4fb1855e89..37a190219e 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -701,7 +701,7 @@ gistXLogSplit(bool page_is_leaf,
* downlink from the parent page.
*/
XLogRecPtr
-gistXLogPageDelete(Buffer buffer, TransactionId xid,
+gistXLogPageDelete(Buffer buffer, FullTransactionId xid,
Buffer parentBuffer, OffsetNumber downlinkOffset)
{
gistxlogPageDelete xlrec;
@@ -725,9 +725,23 @@ gistXLogPageDelete(Buffer buffer, TransactionId xid,
* Write XLOG record about reuse of a deleted page.
*/
void
-gistXLogPageReuse(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
+gistXLogPageReuse(Relation rel, BlockNumber blkno, FullTransactionId latestRemovedXid)
{
gistxlogPageReuse xlrec_reuse;
+ FullTransactionId nextxid;
+ uint64 diff;
+
+ /*
+ * We can skip this if the page was deleted so long ago, that no scan can possibly
+ * still see it, even in a standby. One measure might be anything older than the
+ * table's frozen-xid, but we don't have that at hand here. But anything older than
+ * 2 billion, from the next XID, is surely old enough, because you would hit XID
+ * wraparound at that point.
+ */
+ nextxid = ReadNextFullTransactionId();
+ diff = U64FromFullTransactionId(nextxid) - U64FromFullTransactionId(latestRemovedXid);
+ if (diff < 0x7fffffff)
+ return;
/*
* Note that we don't register the buffer with the record, because this
@@ -738,7 +752,7 @@ gistXLogPageReuse(Relation rel, BlockNumber blkno, TransactionId latestRemovedXi
/* XLOG stuff */
xlrec_reuse.node = rel->rd_node;
xlrec_reuse.block = blkno;
- xlrec_reuse.latestRemovedXid = latestRemovedXid;
+ xlrec_reuse.latestRemovedXid = XidFromFullTransactionId(latestRemovedXid);
XLogBeginInsert();
XLogRegisterData((char *) &xlrec_reuse, SizeOfGistxlogPageReuse);
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index eb308c72d6..ba00315260 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -47,8 +47,10 @@ out_gistxlogPageSplit(StringInfo buf, gistxlogPageSplit *xlrec)
static void
out_gistxlogPageDelete(StringInfo buf, gistxlogPageDelete *xlrec)
{
- appendStringInfo(buf, "deleteXid %u; downlink %u",
- xlrec->deleteXid, xlrec->downlinkOffset);
+ appendStringInfo(buf, "deleteXid %u:%u; downlink %u",
+ EpochFromFullTransactionId(xlrec->deleteXid),
+ XidFromFullTransactionId(xlrec->deleteXid),
+ xlrec->downlinkOffset);
}
void
diff --git a/src/include/access/gist.h b/src/include/access/gist.h
index 6902f4115b..8cdc1f0fa9 100644
--- a/src/include/access/gist.h
+++ b/src/include/access/gist.h
@@ -16,6 +16,7 @@
#ifndef GIST_H
#define GIST_H
+#include "access/transam.h"
#include "access/xlog.h"
#include "access/xlogdefs.h"
#include "storage/block.h"
@@ -159,8 +160,23 @@ typedef struct GISTENTRY
#define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val))
/* For deleted pages we store last xid which could see the page in scan */
-#define GistPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid )
-#define GistPageSetDeleteXid(page, val) ( ((PageHeader) (page))->pd_prune_xid = val)
+static inline FullTransactionId
+GistPageGetDeleteXid(Page page)
+{
+ Assert(GistPageIsDeleted(page));
+ Assert(((PageHeader) page)->pd_lower == MAXALIGN(SizeOfPageHeaderData) + sizeof(FullTransactionId));
+
+ return *(FullTransactionId *) PageGetContents(page);
+}
+
+static inline void
+GistPageSetDeleteXid(Page page, FullTransactionId deletexid)
+{
+ Assert(PageIsEmpty(page));
+ ((PageHeader) page)->pd_lower = MAXALIGN(SizeOfPageHeaderData) + sizeof(FullTransactionId);
+
+ *((FullTransactionId *) PageGetContents(page)) = deletexid;
+}
/*
* Vector of GISTENTRY structs; user-defined methods union and picksplit
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 78e2e3fb31..2c54f208ca 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -419,11 +419,11 @@ extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
/* gistxlog.c */
extern XLogRecPtr gistXLogPageDelete(Buffer buffer,
- TransactionId xid, Buffer parentBuffer,
+ FullTransactionId xid, Buffer parentBuffer,
OffsetNumber downlinkOffset);
extern void gistXLogPageReuse(Relation rel, BlockNumber blkno,
- TransactionId latestRemovedXid);
+ FullTransactionId latestRemovedXid);
extern XLogRecPtr gistXLogUpdate(Buffer buffer,
OffsetNumber *todelete, int ntodelete,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 9990d97cbd..9967bdcf3f 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -83,7 +83,7 @@ typedef struct gistxlogPageSplit
*/
typedef struct gistxlogPageDelete
{
- TransactionId deleteXid; /* last Xid which could see page in scan */
+ FullTransactionId deleteXid; /* last Xid which could see page in scan */
OffsetNumber downlinkOffset; /* Offset of downlink referencing this page */
} gistxlogPageDelete;
--
2.20.1