13.08.2019 18:45, Anastasia Lubennikova wrote:
I also added a nearby FIXME comment to
_bt_insertonpg_in_posting() -- I don't think think that the code for
splitting a posting list in two is currently crash-safe.
Good catch. It seems, that I need to rearrange the code.
I'll send updated patch this week.
Attached is v7.
In this version of the patch, I heavily refactored the code of insertion
into
posting tuple. bt_split logic is quite complex, so I omitted a couple of
optimizations. They are mentioned in TODO comments.
Now the algorithm is the following:
- If bt_findinsertloc() found out that tuple belongs to existing posting
tuple's
TID interval, it sets 'in_posting_offset' variable and passes it to
_bt_insertonpg()
- If 'in_posting_offset' is valid and origtup is valid,
merge our itup into origtup.
It can result in one tuple neworigtup, that must replace origtup; or two
tuples:
neworigtup and newrighttup, if the result exceeds BTMaxItemSize,
- If two new tuple(s) fit into the old page, we're lucky.
call _bt_delete_and_insert(..., neworigtup, newrighttup, newitemoff) to
atomically replace oldtup with new tuple(s) and generate xlog record.
- In case page split is needed, pass both tuples to _bt_split().
_bt_findsplitloc() is now aware of upcoming replacement of origtup with
neworigtup, so it uses correct item size where needed.
It seems that now all replace operations are crash-safe. The new patch
passes
all regression tests, so I think it's ready for review again.
In the meantime, I'll run more stress-tests.
--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company
diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 05e7d67..504bca2 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -924,6 +924,7 @@ bt_target_page_check(BtreeCheckState *state)
size_t tupsize;
BTScanInsert skey;
bool lowersizelimit;
+ ItemPointer scantid;
CHECK_FOR_INTERRUPTS();
@@ -994,29 +995,73 @@ bt_target_page_check(BtreeCheckState *state)
/*
* Readonly callers may optionally verify that non-pivot tuples can
- * each be found by an independent search that starts from the root
+ * each be found by an independent search that starts from the root.
+ * Note that we deliberately don't do individual searches for each
+ * "logical" posting list tuple, since the posting list itself is
+ * validated by other checks.
*/
if (state->rootdescend && P_ISLEAF(topaque) &&
!bt_rootdescend(state, itup))
{
char *itid,
*htid;
+ ItemPointer tid = BTreeTupleGetHeapTID(itup);
itid = psprintf("(%u,%u)", state->targetblock, offset);
htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumber(&(itup->t_tid)),
- ItemPointerGetOffsetNumber(&(itup->t_tid)));
+ ItemPointerGetBlockNumber(tid),
+ ItemPointerGetOffsetNumber(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("could not find tuple using search from root page in index \"%s\"",
RelationGetRelationName(state->rel)),
- errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.",
+ errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.",
itid, htid,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
}
+ /*
+ * If tuple is actually a posting list, make sure posting list TIDs
+ * are in order.
+ */
+ if (BTreeTupleIsPosting(itup))
+ {
+ ItemPointerData last;
+ ItemPointer current;
+
+ ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last);
+
+ for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+ {
+
+ current = BTreeTupleGetPostingN(itup, i);
+
+ if (ItemPointerCompare(current, &last) <= 0)
+ {
+ char *itid,
+ *htid;
+
+ itid = psprintf("(%u,%u)", state->targetblock, offset);
+ htid = psprintf("(%u,%u)",
+ ItemPointerGetBlockNumberNoCheck(current),
+ ItemPointerGetOffsetNumberNoCheck(current));
+
+ ereport(ERROR,
+ (errcode(ERRCODE_INDEX_CORRUPTED),
+ errmsg("posting list heap TIDs out of order in index \"%s\"",
+ RelationGetRelationName(state->rel)),
+ errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.",
+ itid, htid,
+ (uint32) (state->targetlsn >> 32),
+ (uint32) state->targetlsn)));
+ }
+
+ ItemPointerCopy(current, &last);
+ }
+ }
+
/* Build insertion scankey for current page offset */
skey = bt_mkscankey_pivotsearch(state->rel, itup);
@@ -1074,12 +1119,33 @@ bt_target_page_check(BtreeCheckState *state)
{
IndexTuple norm;
- norm = bt_normalize_tuple(state, itup);
- bloom_add_element(state->filter, (unsigned char *) norm,
- IndexTupleSize(norm));
- /* Be tidy */
- if (norm != itup)
- pfree(norm);
+ if (BTreeTupleIsPosting(itup))
+ {
+ IndexTuple onetup;
+
+ /* Fingerprint all elements of posting tuple one by one */
+ for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+ {
+ onetup = BTreeGetNthTupleOfPosting(itup, i);
+
+ norm = bt_normalize_tuple(state, onetup);
+ bloom_add_element(state->filter, (unsigned char *) norm,
+ IndexTupleSize(norm));
+ /* Be tidy */
+ if (norm != onetup)
+ pfree(norm);
+ pfree(onetup);
+ }
+ }
+ else
+ {
+ norm = bt_normalize_tuple(state, itup);
+ bloom_add_element(state->filter, (unsigned char *) norm,
+ IndexTupleSize(norm));
+ /* Be tidy */
+ if (norm != itup)
+ pfree(norm);
+ }
}
/*
@@ -1087,7 +1153,8 @@ bt_target_page_check(BtreeCheckState *state)
*
* If there is a high key (if this is not the rightmost page on its
* entire level), check that high key actually is upper bound on all
- * page items.
+ * page items. If this is a posting list tuple, we'll need to set
+ * scantid to be highest TID in posting list.
*
* We prefer to check all items against high key rather than checking
* just the last and trusting that the operator class obeys the
@@ -1127,6 +1194,9 @@ bt_target_page_check(BtreeCheckState *state)
* tuple. (See also: "Notes About Data Representation" in the nbtree
* README.)
*/
+ scantid = skey->scantid;
+ if (!BTreeTupleIsPivot(itup))
+ skey->scantid = BTreeTupleGetMaxTID(itup);
if (!P_RIGHTMOST(topaque) &&
!(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
invariant_l_offset(state, skey, P_HIKEY)))
@@ -1150,6 +1220,7 @@ bt_target_page_check(BtreeCheckState *state)
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
}
+ skey->scantid = scantid;
/*
* * Item order check *
@@ -1164,11 +1235,13 @@ bt_target_page_check(BtreeCheckState *state)
*htid,
*nitid,
*nhtid;
+ ItemPointer tid;
itid = psprintf("(%u,%u)", state->targetblock, offset);
+ tid = BTreeTupleGetHeapTID(itup);
htid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
nitid = psprintf("(%u,%u)", state->targetblock,
OffsetNumberNext(offset));
@@ -1177,9 +1250,11 @@ bt_target_page_check(BtreeCheckState *state)
state->target,
OffsetNumberNext(offset));
itup = (IndexTuple) PageGetItem(state->target, itemid);
+
+ tid = BTreeTupleGetHeapTID(itup);
nhtid = psprintf("(%u,%u)",
- ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
- ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+ ItemPointerGetBlockNumberNoCheck(tid),
+ ItemPointerGetOffsetNumberNoCheck(tid));
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -1189,10 +1264,10 @@ bt_target_page_check(BtreeCheckState *state)
"higher index tid=%s (points to %s tid=%s) "
"page lsn=%X/%X.",
itid,
- P_ISLEAF(topaque) ? "heap" : "index",
+ P_ISLEAF(topaque) ? "min heap" : "index",
htid,
nitid,
- P_ISLEAF(topaque) ? "heap" : "index",
+ P_ISLEAF(topaque) ? "min heap" : "index",
nhtid,
(uint32) (state->targetlsn >> 32),
(uint32) state->targetlsn)));
@@ -1953,10 +2028,11 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
* verification. In particular, it won't try to normalize opclass-equal
* datums with potentially distinct representations (e.g., btree/numeric_ops
* index datums will not get their display scale normalized-away here).
- * Normalization may need to be expanded to handle more cases in the future,
- * though. For example, it's possible that non-pivot tuples could in the
- * future have alternative logically equivalent representations due to using
- * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication.
+ * Caller does normalization for non-pivot tuples that have their own posting
+ * list, since dummy CREATE INDEX callback code generates new tuples with the
+ * same normalized representation. Compression is performed
+ * opportunistically, and in general there is no guarantee about how or when
+ * compression will be applied.
*/
static IndexTuple
bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
@@ -2560,14 +2636,16 @@ static inline ItemPointer
BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup,
bool nonpivot)
{
- ItemPointer result = BTreeTupleGetHeapTID(itup);
+ ItemPointer result;
BlockNumber targetblock = state->targetblock;
- if (result == NULL && nonpivot)
+ if (BTreeTupleIsPivot(itup) == nonpivot)
ereport(ERROR,
(errcode(ERRCODE_INDEX_CORRUPTED),
errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
targetblock, RelationGetRelationName(state->rel))));
+ result = BTreeTupleGetHeapTID(itup);
+
return result;
}
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 5890f39..fed1e86 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -41,21 +41,28 @@ static OffsetNumber _bt_findinsertloc(Relation rel,
BTStack stack,
Relation heapRel);
static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
+static void _bt_delete_and_insert(Relation rel,
+ Buffer buf,
+ Page page,
+ IndexTuple newitup, IndexTuple newitupright,
+ OffsetNumber newitemoff);
static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
Buffer buf,
Buffer cbuf,
BTStack stack,
IndexTuple itup,
OffsetNumber newitemoff,
- bool split_only_page);
+ bool split_only_page, int in_posting_offset);
static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
- IndexTuple newitem);
+ IndexTuple newitem, IndexTuple lefttup, IndexTuple righttup);
static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
BTStack stack, bool is_root, bool is_only);
static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
OffsetNumber itup_off);
static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
+static void insert_itupprev_to_page(Page page, BTCompressState *compressState);
+static void _bt_compress_one_page(Relation rel, Buffer buffer, Relation heapRel);
/*
* _bt_doinsert() -- Handle insertion of a single index tuple in the tree.
@@ -297,10 +304,13 @@ top:
* search bounds established within _bt_check_unique when insertion is
* checkingunique.
*/
+ insertstate.in_posting_offset = 0;
newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
stack, heapRel);
- _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
- itup, newitemoff, false);
+
+ _bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer,
+ stack, itup, newitemoff, false,
+ insertstate.in_posting_offset);
}
else
{
@@ -435,6 +445,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
/* okay, we gotta fetch the heap tuple ... */
curitup = (IndexTuple) PageGetItem(page, curitemid);
+ Assert(!BTreeTupleIsPosting(curitup));
htid = curitup->t_tid;
/*
@@ -759,6 +770,26 @@ _bt_findinsertloc(Relation rel,
_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
insertstate->bounds_valid = false;
}
+
+ /*
+ * If the target page is full, try to compress the page
+ */
+ if (PageGetFreeSpace(page) < insertstate->itemsz && !checkingunique)
+ {
+ _bt_compress_one_page(rel, insertstate->buf, heapRel);
+ insertstate->bounds_valid = false; /* paranoia */
+
+ /*
+ * FIXME: _bt_vacuum_one_page() won't have cleared the
+ * BTP_HAS_GARBAGE flag when it didn't kill items. Maybe we
+ * should clear the BTP_HAS_GARBAGE flag bit from the page when
+ * compression avoids a page split -- _bt_vacuum_one_page() is
+ * expecting a page split that takes care of it.
+ *
+ * (On the other hand, maybe it doesn't matter very much. A
+ * comment update seems like the bare minimum we should do.)
+ */
+ }
}
else
{
@@ -900,6 +931,77 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
insertstate->bounds_valid = false;
}
+/*
+ * Delete tuple on newitemoff offset and insert newitup at the same offset.
+ *
+ * If original posting tuple was split, 'newitup' represents left part of
+ * original tuple and 'newitupright' is it's right part, that must be inserted
+ * next to newitemoff.
+ * It's essential to do this atomic to be crash safe.
+ *
+ * NOTE All checks of free space must be done before calling this function.
+ *
+ * For use in posting tuple's update.
+ */
+static void
+_bt_delete_and_insert(Relation rel,
+ Buffer buf,
+ Page page,
+ IndexTuple newitup, IndexTuple newitupright,
+ OffsetNumber newitemoff)
+{
+ Size newitupsz = IndexTupleSize(newitup);
+
+ newitupsz = MAXALIGN(newitupsz);
+
+ elog(DEBUG4, "_bt_delete_and_insert %s newitemoff %d",
+ RelationGetRelationName(rel), newitemoff);
+ START_CRIT_SECTION();
+
+ PageIndexTupleDelete(page, newitemoff);
+
+ if (!_bt_pgaddtup(page, newitupsz, newitup, newitemoff))
+ elog(ERROR, "failed to insert compressed item in index \"%s\"",
+ RelationGetRelationName(rel));
+
+ if (newitupright)
+ {
+ if (!_bt_pgaddtup(page, MAXALIGN(IndexTupleSize(newitupright)),
+ newitupright, OffsetNumberNext(newitemoff)))
+ elog(ERROR, "failed to insert compressed item in index \"%s\"",
+ RelationGetRelationName(rel));
+ }
+
+ if (BufferIsValid(buf))
+ {
+ MarkBufferDirty(buf);
+
+ /* Xlog stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_btree_insert xlrec;
+ XLogRecPtr recptr;
+
+ xlrec.offnum = newitemoff;
+
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
+
+ Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
+
+ /*
+ * Force full page write to keep code simple
+ *
+ * TODO: think of using XLOG_BTREE_INSERT_LEAF with a new tuple's data
+ */
+ XLogRegisterBuffer(0, buf, REGBUF_STANDARD | REGBUF_FORCE_IMAGE);
+ recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF);
+ PageSetLSN(page, recptr);
+ }
+ }
+ END_CRIT_SECTION();
+}
+
/*----------
* _bt_insertonpg() -- Insert a tuple on a particular page in the index.
*
@@ -936,11 +1038,17 @@ _bt_insertonpg(Relation rel,
BTStack stack,
IndexTuple itup,
OffsetNumber newitemoff,
- bool split_only_page)
+ bool split_only_page,
+ int in_posting_offset)
{
Page page;
BTPageOpaque lpageop;
Size itemsz;
+ IndexTuple origtup;
+ int nipd;
+ IndexTuple neworigtup = NULL;
+ IndexTuple newrighttup = NULL;
+ bool need_split = false;
page = BufferGetPage(buf);
lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -965,13 +1073,184 @@ _bt_insertonpg(Relation rel,
* need to be consistent */
/*
+ * If new tuple's key is equal to the key of a posting tuple that already
+ * exists on the page and it's TID falls inside the min/max range of
+ * existing posting list, update the posting tuple.
+ *
+ * TODO Think of moving this to a separate function.
+ *
+ * TODO possible optimization:
+ * if original posting tuple is dead,
+ * reset in_posting_offset and handle itup as a regular tuple
+ */
+ if (in_posting_offset)
+ {
+ /* get old posting tuple */
+ ItemId itemid = PageGetItemId(page, newitemoff);
+ ItemPointerData *ipd;
+ int nipd, nipd_right;
+ bool need_posting_split = false;
+
+ origtup = (IndexTuple) PageGetItem(page, itemid);
+ Assert(BTreeTupleIsPosting(origtup));
+ nipd = BTreeTupleGetNPosting(origtup);
+ Assert(in_posting_offset < nipd);
+ Assert(itup_key->scantid != NULL);
+ Assert(itup_key->heapkeyspace);
+
+ elog(DEBUG4, "(%u,%u) is min, (%u,%u) is max, (%u,%u) is new",
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(origtup)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(origtup)),
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(origtup)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(origtup)),
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(itup)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(itup)));
+
+ /* check if posting tuple must be splitted */
+ if (BTMaxItemSize(page) < MAXALIGN(IndexTupleSize(origtup)) + sizeof(ItemPointerData))
+ need_posting_split = true;
+
+ /*
+ * If page split is needed, always split posting tuple.
+ * Probably that is not the most optimal,
+ * but it allows to simplify _bt_split code.
+ *
+ * TODO Does this decision have any significant drawbacks?
+ */
+ if (PageGetFreeSpace(page) < sizeof(ItemPointerData))
+ need_posting_split = true;
+
+ /*
+ * Handle corner cases (1)
+ * - itup TID is smaller than leftmost orightup TID
+ */
+ if (ItemPointerCompare(BTreeTupleGetHeapTID(itup),
+ BTreeTupleGetHeapTID(origtup)) < 0)
+ {
+ if (need_posting_split)
+ {
+ /*
+ * cannot avoid split, so no need in trying to fit itup into posting list.
+ * handle itup insertion as regular tuple insertion
+ */
+ elog(DEBUG4, "split posting tuple. itup is to the left of origtup");
+ in_posting_offset = InvalidOffsetNumber;
+ newitemoff = OffsetNumberPrev(newitemoff);
+ }
+ else
+ {
+ ipd = palloc0(nipd + 1);
+ /* insert new item pointer */
+ memcpy(ipd, itup, sizeof(ItemPointerData));
+ /* copy item pointers from original tuple that belong on right */
+ memcpy(ipd + 1, BTreeTupleGetPosting(origtup), sizeof(ItemPointerData) * nipd);
+ neworigtup = BTreeFormPostingTuple(origtup, ipd, nipd+1);
+ pfree(ipd);
+
+ Assert(ItemPointerCompare(BTreeTupleGetHeapTID(neworigtup),
+ BTreeTupleGetMaxTID(neworigtup)) < 0);
+ }
+ }
+
+ /*
+ * Handle corner cases (2)
+ * - itup TID is larger than rightmost orightup TID
+ */
+ if (ItemPointerCompare(BTreeTupleGetMaxTID(origtup),
+ BTreeTupleGetHeapTID(itup)) < 0)
+ {
+ if (need_posting_split)
+ {
+ /*
+ * cannot avoid split, so no need in trying to fit itup into posting list.
+ * handle itup insertion as regular tuple insertion
+ */
+ elog(DEBUG4, "split posting tuple. itup is to the right of origtup");
+ in_posting_offset = InvalidOffsetNumber;
+ }
+ else
+ {
+ ipd = palloc0(nipd + 1);
+ /* insert new item pointer */
+ /* copy item pointers from original tuple that belong on right */
+ memcpy(ipd, BTreeTupleGetPosting(origtup), sizeof(ItemPointerData) * nipd);
+ memcpy(ipd+nipd, itup, sizeof(ItemPointerData));
+
+ neworigtup = BTreeFormPostingTuple(origtup, ipd, nipd+1);
+ pfree(ipd);
+
+ Assert(ItemPointerCompare(BTreeTupleGetHeapTID(neworigtup),
+ BTreeTupleGetMaxTID(neworigtup)) < 0);
+ }
+ }
+
+ /*
+ * itup TID belongs to TID range of origtup posting list
+ *
+ * Split posting tuple into two halves.
+ *
+ * neworigtup (left) tuple contains all item pointers less than the new one and
+ * newrighttup tuple contains new item pointer and all to the right.
+ */
+ if (ItemPointerCompare(BTreeTupleGetHeapTID(itup),
+ BTreeTupleGetHeapTID(origtup)) > 0
+ &&
+ ItemPointerCompare(BTreeTupleGetMaxTID(origtup),
+ BTreeTupleGetHeapTID(itup)) > 0)
+ {
+ neworigtup = BTreeFormPostingTuple(origtup, BTreeTupleGetPosting(origtup),
+ in_posting_offset);
+
+ nipd_right = nipd - in_posting_offset + 1;
+
+ elog(DEBUG4, "split posting tuple in_posting_offset %d nipd %d nipd_right %d",
+ in_posting_offset, nipd, nipd_right);
+
+ ipd = palloc0(sizeof(ItemPointerData) * nipd_right);
+ /* insert new item pointer */
+ memcpy(ipd, itup, sizeof(ItemPointerData));
+ /* copy item pointers from original tuple that belong on right */
+ memcpy(ipd + 1,
+ BTreeTupleGetPostingN(origtup, in_posting_offset),
+ sizeof(ItemPointerData) * (nipd - in_posting_offset));
+
+ newrighttup = BTreeFormPostingTuple(origtup, ipd, nipd_right);
+
+ Assert(ItemPointerCompare(BTreeTupleGetMaxTID(neworigtup),
+ BTreeTupleGetHeapTID(newrighttup)) < 0);
+ pfree(ipd);
+
+ elog(DEBUG4, "left N %d (%u,%u) to (%u,%u), right N %d (%u,%u) to (%u,%u) ",
+ BTreeTupleIsPosting(neworigtup)?BTreeTupleGetNPosting(neworigtup):0,
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(neworigtup)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(neworigtup)),
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(neworigtup)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(neworigtup)),
+ BTreeTupleIsPosting(newrighttup)?BTreeTupleGetNPosting(newrighttup):0,
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(newrighttup)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(newrighttup)),
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(newrighttup)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(newrighttup)));
+
+ /*
+ * check if splitted tuple still fit into original page
+ * TODO should we add sizeof(ItemIdData) in this check?
+ */
+ if (PageGetFreeSpace(page) < (MAXALIGN(IndexTupleSize(neworigtup))
+ + MAXALIGN(IndexTupleSize(newrighttup))
+ - MAXALIGN(IndexTupleSize(origtup))))
+ need_split = true;
+ }
+ }
+
+ /*
* Do we need to split the page to fit the item on it?
*
* Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
* so this comparison is correct even though we appear to be accounting
* only for the item and not for its line pointer.
*/
- if (PageGetFreeSpace(page) < itemsz)
+ if (PageGetFreeSpace(page) < itemsz || need_split)
{
bool is_root = P_ISROOT(lpageop);
bool is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
@@ -996,7 +1275,8 @@ _bt_insertonpg(Relation rel,
BlockNumberIsValid(RelationGetTargetBlock(rel))));
/* split the buffer into left and right halves */
- rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup);
+ rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
+ neworigtup, newrighttup);
PredicateLockPageSplit(rel,
BufferGetBlockNumber(buf),
BufferGetBlockNumber(rbuf));
@@ -1033,142 +1313,159 @@ _bt_insertonpg(Relation rel,
itup_off = newitemoff;
itup_blkno = BufferGetBlockNumber(buf);
- /*
- * If we are doing this insert because we split a page that was the
- * only one on its tree level, but was not the root, it may have been
- * the "fast root". We need to ensure that the fast root link points
- * at or above the current page. We can safely acquire a lock on the
- * metapage here --- see comments for _bt_newroot().
- */
- if (split_only_page)
+ if (!in_posting_offset)
{
- Assert(!P_ISLEAF(lpageop));
-
- metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
- metapg = BufferGetPage(metabuf);
- metad = BTPageGetMeta(metapg);
-
- if (metad->btm_fastlevel >= lpageop->btpo.level)
+ /*
+ * If we are doing this insert because we split a page that was the
+ * only one on its tree level, but was not the root, it may have been
+ * the "fast root". We need to ensure that the fast root link points
+ * at or above the current page. We can safely acquire a lock on the
+ * metapage here --- see comments for _bt_newroot().
+ */
+ if (split_only_page)
{
- /* no update wanted */
- _bt_relbuf(rel, metabuf);
- metabuf = InvalidBuffer;
- }
- }
-
- /*
- * Every internal page should have exactly one negative infinity item
- * at all times. Only _bt_split() and _bt_newroot() should add items
- * that become negative infinity items through truncation, since
- * they're the only routines that allocate new internal pages. Do not
- * allow a retail insertion of a new item at the negative infinity
- * offset.
- */
- if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
- elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
- itup_blkno, RelationGetRelationName(rel));
+ Assert(!P_ISLEAF(lpageop));
- /* Do the update. No ereport(ERROR) until changes are logged */
- START_CRIT_SECTION();
+ metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+ metapg = BufferGetPage(metabuf);
+ metad = BTPageGetMeta(metapg);
- if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
- elog(PANIC, "failed to add new item to block %u in index \"%s\"",
- itup_blkno, RelationGetRelationName(rel));
+ if (metad->btm_fastlevel >= lpageop->btpo.level)
+ {
+ /* no update wanted */
+ _bt_relbuf(rel, metabuf);
+ metabuf = InvalidBuffer;
+ }
+ }
- MarkBufferDirty(buf);
+ /*
+ * Every internal page should have exactly one negative infinity item
+ * at all times. Only _bt_split() and _bt_newroot() should add items
+ * that become negative infinity items through truncation, since
+ * they're the only routines that allocate new internal pages. Do not
+ * allow a retail insertion of a new item at the negative infinity
+ * offset.
+ */
+ if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
+ elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
+ itup_blkno, RelationGetRelationName(rel));
+
+ /* Do the update. No ereport(ERROR) until changes are logged */
+ START_CRIT_SECTION();
+
+ if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
+ elog(PANIC, "failed to add new item to block %u in index \"%s\"",
+ itup_blkno, RelationGetRelationName(rel));
+
+ MarkBufferDirty(buf);
- if (BufferIsValid(metabuf))
- {
- /* upgrade meta-page if needed */
- if (metad->btm_version < BTREE_NOVAC_VERSION)
- _bt_upgrademetapage(metapg);
- metad->btm_fastroot = itup_blkno;
- metad->btm_fastlevel = lpageop->btpo.level;
- MarkBufferDirty(metabuf);
- }
+ if (BufferIsValid(metabuf))
+ {
+ /* upgrade meta-page if needed */
+ if (metad->btm_version < BTREE_NOVAC_VERSION)
+ _bt_upgrademetapage(metapg);
+ metad->btm_fastroot = itup_blkno;
+ metad->btm_fastlevel = lpageop->btpo.level;
+ MarkBufferDirty(metabuf);
+ }
- /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
- if (BufferIsValid(cbuf))
- {
- Page cpage = BufferGetPage(cbuf);
- BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+ /* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
+ if (BufferIsValid(cbuf))
+ {
+ Page cpage = BufferGetPage(cbuf);
+ BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
- Assert(P_INCOMPLETE_SPLIT(cpageop));
- cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
- MarkBufferDirty(cbuf);
- }
+ Assert(P_INCOMPLETE_SPLIT(cpageop));
+ cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+ MarkBufferDirty(cbuf);
+ }
- /*
- * Cache the block information if we just inserted into the rightmost
- * leaf page of the index and it's not the root page. For very small
- * index where root is also the leaf, there is no point trying for any
- * optimization.
- */
- if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
- cachedBlock = BufferGetBlockNumber(buf);
+ /* XLOG stuff */
+ if (RelationNeedsWAL(rel))
+ {
+ xl_btree_insert xlrec;
+ xl_btree_metadata xlmeta;
+ uint8 xlinfo;
+ XLogRecPtr recptr;
- /* XLOG stuff */
- if (RelationNeedsWAL(rel))
- {
- xl_btree_insert xlrec;
- xl_btree_metadata xlmeta;
- uint8 xlinfo;
- XLogRecPtr recptr;
+ xlrec.offnum = itup_off;
- xlrec.offnum = itup_off;
+ XLogBeginInsert();
+ XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
- XLogBeginInsert();
- XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
+ if (P_ISLEAF(lpageop))
+ xlinfo = XLOG_BTREE_INSERT_LEAF;
+ else
+ {
+ /*
+ * Register the left child whose INCOMPLETE_SPLIT flag was
+ * cleared.
+ */
+ XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
- if (P_ISLEAF(lpageop))
- xlinfo = XLOG_BTREE_INSERT_LEAF;
- else
- {
- /*
- * Register the left child whose INCOMPLETE_SPLIT flag was
- * cleared.
- */
- XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
+ xlinfo = XLOG_BTREE_INSERT_UPPER;
+ }
- xlinfo = XLOG_BTREE_INSERT_UPPER;
- }
+ if (BufferIsValid(metabuf))
+ {
+ Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+ xlmeta.version = metad->btm_version;
+ xlmeta.root = metad->btm_root;
+ xlmeta.level = metad->btm_level;
+ xlmeta.fastroot = metad->btm_fastroot;
+ xlmeta.fastlevel = metad->btm_fastlevel;
+ xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+ xlmeta.last_cleanup_num_heap_tuples =
+ metad->btm_last_cleanup_num_heap_tuples;
+
+ XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+ XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
+
+ xlinfo = XLOG_BTREE_INSERT_META;
+ }
- if (BufferIsValid(metabuf))
- {
- Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
- xlmeta.version = metad->btm_version;
- xlmeta.root = metad->btm_root;
- xlmeta.level = metad->btm_level;
- xlmeta.fastroot = metad->btm_fastroot;
- xlmeta.fastlevel = metad->btm_fastlevel;
- xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
- xlmeta.last_cleanup_num_heap_tuples =
- metad->btm_last_cleanup_num_heap_tuples;
-
- XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
- XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
-
- xlinfo = XLOG_BTREE_INSERT_META;
- }
+ XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+ XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
- XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
- XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+ recptr = XLogInsert(RM_BTREE_ID, xlinfo);
- recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+ if (BufferIsValid(metabuf))
+ {
+ PageSetLSN(metapg, recptr);
+ }
+ if (BufferIsValid(cbuf))
+ {
+ PageSetLSN(BufferGetPage(cbuf), recptr);
+ }
- if (BufferIsValid(metabuf))
- {
- PageSetLSN(metapg, recptr);
- }
- if (BufferIsValid(cbuf))
- {
- PageSetLSN(BufferGetPage(cbuf), recptr);
+ PageSetLSN(page, recptr);
}
- PageSetLSN(page, recptr);
+ END_CRIT_SECTION();
+ }
+ else
+ {
+ /*
+ * Insert new tuple on place of existing posting tuple.
+ * Delete old posting tuple, and insert updated tuple instead.
+ *
+ * If split was needed, both neworigtup and newrighttup are initialized
+ * and both will be inserted, otherwise newrighttup is NULL.
+ *
+ * It only can happen on leaf page.
+ */
+ elog(DEBUG4, "_bt_insertonpg. _bt_delete_and_insert %s", RelationGetRelationName(rel));
+ _bt_delete_and_insert(rel, buf, page, neworigtup, newrighttup, newitemoff);
}
- END_CRIT_SECTION();
+ /*
+ * Cache the block information if we just inserted into the rightmost
+ * leaf page of the index and it's not the root page. For very small
+ * index where root is also the leaf, there is no point trying for any
+ * optimization.
+ */
+ if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
+ cachedBlock = BufferGetBlockNumber(buf);
/* release buffers */
if (BufferIsValid(metabuf))
@@ -1214,7 +1511,8 @@ _bt_insertonpg(Relation rel,
*/
static Buffer
_bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
- OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
+ OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+ IndexTuple lefttup, IndexTuple righttup)
{
Buffer rbuf;
Page origpage;
@@ -1236,6 +1534,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
OffsetNumber firstright;
OffsetNumber maxoff;
OffsetNumber i;
+ OffsetNumber replaceitemoff = InvalidOffsetNumber;
+ Size replaceitemsz;
bool newitemonleft,
isleaf;
IndexTuple lefthikey;
@@ -1243,6 +1543,24 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
int indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
/*
+ * If we're working with splitted posting tuple,
+ * new tuple is actually contained in righttup posting list
+ */
+ if (righttup)
+ {
+ newitem = righttup;
+ newitemsz = MAXALIGN(IndexTupleSize(righttup));
+
+ /*
+ * actual insertion is a replacement of origtup with lefttup
+ * and insertion of righttup (as newitem) next to it.
+ */
+ replaceitemoff = newitemoff;
+ replaceitemsz = MAXALIGN(IndexTupleSize(lefttup));
+ newitemoff = OffsetNumberNext(newitemoff);
+ }
+
+ /*
* origpage is the original page to be split. leftpage is a temporary
* buffer that receives the left-sibling data, which will be copied back
* into origpage on success. rightpage is the new page that will receive
@@ -1275,7 +1593,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
* (but not always) redundant information.
*/
firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
- newitem, &newitemonleft);
+ newitem, replaceitemoff, replaceitemsz,
+ lefttup, &newitemonleft);
/* Allocate temp buffer for leftpage */
leftpage = PageGetTempPage(origpage);
@@ -1364,6 +1683,17 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
/* incoming tuple will become last on left page */
lastleft = newitem;
}
+ else if (!newitemonleft && newitemoff == firstright && lefttup)
+ {
+ /*
+ * if newitem is first on the right page
+ * and split posting tuple handle is reuqired,
+ * lastleft will be replaced with lefttup,
+ * so use it here
+ */
+ elog(DEBUG4, "lastleft = lefttup firstright %d", firstright);
+ lastleft = lefttup;
+ }
else
{
OffsetNumber lastleftoff;
@@ -1480,6 +1810,39 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
itemsz = ItemIdGetLength(itemid);
item = (IndexTuple) PageGetItem(origpage, itemid);
+ if (i == replaceitemoff)
+ {
+ if (replaceitemoff <= firstright)
+ {
+ elog(DEBUG4, "_bt_split left. replaceitem block %u %s replaceitemoff %d leftoff %d",
+ origpagenumber, RelationGetRelationName(rel), replaceitemoff, leftoff);
+ if (!_bt_pgaddtup(leftpage, MAXALIGN(IndexTupleSize(lefttup)), lefttup, leftoff))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add new item to the left sibling"
+ " while splitting block %u of index \"%s\"",
+ origpagenumber, RelationGetRelationName(rel));
+ }
+ leftoff = OffsetNumberNext(leftoff);
+ }
+ else
+ {
+ elog(DEBUG4, "_bt_split right. replaceitem block %u %s replaceitemoff %d newitemoff %d",
+ origpagenumber, RelationGetRelationName(rel), replaceitemoff, newitemoff);
+ elog(DEBUG4, "_bt_split right. i %d, maxoff %d, rightoff %d", i, maxoff, rightoff);
+
+ if (!_bt_pgaddtup(rightpage, MAXALIGN(IndexTupleSize(lefttup)), lefttup, rightoff))
+ {
+ memset(rightpage, 0, BufferGetPageSize(rbuf));
+ elog(ERROR, "failed to add new item to the right sibling"
+ " while splitting block %u of index \"%s\", rightoff %d",
+ origpagenumber, RelationGetRelationName(rel), rightoff);
+ }
+ rightoff = OffsetNumberNext(rightoff);
+ }
+ continue;
+ }
+
/* does new item belong before this one? */
if (i == newitemoff)
{
@@ -1497,13 +1860,14 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
}
else
{
+ elog(DEBUG4, "insert newitem to the right. i %d, maxoff %d, rightoff %d", i, maxoff, rightoff);
Assert(newitemoff >= firstright);
if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the right sibling"
- " while splitting block %u of index \"%s\"",
- origpagenumber, RelationGetRelationName(rel));
+ " while splitting block %u of index \"%s\", rightoff %d",
+ origpagenumber, RelationGetRelationName(rel), rightoff);
}
rightoff = OffsetNumberNext(rightoff);
}
@@ -1547,8 +1911,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
{
memset(rightpage, 0, BufferGetPageSize(rbuf));
elog(ERROR, "failed to add new item to the right sibling"
- " while splitting block %u of index \"%s\"",
- origpagenumber, RelationGetRelationName(rel));
+ " while splitting block %u of index \"%s\" rightoff %d",
+ origpagenumber, RelationGetRelationName(rel), rightoff);
}
rightoff = OffsetNumberNext(rightoff);
}
@@ -1837,7 +2201,7 @@ _bt_insert_parent(Relation rel,
/* Recursively update the parent */
_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
new_item, stack->bts_offset + 1,
- is_only);
+ is_only, 0);
/* be tidy */
pfree(new_item);
@@ -2290,3 +2654,206 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
* the page.
*/
}
+
+/*
+ * Add new item (compressed or not) to the page, while compressing it.
+ * If insertion failed, return false.
+ * Caller should consider this as compression failure and
+ * leave page uncompressed.
+ */
+static void
+insert_itupprev_to_page(Page page, BTCompressState *compressState)
+{
+ IndexTuple to_insert;
+ OffsetNumber offnum = PageGetMaxOffsetNumber(page);
+
+ if (compressState->ntuples == 0)
+ to_insert = compressState->itupprev;
+ else
+ {
+ IndexTuple postingtuple;
+
+ /* form a tuple with a posting list */
+ postingtuple = BTreeFormPostingTuple(compressState->itupprev,
+ compressState->ipd,
+ compressState->ntuples);
+ to_insert = postingtuple;
+ pfree(compressState->ipd);
+ }
+
+ /* Add the new item into the page */
+ offnum = OffsetNumberNext(offnum);
+
+ elog(DEBUG4, "insert_itupprev_to_page. compressState->ntuples %d IndexTupleSize %zu free %zu",
+ compressState->ntuples, IndexTupleSize(to_insert), PageGetFreeSpace(page));
+
+ if (PageAddItem(page, (Item) to_insert, IndexTupleSize(to_insert),
+ offnum, false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add tuple to page while compresing it");
+
+ if (compressState->ntuples > 0)
+ pfree(to_insert);
+ compressState->ntuples = 0;
+}
+
+/*
+ * Before splitting the page, try to compress items to free some space.
+ * If compression didn't succeed, buffer will contain old state of the page.
+ * This function should be called after lp_dead items
+ * were removed by _bt_vacuum_one_page().
+ */
+static void
+_bt_compress_one_page(Relation rel, Buffer buffer, Relation heapRel)
+{
+ OffsetNumber offnum,
+ minoff,
+ maxoff;
+ Page page = BufferGetPage(buffer);
+ Page newpage;
+ BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+ bool use_compression = false;
+ BTCompressState *compressState = NULL;
+ int natts = IndexRelationGetNumberOfAttributes(rel);
+ OffsetNumber deletable[MaxOffsetNumber];
+ int ndeletable = 0;
+
+ /*
+ * Don't use compression for indexes with INCLUDEd columns and unique
+ * indexes.
+ */
+ use_compression = (IndexRelationGetNumberOfKeyAttributes(rel) ==
+ IndexRelationGetNumberOfAttributes(rel) &&
+ !rel->rd_index->indisunique);
+ if (!use_compression)
+ return;
+
+ /* init compress state needed to build posting tuples */
+ compressState = (BTCompressState *) palloc0(sizeof(BTCompressState));
+ compressState->ipd = NULL;
+ compressState->ntuples = 0;
+ compressState->itupprev = NULL;
+ compressState->maxitemsize = BTMaxItemSize(page);
+ compressState->maxpostingsize = 0;
+
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+
+
+ /*
+ * Delete dead tuples if any.
+ * We cannot simply skip them in the cycle below, because it's neccessary
+ * to generate special Xlog record containing such tuples to compute
+ * latestRemovedXid on a standby server later.
+ *
+ * This should not affect performance, since it only can happen in a rare
+ * situation when BTP_HAS_GARBAGE flag was not set and _bt_vacuum_one_page
+ * was not called, or _bt_vacuum_one_page didn't remove all dead items.
+ */
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemid = PageGetItemId(page, P_HIKEY);
+
+ if (ItemIdIsDead(itemid))
+ deletable[ndeletable++] = offnum;
+ }
+
+ if (ndeletable > 0)
+ _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
+
+ /*
+ * Scan over all items to see which ones can be compressed
+ */
+ minoff = P_FIRSTDATAKEY(opaque);
+ maxoff = PageGetMaxOffsetNumber(page);
+ newpage = PageGetTempPageCopySpecial(page);
+ elog(DEBUG4, "_bt_compress_one_page rel: %s,blkno: %u",
+ RelationGetRelationName(rel), BufferGetBlockNumber(buffer));
+
+ /* Copy High Key if any */
+ if (!P_RIGHTMOST(opaque))
+ {
+ ItemId itemid = PageGetItemId(page, P_HIKEY);
+ Size itemsz = ItemIdGetLength(itemid);
+ IndexTuple item = (IndexTuple) PageGetItem(page, itemid);
+
+ if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY,
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to add highkey during compression");
+ }
+
+ /*
+ * Iterate over tuples on the page, try to compress them into posting
+ * lists and insert into new page.
+ */
+ for (offnum = minoff;
+ offnum <= maxoff;
+ offnum = OffsetNumberNext(offnum))
+ {
+ ItemId itemId = PageGetItemId(page, offnum);
+ IndexTuple itup = (IndexTuple) PageGetItem(page, itemId);
+
+ if (compressState->itupprev != NULL)
+ {
+ int n_equal_atts =
+ _bt_keep_natts_fast(rel, compressState->itupprev, itup);
+ int itup_ntuples = BTreeTupleIsPosting(itup) ?
+ BTreeTupleGetNPosting(itup) : 1;
+
+ if (n_equal_atts > natts)
+ {
+ /*
+ * When tuples are equal, create or update posting.
+ *
+ * If posting is too big, insert it on page and continue.
+ */
+ if (compressState->maxitemsize >
+ MAXALIGN(((IndexTupleSize(compressState->itupprev)
+ + (compressState->ntuples + itup_ntuples + 1) * sizeof(ItemPointerData)))))
+ {
+ _bt_add_posting_item(compressState, itup);
+ }
+ else
+ {
+ insert_itupprev_to_page(newpage, compressState);
+ }
+ }
+ else
+ {
+ insert_itupprev_to_page(newpage, compressState);
+ }
+ }
+
+ /*
+ * Copy the tuple into temp variable itupprev to compare it with the
+ * following tuple and maybe unite them into a posting tuple
+ */
+ if (compressState->itupprev)
+ pfree(compressState->itupprev);
+ compressState->itupprev = CopyIndexTuple(itup);
+
+ Assert(IndexTupleSize(compressState->itupprev) <= compressState->maxitemsize);
+ }
+
+ /* Handle the last item. */
+ insert_itupprev_to_page(newpage, compressState);
+
+ START_CRIT_SECTION();
+
+ PageRestoreTempPage(newpage, page);
+ MarkBufferDirty(buffer);
+
+ /* Log full page write */
+ if (RelationNeedsWAL(rel))
+ {
+ XLogRecPtr recptr;
+
+ recptr = log_newpage_buffer(buffer, true);
+ PageSetLSN(page, recptr);
+ }
+
+ END_CRIT_SECTION();
+
+ elog(DEBUG4, "_bt_compress_one_page. success");
+}
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 9c1f7de..86c662d 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -983,14 +983,52 @@ _bt_page_recyclable(Page page)
void
_bt_delitems_vacuum(Relation rel, Buffer buf,
OffsetNumber *itemnos, int nitems,
+ OffsetNumber *remainingoffset,
+ IndexTuple *remaining, int nremaining,
BlockNumber lastBlockVacuumed)
{
Page page = BufferGetPage(buf);
BTPageOpaque opaque;
+ Size itemsz;
+ Size remaining_sz = 0;
+ char *remaining_buf = NULL;
+
+ /* XLOG stuff, buffer for remainings */
+ if (nremaining && RelationNeedsWAL(rel))
+ {
+ Size offset = 0;
+
+ for (int i = 0; i < nremaining; i++)
+ remaining_sz += MAXALIGN(IndexTupleSize(remaining[i]));
+
+ remaining_buf = palloc0(remaining_sz);
+ for (int i = 0; i < nremaining; i++)
+ {
+ itemsz = IndexTupleSize(remaining[i]);
+ memcpy(remaining_buf + offset, (char *) remaining[i], itemsz);
+ offset += MAXALIGN(itemsz);
+ }
+ Assert(offset == remaining_sz);
+ }
/* No ereport(ERROR) until changes are logged */
START_CRIT_SECTION();
+ /* Handle posting tuples here */
+ for (int i = 0; i < nremaining; i++)
+ {
+ /* At first, delete the old tuple. */
+ PageIndexTupleDelete(page, remainingoffset[i]);
+
+ itemsz = IndexTupleSize(remaining[i]);
+ itemsz = MAXALIGN(itemsz);
+
+ /* Add tuple with remaining ItemPointers to the page. */
+ if (PageAddItem(page, (Item) remaining[i], itemsz, remainingoffset[i],
+ false, false) == InvalidOffsetNumber)
+ elog(ERROR, "failed to rewrite compressed item in index while doing vacuum");
+ }
+
/* Fix the page */
if (nitems > 0)
PageIndexMultiDelete(page, itemnos, nitems);
@@ -1020,6 +1058,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
xl_btree_vacuum xlrec_vacuum;
xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
+ xlrec_vacuum.nremaining = nremaining;
+ xlrec_vacuum.ndeleted = nitems;
XLogBeginInsert();
XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
@@ -1033,6 +1073,19 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
if (nitems > 0)
XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber));
+ /*
+ * Here we should save offnums and remaining tuples themselves. It's
+ * important to restore them in correct order. At first, we must
+ * handle remaining tuples and only after that other deleted items.
+ */
+ if (nremaining > 0)
+ {
+ Assert(remaining_buf != NULL);
+ XLogRegisterBufData(0, (char *) remainingoffset,
+ nremaining * sizeof(OffsetNumber));
+ XLogRegisterBufData(0, remaining_buf, remaining_sz);
+ }
+
recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
PageSetLSN(page, recptr);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 4cfd528..22fb228 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -97,6 +97,8 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
BTCycleId cycleid, TransactionId *oldestBtpoXact);
static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
BlockNumber orig_blkno);
+static ItemPointer btreevacuumPosting(BTVacState *vstate, IndexTuple itup,
+ int *nremaining);
/*
@@ -1069,7 +1071,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
RBM_NORMAL, info->strategy);
LockBufferForCleanup(buf);
_bt_checkpage(rel, buf);
- _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
+ _bt_delitems_vacuum(rel, buf, NULL, 0, NULL, NULL, 0,
+ vstate.lastBlockVacuumed);
_bt_relbuf(rel, buf);
}
@@ -1193,6 +1196,9 @@ restart:
OffsetNumber offnum,
minoff,
maxoff;
+ IndexTuple remaining[MaxOffsetNumber];
+ OffsetNumber remainingoffset[MaxOffsetNumber];
+ int nremaining;
/*
* Trade in the initial read lock for a super-exclusive write lock on
@@ -1229,6 +1235,7 @@ restart:
* callback function.
*/
ndeletable = 0;
+ nremaining = 0;
minoff = P_FIRSTDATAKEY(opaque);
maxoff = PageGetMaxOffsetNumber(page);
if (callback)
@@ -1242,31 +1249,78 @@ restart:
itup = (IndexTuple) PageGetItem(page,
PageGetItemId(page, offnum));
- htup = &(itup->t_tid);
- /*
- * During Hot Standby we currently assume that
- * XLOG_BTREE_VACUUM records do not produce conflicts. That is
- * only true as long as the callback function depends only
- * upon whether the index tuple refers to heap tuples removed
- * in the initial heap scan. When vacuum starts it derives a
- * value of OldestXmin. Backends taking later snapshots could
- * have a RecentGlobalXmin with a later xid than the vacuum's
- * OldestXmin, so it is possible that row versions deleted
- * after OldestXmin could be marked as killed by other
- * backends. The callback function *could* look at the index
- * tuple state in isolation and decide to delete the index
- * tuple, though currently it does not. If it ever did, we
- * would need to reconsider whether XLOG_BTREE_VACUUM records
- * should cause conflicts. If they did cause conflicts they
- * would be fairly harsh conflicts, since we haven't yet
- * worked out a way to pass a useful value for
- * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
- * applies to *any* type of index that marks index tuples as
- * killed.
- */
- if (callback(htup, callback_state))
- deletable[ndeletable++] = offnum;
+ if (BTreeTupleIsPosting(itup))
+ {
+ int nnewipd = 0;
+ ItemPointer newipd = NULL;
+
+ newipd = btreevacuumPosting(vstate, itup, &nnewipd);
+
+ if (nnewipd == 0)
+ {
+ /*
+ * All TIDs from posting list must be deleted, we can
+ * delete whole tuple in a regular way.
+ */
+ deletable[ndeletable++] = offnum;
+ }
+ else if (nnewipd == BTreeTupleGetNPosting(itup))
+ {
+ /*
+ * All TIDs from posting tuple must remain. Do
+ * nothing, just cleanup.
+ */
+ pfree(newipd);
+ }
+ else if (nnewipd < BTreeTupleGetNPosting(itup))
+ {
+ /* Some TIDs from posting tuple must remain. */
+ Assert(nnewipd > 0);
+ Assert(newipd != NULL);
+
+ /*
+ * Form new tuple that contains only remaining TIDs.
+ * Remember this tuple and the offset of the old tuple
+ * to update it in place.
+ */
+ remainingoffset[nremaining] = offnum;
+ remaining[nremaining] = BTreeFormPostingTuple(itup, newipd, nnewipd);
+ nremaining++;
+ pfree(newipd);
+
+ Assert(IndexTupleSize(itup) <= BTMaxItemSize(page));
+ }
+ }
+ else
+ {
+ htup = &(itup->t_tid);
+
+ /*
+ * During Hot Standby we currently assume that
+ * XLOG_BTREE_VACUUM records do not produce conflicts.
+ * That is only true as long as the callback function
+ * depends only upon whether the index tuple refers to
+ * heap tuples removed in the initial heap scan. When
+ * vacuum starts it derives a value of OldestXmin.
+ * Backends taking later snapshots could have a
+ * RecentGlobalXmin with a later xid than the vacuum's
+ * OldestXmin, so it is possible that row versions deleted
+ * after OldestXmin could be marked as killed by other
+ * backends. The callback function *could* look at the
+ * index tuple state in isolation and decide to delete the
+ * index tuple, though currently it does not. If it ever
+ * did, we would need to reconsider whether
+ * XLOG_BTREE_VACUUM records should cause conflicts. If
+ * they did cause conflicts they would be fairly harsh
+ * conflicts, since we haven't yet worked out a way to
+ * pass a useful value for latestRemovedXid on the
+ * XLOG_BTREE_VACUUM records. This applies to *any* type
+ * of index that marks index tuples as killed.
+ */
+ if (callback(htup, callback_state))
+ deletable[ndeletable++] = offnum;
+ }
}
}
@@ -1274,7 +1328,7 @@ restart:
* Apply any needed deletes. We issue just one _bt_delitems_vacuum()
* call per page, so as to minimize WAL traffic.
*/
- if (ndeletable > 0)
+ if (ndeletable > 0 || nremaining > 0)
{
/*
* Notice that the issued XLOG_BTREE_VACUUM WAL record includes
@@ -1291,6 +1345,7 @@ restart:
* that.
*/
_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
+ remainingoffset, remaining, nremaining,
vstate->lastBlockVacuumed);
/*
@@ -1376,6 +1431,41 @@ restart:
}
/*
+ * btreevacuumPosting() -- vacuums a posting tuple.
+ *
+ * Returns new palloc'd posting list with remaining items.
+ * Posting list size is returned via nremaining.
+ *
+ * If all items are dead,
+ * nremaining is 0 and resulting posting list is NULL.
+ */
+static ItemPointer
+btreevacuumPosting(BTVacState *vstate, IndexTuple itup, int *nremaining)
+{
+ int remaining = 0;
+ int nitem = BTreeTupleGetNPosting(itup);
+ ItemPointer tmpitems = NULL,
+ items = BTreeTupleGetPosting(itup);
+
+ /*
+ * Check each tuple in the posting list, save alive tuples into tmpitems
+ */
+ for (int i = 0; i < nitem; i++)
+ {
+ if (vstate->callback(items + i, vstate->callback_state))
+ continue;
+
+ if (tmpitems == NULL)
+ tmpitems = palloc(sizeof(ItemPointerData) * nitem);
+
+ tmpitems[remaining++] = items[i];
+ }
+
+ *nremaining = remaining;
+ return tmpitems;
+}
+
+/*
* btcanreturn() -- Check whether btree indexes support index-only scans.
*
* btrees always do, so this is trivial.
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 19735bf..de0af9e 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -30,6 +30,9 @@ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
OffsetNumber offnum);
static void _bt_saveitem(BTScanOpaque so, int itemIndex,
OffsetNumber offnum, IndexTuple itup);
+static void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+ OffsetNumber offnum, ItemPointer iptr,
+ IndexTuple itup, int i);
static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
@@ -504,7 +507,8 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
/* We have low <= mid < high, so mid points at a real slot */
- result = _bt_compare(rel, key, page, mid);
+ result = _bt_compare_posting(rel, key, page, mid,
+ &(insertstate->in_posting_offset));
if (result >= cmpval)
low = mid + 1;
@@ -533,6 +537,60 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
return low;
}
+/*
+ * Compare insertion-type scankey to tuple on a page,
+ * taking into account posting tuples.
+ * If the key of the posting tuple is equal to scankey,
+ * find exact position inside the posting list,
+ * using TID as extra attribute.
+ */
+int32
+_bt_compare_posting(Relation rel,
+ BTScanInsert key,
+ Page page,
+ OffsetNumber offnum,
+ int *in_posting_offset)
+{
+ IndexTuple itup;
+ int result;
+
+ itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+ result = _bt_compare(rel, key, page, offnum);
+
+ if (BTreeTupleIsPosting(itup) && result == 0)
+ {
+ int low,
+ high,
+ mid,
+ res;
+
+ low = 0;
+ /* "high" is past end of posting list for loop invariant */
+ high = BTreeTupleGetNPosting(itup);
+
+ while (high > low)
+ {
+ mid = low + ((high - low) / 2);
+ res = ItemPointerCompare(key->scantid,
+ BTreeTupleGetPostingN(itup, mid));
+
+ if (res >= 1)
+ low = mid + 1;
+ else
+ high = mid;
+ }
+
+ *in_posting_offset = high;
+ elog(DEBUG4, "_bt_compare_posting in_posting_offset %d", *in_posting_offset);
+ Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
+ key->scantid) < 0);
+ Assert(ItemPointerCompare(key->scantid,
+ BTreeTupleGetMaxTID(itup)) < 0);
+ }
+
+ return result;
+}
+
/*----------
* _bt_compare() -- Compare insertion-type scankey to tuple on a page.
*
@@ -665,61 +723,120 @@ _bt_compare(Relation rel,
* Use the heap TID attribute and scantid to try to break the tie. The
* rules are the same as any other key attribute -- only the
* representation differs.
+ *
+ * When itup is a posting tuple, the check becomes more complex. It is
+ * possible that the scankey belongs to the tuple's posting list TID
+ * range.
+ *
+ * _bt_compare() is multipurpose, so it just returns 0 for a fact that key
+ * matches tuple at this offset.
+ *
+ * Use special _bt_compare_posting() wrapper function to handle this case
+ * and perform recheck for posting tuple, finding exact position of the
+ * scankey.
*/
- heapTid = BTreeTupleGetHeapTID(itup);
- if (key->scantid == NULL)
+ if (!BTreeTupleIsPosting(itup))
{
+ heapTid = BTreeTupleGetHeapTID(itup);
+ if (key->scantid == NULL)
+ {
+ /*
+ * Most searches have a scankey that is considered greater than a
+ * truncated pivot tuple if and when the scankey has equal values
+ * for attributes up to and including the least significant
+ * untruncated attribute in tuple.
+ *
+ * For example, if an index has the minimum two attributes (single
+ * user key attribute, plus heap TID attribute), and a page's high
+ * key is ('foo', -inf), and scankey is ('foo', <omitted>), the
+ * search will not descend to the page to the left. The search
+ * will descend right instead. The truncated attribute in pivot
+ * tuple means that all non-pivot tuples on the page to the left
+ * are strictly < 'foo', so it isn't necessary to descend left. In
+ * other words, search doesn't have to descend left because it
+ * isn't interested in a match that has a heap TID value of -inf.
+ *
+ * However, some searches (pivotsearch searches) actually require
+ * that we descend left when this happens. -inf is treated as a
+ * possible match for omitted scankey attribute(s). This is
+ * needed by page deletion, which must re-find leaf pages that are
+ * targets for deletion using their high keys.
+ *
+ * Note: the heap TID part of the test ensures that scankey is
+ * being compared to a pivot tuple with one or more truncated key
+ * attributes.
+ *
+ * Note: pg_upgrade'd !heapkeyspace indexes must always descend to
+ * the left here, since they have no heap TID attribute (and
+ * cannot have any -inf key values in any case, since truncation
+ * can only remove non-key attributes). !heapkeyspace searches
+ * must always be prepared to deal with matches on both sides of
+ * the pivot once the leaf level is reached.
+ */
+ if (key->heapkeyspace && !key->pivotsearch &&
+ key->keysz == ntupatts && heapTid == NULL)
+ return 1;
+
+ /* All provided scankey arguments found to be equal */
+ return 0;
+ }
+
/*
- * Most searches have a scankey that is considered greater than a
- * truncated pivot tuple if and when the scankey has equal values for
- * attributes up to and including the least significant untruncated
- * attribute in tuple.
- *
- * For example, if an index has the minimum two attributes (single
- * user key attribute, plus heap TID attribute), and a page's high key
- * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
- * will not descend to the page to the left. The search will descend
- * right instead. The truncated attribute in pivot tuple means that
- * all non-pivot tuples on the page to the left are strictly < 'foo',
- * so it isn't necessary to descend left. In other words, search
- * doesn't have to descend left because it isn't interested in a match
- * that has a heap TID value of -inf.
- *
- * However, some searches (pivotsearch searches) actually require that
- * we descend left when this happens. -inf is treated as a possible
- * match for omitted scankey attribute(s). This is needed by page
- * deletion, which must re-find leaf pages that are targets for
- * deletion using their high keys.
- *
- * Note: the heap TID part of the test ensures that scankey is being
- * compared to a pivot tuple with one or more truncated key
- * attributes.
- *
- * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
- * left here, since they have no heap TID attribute (and cannot have
- * any -inf key values in any case, since truncation can only remove
- * non-key attributes). !heapkeyspace searches must always be
- * prepared to deal with matches on both sides of the pivot once the
- * leaf level is reached.
+ * Treat truncated heap TID as minus infinity, since scankey has a key
+ * attribute value (scantid) that would otherwise be compared directly
*/
- if (key->heapkeyspace && !key->pivotsearch &&
- key->keysz == ntupatts && heapTid == NULL)
+ Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
+ if (heapTid == NULL)
return 1;
- /* All provided scankey arguments found to be equal */
- return 0;
+ Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
+ return ItemPointerCompare(key->scantid, heapTid);
}
+ else
+ {
+ heapTid = BTreeTupleGetHeapTID(itup);
+ if (key->scantid != NULL && heapTid != NULL)
+ {
+ int cmp = ItemPointerCompare(key->scantid, heapTid);
- /*
- * Treat truncated heap TID as minus infinity, since scankey has a key
- * attribute value (scantid) that would otherwise be compared directly
- */
- Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
- if (heapTid == NULL)
- return 1;
+ if (cmp == -1 || cmp == 0)
+ {
+ elog(DEBUG4, "offnum %d Scankey (%u,%u) is less than or equal to posting tuple (%u,%u)",
+ offnum, ItemPointerGetBlockNumberNoCheck(key->scantid),
+ ItemPointerGetOffsetNumberNoCheck(key->scantid),
+ ItemPointerGetBlockNumberNoCheck(heapTid),
+ ItemPointerGetOffsetNumberNoCheck(heapTid));
+ return cmp;
+ }
- Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
- return ItemPointerCompare(key->scantid, heapTid);
+ heapTid = BTreeTupleGetMaxTID(itup);
+ cmp = ItemPointerCompare(key->scantid, heapTid);
+ if (cmp == 1)
+ {
+ elog(DEBUG4, "offnum %d Scankey (%u,%u) is greater than posting tuple (%u,%u)",
+ offnum, ItemPointerGetBlockNumberNoCheck(key->scantid),
+ ItemPointerGetOffsetNumberNoCheck(key->scantid),
+ ItemPointerGetBlockNumberNoCheck(heapTid),
+ ItemPointerGetOffsetNumberNoCheck(heapTid));
+ return cmp;
+ }
+
+ /*
+ * if we got here, scantid is inbetween of posting items of the
+ * tuple
+ */
+ elog(DEBUG4, "offnum %d Scankey (%u,%u) is between posting items (%u,%u) and (%u,%u)",
+ offnum, ItemPointerGetBlockNumberNoCheck(key->scantid),
+ ItemPointerGetOffsetNumberNoCheck(key->scantid),
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(itup)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(itup)),
+ ItemPointerGetBlockNumberNoCheck(heapTid),
+ ItemPointerGetOffsetNumberNoCheck(heapTid));
+ return 0;
+ }
+ }
+
+ return 0;
}
/*
@@ -1456,6 +1573,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
/* initialize tuple workspace to empty */
so->currPos.nextTupleOffset = 0;
+ so->currPos.prevTupleOffset = 0;
/*
* Now that the current page has been made consistent, the macro should be
@@ -1490,8 +1608,22 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
{
/* tuple passes all scan key conditions, so remember it */
- _bt_saveitem(so, itemIndex, offnum, itup);
- itemIndex++;
+ if (!BTreeTupleIsPosting(itup))
+ {
+ _bt_saveitem(so, itemIndex, offnum, itup);
+ itemIndex++;
+ }
+ else
+ {
+ /* Return posting list "logical" tuples */
+ for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+ {
+ _bt_savepostingitem(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, i),
+ itup, i);
+ itemIndex++;
+ }
+ }
}
/* When !continuescan, there can't be any more matches, so stop */
if (!continuescan)
@@ -1524,7 +1656,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
if (!continuescan)
so->currPos.moreRight = false;
- Assert(itemIndex <= MaxIndexTuplesPerPage);
+ Assert(itemIndex <= MaxPostingIndexTuplesPerPage);
so->currPos.firstItem = 0;
so->currPos.lastItem = itemIndex - 1;
so->currPos.itemIndex = 0;
@@ -1532,7 +1664,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
else
{
/* load items[] in descending order */
- itemIndex = MaxIndexTuplesPerPage;
+ itemIndex = MaxPostingIndexTuplesPerPage;
offnum = Min(offnum, maxoff);
@@ -1574,8 +1706,23 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
if (passes_quals && tuple_alive)
{
/* tuple passes all scan key conditions, so remember it */
- itemIndex--;
- _bt_saveitem(so, itemIndex, offnum, itup);
+ if (!BTreeTupleIsPosting(itup))
+ {
+ itemIndex--;
+ _bt_saveitem(so, itemIndex, offnum, itup);
+ }
+ else
+ {
+ /* Return posting list "logical" tuples */
+ /* XXX: Maybe this loop should be backwards? */
+ for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+ {
+ itemIndex--;
+ _bt_savepostingitem(so, itemIndex, offnum,
+ BTreeTupleGetPostingN(itup, i),
+ itup, i);
+ }
+ }
}
if (!continuescan)
{
@@ -1589,8 +1736,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
Assert(itemIndex >= 0);
so->currPos.firstItem = itemIndex;
- so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
- so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
+ so->currPos.lastItem = MaxPostingIndexTuplesPerPage - 1;
+ so->currPos.itemIndex = MaxPostingIndexTuplesPerPage - 1;
}
return (so->currPos.firstItem <= so->currPos.lastItem);
@@ -1603,6 +1750,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
{
BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+ Assert(!BTreeTupleIsPosting(itup));
+
currItem->heapTid = itup->t_tid;
currItem->indexOffset = offnum;
if (so->currTuples)
@@ -1615,6 +1764,33 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
}
}
+/* Save an index item into so->currPos.items[itemIndex] for posting tuples. */
+static void
+_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+ ItemPointer iptr, IndexTuple itup, int i)
+{
+ BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+ currItem->heapTid = *iptr;
+ currItem->indexOffset = offnum;
+
+ if (so->currTuples)
+ {
+ if (i == 0)
+ {
+ /* save key. the same for all tuples in the posting */
+ Size itupsz = BTreeTupleGetPostingOffset(itup);
+
+ currItem->tupleOffset = so->currPos.nextTupleOffset;
+ memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
+ so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+ so->currPos.prevTupleOffset = currItem->tupleOffset;
+ }
+ else
+ currItem->tupleOffset = so->currPos.prevTupleOffset;
+ }
+}
+
/*
* _bt_steppage() -- Step to next page containing valid data for scan
*
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index b30cf9e..b058599 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -288,6 +288,8 @@ static void _bt_sortaddtup(Page page, Size itemsize,
static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
IndexTuple itup);
static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
+static void _bt_buildadd_posting(BTWriteState *wstate, BTPageState *state,
+ BTCompressState *compressState);
static void _bt_load(BTWriteState *wstate,
BTSpool *btspool, BTSpool *btspool2);
static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
@@ -972,6 +974,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
* only shift the line pointer array back and forth, and overwrite
* the tuple space previously occupied by oitup. This is fairly
* cheap.
+ *
+ * If lastleft tuple was a posting tuple, we'll truncate its
+ * posting list in _bt_truncate as well. Note that it is also
+ * applicable only to leaf pages, since internal pages never
+ * contain posting tuples.
*/
ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
lastleft = (IndexTuple) PageGetItem(opage, ii);
@@ -1011,6 +1018,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
* the minimum key for the new page.
*/
state->btps_minkey = CopyIndexTuple(oitup);
+ Assert(BTreeTupleIsPivot(state->btps_minkey));
/*
* Set the sibling links for both pages.
@@ -1052,6 +1060,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
Assert(state->btps_minkey == NULL);
state->btps_minkey = CopyIndexTuple(itup);
/* _bt_sortaddtup() will perform full truncation later */
+ BTreeTupleClearBtIsPosting(state->btps_minkey);
BTreeTupleSetNAtts(state->btps_minkey, 0);
}
@@ -1137,6 +1146,91 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
}
/*
+ * Add new tuple (posting or non-posting) to the page while building index.
+ */
+static void
+_bt_buildadd_posting(BTWriteState *wstate, BTPageState *state,
+ BTCompressState *compressState)
+{
+ IndexTuple to_insert;
+
+ /* Return, if there is no tuple to insert */
+ if (state == NULL)
+ return;
+
+ if (compressState->ntuples == 0)
+ to_insert = compressState->itupprev;
+ else
+ {
+ IndexTuple postingtuple;
+
+ /* form a tuple with a posting list */
+ postingtuple = BTreeFormPostingTuple(compressState->itupprev,
+ compressState->ipd,
+ compressState->ntuples);
+ to_insert = postingtuple;
+ pfree(compressState->ipd);
+ }
+
+ _bt_buildadd(wstate, state, to_insert);
+
+ if (compressState->ntuples > 0)
+ pfree(to_insert);
+ compressState->ntuples = 0;
+}
+
+/*
+ * Save item pointer(s) of itup to the posting list in compressState.
+ *
+ * Helper function for _bt_load() and _bt_compress_one_page().
+ *
+ * Note: caller is responsible for size check to ensure that resulting tuple
+ * won't exceed BTMaxItemSize.
+ */
+void
+_bt_add_posting_item(BTCompressState *compressState, IndexTuple itup)
+{
+ int nposting = 0;
+
+ if (compressState->ntuples == 0)
+ {
+ compressState->ipd = palloc0(compressState->maxitemsize);
+
+ if (BTreeTupleIsPosting(compressState->itupprev))
+ {
+ /* if itupprev is posting, add all its TIDs to the posting list */
+ nposting = BTreeTupleGetNPosting(compressState->itupprev);
+ memcpy(compressState->ipd,
+ BTreeTupleGetPosting(compressState->itupprev),
+ sizeof(ItemPointerData) * nposting);
+ compressState->ntuples += nposting;
+ }
+ else
+ {
+ memcpy(compressState->ipd, compressState->itupprev,
+ sizeof(ItemPointerData));
+ compressState->ntuples++;
+ }
+ }
+
+ if (BTreeTupleIsPosting(itup))
+ {
+ /* if tuple is posting, add all its TIDs to the posting list */
+ nposting = BTreeTupleGetNPosting(itup);
+ memcpy(compressState->ipd + compressState->ntuples,
+ BTreeTupleGetPosting(itup),
+ sizeof(ItemPointerData) * nposting);
+ compressState->ntuples += nposting;
+ }
+ else
+ {
+ memcpy(compressState->ipd + compressState->ntuples, itup,
+ sizeof(ItemPointerData));
+ compressState->ntuples++;
+ }
+}
+
+/*
* Read tuples in correct sort order from tuplesort, and load them into
* btree leaves.
*/
@@ -1150,9 +1244,20 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
bool load1;
TupleDesc tupdes = RelationGetDescr(wstate->index);
int i,
- keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
+ keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index),
+ natts = IndexRelationGetNumberOfAttributes(wstate->index);
SortSupport sortKeys;
int64 tuples_done = 0;
+ bool use_compression = false;
+ BTCompressState *compressState = NULL;
+
+ /*
+ * Don't use compression for indexes with INCLUDEd columns and unique
+ * indexes.
+ */
+ use_compression = (IndexRelationGetNumberOfKeyAttributes(wstate->index) ==
+ IndexRelationGetNumberOfAttributes(wstate->index) &&
+ !wstate->index->rd_index->indisunique);
if (merge)
{
@@ -1266,19 +1371,89 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
}
else
{
- /* merge is unnecessary */
- while ((itup = tuplesort_getindextuple(btspool->sortstate,
- true)) != NULL)
+ if (!use_compression)
{
- /* When we see first tuple, create first index page */
- if (state == NULL)
- state = _bt_pagestate(wstate, 0);
+ /* merge is unnecessary */
+ while ((itup = tuplesort_getindextuple(btspool->sortstate,
+ true)) != NULL)
+ {
+ /* When we see first tuple, create first index page */
+ if (state == NULL)
+ state = _bt_pagestate(wstate, 0);
- _bt_buildadd(wstate, state, itup);
+ _bt_buildadd(wstate, state, itup);
- /* Report progress */
- pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
- ++tuples_done);
+ /* Report progress */
+ pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+ ++tuples_done);
+ }
+ }
+ else
+ {
+ /* init compress state needed to build posting tuples */
+ compressState = (BTCompressState *) palloc0(sizeof(BTCompressState));
+ compressState->ipd = NULL;
+ compressState->ntuples = 0;
+ compressState->itupprev = NULL;
+ compressState->maxitemsize = 0;
+ compressState->maxpostingsize = 0;
+
+ while ((itup = tuplesort_getindextuple(btspool->sortstate,
+ true)) != NULL)
+ {
+ /* When we see first tuple, create first index page */
+ if (state == NULL)
+ {
+ state = _bt_pagestate(wstate, 0);
+ compressState->maxitemsize = BTMaxItemSize(state->btps_page);
+ }
+
+ if (compressState->itupprev != NULL)
+ {
+ int n_equal_atts = _bt_keep_natts_fast(wstate->index,
+ compressState->itupprev, itup);
+
+ if (n_equal_atts > natts)
+ {
+ /*
+ * Tuples are equal. Create or update posting.
+ *
+ * Else If posting is too big, insert it on page and
+ * continue.
+ */
+ if ((compressState->ntuples + 1) * sizeof(ItemPointerData) <
+ compressState->maxpostingsize)
+ _bt_add_posting_item(compressState, itup);
+ else
+ _bt_buildadd_posting(wstate, state,
+ compressState);
+ }
+ else
+ {
+ /*
+ * Tuples are not equal. Insert itupprev into index.
+ * Save current tuple for the next iteration.
+ */
+ _bt_buildadd_posting(wstate, state, compressState);
+ }
+ }
+
+ /*
+ * Save the tuple to compare it with the next one and maybe
+ * unite them into a posting tuple.
+ */
+ if (compressState->itupprev)
+ pfree(compressState->itupprev);
+ compressState->itupprev = CopyIndexTuple(itup);
+
+ /* compute max size of posting list */
+ compressState->maxpostingsize = compressState->maxitemsize -
+ IndexInfoFindDataOffset(compressState->itupprev->t_info) -
+ MAXALIGN(IndexTupleSize(compressState->itupprev));
+ }
+
+ /* Handle the last item */
+ _bt_buildadd_posting(wstate, state, compressState);
}
}
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
index a7882fd..c492b04 100644
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -62,6 +62,11 @@ typedef struct
int nsplits; /* current number of splits */
SplitPoint *splits; /* all candidate split points for page */
int interval; /* current range of acceptable split points */
+
+ /* fields only valid when insert splitted posting tuple */
+ OffsetNumber replaceitemoff;
+ IndexTuple replaceitem;
+ Size replaceitemsz;
} FindSplitData;
static void _bt_recsplitloc(FindSplitData *state,
@@ -129,6 +134,9 @@ _bt_findsplitloc(Relation rel,
OffsetNumber newitemoff,
Size newitemsz,
IndexTuple newitem,
+ OffsetNumber replaceitemoff,
+ Size replaceitemsz,
+ IndexTuple replaceitem,
bool *newitemonleft)
{
BTPageOpaque opaque;
@@ -183,6 +191,10 @@ _bt_findsplitloc(Relation rel,
state.minfirstrightsz = SIZE_MAX;
state.newitemoff = newitemoff;
+ state.replaceitemoff = replaceitemoff;
+ state.replaceitemsz = replaceitemsz;
+ state.replaceitem = replaceitem;
+
/*
* maxsplits should never exceed maxoff because there will be at most as
* many candidate split points as there are points _between_ tuples, once
@@ -207,7 +219,17 @@ _bt_findsplitloc(Relation rel,
Size itemsz;
itemid = PageGetItemId(page, offnum);
- itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
+
+ /* use size of replacing item for calculations */
+ if (offnum == replaceitemoff)
+ {
+ itemsz = replaceitemsz + sizeof(ItemIdData);
+ olddataitemstotal = state.olddataitemstotal = state.olddataitemstotal
+ - MAXALIGN(ItemIdGetLength(itemid))
+ + replaceitemsz;
+ }
+ else
+ itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
/*
* When item offset number is not newitemoff, neither side of the
@@ -466,9 +488,13 @@ _bt_recsplitloc(FindSplitData *state,
&& !newitemonleft);
if (newitemisfirstonright)
+ {
firstrightitemsz = state->newitemsz;
+ }
else
+ {
firstrightitemsz = firstoldonrightsz;
+ }
/* Account for all the old tuples */
leftfree = state->leftspace - olddataitemstoleft;
@@ -492,12 +518,12 @@ _bt_recsplitloc(FindSplitData *state,
* adding a heap TID to the left half's new high key when splitting at the
* leaf level. In practice the new high key will often be smaller and
* will rarely be larger, but conservatively assume the worst case.
+ * Truncation always truncates away any posting list that appears in the
+ * first right tuple, though, so it's safe to subtract that overhead
+ * (while still conservatively assuming that truncation might have to add
+ * back a single heap TID using the pivot tuple heap TID representation).
*/
- if (state->is_leaf)
- leftfree -= (int16) (firstrightitemsz +
- MAXALIGN(sizeof(ItemPointerData)));
- else
- leftfree -= (int16) firstrightitemsz;
+ leftfree -= (int16) firstrightitemsz;
/* account for the new item */
if (newitemonleft)
@@ -1066,13 +1092,20 @@ static inline IndexTuple
_bt_split_lastleft(FindSplitData *state, SplitPoint *split)
{
ItemId itemid;
+ OffsetNumber offset;
if (split->newitemonleft && split->firstoldonright == state->newitemoff)
return state->newitem;
- itemid = PageGetItemId(state->page,
- OffsetNumberPrev(split->firstoldonright));
- return (IndexTuple) PageGetItem(state->page, itemid);
+ offset = OffsetNumberPrev(split->firstoldonright);
+ if (offset == state->replaceitemoff)
+ return state->replaceitem;
+ else
+ {
+ itemid = PageGetItemId(state->page,
+ OffsetNumberPrev(split->firstoldonright));
+ return (IndexTuple) PageGetItem(state->page, itemid);
+ }
}
/*
@@ -1086,6 +1119,11 @@ _bt_split_firstright(FindSplitData *state, SplitPoint *split)
if (!split->newitemonleft && split->firstoldonright == state->newitemoff)
return state->newitem;
- itemid = PageGetItemId(state->page, split->firstoldonright);
- return (IndexTuple) PageGetItem(state->page, itemid);
+ if (split->firstoldonright == state->replaceitemoff)
+ return state->replaceitem;
+ else
+ {
+ itemid = PageGetItemId(state->page, split->firstoldonright);
+ return (IndexTuple) PageGetItem(state->page, itemid);
+ }
}
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 9b172c1..c56f5ab 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -111,8 +111,12 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
key->nextkey = false;
key->pivotsearch = false;
key->keysz = Min(indnkeyatts, tupnatts);
- key->scantid = key->heapkeyspace && itup ?
- BTreeTupleGetHeapTID(itup) : NULL;
+
+ if (itup && key->heapkeyspace)
+ key->scantid = BTreeTupleGetHeapTID(itup);
+ else
+ key->scantid = NULL;
+
skey = key->scankeys;
for (i = 0; i < indnkeyatts; i++)
{
@@ -1787,7 +1791,9 @@ _bt_killitems(IndexScanDesc scan)
ItemId iid = PageGetItemId(page, offnum);
IndexTuple ituple = (IndexTuple) PageGetItem(page, iid);
- if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+ /* No microvacuum for posting tuples */
+ if (!BTreeTupleIsPosting(ituple) &&
+ (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)))
{
/* found the item */
ItemIdMarkDead(iid);
@@ -2112,6 +2118,7 @@ btbuildphasename(int64 phasenum)
* returning an enlarged tuple to caller when truncation + TOAST compression
* ends up enlarging the final datum.
*/
+
IndexTuple
_bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
BTScanInsert itup_key)
@@ -2124,6 +2131,17 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
ItemPointer pivotheaptid;
Size newsize;
+ elog(DEBUG4, "_bt_truncate left N %d (%u,%u) to (%u,%u), right N %d (%u,%u) to (%u,%u) ",
+ BTreeTupleIsPosting(lastleft)?BTreeTupleGetNPosting(lastleft):0,
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(lastleft)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(lastleft)),
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(lastleft)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(lastleft)),
+ BTreeTupleIsPosting(firstright)?BTreeTupleGetNPosting(firstright):0,
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(firstright)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(firstright)),
+ ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(firstright)),
+ ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(firstright)));
/*
* We should only ever truncate leaf index tuples. It's never okay to
* truncate a second time.
@@ -2145,6 +2163,16 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
pivot = index_truncate_tuple(itupdesc, firstright, keepnatts);
+ if (BTreeTupleIsPosting(firstright))
+ {
+ BTreeTupleClearBtIsPosting(pivot);
+ BTreeTupleSetNAtts(pivot, keepnatts);
+ pivot->t_info &= ~INDEX_SIZE_MASK;
+ pivot->t_info |= BTreeTupleGetPostingOffset(firstright);
+ }
+
+ Assert(!BTreeTupleIsPosting(pivot));
+
/*
* If there is a distinguishing key attribute within new pivot tuple,
* there is no need to add an explicit heap TID attribute
@@ -2161,6 +2189,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
* attribute to the new pivot tuple.
*/
Assert(natts != nkeyatts);
+ Assert(!BTreeTupleIsPosting(lastleft));
+ Assert(!BTreeTupleIsPosting(firstright));
newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData));
tidpivot = palloc0(newsize);
memcpy(tidpivot, pivot, IndexTupleSize(pivot));
@@ -2168,6 +2198,27 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
pfree(pivot);
pivot = tidpivot;
}
+ else if (BTreeTupleIsPosting(firstright))
+ {
+ /*
+ * No truncation was possible, since key attributes are all equal. But
+ * the tuple is a compressed tuple with a posting list, so we still
+ * must truncate it.
+ *
+ * It's necessary to add a heap TID attribute to the new pivot tuple.
+ */
+ newsize = BTreeTupleGetPostingOffset(firstright) +
+ MAXALIGN(sizeof(ItemPointerData));
+ pivot = palloc0(newsize);
+ memcpy(pivot, firstright, BTreeTupleGetPostingOffset(firstright));
+
+ pivot->t_info &= ~INDEX_SIZE_MASK;
+ pivot->t_info |= newsize;
+ BTreeTupleClearBtIsPosting(pivot);
+ BTreeTupleSetAltHeapTID(pivot);
+
+ Assert(!BTreeTupleIsPosting(pivot));
+ }
else
{
/*
@@ -2205,7 +2256,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
*/
pivotheaptid = (ItemPointer) ((char *) pivot + newsize -
sizeof(ItemPointerData));
- ItemPointerCopy(&lastleft->t_tid, pivotheaptid);
+ ItemPointerCopy(BTreeTupleGetMaxTID(lastleft), pivotheaptid);
/*
* Lehman and Yao require that the downlink to the right page, which is to
@@ -2216,9 +2267,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
* tiebreaker.
*/
#ifndef DEBUG_NO_TRUNCATE
- Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0);
- Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0);
- Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+ Assert(ItemPointerCompare(BTreeTupleGetMaxTID(lastleft),
+ BTreeTupleGetHeapTID(firstright)) < 0);
+ Assert(ItemPointerCompare(pivotheaptid,
+ BTreeTupleGetHeapTID(lastleft)) >= 0);
+ Assert(ItemPointerCompare(pivotheaptid,
+ BTreeTupleGetHeapTID(firstright)) < 0);
#else
/*
@@ -2231,7 +2285,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
* attribute values along with lastleft's heap TID value when lastleft's
* TID happens to be greater than firstright's TID.
*/
- ItemPointerCopy(&firstright->t_tid, pivotheaptid);
+ ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
/*
* Pivot heap TID should never be fully equal to firstright. Note that
@@ -2240,7 +2294,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
*/
ItemPointerSetOffsetNumber(pivotheaptid,
OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
- Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+ Assert(ItemPointerCompare(pivotheaptid,
+ BTreeTupleGetHeapTID(firstright)) < 0);
#endif
BTreeTupleSetNAtts(pivot, nkeyatts);
@@ -2330,6 +2385,25 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
* leaving excessive amounts of free space on either side of page split.
* Callers can rely on the fact that attributes considered equal here are
* definitely also equal according to _bt_keep_natts.
+ *
+ * To build a posting tuple we need to ensure that all attributes
+ * of both tuples are equal. Use this function to compare them.
+ * TODO: maybe it's worth to rename the function.
+ *
+ * XXX: Obviously we need infrastructure for making sure it is okay to use
+ * this for posting list stuff. For example, non-deterministic collations
+ * cannot use compression, and will not work with what we have now.
+ *
+ * XXX: Even then, we probably also need to worry about TOAST as a special
+ * case. Don't repeat bugs like the amcheck bug that was fixed in commit
+ * eba775345d23d2c999bbb412ae658b6dab36e3e8. As the test case added in that
+ * commit shows, we need to worry about pg_attribute.attstorage changing in
+ * the underlying table due to an ALTER TABLE (and maybe a few other things
+ * like that). In general, the "TOAST input state" of a TOASTable datum isn't
+ * something that we make many guarantees about today, so even with C
+ * collation text we could in theory get different answers from
+ * _bt_keep_natts_fast() and _bt_keep_natts(). This needs to be nailed down
+ * in some way.
*/
int
_bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
@@ -2415,7 +2489,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
* Non-pivot tuples currently never use alternative heap TID
* representation -- even those within heapkeyspace indexes
*/
- if ((itup->t_info & INDEX_ALT_TID_MASK) != 0)
+ if (BTreeTupleIsPivot(itup))
return false;
/*
@@ -2470,7 +2544,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
* that to decide if the tuple is a pre-v11 tuple.
*/
return tupnatts == 0 ||
- ((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
+ (!BTreeTupleIsPivot(itup) &&
ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
}
else
@@ -2497,7 +2571,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
* heapkeyspace index pivot tuples, regardless of whether or not there are
* non-key attributes.
*/
- if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+ if (!BTreeTupleIsPivot(itup))
return false;
/*
@@ -2549,6 +2623,8 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
return;
+ /* TODO correct error messages for posting tuples */
+
/*
* Internal page insertions cannot fail here, because that would mean that
* an earlier leaf level insertion that should have failed didn't
@@ -2575,3 +2651,79 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
"or use full text indexing."),
errtableconstraint(heap, RelationGetRelationName(rel))));
}
+
+/*
+ * Given a basic tuple that contains key datum and posting list,
+ * build a posting tuple.
+ *
+ * Basic tuple can be a posting tuple, but we only use key part of it,
+ * all ItemPointers must be passed via ipd.
+ *
+ * If nipd == 1 fallback to building a non-posting tuple.
+ * It is necessary to avoid storage overhead after posting tuple was vacuumed.
+ */
+IndexTuple
+BTreeFormPostingTuple(IndexTuple tuple, ItemPointerData *ipd, int nipd)
+{
+ uint32 keysize,
+ newsize = 0;
+ IndexTuple itup;
+
+ /* We only need key part of the tuple */
+ if (BTreeTupleIsPosting(tuple))
+ keysize = BTreeTupleGetPostingOffset(tuple);
+ else
+ keysize = IndexTupleSize(tuple);
+
+ Assert(nipd > 0);
+
+ /* Add space needed for posting list */
+ if (nipd > 1)
+ newsize = SHORTALIGN(keysize) + sizeof(ItemPointerData) * nipd;
+ else
+ newsize = keysize;
+
+ newsize = MAXALIGN(newsize);
+ itup = palloc0(newsize);
+ memcpy(itup, tuple, keysize);
+ itup->t_info &= ~INDEX_SIZE_MASK;
+ itup->t_info |= newsize;
+
+ if (nipd > 1)
+ {
+ /* Form posting tuple, fill posting fields */
+
+ /* Set meta info about the posting list */
+ itup->t_info |= INDEX_ALT_TID_MASK;
+ BTreeSetPostingMeta(itup, nipd, SHORTALIGN(keysize));
+
+ /* sort the list to preserve TID order invariant */
+ qsort((void *) ipd, nipd, sizeof(ItemPointerData),
+ (int (*) (const void *, const void *)) ItemPointerCompare);
+
+ /* Copy posting list into the posting tuple */
+ memcpy(BTreeTupleGetPosting(itup), ipd,
+ sizeof(ItemPointerData) * nipd);
+ }
+ else
+ {
+ /* To finish building of a non-posting tuple, copy TID from ipd */
+ itup->t_info &= ~INDEX_ALT_TID_MASK;
+ ItemPointerCopy(ipd, &itup->t_tid);
+ }
+
+ return itup;
+}
+
+/*
+ * Opposite of BTreeFormPostingTuple.
+ * returns regular tuple that contains the key,
+ * the tid of the new tuple is the nth tid of original tuple's posting list
+ * result tuple palloc'd in a caller's context.
+ */
+IndexTuple
+BTreeGetNthTupleOfPosting(IndexTuple tuple, int n)
+{
+ Assert(BTreeTupleIsPosting(tuple));
+ return BTreeFormPostingTuple(tuple, BTreeTupleGetPostingN(tuple, n), 1);
+}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index dd5315c..538a6bc 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -386,8 +386,8 @@ btree_xlog_vacuum(XLogReaderState *record)
Buffer buffer;
Page page;
BTPageOpaque opaque;
-#ifdef UNUSED
xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+#ifdef UNUSED
/*
* This section of code is thought to be no longer needed, after analysis
@@ -478,14 +478,34 @@ btree_xlog_vacuum(XLogReaderState *record)
if (len > 0)
{
- OffsetNumber *unused;
- OffsetNumber *unend;
+ if (xlrec->nremaining)
+ {
+ OffsetNumber *remainingoffset;
+ IndexTuple remaining;
+ Size itemsz;
+
+ remainingoffset = (OffsetNumber *)
+ (ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+ remaining = (IndexTuple) ((char *) remainingoffset +
+ xlrec->nremaining * sizeof(OffsetNumber));
- unused = (OffsetNumber *) ptr;
- unend = (OffsetNumber *) ((char *) ptr + len);
+ /* Handle posting tuples */
+ for (int i = 0; i < xlrec->nremaining; i++)
+ {
+ PageIndexTupleDelete(page, remainingoffset[i]);
+
+ itemsz = MAXALIGN(IndexTupleSize(remaining));
+
+ if (PageAddItem(page, (Item) remaining, itemsz, remainingoffset[i],
+ false, false) == InvalidOffsetNumber)
+ elog(PANIC, "btree_xlog_vacuum: failed to add remaining item");
+
+ remaining = (IndexTuple) ((char *) remaining + itemsz);
+ }
+ }
- if ((unend - unused) > 0)
- PageIndexMultiDelete(page, unused, unend - unused);
+ if (xlrec->ndeleted)
+ PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
}
/*
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c
index a14eb79..e4fa99a 100644
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@@ -46,8 +46,10 @@ btree_desc(StringInfo buf, XLogReaderState *record)
{
xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
- appendStringInfo(buf, "lastBlockVacuumed %u",
- xlrec->lastBlockVacuumed);
+ appendStringInfo(buf, "lastBlockVacuumed %u; nremaining %u; ndeleted %u",
+ xlrec->lastBlockVacuumed,
+ xlrec->nremaining,
+ xlrec->ndeleted);
break;
}
case XLOG_BTREE_DELETE:
diff --git a/src/include/access/itup.h b/src/include/access/itup.h
index 744ffb6..b10c0d5 100644
--- a/src/include/access/itup.h
+++ b/src/include/access/itup.h
@@ -141,6 +141,10 @@ typedef IndexAttributeBitMapData * IndexAttributeBitMap;
* On such a page, N tuples could take one MAXALIGN quantum less space than
* estimated here, seemingly allowing one more tuple than estimated here.
* But such a page always has at least MAXALIGN special space, so we're safe.
+ *
+ * Note: btree leaf pages may contain posting tuples, which store duplicates
+ * in a more effective way, so they may contain more tuples.
+ * Use MaxPostingIndexTuplesPerPage instead.
*/
#define MaxIndexTuplesPerPage \
((int) ((BLCKSZ - SizeOfPageHeaderData) / \
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 83e0e6c..3064afb 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -234,8 +234,7 @@ typedef struct BTMetaPageData
* t_tid | t_info | key values | INCLUDE columns, if any
*
* t_tid points to the heap TID, which is a tiebreaker key column as of
- * BTREE_VERSION 4. Currently, the INDEX_ALT_TID_MASK status bit is never
- * set for non-pivot tuples.
+ * BTREE_VERSION 4.
*
* All other types of index tuples ("pivot" tuples) only have key columns,
* since pivot tuples only exist to represent how the key space is
@@ -252,6 +251,39 @@ typedef struct BTMetaPageData
* omitted rather than truncated, since its representation is different to
* the non-pivot representation.)
*
+ * Non-pivot posting tuple format:
+ * t_tid | t_info | key values | INCLUDE columns, if any | posting_list[]
+ *
+ * In order to store duplicated keys more effectively,
+ * we use special format of tuples - posting tuples.
+ * posting_list is an array of ItemPointerData.
+ *
+ * This type of compression never applies to system indexes, unique indexes
+ * or indexes with INCLUDEd columns.
+ *
+ * To differ posting tuples we use INDEX_ALT_TID_MASK flag in t_info and
+ * BT_IS_POSTING flag in t_tid.
+ * These flags redefine the content of the posting tuple's tid:
+ * - t_tid.ip_blkid contains offset of the posting list.
+ * - t_tid offset field contains number of posting items this tuple contain
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of posting items in posting tuples, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use.
+ * BT_N_POSTING_OFFSET_MASK is large enough to store any number of posting
+ * tuples, which is constrainted by BTMaxItemSize.
+
+ * If page contains so many duplicates, that they do not fit into one posting
+ * tuple (bounded by BTMaxItemSize and ), page may contain several posting
+ * tuples with the same key.
+ * Also page can contain both posting and non-posting tuples with the same key.
+ * Currently, posting tuples always contain at least two TIDs in the posting
+ * list.
+ *
+ * Posting tuples always have the same number of attributes as the index has
+ * generally.
+ *
* Pivot tuple format:
*
* t_tid | t_info | key values | [heap TID]
@@ -281,23 +313,144 @@ typedef struct BTMetaPageData
* bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
* future use. BT_N_KEYS_OFFSET_MASK should be large enough to store any
* number of columns/attributes <= INDEX_MAX_KEYS.
+ * BT_IS_POSTING bit must be unset for pivot tuples, since we use it
+ * to distinct posting tuples from pivot tuples.
*
* Note well: The macros that deal with the number of attributes in tuples
- * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple,
- * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot
- * tuple (or must have the same number of attributes as the index has
- * generally in the case of !heapkeyspace indexes). They will need to be
- * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK
- * for something else.
+ * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or
+ * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set
+ * must be a non-pivot tuple (or must have the same number of attributes as
+ * the index has generally in the case of !heapkeyspace indexes).
*/
#define INDEX_ALT_TID_MASK INDEX_AM_RESERVED_BIT
/* Item pointer offset bits */
#define BT_RESERVED_OFFSET_MASK 0xF000
#define BT_N_KEYS_OFFSET_MASK 0x0FFF
+#define BT_N_POSTING_OFFSET_MASK 0x0FFF
#define BT_HEAP_TID_ATTR 0x1000
+#define BT_IS_POSTING 0x2000
+
+#define BTreeTupleIsPosting(itup) \
+ ( \
+ ((itup)->t_info & INDEX_ALT_TID_MASK && \
+ ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0))\
+ )
+
+#define BTreeTupleIsPivot(itup) \
+ ( \
+ ((itup)->t_info & INDEX_ALT_TID_MASK && \
+ ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0))\
+ )
-/* Get/set downlink block number */
+/*
+ * MaxPostingIndexTuplesPerPage is an upper bound on the number of tuples
+ * that can fit on one btree leaf page.
+ *
+ * Btree leaf pages may contain posting tuples, which store duplicates
+ * in a more effective way, so MaxPostingIndexTuplesPerPage is larger then
+ * MaxIndexTuplesPerPage.
+ *
+ * Each leaf page must contain at least three items, so estimate it as
+ * if we have three posting tuples with minimal size keys.
+ */
+#define MaxPostingIndexTuplesPerPage \
+ ((int) ((BLCKSZ - SizeOfPageHeaderData - \
+ 3*((MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))) )) / \
+ (sizeof(ItemPointerData)))
+
+/*
+ * Btree-private state needed to build posting tuples.
+ * ipd is a posting list - an array of ItemPointerData.
+ *
+ * Iterating over tuples during index build or applying compression to a
+ * single page, we remember a tuple in itupprev, then compare the next one
+ * with it. If tuples are equal, save their TIDs in the posting list.
+ * ntuples contains the size of the posting list.
+ *
+ * Use maxitemsize and maxpostingsize to ensure that resulting posting tuple
+ * will satisfy BTMaxItemSize.
+ */
+typedef struct BTCompressState
+{
+ Size maxitemsize;
+ Size maxpostingsize;
+ IndexTuple itupprev;
+ int ntuples;
+ ItemPointerData *ipd;
+} BTCompressState;
+
+/* macros to work with posting tuples *BEGIN* */
+#define BTreeTupleSetBtIsPosting(itup) \
+ do { \
+ Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+ Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \
+ ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_IS_POSTING); \
+ } while(0)
+
+#define BTreeTupleClearBtIsPosting(itup) \
+ do { \
+ ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & ~BT_IS_POSTING); \
+ } while(0)
+
+#define BTreeTupleGetNPosting(itup) \
+ ( \
+ AssertMacro(BTreeTupleIsPosting(itup)), \
+ ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_POSTING_OFFSET_MASK \
+ )
+
+#define BTreeTupleSetNPosting(itup, n) \
+ do { \
+ ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_POSTING_OFFSET_MASK); \
+ BTreeTupleSetBtIsPosting(itup); \
+ } while(0)
+
+/*
+ * If tuple is posting, t_tid.ip_blkid contains offset of the posting list.
+ * Caller is responsible for checking BTreeTupleIsPosting to ensure that it
+ * will get what is expected.
+ */
+#define BTreeTupleGetPostingOffset(itup) \
+ ( \
+ AssertMacro(BTreeTupleIsPosting(itup)), \
+ ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) \
+ )
+#define BTreeTupleSetPostingOffset(itup, offset) \
+ ( \
+ AssertMacro(BTreeTupleIsPosting(itup)), \
+ ItemPointerSetBlockNumber(&((itup)->t_tid), (offset)) \
+ )
+#define BTreeSetPostingMeta(itup, nposting, off) \
+ do { \
+ BTreeTupleSetNPosting(itup, nposting); \
+ BTreeTupleSetPostingOffset(itup, off); \
+ } while(0)
+
+#define BTreeTupleGetPosting(itup) \
+ (ItemPointerData*) ((char*)(itup) + BTreeTupleGetPostingOffset(itup))
+#define BTreeTupleGetPostingN(itup,n) \
+ (ItemPointerData*) (BTreeTupleGetPosting(itup) + (n))
+
+/*
+ * Posting tuples always contain more than one TID. The minimum TID can be
+ * accessed using BTreeTupleGetHeapTID(). The maximum is accessed using
+ * BTreeTupleGetMaxTID().
+ */
+#define BTreeTupleGetMaxTID(itup) \
+ ( \
+ ((itup)->t_info & INDEX_ALT_TID_MASK && \
+ ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING))) ? \
+ ( \
+ (ItemPointer) (BTreeTupleGetPosting(itup) + (BTreeTupleGetNPosting(itup)-1)) \
+ ) \
+ : \
+ (ItemPointer) &((itup)->t_tid) \
+ )
+/* macros to work with posting tuples *END* */
+
+/* Get/set downlink block number */
#define BTreeInnerTupleGetDownLink(itup) \
ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
#define BTreeInnerTupleSetDownLink(itup, blkno) \
@@ -326,7 +479,8 @@ typedef struct BTMetaPageData
*/
#define BTreeTupleGetNAtts(itup, rel) \
( \
- (itup)->t_info & INDEX_ALT_TID_MASK ? \
+ ((itup)->t_info & INDEX_ALT_TID_MASK && \
+ ((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0)) ? \
( \
ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
) \
@@ -335,6 +489,7 @@ typedef struct BTMetaPageData
)
#define BTreeTupleSetNAtts(itup, n) \
do { \
+ Assert(!BTreeTupleIsPosting(itup)); \
(itup)->t_info |= INDEX_ALT_TID_MASK; \
ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
} while(0)
@@ -342,6 +497,8 @@ typedef struct BTMetaPageData
/*
* Get tiebreaker heap TID attribute, if any. Macro works with both pivot
* and non-pivot tuples, despite differences in how heap TID is represented.
+ *
+ * For non-pivot posting tuples this returns the first tid from posting list.
*/
#define BTreeTupleGetHeapTID(itup) \
( \
@@ -351,7 +508,10 @@ typedef struct BTMetaPageData
(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
sizeof(ItemPointerData)) \
) \
- : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
+ : (itup)->t_info & INDEX_ALT_TID_MASK ? \
+ (((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0) ? \
+ (ItemPointer) BTreeTupleGetPosting(itup) : NULL) \
+ : (ItemPointer) &((itup)->t_tid) \
)
/*
* Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
@@ -360,6 +520,7 @@ typedef struct BTMetaPageData
#define BTreeTupleSetAltHeapTID(itup) \
do { \
Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+ Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \
ItemPointerSetOffsetNumber(&(itup)->t_tid, \
ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \
} while(0)
@@ -501,6 +662,12 @@ typedef struct BTInsertStateData
Buffer buf;
/*
+ * if _bt_binsrch_insert() found the location inside existing posting
+ * list, save the position inside the list.
+ */
+ int in_posting_offset;
+
+ /*
* Cache of bounds within the current buffer. Only used for insertions
* where _bt_check_unique is called. See _bt_binsrch_insert and
* _bt_findinsertloc for details.
@@ -567,6 +734,8 @@ typedef struct BTScanPosData
* location in the associated tuple storage workspace.
*/
int nextTupleOffset;
+ /* prevTupleOffset is for posting list handling */
+ int prevTupleOffset;
/*
* The items array is always ordered in index order (ie, increasing
@@ -579,7 +748,7 @@ typedef struct BTScanPosData
int lastItem; /* last valid index in items[] */
int itemIndex; /* current index in items[] */
- BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+ BTScanPosItem items[MaxPostingIndexTuplesPerPage]; /* MUST BE LAST */
} BTScanPosData;
typedef BTScanPosData *BTScanPos;
@@ -739,6 +908,8 @@ extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
*/
extern OffsetNumber _bt_findsplitloc(Relation rel, Page page,
OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+ OffsetNumber replaceitemoff, Size replaceitemsz,
+ IndexTuple replaceitem,
bool *newitemonleft);
/*
@@ -763,6 +934,8 @@ extern void _bt_delitems_delete(Relation rel, Buffer buf,
OffsetNumber *itemnos, int nitems, Relation heapRel);
extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
OffsetNumber *itemnos, int nitems,
+ OffsetNumber *remainingoffset,
+ IndexTuple *remaining, int nremaining,
BlockNumber lastBlockVacuumed);
extern int _bt_pagedel(Relation rel, Buffer buf);
@@ -775,6 +948,8 @@ extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf,
bool forupdate, BTStack stack, int access, Snapshot snapshot);
extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
+extern int32 _bt_compare_posting(Relation rel, BTScanInsert key, Page page,
+ OffsetNumber offnum, int *in_posting_offset);
extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
@@ -813,6 +988,9 @@ extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
OffsetNumber offnum);
extern void _bt_check_third_page(Relation rel, Relation heap,
bool needheaptidspace, Page page, IndexTuple newtup);
+extern IndexTuple BTreeFormPostingTuple(IndexTuple tuple, ItemPointerData *ipd,
+ int nipd);
+extern IndexTuple BTreeGetNthTupleOfPosting(IndexTuple tuple, int n);
/*
* prototypes for functions in nbtvalidate.c
@@ -825,5 +1003,7 @@ extern bool btvalidate(Oid opclassoid);
extern IndexBuildResult *btbuild(Relation heap, Relation index,
struct IndexInfo *indexInfo);
extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
+extern void _bt_add_posting_item(BTCompressState *compressState,
+ IndexTuple itup);
#endif /* NBTREE_H */
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index afa614d..4b615e0 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -173,10 +173,19 @@ typedef struct xl_btree_vacuum
{
BlockNumber lastBlockVacuumed;
- /* TARGET OFFSET NUMBERS FOLLOW */
+ /*
+ * This field helps us to find beginning of the remaining tuples from
+ * postings which follow array of offset numbers.
+ */
+ uint32 nremaining;
+ uint32 ndeleted;
+
+ /* REMAINING OFFSET NUMBERS FOLLOW (nremaining values) */
+ /* REMAINING TUPLES TO INSERT FOLLOW (if nremaining > 0) */
+ /* TARGET OFFSET NUMBERS FOLLOW (if any) */
} xl_btree_vacuum;
-#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
+#define SizeOfBtreeVacuum (offsetof(xl_btree_vacuum, ndeleted) + sizeof(BlockNumber))
/*
* This is what we need to know about marking an empty branch for deletion.