Re: [HACKERS] [WIP] Effective storage of duplicates in B-tree index.

Anastasia Lubennikova Fri, 16 Aug 2019 08:56:45 -0700

13.08.2019 18:45, Anastasia Lubennikova wrote:

  I also added a nearby FIXME comment to
_bt_insertonpg_in_posting() -- I don't think think that the code for
splitting a posting list in two is currently crash-safe.

Good catch. It seems, that I need to rearrange the code.
I'll send updated patch this week.


Attached is v7.

In this version of the patch, I heavily refactored the code of insertioninto

posting tuple. bt_split logic is quite complex, so I omitted a couple of
optimizations. They are mentioned in TODO comments.

Now the algorithm is the following:

- If bt_findinsertloc() found out that tuple belongs to existing postingtuple's

TID interval, it sets 'in_posting_offset' variable and passes it to
_bt_insertonpg()

- If 'in_posting_offset' is valid and origtup is valid,
merge our itup into origtup.

It can result in one tuple neworigtup, that must replace origtup; or twotuples:

neworigtup and newrighttup, if the result exceeds BTMaxItemSize,

- If two new tuple(s) fit into the old page, we're lucky.
call _bt_delete_and_insert(..., neworigtup, newrighttup, newitemoff) to
atomically replace oldtup with new tuple(s) and generate xlog record.

- In case page split is needed, pass both tuples to _bt_split().
 _bt_findsplitloc() is now aware of upcoming replacement of origtup with
neworigtup, so it uses correct item size where needed.

It seems that now all replace operations are crash-safe. The new patchpasses

all regression tests, so I think it's ready for review again.

In the meantime, I'll run more stress-tests.

--
Anastasia Lubennikova
Postgres Professional: http://www.postgrespro.com
The Russian Postgres Company

diff --git a/contrib/amcheck/verify_nbtree.c b/contrib/amcheck/verify_nbtree.c
index 05e7d67..504bca2 100644
--- a/contrib/amcheck/verify_nbtree.c
+++ b/contrib/amcheck/verify_nbtree.c
@@ -924,6 +924,7 @@ bt_target_page_check(BtreeCheckState *state)
 		size_t		tupsize;
 		BTScanInsert skey;
 		bool		lowersizelimit;
+		ItemPointer	scantid;
 
 		CHECK_FOR_INTERRUPTS();
 
@@ -994,29 +995,73 @@ bt_target_page_check(BtreeCheckState *state)
 
 		/*
 		 * Readonly callers may optionally verify that non-pivot tuples can
-		 * each be found by an independent search that starts from the root
+		 * each be found by an independent search that starts from the root.
+		 * Note that we deliberately don't do individual searches for each
+		 * "logical" posting list tuple, since the posting list itself is
+		 * validated by other checks.
 		 */
 		if (state->rootdescend && P_ISLEAF(topaque) &&
 			!bt_rootdescend(state, itup))
 		{
 			char	   *itid,
 					   *htid;
+			ItemPointer tid = BTreeTupleGetHeapTID(itup);
 
 			itid = psprintf("(%u,%u)", state->targetblock, offset);
 			htid = psprintf("(%u,%u)",
-							ItemPointerGetBlockNumber(&(itup->t_tid)),
-							ItemPointerGetOffsetNumber(&(itup->t_tid)));
+							ItemPointerGetBlockNumber(tid),
+							ItemPointerGetOffsetNumber(tid));
 
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
 					 errmsg("could not find tuple using search from root page in index \"%s\"",
 							RelationGetRelationName(state->rel)),
-					 errdetail_internal("Index tid=%s points to heap tid=%s page lsn=%X/%X.",
+					 errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.",
 										itid, htid,
 										(uint32) (state->targetlsn >> 32),
 										(uint32) state->targetlsn)));
 		}
 
+		/*
+		 * If tuple is actually a posting list, make sure posting list TIDs
+		 * are in order.
+		 */
+		if (BTreeTupleIsPosting(itup))
+		{
+			ItemPointerData last;
+			ItemPointer		current;
+
+			ItemPointerCopy(BTreeTupleGetHeapTID(itup), &last);
+
+			for (int i = 1; i < BTreeTupleGetNPosting(itup); i++)
+			{
+
+				current = BTreeTupleGetPostingN(itup, i);
+
+				if (ItemPointerCompare(current, &last) <= 0)
+				{
+					char	   *itid,
+							   *htid;
+
+					itid = psprintf("(%u,%u)", state->targetblock, offset);
+					htid = psprintf("(%u,%u)",
+									ItemPointerGetBlockNumberNoCheck(current),
+									ItemPointerGetOffsetNumberNoCheck(current));
+
+					ereport(ERROR,
+							(errcode(ERRCODE_INDEX_CORRUPTED),
+							 errmsg("posting list heap TIDs out of order in index \"%s\"",
+									RelationGetRelationName(state->rel)),
+							 errdetail_internal("Index tid=%s min heap tid=%s page lsn=%X/%X.",
+												itid, htid,
+												(uint32) (state->targetlsn >> 32),
+												(uint32) state->targetlsn)));
+				}
+
+				ItemPointerCopy(current, &last);
+			}
+		}
+
 		/* Build insertion scankey for current page offset */
 		skey = bt_mkscankey_pivotsearch(state->rel, itup);
 
@@ -1074,12 +1119,33 @@ bt_target_page_check(BtreeCheckState *state)
 		{
 			IndexTuple	norm;
 
-			norm = bt_normalize_tuple(state, itup);
-			bloom_add_element(state->filter, (unsigned char *) norm,
-							  IndexTupleSize(norm));
-			/* Be tidy */
-			if (norm != itup)
-				pfree(norm);
+			if (BTreeTupleIsPosting(itup))
+			{
+				IndexTuple	onetup;
+
+				/* Fingerprint all elements of posting tuple one by one */
+				for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+				{
+					onetup = BTreeGetNthTupleOfPosting(itup, i);
+
+					norm = bt_normalize_tuple(state, onetup);
+					bloom_add_element(state->filter, (unsigned char *) norm,
+									  IndexTupleSize(norm));
+					/* Be tidy */
+					if (norm != onetup)
+						pfree(norm);
+					pfree(onetup);
+				}
+			}
+			else
+			{
+				norm = bt_normalize_tuple(state, itup);
+				bloom_add_element(state->filter, (unsigned char *) norm,
+								  IndexTupleSize(norm));
+				/* Be tidy */
+				if (norm != itup)
+					pfree(norm);
+			}
 		}
 
 		/*
@@ -1087,7 +1153,8 @@ bt_target_page_check(BtreeCheckState *state)
 		 *
 		 * If there is a high key (if this is not the rightmost page on its
 		 * entire level), check that high key actually is upper bound on all
-		 * page items.
+		 * page items.  If this is a posting list tuple, we'll need to set
+		 * scantid to be highest TID in posting list.
 		 *
 		 * We prefer to check all items against high key rather than checking
 		 * just the last and trusting that the operator class obeys the
@@ -1127,6 +1194,9 @@ bt_target_page_check(BtreeCheckState *state)
 		 * tuple. (See also: "Notes About Data Representation" in the nbtree
 		 * README.)
 		 */
+		scantid = skey->scantid;
+		if (!BTreeTupleIsPivot(itup))
+			skey->scantid = BTreeTupleGetMaxTID(itup);
 		if (!P_RIGHTMOST(topaque) &&
 			!(P_ISLEAF(topaque) ? invariant_leq_offset(state, skey, P_HIKEY) :
 			  invariant_l_offset(state, skey, P_HIKEY)))
@@ -1150,6 +1220,7 @@ bt_target_page_check(BtreeCheckState *state)
 										(uint32) (state->targetlsn >> 32),
 										(uint32) state->targetlsn)));
 		}
+		skey->scantid = scantid;
 
 		/*
 		 * * Item order check *
@@ -1164,11 +1235,13 @@ bt_target_page_check(BtreeCheckState *state)
 					   *htid,
 					   *nitid,
 					   *nhtid;
+			ItemPointer tid;
 
 			itid = psprintf("(%u,%u)", state->targetblock, offset);
+			tid = BTreeTupleGetHeapTID(itup);
 			htid = psprintf("(%u,%u)",
-							ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-							ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+							ItemPointerGetBlockNumberNoCheck(tid),
+							ItemPointerGetOffsetNumberNoCheck(tid));
 			nitid = psprintf("(%u,%u)", state->targetblock,
 							 OffsetNumberNext(offset));
 
@@ -1177,9 +1250,11 @@ bt_target_page_check(BtreeCheckState *state)
 										  state->target,
 										  OffsetNumberNext(offset));
 			itup = (IndexTuple) PageGetItem(state->target, itemid);
+
+			tid = BTreeTupleGetHeapTID(itup);
 			nhtid = psprintf("(%u,%u)",
-							 ItemPointerGetBlockNumberNoCheck(&(itup->t_tid)),
-							 ItemPointerGetOffsetNumberNoCheck(&(itup->t_tid)));
+							 ItemPointerGetBlockNumberNoCheck(tid),
+							 ItemPointerGetOffsetNumberNoCheck(tid));
 
 			ereport(ERROR,
 					(errcode(ERRCODE_INDEX_CORRUPTED),
@@ -1189,10 +1264,10 @@ bt_target_page_check(BtreeCheckState *state)
 										"higher index tid=%s (points to %s tid=%s) "
 										"page lsn=%X/%X.",
 										itid,
-										P_ISLEAF(topaque) ? "heap" : "index",
+										P_ISLEAF(topaque) ? "min heap" : "index",
 										htid,
 										nitid,
-										P_ISLEAF(topaque) ? "heap" : "index",
+										P_ISLEAF(topaque) ? "min heap" : "index",
 										nhtid,
 										(uint32) (state->targetlsn >> 32),
 										(uint32) state->targetlsn)));
@@ -1953,10 +2028,11 @@ bt_tuple_present_callback(Relation index, HeapTuple htup, Datum *values,
  * verification.  In particular, it won't try to normalize opclass-equal
  * datums with potentially distinct representations (e.g., btree/numeric_ops
  * index datums will not get their display scale normalized-away here).
- * Normalization may need to be expanded to handle more cases in the future,
- * though.  For example, it's possible that non-pivot tuples could in the
- * future have alternative logically equivalent representations due to using
- * the INDEX_ALT_TID_MASK bit to implement intelligent deduplication.
+ * Caller does normalization for non-pivot tuples that have their own posting
+ * list, since dummy CREATE INDEX callback code generates new tuples with the
+ * same normalized representation.  Compression is performed
+ * opportunistically, and in general there is no guarantee about how or when
+ * compression will be applied.
  */
 static IndexTuple
 bt_normalize_tuple(BtreeCheckState *state, IndexTuple itup)
@@ -2560,14 +2636,16 @@ static inline ItemPointer
 BTreeTupleGetHeapTIDCareful(BtreeCheckState *state, IndexTuple itup,
 							bool nonpivot)
 {
-	ItemPointer result = BTreeTupleGetHeapTID(itup);
+	ItemPointer result;
 	BlockNumber targetblock = state->targetblock;
 
-	if (result == NULL && nonpivot)
+	if (BTreeTupleIsPivot(itup) == nonpivot)
 		ereport(ERROR,
 				(errcode(ERRCODE_INDEX_CORRUPTED),
 				 errmsg("block %u or its right sibling block or child block in index \"%s\" contains non-pivot tuple that lacks a heap TID",
 						targetblock, RelationGetRelationName(state->rel))));
 
+	result = BTreeTupleGetHeapTID(itup);
+
 	return result;
 }
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 5890f39..fed1e86 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -41,21 +41,28 @@ static OffsetNumber _bt_findinsertloc(Relation rel,
 									  BTStack stack,
 									  Relation heapRel);
 static void _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack);
+static void _bt_delete_and_insert(Relation rel,
+								  Buffer buf,
+								  Page page,
+								  IndexTuple newitup, IndexTuple newitupright,
+								  OffsetNumber newitemoff);
 static void _bt_insertonpg(Relation rel, BTScanInsert itup_key,
 						   Buffer buf,
 						   Buffer cbuf,
 						   BTStack stack,
 						   IndexTuple itup,
 						   OffsetNumber newitemoff,
-						   bool split_only_page);
+						   bool split_only_page, int in_posting_offset);
 static Buffer _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf,
 						Buffer cbuf, OffsetNumber newitemoff, Size newitemsz,
-						IndexTuple newitem);
+						IndexTuple newitem, IndexTuple lefttup, IndexTuple righttup);
 static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
 							  BTStack stack, bool is_root, bool is_only);
 static bool _bt_pgaddtup(Page page, Size itemsize, IndexTuple itup,
 						 OffsetNumber itup_off);
 static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
+static void insert_itupprev_to_page(Page page, BTCompressState *compressState);
+static void _bt_compress_one_page(Relation rel, Buffer buffer, Relation heapRel);
 
 /*
  *	_bt_doinsert() -- Handle insertion of a single index tuple in the tree.
@@ -297,10 +304,13 @@ top:
 		 * search bounds established within _bt_check_unique when insertion is
 		 * checkingunique.
 		 */
+		insertstate.in_posting_offset = 0;
 		newitemoff = _bt_findinsertloc(rel, &insertstate, checkingunique,
 									   stack, heapRel);
-		_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer, stack,
-					   itup, newitemoff, false);
+
+		_bt_insertonpg(rel, itup_key, insertstate.buf, InvalidBuffer,
+					   stack, itup, newitemoff, false,
+					   insertstate.in_posting_offset);
 	}
 	else
 	{
@@ -435,6 +445,7 @@ _bt_check_unique(Relation rel, BTInsertState insertstate, Relation heapRel,
 
 				/* okay, we gotta fetch the heap tuple ... */
 				curitup = (IndexTuple) PageGetItem(page, curitemid);
+				Assert(!BTreeTupleIsPosting(curitup));
 				htid = curitup->t_tid;
 
 				/*
@@ -759,6 +770,26 @@ _bt_findinsertloc(Relation rel,
 			_bt_vacuum_one_page(rel, insertstate->buf, heapRel);
 			insertstate->bounds_valid = false;
 		}
+
+		/*
+		 * If the target page is full, try to compress the page
+		 */
+		if (PageGetFreeSpace(page) < insertstate->itemsz && !checkingunique)
+		{
+			_bt_compress_one_page(rel, insertstate->buf, heapRel);
+			insertstate->bounds_valid = false;		/* paranoia */
+
+			/*
+			 * FIXME: _bt_vacuum_one_page() won't have cleared the
+			 * BTP_HAS_GARBAGE flag when it didn't kill items.  Maybe we
+			 * should clear the BTP_HAS_GARBAGE flag bit from the page when
+			 * compression avoids a page split -- _bt_vacuum_one_page() is
+			 * expecting a page split that takes care of it.
+			 *
+			 * (On the other hand, maybe it doesn't matter very much.  A
+			 * comment update seems like the bare minimum we should do.)
+			 */
+		}
 	}
 	else
 	{
@@ -900,6 +931,77 @@ _bt_stepright(Relation rel, BTInsertState insertstate, BTStack stack)
 	insertstate->bounds_valid = false;
 }
 
+/*
+ * Delete tuple on newitemoff offset and insert newitup at the same offset.
+ *
+ * If original posting tuple was split, 'newitup' represents left part of
+ * original tuple and 'newitupright' is it's right part, that must be inserted
+ * next to newitemoff.
+ * It's essential to do this atomic to be crash safe.
+ *
+ * NOTE All checks of free space must be done before calling this function.
+ *
+ * For use in posting tuple's update.
+ */
+static void
+_bt_delete_and_insert(Relation rel,
+					  Buffer buf,
+					  Page page,
+					  IndexTuple newitup, IndexTuple newitupright,
+					  OffsetNumber newitemoff)
+{
+	Size		newitupsz = IndexTupleSize(newitup);
+
+	newitupsz = MAXALIGN(newitupsz);
+
+	elog(DEBUG4, "_bt_delete_and_insert %s newitemoff %d",
+				  RelationGetRelationName(rel), newitemoff);
+	START_CRIT_SECTION();
+
+	PageIndexTupleDelete(page, newitemoff);
+
+	if (!_bt_pgaddtup(page, newitupsz, newitup, newitemoff))
+		elog(ERROR, "failed to insert compressed item in index \"%s\"",
+			 RelationGetRelationName(rel));
+
+	if (newitupright)
+	{
+		if (!_bt_pgaddtup(page, MAXALIGN(IndexTupleSize(newitupright)),
+						  newitupright, OffsetNumberNext(newitemoff)))
+			elog(ERROR, "failed to insert compressed item in index \"%s\"",
+				 RelationGetRelationName(rel));
+	}
+
+	if (BufferIsValid(buf))
+	{
+		MarkBufferDirty(buf);
+
+		/* Xlog stuff */
+		if (RelationNeedsWAL(rel))
+		{
+			xl_btree_insert xlrec;
+			XLogRecPtr	recptr;
+
+			xlrec.offnum = newitemoff;
+
+			XLogBeginInsert();
+			XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
+
+			Assert(P_ISLEAF((BTPageOpaque) PageGetSpecialPointer(page)));
+
+			/*
+			* Force full page write to keep code simple
+			*
+			* TODO: think of using XLOG_BTREE_INSERT_LEAF with a new tuple's data
+			*/
+			XLogRegisterBuffer(0, buf, REGBUF_STANDARD | REGBUF_FORCE_IMAGE);
+			recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_INSERT_LEAF);
+			PageSetLSN(page, recptr);
+		}
+	}
+	END_CRIT_SECTION();
+}
+
 /*----------
  *	_bt_insertonpg() -- Insert a tuple on a particular page in the index.
  *
@@ -936,11 +1038,17 @@ _bt_insertonpg(Relation rel,
 			   BTStack stack,
 			   IndexTuple itup,
 			   OffsetNumber newitemoff,
-			   bool split_only_page)
+			   bool split_only_page,
+			   int in_posting_offset)
 {
 	Page		page;
 	BTPageOpaque lpageop;
 	Size		itemsz;
+	IndexTuple	origtup;
+	int			nipd;
+	IndexTuple	neworigtup = NULL;
+	IndexTuple	newrighttup = NULL;
+	bool		need_split = false;
 
 	page = BufferGetPage(buf);
 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
@@ -965,13 +1073,184 @@ _bt_insertonpg(Relation rel,
 								 * need to be consistent */
 
 	/*
+	 * If new tuple's key is equal to the key of a posting tuple that already
+	 * exists on the page and it's TID falls inside the min/max range of
+	 * existing posting list, update the posting tuple.
+	 *
+	 * TODO Think of moving this to a separate function.
+	 *
+	 * TODO possible optimization:
+	 *		if original posting tuple is dead,
+	 *		reset in_posting_offset and handle itup as a regular tuple
+	 */
+	if (in_posting_offset)
+	{
+		/* get old posting tuple */
+		ItemId 			itemid = PageGetItemId(page, newitemoff);
+		ItemPointerData *ipd;
+		int				nipd, nipd_right;
+		bool			need_posting_split = false;
+
+		origtup = (IndexTuple) PageGetItem(page, itemid);
+		Assert(BTreeTupleIsPosting(origtup));
+		nipd = BTreeTupleGetNPosting(origtup);
+		Assert(in_posting_offset < nipd);
+		Assert(itup_key->scantid != NULL);
+		Assert(itup_key->heapkeyspace);
+
+		elog(DEBUG4, "(%u,%u) is min, (%u,%u) is max, (%u,%u) is new",
+			ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(origtup)),
+			ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(origtup)),
+			ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(origtup)),
+			ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(origtup)),
+			ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(itup)),
+			ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(itup)));
+
+		/* check if posting tuple must be splitted */
+		if (BTMaxItemSize(page) < MAXALIGN(IndexTupleSize(origtup)) + sizeof(ItemPointerData))
+			need_posting_split = true;
+
+		/*
+		 * If page split is needed, always split posting tuple.
+		 * Probably that is not the most optimal,
+		 * but it allows to simplify _bt_split code.
+		 *
+		 * TODO Does this decision have any significant drawbacks?
+		 */
+		if (PageGetFreeSpace(page) < sizeof(ItemPointerData))
+			need_posting_split = true;
+
+		/*
+		 * Handle corner cases (1)
+		 *		- itup TID is smaller than leftmost orightup TID
+		 */
+		if (ItemPointerCompare(BTreeTupleGetHeapTID(itup),
+								BTreeTupleGetHeapTID(origtup)) < 0)
+		{
+			if (need_posting_split)
+			{
+				/*
+				 * cannot avoid split, so no need in trying to fit itup into posting list.
+				 * handle itup insertion as regular tuple insertion
+				 */
+				elog(DEBUG4, "split posting tuple. itup is to the left of origtup");
+				in_posting_offset = InvalidOffsetNumber;
+				newitemoff = OffsetNumberPrev(newitemoff);
+			}
+			else
+			{
+				ipd = palloc0(nipd + 1);
+				/* insert new item pointer */
+				memcpy(ipd, itup, sizeof(ItemPointerData));
+				/* copy item pointers from original tuple that belong on right */
+				memcpy(ipd + 1, BTreeTupleGetPosting(origtup), sizeof(ItemPointerData) * nipd);
+				neworigtup = BTreeFormPostingTuple(origtup, ipd, nipd+1);
+				pfree(ipd);
+
+				Assert(ItemPointerCompare(BTreeTupleGetHeapTID(neworigtup),
+										  BTreeTupleGetMaxTID(neworigtup)) < 0);
+			}
+		}
+
+		/*
+		 * Handle corner cases (2)
+		 *		- itup TID is larger than rightmost orightup TID
+		 */
+		if (ItemPointerCompare(BTreeTupleGetMaxTID(origtup),
+							   BTreeTupleGetHeapTID(itup)) < 0)
+		{
+			if (need_posting_split)
+			{
+				/*
+				 * cannot avoid split, so no need in trying to fit itup into posting list.
+				 * handle itup insertion as regular tuple insertion
+				 */
+				elog(DEBUG4, "split posting tuple. itup is to the right of origtup");
+				in_posting_offset = InvalidOffsetNumber;
+			}
+			else
+			{
+				ipd = palloc0(nipd + 1);
+				/* insert new item pointer */
+				/* copy item pointers from original tuple that belong on right */
+				memcpy(ipd, BTreeTupleGetPosting(origtup), sizeof(ItemPointerData) * nipd);
+				memcpy(ipd+nipd, itup, sizeof(ItemPointerData));
+
+				neworigtup = BTreeFormPostingTuple(origtup, ipd, nipd+1);
+				pfree(ipd);
+
+				Assert(ItemPointerCompare(BTreeTupleGetHeapTID(neworigtup),
+										  BTreeTupleGetMaxTID(neworigtup)) < 0);
+			}
+		}
+
+		/*
+		 * itup TID belongs to TID range of origtup posting list
+		 *
+		 * Split posting tuple into two halves.
+		 *
+		 * neworigtup (left) tuple contains all item pointers less than the new one and
+		 * newrighttup tuple contains new item pointer and all to the right.
+		 */
+		if (ItemPointerCompare(BTreeTupleGetHeapTID(itup),
+							   BTreeTupleGetHeapTID(origtup)) > 0
+			&&
+			ItemPointerCompare(BTreeTupleGetMaxTID(origtup),
+							   BTreeTupleGetHeapTID(itup)) > 0)
+		{
+			neworigtup = BTreeFormPostingTuple(origtup, BTreeTupleGetPosting(origtup),
+											in_posting_offset);
+
+			nipd_right = nipd - in_posting_offset + 1;
+
+			elog(DEBUG4, "split posting tuple in_posting_offset %d nipd %d nipd_right %d",
+						 in_posting_offset, nipd, nipd_right);
+
+			ipd = palloc0(sizeof(ItemPointerData) * nipd_right);
+			/* insert new item pointer */
+			memcpy(ipd, itup, sizeof(ItemPointerData));
+			/* copy item pointers from original tuple that belong on right */
+			memcpy(ipd + 1,
+				BTreeTupleGetPostingN(origtup, in_posting_offset),
+				sizeof(ItemPointerData) * (nipd - in_posting_offset));
+
+			newrighttup = BTreeFormPostingTuple(origtup, ipd, nipd_right);
+
+			Assert(ItemPointerCompare(BTreeTupleGetMaxTID(neworigtup),
+									BTreeTupleGetHeapTID(newrighttup)) < 0);
+			pfree(ipd);
+
+			elog(DEBUG4, "left N %d (%u,%u) to (%u,%u), right N %d (%u,%u) to (%u,%u) ",
+				BTreeTupleIsPosting(neworigtup)?BTreeTupleGetNPosting(neworigtup):0,
+				ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(neworigtup)),
+				ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(neworigtup)),
+				ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(neworigtup)),
+				ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(neworigtup)),
+				BTreeTupleIsPosting(newrighttup)?BTreeTupleGetNPosting(newrighttup):0,
+				ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(newrighttup)),
+				ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(newrighttup)),
+				ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(newrighttup)),
+				ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(newrighttup)));
+
+			/*
+				* check if splitted tuple still fit into original page
+				* TODO should we add sizeof(ItemIdData) in this check?
+				*/
+			if (PageGetFreeSpace(page) < (MAXALIGN(IndexTupleSize(neworigtup))
+											+ MAXALIGN(IndexTupleSize(newrighttup))
+											- MAXALIGN(IndexTupleSize(origtup))))
+				need_split = true;
+		}
+	}
+
+	/*
 	 * Do we need to split the page to fit the item on it?
 	 *
 	 * Note: PageGetFreeSpace() subtracts sizeof(ItemIdData) from its result,
 	 * so this comparison is correct even though we appear to be accounting
 	 * only for the item and not for its line pointer.
 	 */
-	if (PageGetFreeSpace(page) < itemsz)
+	if (PageGetFreeSpace(page) < itemsz || need_split)
 	{
 		bool		is_root = P_ISROOT(lpageop);
 		bool		is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(lpageop);
@@ -996,7 +1275,8 @@ _bt_insertonpg(Relation rel,
 				 BlockNumberIsValid(RelationGetTargetBlock(rel))));
 
 		/* split the buffer into left and right halves */
-		rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup);
+		rbuf = _bt_split(rel, itup_key, buf, cbuf, newitemoff, itemsz, itup,
+						 neworigtup, newrighttup);
 		PredicateLockPageSplit(rel,
 							   BufferGetBlockNumber(buf),
 							   BufferGetBlockNumber(rbuf));
@@ -1033,142 +1313,159 @@ _bt_insertonpg(Relation rel,
 		itup_off = newitemoff;
 		itup_blkno = BufferGetBlockNumber(buf);
 
-		/*
-		 * If we are doing this insert because we split a page that was the
-		 * only one on its tree level, but was not the root, it may have been
-		 * the "fast root".  We need to ensure that the fast root link points
-		 * at or above the current page.  We can safely acquire a lock on the
-		 * metapage here --- see comments for _bt_newroot().
-		 */
-		if (split_only_page)
+		if (!in_posting_offset)
 		{
-			Assert(!P_ISLEAF(lpageop));
-
-			metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
-			metapg = BufferGetPage(metabuf);
-			metad = BTPageGetMeta(metapg);
-
-			if (metad->btm_fastlevel >= lpageop->btpo.level)
+			/*
+			* If we are doing this insert because we split a page that was the
+			* only one on its tree level, but was not the root, it may have been
+			* the "fast root".  We need to ensure that the fast root link points
+			* at or above the current page.  We can safely acquire a lock on the
+			* metapage here --- see comments for _bt_newroot().
+			*/
+			if (split_only_page)
 			{
-				/* no update wanted */
-				_bt_relbuf(rel, metabuf);
-				metabuf = InvalidBuffer;
-			}
-		}
-
-		/*
-		 * Every internal page should have exactly one negative infinity item
-		 * at all times.  Only _bt_split() and _bt_newroot() should add items
-		 * that become negative infinity items through truncation, since
-		 * they're the only routines that allocate new internal pages.  Do not
-		 * allow a retail insertion of a new item at the negative infinity
-		 * offset.
-		 */
-		if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
-			elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
-				 itup_blkno, RelationGetRelationName(rel));
+				Assert(!P_ISLEAF(lpageop));
 
-		/* Do the update.  No ereport(ERROR) until changes are logged */
-		START_CRIT_SECTION();
+				metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+				metapg = BufferGetPage(metabuf);
+				metad = BTPageGetMeta(metapg);
 
-		if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
-			elog(PANIC, "failed to add new item to block %u in index \"%s\"",
-				 itup_blkno, RelationGetRelationName(rel));
+				if (metad->btm_fastlevel >= lpageop->btpo.level)
+				{
+					/* no update wanted */
+					_bt_relbuf(rel, metabuf);
+					metabuf = InvalidBuffer;
+				}
+			}
 
-		MarkBufferDirty(buf);
+			/*
+			* Every internal page should have exactly one negative infinity item
+			* at all times.  Only _bt_split() and _bt_newroot() should add items
+			* that become negative infinity items through truncation, since
+			* they're the only routines that allocate new internal pages.  Do not
+			* allow a retail insertion of a new item at the negative infinity
+			* offset.
+			*/
+			if (!P_ISLEAF(lpageop) && newitemoff == P_FIRSTDATAKEY(lpageop))
+				elog(ERROR, "cannot insert second negative infinity item in block %u of index \"%s\"",
+					itup_blkno, RelationGetRelationName(rel));
+
+			/* Do the update.  No ereport(ERROR) until changes are logged */
+			START_CRIT_SECTION();
+
+			if (!_bt_pgaddtup(page, itemsz, itup, newitemoff))
+				elog(PANIC, "failed to add new item to block %u in index \"%s\"",
+					itup_blkno, RelationGetRelationName(rel));
+
+			MarkBufferDirty(buf);
 
-		if (BufferIsValid(metabuf))
-		{
-			/* upgrade meta-page if needed */
-			if (metad->btm_version < BTREE_NOVAC_VERSION)
-				_bt_upgrademetapage(metapg);
-			metad->btm_fastroot = itup_blkno;
-			metad->btm_fastlevel = lpageop->btpo.level;
-			MarkBufferDirty(metabuf);
-		}
+			if (BufferIsValid(metabuf))
+			{
+				/* upgrade meta-page if needed */
+				if (metad->btm_version < BTREE_NOVAC_VERSION)
+					_bt_upgrademetapage(metapg);
+				metad->btm_fastroot = itup_blkno;
+				metad->btm_fastlevel = lpageop->btpo.level;
+				MarkBufferDirty(metabuf);
+			}
 
-		/* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
-		if (BufferIsValid(cbuf))
-		{
-			Page		cpage = BufferGetPage(cbuf);
-			BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+			/* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
+			if (BufferIsValid(cbuf))
+			{
+				Page		cpage = BufferGetPage(cbuf);
+				BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
 
-			Assert(P_INCOMPLETE_SPLIT(cpageop));
-			cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
-			MarkBufferDirty(cbuf);
-		}
+				Assert(P_INCOMPLETE_SPLIT(cpageop));
+				cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+				MarkBufferDirty(cbuf);
+			}
 
-		/*
-		 * Cache the block information if we just inserted into the rightmost
-		 * leaf page of the index and it's not the root page.  For very small
-		 * index where root is also the leaf, there is no point trying for any
-		 * optimization.
-		 */
-		if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
-			cachedBlock = BufferGetBlockNumber(buf);
+			/* XLOG stuff */
+			if (RelationNeedsWAL(rel))
+			{
+				xl_btree_insert xlrec;
+				xl_btree_metadata xlmeta;
+				uint8		xlinfo;
+				XLogRecPtr	recptr;
 
-		/* XLOG stuff */
-		if (RelationNeedsWAL(rel))
-		{
-			xl_btree_insert xlrec;
-			xl_btree_metadata xlmeta;
-			uint8		xlinfo;
-			XLogRecPtr	recptr;
+				xlrec.offnum = itup_off;
 
-			xlrec.offnum = itup_off;
+				XLogBeginInsert();
+				XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
 
-			XLogBeginInsert();
-			XLogRegisterData((char *) &xlrec, SizeOfBtreeInsert);
+				if (P_ISLEAF(lpageop))
+					xlinfo = XLOG_BTREE_INSERT_LEAF;
+				else
+				{
+					/*
+					* Register the left child whose INCOMPLETE_SPLIT flag was
+					* cleared.
+					*/
+					XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
 
-			if (P_ISLEAF(lpageop))
-				xlinfo = XLOG_BTREE_INSERT_LEAF;
-			else
-			{
-				/*
-				 * Register the left child whose INCOMPLETE_SPLIT flag was
-				 * cleared.
-				 */
-				XLogRegisterBuffer(1, cbuf, REGBUF_STANDARD);
+					xlinfo = XLOG_BTREE_INSERT_UPPER;
+				}
 
-				xlinfo = XLOG_BTREE_INSERT_UPPER;
-			}
+				if (BufferIsValid(metabuf))
+				{
+					Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
+					xlmeta.version = metad->btm_version;
+					xlmeta.root = metad->btm_root;
+					xlmeta.level = metad->btm_level;
+					xlmeta.fastroot = metad->btm_fastroot;
+					xlmeta.fastlevel = metad->btm_fastlevel;
+					xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
+					xlmeta.last_cleanup_num_heap_tuples =
+						metad->btm_last_cleanup_num_heap_tuples;
+
+					XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
+					XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
+
+					xlinfo = XLOG_BTREE_INSERT_META;
+				}
 
-			if (BufferIsValid(metabuf))
-			{
-				Assert(metad->btm_version >= BTREE_NOVAC_VERSION);
-				xlmeta.version = metad->btm_version;
-				xlmeta.root = metad->btm_root;
-				xlmeta.level = metad->btm_level;
-				xlmeta.fastroot = metad->btm_fastroot;
-				xlmeta.fastlevel = metad->btm_fastlevel;
-				xlmeta.oldest_btpo_xact = metad->btm_oldest_btpo_xact;
-				xlmeta.last_cleanup_num_heap_tuples =
-					metad->btm_last_cleanup_num_heap_tuples;
-
-				XLogRegisterBuffer(2, metabuf, REGBUF_WILL_INIT | REGBUF_STANDARD);
-				XLogRegisterBufData(2, (char *) &xlmeta, sizeof(xl_btree_metadata));
-
-				xlinfo = XLOG_BTREE_INSERT_META;
-			}
+				XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
+				XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
 
-			XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
-			XLogRegisterBufData(0, (char *) itup, IndexTupleSize(itup));
+				recptr = XLogInsert(RM_BTREE_ID, xlinfo);
 
-			recptr = XLogInsert(RM_BTREE_ID, xlinfo);
+				if (BufferIsValid(metabuf))
+				{
+					PageSetLSN(metapg, recptr);
+				}
+				if (BufferIsValid(cbuf))
+				{
+					PageSetLSN(BufferGetPage(cbuf), recptr);
+				}
 
-			if (BufferIsValid(metabuf))
-			{
-				PageSetLSN(metapg, recptr);
-			}
-			if (BufferIsValid(cbuf))
-			{
-				PageSetLSN(BufferGetPage(cbuf), recptr);
+				PageSetLSN(page, recptr);
 			}
 
-			PageSetLSN(page, recptr);
+			END_CRIT_SECTION();
+		}
+		else
+		{
+			/*
+			 * Insert new tuple on place of existing posting tuple.
+			 * Delete old posting tuple, and insert updated tuple instead.
+			 *
+			 * If split was needed, both neworigtup and newrighttup are initialized
+			 * and both will be inserted, otherwise newrighttup is NULL.
+			 *
+			 * It only can happen on leaf page.
+			 */
+			elog(DEBUG4, "_bt_insertonpg. _bt_delete_and_insert %s",  RelationGetRelationName(rel));
+			_bt_delete_and_insert(rel, buf, page, neworigtup, newrighttup, newitemoff);
 		}
 
-		END_CRIT_SECTION();
+		/*
+		 * Cache the block information if we just inserted into the rightmost
+		 * leaf page of the index and it's not the root page.  For very small
+		 * index where root is also the leaf, there is no point trying for any
+		 * optimization.
+		 */
+		if (P_RIGHTMOST(lpageop) && P_ISLEAF(lpageop) && !P_ISROOT(lpageop))
+				cachedBlock = BufferGetBlockNumber(buf);
 
 		/* release buffers */
 		if (BufferIsValid(metabuf))
@@ -1214,7 +1511,8 @@ _bt_insertonpg(Relation rel,
  */
 static Buffer
 _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
-		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem)
+		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+		  IndexTuple lefttup, IndexTuple righttup)
 {
 	Buffer		rbuf;
 	Page		origpage;
@@ -1236,6 +1534,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	OffsetNumber firstright;
 	OffsetNumber maxoff;
 	OffsetNumber i;
+	OffsetNumber replaceitemoff = InvalidOffsetNumber;
+	Size		replaceitemsz;
 	bool		newitemonleft,
 				isleaf;
 	IndexTuple	lefthikey;
@@ -1243,6 +1543,24 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	int			indnkeyatts = IndexRelationGetNumberOfKeyAttributes(rel);
 
 	/*
+	 * If we're working with splitted posting tuple,
+	 * new tuple is actually contained in righttup posting list
+	 */
+	if (righttup)
+	{
+		newitem = righttup;
+		newitemsz = MAXALIGN(IndexTupleSize(righttup));
+
+		/*
+		 * actual insertion is a replacement of origtup with lefttup
+		 * and insertion of righttup (as newitem) next to it.
+		 */
+		replaceitemoff = newitemoff;
+		replaceitemsz = MAXALIGN(IndexTupleSize(lefttup));
+		newitemoff = OffsetNumberNext(newitemoff);
+	}
+
+	/*
 	 * origpage is the original page to be split.  leftpage is a temporary
 	 * buffer that receives the left-sibling data, which will be copied back
 	 * into origpage on success.  rightpage is the new page that will receive
@@ -1275,7 +1593,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 	 * (but not always) redundant information.
 	 */
 	firstright = _bt_findsplitloc(rel, origpage, newitemoff, newitemsz,
-								  newitem, &newitemonleft);
+								  newitem, replaceitemoff, replaceitemsz,
+								  lefttup, &newitemonleft);
 
 	/* Allocate temp buffer for leftpage */
 	leftpage = PageGetTempPage(origpage);
@@ -1364,6 +1683,17 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 			/* incoming tuple will become last on left page */
 			lastleft = newitem;
 		}
+		else if (!newitemonleft && newitemoff == firstright && lefttup)
+		{
+			/*
+			 * if newitem is first on the right page
+			 * and split posting tuple handle is reuqired,
+			 * lastleft will be replaced with lefttup,
+			 * so use it here
+			 */
+			elog(DEBUG4, "lastleft = lefttup firstright %d", firstright);
+			lastleft = lefttup;
+		}
 		else
 		{
 			OffsetNumber lastleftoff;
@@ -1480,6 +1810,39 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		itemsz = ItemIdGetLength(itemid);
 		item = (IndexTuple) PageGetItem(origpage, itemid);
 
+		if (i == replaceitemoff)
+		{
+			if (replaceitemoff <= firstright)
+			{
+				elog(DEBUG4, "_bt_split left. replaceitem block %u %s replaceitemoff %d leftoff %d", 
+					origpagenumber, RelationGetRelationName(rel), replaceitemoff, leftoff);
+				if (!_bt_pgaddtup(leftpage, MAXALIGN(IndexTupleSize(lefttup)), lefttup, leftoff))
+				{
+					memset(rightpage, 0, BufferGetPageSize(rbuf));
+					elog(ERROR, "failed to add new item to the left sibling"
+						 " while splitting block %u of index \"%s\"",
+						 origpagenumber, RelationGetRelationName(rel));
+				}
+				leftoff = OffsetNumberNext(leftoff);
+			}
+			else
+			{
+				elog(DEBUG4, "_bt_split right. replaceitem block %u %s replaceitemoff %d newitemoff %d", 
+					 origpagenumber, RelationGetRelationName(rel), replaceitemoff, newitemoff);
+				elog(DEBUG4, "_bt_split right. i %d, maxoff %d, rightoff %d", i, maxoff, rightoff);
+
+				if (!_bt_pgaddtup(rightpage, MAXALIGN(IndexTupleSize(lefttup)), lefttup, rightoff))
+				{
+					memset(rightpage, 0, BufferGetPageSize(rbuf));
+					elog(ERROR, "failed to add new item to the right sibling"
+						 " while splitting block %u of index \"%s\", rightoff %d",
+						 origpagenumber, RelationGetRelationName(rel), rightoff);
+				}
+				rightoff = OffsetNumberNext(rightoff);
+			}
+			continue;
+		}
+
 		/* does new item belong before this one? */
 		if (i == newitemoff)
 		{
@@ -1497,13 +1860,14 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 			}
 			else
 			{
+				elog(DEBUG4, "insert newitem to the right. i %d, maxoff %d, rightoff %d", i, maxoff, rightoff);
 				Assert(newitemoff >= firstright);
 				if (!_bt_pgaddtup(rightpage, newitemsz, newitem, rightoff))
 				{
 					memset(rightpage, 0, BufferGetPageSize(rbuf));
 					elog(ERROR, "failed to add new item to the right sibling"
-						 " while splitting block %u of index \"%s\"",
-						 origpagenumber, RelationGetRelationName(rel));
+						 " while splitting block %u of index \"%s\", rightoff %d",
+						 origpagenumber, RelationGetRelationName(rel), rightoff);
 				}
 				rightoff = OffsetNumberNext(rightoff);
 			}
@@ -1547,8 +1911,8 @@ _bt_split(Relation rel, BTScanInsert itup_key, Buffer buf, Buffer cbuf,
 		{
 			memset(rightpage, 0, BufferGetPageSize(rbuf));
 			elog(ERROR, "failed to add new item to the right sibling"
-				 " while splitting block %u of index \"%s\"",
-				 origpagenumber, RelationGetRelationName(rel));
+				 " while splitting block %u of index \"%s\" rightoff %d",
+				 origpagenumber, RelationGetRelationName(rel), rightoff);
 		}
 		rightoff = OffsetNumberNext(rightoff);
 	}
@@ -1837,7 +2201,7 @@ _bt_insert_parent(Relation rel,
 		/* Recursively update the parent */
 		_bt_insertonpg(rel, NULL, pbuf, buf, stack->bts_parent,
 					   new_item, stack->bts_offset + 1,
-					   is_only);
+					   is_only, 0);
 
 		/* be tidy */
 		pfree(new_item);
@@ -2290,3 +2654,206 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
 	 * the page.
 	 */
 }
+
+/*
+ * Add new item (compressed or not) to the page, while compressing it.
+ * If insertion failed, return false.
+ * Caller should consider this as compression failure and
+ * leave page uncompressed.
+ */
+static void
+insert_itupprev_to_page(Page page, BTCompressState *compressState)
+{
+	IndexTuple	to_insert;
+	OffsetNumber offnum = PageGetMaxOffsetNumber(page);
+
+	if (compressState->ntuples == 0)
+		to_insert = compressState->itupprev;
+	else
+	{
+		IndexTuple	postingtuple;
+
+		/* form a tuple with a posting list */
+		postingtuple = BTreeFormPostingTuple(compressState->itupprev,
+											 compressState->ipd,
+											 compressState->ntuples);
+		to_insert = postingtuple;
+		pfree(compressState->ipd);
+	}
+
+	/* Add the new item into the page */
+	offnum = OffsetNumberNext(offnum);
+
+	elog(DEBUG4, "insert_itupprev_to_page. compressState->ntuples %d IndexTupleSize %zu free %zu",
+		 compressState->ntuples, IndexTupleSize(to_insert), PageGetFreeSpace(page));
+
+	if (PageAddItem(page, (Item) to_insert, IndexTupleSize(to_insert),
+					offnum, false, false) == InvalidOffsetNumber)
+		elog(ERROR, "failed to add tuple to page while compresing it");
+
+	if (compressState->ntuples > 0)
+		pfree(to_insert);
+	compressState->ntuples = 0;
+}
+
+/*
+ * Before splitting the page, try to compress items to free some space.
+ * If compression didn't succeed, buffer will contain old state of the page.
+ * This function should be called after lp_dead items
+ * were removed by _bt_vacuum_one_page().
+ */
+static void
+_bt_compress_one_page(Relation rel, Buffer buffer, Relation heapRel)
+{
+	OffsetNumber offnum,
+				minoff,
+				maxoff;
+	Page		page = BufferGetPage(buffer);
+	Page		newpage;
+	BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+	bool		use_compression = false;
+	BTCompressState *compressState = NULL;
+	int			natts = IndexRelationGetNumberOfAttributes(rel);
+	OffsetNumber deletable[MaxOffsetNumber];
+	int			ndeletable = 0;
+
+	/*
+	 * Don't use compression for indexes with INCLUDEd columns and unique
+	 * indexes.
+	 */
+	use_compression = (IndexRelationGetNumberOfKeyAttributes(rel) ==
+					   IndexRelationGetNumberOfAttributes(rel) &&
+					   !rel->rd_index->indisunique);
+	if (!use_compression)
+		return;
+
+	/* init compress state needed to build posting tuples */
+	compressState = (BTCompressState *) palloc0(sizeof(BTCompressState));
+	compressState->ipd = NULL;
+	compressState->ntuples = 0;
+	compressState->itupprev = NULL;
+	compressState->maxitemsize = BTMaxItemSize(page);
+	compressState->maxpostingsize = 0;
+
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+
+	
+	/*
+	 * Delete dead tuples if any.
+	 * We cannot simply skip them in the cycle below, because it's neccessary
+	 * to generate special Xlog record containing such tuples to compute
+	 * latestRemovedXid on a standby server later.
+	 *
+	 * This should not affect performance, since it only can happen in a rare
+	 * situation when BTP_HAS_GARBAGE flag was not set and  _bt_vacuum_one_page
+	 * was not called, or _bt_vacuum_one_page didn't remove all dead items.
+	 */
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId	itemid = PageGetItemId(page, P_HIKEY);
+
+		if (ItemIdIsDead(itemid))
+			deletable[ndeletable++] = offnum;
+	}
+
+	if (ndeletable > 0)
+		_bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
+
+	/*
+	 * Scan over all items to see which ones can be compressed
+	 */
+	minoff = P_FIRSTDATAKEY(opaque);
+	maxoff = PageGetMaxOffsetNumber(page);
+	newpage = PageGetTempPageCopySpecial(page);
+	elog(DEBUG4, "_bt_compress_one_page rel: %s,blkno: %u",
+		 RelationGetRelationName(rel), BufferGetBlockNumber(buffer));
+
+	/* Copy High Key if any */
+	if (!P_RIGHTMOST(opaque))
+	{
+		ItemId		itemid = PageGetItemId(page, P_HIKEY);
+		Size		itemsz = ItemIdGetLength(itemid);
+		IndexTuple	item = (IndexTuple) PageGetItem(page, itemid);
+
+		if (PageAddItem(newpage, (Item) item, itemsz, P_HIKEY,
+						false, false) == InvalidOffsetNumber)
+			elog(ERROR, "failed to add highkey during compression");
+	}
+
+	/*
+	 * Iterate over tuples on the page, try to compress them into posting
+	 * lists and insert into new page.
+	 */
+	for (offnum = minoff;
+		 offnum <= maxoff;
+		 offnum = OffsetNumberNext(offnum))
+	{
+		ItemId		itemId = PageGetItemId(page, offnum);
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, itemId);
+
+		if (compressState->itupprev != NULL)
+		{
+			int			n_equal_atts =
+			_bt_keep_natts_fast(rel, compressState->itupprev, itup);
+			int			itup_ntuples = BTreeTupleIsPosting(itup) ?
+			BTreeTupleGetNPosting(itup) : 1;
+
+			if (n_equal_atts > natts)
+			{
+				/*
+				 * When tuples are equal, create or update posting.
+				 *
+				 * If posting is too big, insert it on page and continue.
+				 */
+				if (compressState->maxitemsize >
+					MAXALIGN(((IndexTupleSize(compressState->itupprev)
+							   + (compressState->ntuples + itup_ntuples + 1) * sizeof(ItemPointerData)))))
+				{
+					_bt_add_posting_item(compressState, itup);
+				}
+				else
+				{
+					insert_itupprev_to_page(newpage, compressState);
+				}
+			}
+			else
+			{
+				insert_itupprev_to_page(newpage, compressState);
+			}
+		}
+
+		/*
+		 * Copy the tuple into temp variable itupprev to compare it with the
+		 * following tuple and maybe unite them into a posting tuple
+		 */
+		if (compressState->itupprev)
+			pfree(compressState->itupprev);
+		compressState->itupprev = CopyIndexTuple(itup);
+
+		Assert(IndexTupleSize(compressState->itupprev) <= compressState->maxitemsize);
+	}
+
+	/* Handle the last item. */
+	insert_itupprev_to_page(newpage, compressState);
+
+	START_CRIT_SECTION();
+
+	PageRestoreTempPage(newpage, page);
+	MarkBufferDirty(buffer);
+
+	/* Log full page write */
+	if (RelationNeedsWAL(rel))
+	{
+		XLogRecPtr	recptr;
+
+		recptr = log_newpage_buffer(buffer, true);
+		PageSetLSN(page, recptr);
+	}
+
+	END_CRIT_SECTION();
+
+	elog(DEBUG4, "_bt_compress_one_page. success");
+}
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 9c1f7de..86c662d 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -983,14 +983,52 @@ _bt_page_recyclable(Page page)
 void
 _bt_delitems_vacuum(Relation rel, Buffer buf,
 					OffsetNumber *itemnos, int nitems,
+					OffsetNumber *remainingoffset,
+					IndexTuple *remaining, int nremaining,
 					BlockNumber lastBlockVacuumed)
 {
 	Page		page = BufferGetPage(buf);
 	BTPageOpaque opaque;
+	Size		itemsz;
+	Size		remaining_sz = 0;
+	char	   *remaining_buf = NULL;
+
+	/* XLOG stuff, buffer for remainings */
+	if (nremaining && RelationNeedsWAL(rel))
+	{
+		Size		offset = 0;
+
+		for (int i = 0; i < nremaining; i++)
+			remaining_sz += MAXALIGN(IndexTupleSize(remaining[i]));
+
+		remaining_buf = palloc0(remaining_sz);
+		for (int i = 0; i < nremaining; i++)
+		{
+			itemsz = IndexTupleSize(remaining[i]);
+			memcpy(remaining_buf + offset, (char *) remaining[i], itemsz);
+			offset += MAXALIGN(itemsz);
+		}
+		Assert(offset == remaining_sz);
+	}
 
 	/* No ereport(ERROR) until changes are logged */
 	START_CRIT_SECTION();
 
+	/* Handle posting tuples here */
+	for (int i = 0; i < nremaining; i++)
+	{
+		/* At first, delete the old tuple. */
+		PageIndexTupleDelete(page, remainingoffset[i]);
+
+		itemsz = IndexTupleSize(remaining[i]);
+		itemsz = MAXALIGN(itemsz);
+
+		/* Add tuple with remaining ItemPointers to the page. */
+		if (PageAddItem(page, (Item) remaining[i], itemsz, remainingoffset[i],
+						false, false) == InvalidOffsetNumber)
+			elog(ERROR, "failed to rewrite compressed item in index while doing vacuum");
+	}
+
 	/* Fix the page */
 	if (nitems > 0)
 		PageIndexMultiDelete(page, itemnos, nitems);
@@ -1020,6 +1058,8 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 		xl_btree_vacuum xlrec_vacuum;
 
 		xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
+		xlrec_vacuum.nremaining = nremaining;
+		xlrec_vacuum.ndeleted = nitems;
 
 		XLogBeginInsert();
 		XLogRegisterBuffer(0, buf, REGBUF_STANDARD);
@@ -1033,6 +1073,19 @@ _bt_delitems_vacuum(Relation rel, Buffer buf,
 		if (nitems > 0)
 			XLogRegisterBufData(0, (char *) itemnos, nitems * sizeof(OffsetNumber));
 
+		/*
+		 * Here we should save offnums and remaining tuples themselves. It's
+		 * important to restore them in correct order. At first, we must
+		 * handle remaining tuples and only after that other deleted items.
+		 */
+		if (nremaining > 0)
+		{
+			Assert(remaining_buf != NULL);
+			XLogRegisterBufData(0, (char *) remainingoffset,
+								nremaining * sizeof(OffsetNumber));
+			XLogRegisterBufData(0, remaining_buf, remaining_sz);
+		}
+
 		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM);
 
 		PageSetLSN(page, recptr);
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 4cfd528..22fb228 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -97,6 +97,8 @@ static void btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 						 BTCycleId cycleid, TransactionId *oldestBtpoXact);
 static void btvacuumpage(BTVacState *vstate, BlockNumber blkno,
 						 BlockNumber orig_blkno);
+static ItemPointer btreevacuumPosting(BTVacState *vstate, IndexTuple itup,
+									  int *nremaining);
 
 
 /*
@@ -1069,7 +1071,8 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 								 RBM_NORMAL, info->strategy);
 		LockBufferForCleanup(buf);
 		_bt_checkpage(rel, buf);
-		_bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
+		_bt_delitems_vacuum(rel, buf, NULL, 0, NULL, NULL, 0,
+							vstate.lastBlockVacuumed);
 		_bt_relbuf(rel, buf);
 	}
 
@@ -1193,6 +1196,9 @@ restart:
 		OffsetNumber offnum,
 					minoff,
 					maxoff;
+		IndexTuple	remaining[MaxOffsetNumber];
+		OffsetNumber remainingoffset[MaxOffsetNumber];
+		int			nremaining;
 
 		/*
 		 * Trade in the initial read lock for a super-exclusive write lock on
@@ -1229,6 +1235,7 @@ restart:
 		 * callback function.
 		 */
 		ndeletable = 0;
+		nremaining = 0;
 		minoff = P_FIRSTDATAKEY(opaque);
 		maxoff = PageGetMaxOffsetNumber(page);
 		if (callback)
@@ -1242,31 +1249,78 @@ restart:
 
 				itup = (IndexTuple) PageGetItem(page,
 												PageGetItemId(page, offnum));
-				htup = &(itup->t_tid);
 
-				/*
-				 * During Hot Standby we currently assume that
-				 * XLOG_BTREE_VACUUM records do not produce conflicts. That is
-				 * only true as long as the callback function depends only
-				 * upon whether the index tuple refers to heap tuples removed
-				 * in the initial heap scan. When vacuum starts it derives a
-				 * value of OldestXmin. Backends taking later snapshots could
-				 * have a RecentGlobalXmin with a later xid than the vacuum's
-				 * OldestXmin, so it is possible that row versions deleted
-				 * after OldestXmin could be marked as killed by other
-				 * backends. The callback function *could* look at the index
-				 * tuple state in isolation and decide to delete the index
-				 * tuple, though currently it does not. If it ever did, we
-				 * would need to reconsider whether XLOG_BTREE_VACUUM records
-				 * should cause conflicts. If they did cause conflicts they
-				 * would be fairly harsh conflicts, since we haven't yet
-				 * worked out a way to pass a useful value for
-				 * latestRemovedXid on the XLOG_BTREE_VACUUM records. This
-				 * applies to *any* type of index that marks index tuples as
-				 * killed.
-				 */
-				if (callback(htup, callback_state))
-					deletable[ndeletable++] = offnum;
+				if (BTreeTupleIsPosting(itup))
+				{
+					int			nnewipd = 0;
+					ItemPointer newipd = NULL;
+
+					newipd = btreevacuumPosting(vstate, itup, &nnewipd);
+
+					if (nnewipd == 0)
+					{
+						/*
+						 * All TIDs from posting list must be deleted, we can
+						 * delete whole tuple in a regular way.
+						 */
+						deletable[ndeletable++] = offnum;
+					}
+					else if (nnewipd == BTreeTupleGetNPosting(itup))
+					{
+						/*
+						 * All TIDs from posting tuple must remain. Do
+						 * nothing, just cleanup.
+						 */
+						pfree(newipd);
+					}
+					else if (nnewipd < BTreeTupleGetNPosting(itup))
+					{
+						/* Some TIDs from posting tuple must remain. */
+						Assert(nnewipd > 0);
+						Assert(newipd != NULL);
+
+						/*
+						 * Form new tuple that contains only remaining TIDs.
+						 * Remember this tuple and the offset of the old tuple
+						 * to update it in place.
+						 */
+						remainingoffset[nremaining] = offnum;
+						remaining[nremaining] = BTreeFormPostingTuple(itup, newipd, nnewipd);
+						nremaining++;
+						pfree(newipd);
+
+						Assert(IndexTupleSize(itup) <= BTMaxItemSize(page));
+					}
+				}
+				else
+				{
+					htup = &(itup->t_tid);
+
+					/*
+					 * During Hot Standby we currently assume that
+					 * XLOG_BTREE_VACUUM records do not produce conflicts.
+					 * That is only true as long as the callback function
+					 * depends only upon whether the index tuple refers to
+					 * heap tuples removed in the initial heap scan. When
+					 * vacuum starts it derives a value of OldestXmin.
+					 * Backends taking later snapshots could have a
+					 * RecentGlobalXmin with a later xid than the vacuum's
+					 * OldestXmin, so it is possible that row versions deleted
+					 * after OldestXmin could be marked as killed by other
+					 * backends. The callback function *could* look at the
+					 * index tuple state in isolation and decide to delete the
+					 * index tuple, though currently it does not. If it ever
+					 * did, we would need to reconsider whether
+					 * XLOG_BTREE_VACUUM records should cause conflicts. If
+					 * they did cause conflicts they would be fairly harsh
+					 * conflicts, since we haven't yet worked out a way to
+					 * pass a useful value for latestRemovedXid on the
+					 * XLOG_BTREE_VACUUM records. This applies to *any* type
+					 * of index that marks index tuples as killed.
+					 */
+					if (callback(htup, callback_state))
+						deletable[ndeletable++] = offnum;
+				}
 			}
 		}
 
@@ -1274,7 +1328,7 @@ restart:
 		 * Apply any needed deletes.  We issue just one _bt_delitems_vacuum()
 		 * call per page, so as to minimize WAL traffic.
 		 */
-		if (ndeletable > 0)
+		if (ndeletable > 0 || nremaining > 0)
 		{
 			/*
 			 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes
@@ -1291,6 +1345,7 @@ restart:
 			 * that.
 			 */
 			_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
+								remainingoffset, remaining, nremaining,
 								vstate->lastBlockVacuumed);
 
 			/*
@@ -1376,6 +1431,41 @@ restart:
 }
 
 /*
+ * btreevacuumPosting() -- vacuums a posting tuple.
+ *
+ * Returns new palloc'd posting list with remaining items.
+ * Posting list size is returned via nremaining.
+ *
+ * If all items are dead,
+ * nremaining is 0 and resulting posting list is NULL.
+ */
+static ItemPointer
+btreevacuumPosting(BTVacState *vstate, IndexTuple itup, int *nremaining)
+{
+	int			remaining = 0;
+	int			nitem = BTreeTupleGetNPosting(itup);
+	ItemPointer tmpitems = NULL,
+				items = BTreeTupleGetPosting(itup);
+
+	/*
+	 * Check each tuple in the posting list, save alive tuples into tmpitems
+	 */
+	for (int i = 0; i < nitem; i++)
+	{
+		if (vstate->callback(items + i, vstate->callback_state))
+			continue;
+
+		if (tmpitems == NULL)
+			tmpitems = palloc(sizeof(ItemPointerData) * nitem);
+
+		tmpitems[remaining++] = items[i];
+	}
+
+	*nremaining = remaining;
+	return tmpitems;
+}
+
+/*
  *	btcanreturn() -- Check whether btree indexes support index-only scans.
  *
  * btrees always do, so this is trivial.
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 19735bf..de0af9e 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -30,6 +30,9 @@ static bool _bt_readpage(IndexScanDesc scan, ScanDirection dir,
 						 OffsetNumber offnum);
 static void _bt_saveitem(BTScanOpaque so, int itemIndex,
 						 OffsetNumber offnum, IndexTuple itup);
+static void _bt_savepostingitem(BTScanOpaque so, int itemIndex,
+								OffsetNumber offnum, ItemPointer iptr,
+								IndexTuple itup, int i);
 static bool _bt_steppage(IndexScanDesc scan, ScanDirection dir);
 static bool _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno, ScanDirection dir);
 static bool _bt_parallel_readpage(IndexScanDesc scan, BlockNumber blkno,
@@ -504,7 +507,8 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 
 		/* We have low <= mid < high, so mid points at a real slot */
 
-		result = _bt_compare(rel, key, page, mid);
+		result = _bt_compare_posting(rel, key, page, mid,
+									 &(insertstate->in_posting_offset));
 
 		if (result >= cmpval)
 			low = mid + 1;
@@ -533,6 +537,60 @@ _bt_binsrch_insert(Relation rel, BTInsertState insertstate)
 	return low;
 }
 
+/*
+ * Compare insertion-type scankey to tuple on a page,
+ * taking into account posting tuples.
+ * If the key of the posting tuple is equal to scankey,
+ * find exact position inside the posting list,
+ * using TID as extra attribute.
+ */
+int32
+_bt_compare_posting(Relation rel,
+					BTScanInsert key,
+					Page page,
+					OffsetNumber offnum,
+					int *in_posting_offset)
+{
+	IndexTuple	itup;
+	int			result;
+
+	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, offnum));
+	result = _bt_compare(rel, key, page, offnum);
+
+	if (BTreeTupleIsPosting(itup) && result == 0)
+	{
+		int			low,
+					high,
+					mid,
+					res;
+
+		low = 0;
+		/* "high" is past end of posting list for loop invariant */
+		high = BTreeTupleGetNPosting(itup);
+
+		while (high > low)
+		{
+			mid = low + ((high - low) / 2);
+			res = ItemPointerCompare(key->scantid,
+									 BTreeTupleGetPostingN(itup, mid));
+
+			if (res >= 1)
+				low = mid + 1;
+			else
+				high = mid;
+		}
+
+		*in_posting_offset = high;
+		elog(DEBUG4, "_bt_compare_posting in_posting_offset %d", *in_posting_offset);
+		Assert(ItemPointerCompare(BTreeTupleGetHeapTID(itup),
+							  key->scantid) < 0);
+		Assert(ItemPointerCompare(key->scantid,
+							  BTreeTupleGetMaxTID(itup)) < 0);
+	}
+
+	return result;
+}
+
 /*----------
  *	_bt_compare() -- Compare insertion-type scankey to tuple on a page.
  *
@@ -665,61 +723,120 @@ _bt_compare(Relation rel,
 	 * Use the heap TID attribute and scantid to try to break the tie.  The
 	 * rules are the same as any other key attribute -- only the
 	 * representation differs.
+	 *
+	 * When itup is a posting tuple, the check becomes more complex. It is
+	 * possible that the scankey belongs to the tuple's posting list TID
+	 * range.
+	 *
+	 * _bt_compare() is multipurpose, so it just returns 0 for a fact that key
+	 * matches tuple at this offset.
+	 *
+	 * Use special _bt_compare_posting() wrapper function to handle this case
+	 * and perform recheck for posting tuple, finding exact position of the
+	 * scankey.
 	 */
-	heapTid = BTreeTupleGetHeapTID(itup);
-	if (key->scantid == NULL)
+	if (!BTreeTupleIsPosting(itup))
 	{
+		heapTid = BTreeTupleGetHeapTID(itup);
+		if (key->scantid == NULL)
+		{
+			/*
+			 * Most searches have a scankey that is considered greater than a
+			 * truncated pivot tuple if and when the scankey has equal values
+			 * for attributes up to and including the least significant
+			 * untruncated attribute in tuple.
+			 *
+			 * For example, if an index has the minimum two attributes (single
+			 * user key attribute, plus heap TID attribute), and a page's high
+			 * key is ('foo', -inf), and scankey is ('foo', <omitted>), the
+			 * search will not descend to the page to the left.  The search
+			 * will descend right instead.  The truncated attribute in pivot
+			 * tuple means that all non-pivot tuples on the page to the left
+			 * are strictly < 'foo', so it isn't necessary to descend left. In
+			 * other words, search doesn't have to descend left because it
+			 * isn't interested in a match that has a heap TID value of -inf.
+			 *
+			 * However, some searches (pivotsearch searches) actually require
+			 * that we descend left when this happens.  -inf is treated as a
+			 * possible match for omitted scankey attribute(s).  This is
+			 * needed by page deletion, which must re-find leaf pages that are
+			 * targets for deletion using their high keys.
+			 *
+			 * Note: the heap TID part of the test ensures that scankey is
+			 * being compared to a pivot tuple with one or more truncated key
+			 * attributes.
+			 *
+			 * Note: pg_upgrade'd !heapkeyspace indexes must always descend to
+			 * the left here, since they have no heap TID attribute (and
+			 * cannot have any -inf key values in any case, since truncation
+			 * can only remove non-key attributes).  !heapkeyspace searches
+			 * must always be prepared to deal with matches on both sides of
+			 * the pivot once the leaf level is reached.
+			 */
+			if (key->heapkeyspace && !key->pivotsearch &&
+				key->keysz == ntupatts && heapTid == NULL)
+				return 1;
+
+			/* All provided scankey arguments found to be equal */
+			return 0;
+		}
+
 		/*
-		 * Most searches have a scankey that is considered greater than a
-		 * truncated pivot tuple if and when the scankey has equal values for
-		 * attributes up to and including the least significant untruncated
-		 * attribute in tuple.
-		 *
-		 * For example, if an index has the minimum two attributes (single
-		 * user key attribute, plus heap TID attribute), and a page's high key
-		 * is ('foo', -inf), and scankey is ('foo', <omitted>), the search
-		 * will not descend to the page to the left.  The search will descend
-		 * right instead.  The truncated attribute in pivot tuple means that
-		 * all non-pivot tuples on the page to the left are strictly < 'foo',
-		 * so it isn't necessary to descend left.  In other words, search
-		 * doesn't have to descend left because it isn't interested in a match
-		 * that has a heap TID value of -inf.
-		 *
-		 * However, some searches (pivotsearch searches) actually require that
-		 * we descend left when this happens.  -inf is treated as a possible
-		 * match for omitted scankey attribute(s).  This is needed by page
-		 * deletion, which must re-find leaf pages that are targets for
-		 * deletion using their high keys.
-		 *
-		 * Note: the heap TID part of the test ensures that scankey is being
-		 * compared to a pivot tuple with one or more truncated key
-		 * attributes.
-		 *
-		 * Note: pg_upgrade'd !heapkeyspace indexes must always descend to the
-		 * left here, since they have no heap TID attribute (and cannot have
-		 * any -inf key values in any case, since truncation can only remove
-		 * non-key attributes).  !heapkeyspace searches must always be
-		 * prepared to deal with matches on both sides of the pivot once the
-		 * leaf level is reached.
+		 * Treat truncated heap TID as minus infinity, since scankey has a key
+		 * attribute value (scantid) that would otherwise be compared directly
 		 */
-		if (key->heapkeyspace && !key->pivotsearch &&
-			key->keysz == ntupatts && heapTid == NULL)
+		Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
+		if (heapTid == NULL)
 			return 1;
 
-		/* All provided scankey arguments found to be equal */
-		return 0;
+		Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
+		return ItemPointerCompare(key->scantid, heapTid);
 	}
+	else
+	{
+		heapTid = BTreeTupleGetHeapTID(itup);
+		if (key->scantid != NULL && heapTid != NULL)
+		{
+			int			cmp = ItemPointerCompare(key->scantid, heapTid);
 
-	/*
-	 * Treat truncated heap TID as minus infinity, since scankey has a key
-	 * attribute value (scantid) that would otherwise be compared directly
-	 */
-	Assert(key->keysz == IndexRelationGetNumberOfKeyAttributes(rel));
-	if (heapTid == NULL)
-		return 1;
+			if (cmp == -1 || cmp == 0)
+			{
+				elog(DEBUG4, "offnum %d Scankey (%u,%u) is less than or equal to posting tuple (%u,%u)",
+					 offnum, ItemPointerGetBlockNumberNoCheck(key->scantid),
+					 ItemPointerGetOffsetNumberNoCheck(key->scantid),
+					 ItemPointerGetBlockNumberNoCheck(heapTid),
+					 ItemPointerGetOffsetNumberNoCheck(heapTid));
+				return cmp;
+			}
 
-	Assert(ntupatts >= IndexRelationGetNumberOfKeyAttributes(rel));
-	return ItemPointerCompare(key->scantid, heapTid);
+			heapTid = BTreeTupleGetMaxTID(itup);
+			cmp = ItemPointerCompare(key->scantid, heapTid);
+			if (cmp == 1)
+			{
+				elog(DEBUG4, "offnum %d Scankey (%u,%u) is greater than posting tuple (%u,%u)",
+					 offnum, ItemPointerGetBlockNumberNoCheck(key->scantid),
+					 ItemPointerGetOffsetNumberNoCheck(key->scantid),
+					 ItemPointerGetBlockNumberNoCheck(heapTid),
+					 ItemPointerGetOffsetNumberNoCheck(heapTid));
+				return cmp;
+			}
+
+			/*
+			 * if we got here, scantid is inbetween of posting items of the
+			 * tuple
+			 */
+			elog(DEBUG4, "offnum %d Scankey (%u,%u) is between posting items (%u,%u) and (%u,%u)",
+				 offnum, ItemPointerGetBlockNumberNoCheck(key->scantid),
+				 ItemPointerGetOffsetNumberNoCheck(key->scantid),
+				 ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(itup)),
+				 ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(itup)),
+				 ItemPointerGetBlockNumberNoCheck(heapTid),
+				 ItemPointerGetOffsetNumberNoCheck(heapTid));
+			return 0;
+		}
+	}
+
+	return 0;
 }
 
 /*
@@ -1456,6 +1573,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 	/* initialize tuple workspace to empty */
 	so->currPos.nextTupleOffset = 0;
+	so->currPos.prevTupleOffset = 0;
 
 	/*
 	 * Now that the current page has been made consistent, the macro should be
@@ -1490,8 +1608,22 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			if (_bt_checkkeys(scan, itup, indnatts, dir, &continuescan))
 			{
 				/* tuple passes all scan key conditions, so remember it */
-				_bt_saveitem(so, itemIndex, offnum, itup);
-				itemIndex++;
+				if (!BTreeTupleIsPosting(itup))
+				{
+					_bt_saveitem(so, itemIndex, offnum, itup);
+					itemIndex++;
+				}
+				else
+				{
+					/* Return posting list "logical" tuples */
+					for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											itup, i);
+						itemIndex++;
+					}
+				}
 			}
 			/* When !continuescan, there can't be any more matches, so stop */
 			if (!continuescan)
@@ -1524,7 +1656,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 		if (!continuescan)
 			so->currPos.moreRight = false;
 
-		Assert(itemIndex <= MaxIndexTuplesPerPage);
+		Assert(itemIndex <= MaxPostingIndexTuplesPerPage);
 		so->currPos.firstItem = 0;
 		so->currPos.lastItem = itemIndex - 1;
 		so->currPos.itemIndex = 0;
@@ -1532,7 +1664,7 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 	else
 	{
 		/* load items[] in descending order */
-		itemIndex = MaxIndexTuplesPerPage;
+		itemIndex = MaxPostingIndexTuplesPerPage;
 
 		offnum = Min(offnum, maxoff);
 
@@ -1574,8 +1706,23 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 			if (passes_quals && tuple_alive)
 			{
 				/* tuple passes all scan key conditions, so remember it */
-				itemIndex--;
-				_bt_saveitem(so, itemIndex, offnum, itup);
+				if (!BTreeTupleIsPosting(itup))
+				{
+					itemIndex--;
+					_bt_saveitem(so, itemIndex, offnum, itup);
+				}
+				else
+				{
+					/* Return posting list "logical" tuples */
+					/* XXX: Maybe this loop should be backwards? */
+					for (int i = 0; i < BTreeTupleGetNPosting(itup); i++)
+					{
+						itemIndex--;
+						_bt_savepostingitem(so, itemIndex, offnum,
+											BTreeTupleGetPostingN(itup, i),
+											itup, i);
+					}
+				}
 			}
 			if (!continuescan)
 			{
@@ -1589,8 +1736,8 @@ _bt_readpage(IndexScanDesc scan, ScanDirection dir, OffsetNumber offnum)
 
 		Assert(itemIndex >= 0);
 		so->currPos.firstItem = itemIndex;
-		so->currPos.lastItem = MaxIndexTuplesPerPage - 1;
-		so->currPos.itemIndex = MaxIndexTuplesPerPage - 1;
+		so->currPos.lastItem = MaxPostingIndexTuplesPerPage - 1;
+		so->currPos.itemIndex = MaxPostingIndexTuplesPerPage - 1;
 	}
 
 	return (so->currPos.firstItem <= so->currPos.lastItem);
@@ -1603,6 +1750,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 {
 	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
 
+	Assert(!BTreeTupleIsPosting(itup));
+
 	currItem->heapTid = itup->t_tid;
 	currItem->indexOffset = offnum;
 	if (so->currTuples)
@@ -1615,6 +1764,33 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 	}
 }
 
+/* Save an index item into so->currPos.items[itemIndex] for posting tuples. */
+static void
+_bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
+					ItemPointer iptr, IndexTuple itup, int i)
+{
+	BTScanPosItem *currItem = &so->currPos.items[itemIndex];
+
+	currItem->heapTid = *iptr;
+	currItem->indexOffset = offnum;
+
+	if (so->currTuples)
+	{
+		if (i == 0)
+		{
+			/* save key. the same for all tuples in the posting */
+			Size		itupsz = BTreeTupleGetPostingOffset(itup);
+
+			currItem->tupleOffset = so->currPos.nextTupleOffset;
+			memcpy(so->currTuples + so->currPos.nextTupleOffset, itup, itupsz);
+			so->currPos.nextTupleOffset += MAXALIGN(itupsz);
+			so->currPos.prevTupleOffset = currItem->tupleOffset;
+		}
+		else
+			currItem->tupleOffset = so->currPos.prevTupleOffset;
+	}
+}
+
 /*
  *	_bt_steppage() -- Step to next page containing valid data for scan
  *
diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c
index b30cf9e..b058599 100644
--- a/src/backend/access/nbtree/nbtsort.c
+++ b/src/backend/access/nbtree/nbtsort.c
@@ -288,6 +288,8 @@ static void _bt_sortaddtup(Page page, Size itemsize,
 static void _bt_buildadd(BTWriteState *wstate, BTPageState *state,
 						 IndexTuple itup);
 static void _bt_uppershutdown(BTWriteState *wstate, BTPageState *state);
+static void _bt_buildadd_posting(BTWriteState *wstate, BTPageState *state,
+								 BTCompressState *compressState);
 static void _bt_load(BTWriteState *wstate,
 					 BTSpool *btspool, BTSpool *btspool2);
 static void _bt_begin_parallel(BTBuildState *buildstate, bool isconcurrent,
@@ -972,6 +974,11 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 			 * only shift the line pointer array back and forth, and overwrite
 			 * the tuple space previously occupied by oitup.  This is fairly
 			 * cheap.
+			 *
+			 * If lastleft tuple was a posting tuple, we'll truncate its
+			 * posting list in _bt_truncate as well. Note that it is also
+			 * applicable only to leaf pages, since internal pages never
+			 * contain posting tuples.
 			 */
 			ii = PageGetItemId(opage, OffsetNumberPrev(last_off));
 			lastleft = (IndexTuple) PageGetItem(opage, ii);
@@ -1011,6 +1018,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		 * the minimum key for the new page.
 		 */
 		state->btps_minkey = CopyIndexTuple(oitup);
+		Assert(BTreeTupleIsPivot(state->btps_minkey));
 
 		/*
 		 * Set the sibling links for both pages.
@@ -1052,6 +1060,7 @@ _bt_buildadd(BTWriteState *wstate, BTPageState *state, IndexTuple itup)
 		Assert(state->btps_minkey == NULL);
 		state->btps_minkey = CopyIndexTuple(itup);
 		/* _bt_sortaddtup() will perform full truncation later */
+		BTreeTupleClearBtIsPosting(state->btps_minkey);
 		BTreeTupleSetNAtts(state->btps_minkey, 0);
 	}
 
@@ -1137,6 +1146,91 @@ _bt_uppershutdown(BTWriteState *wstate, BTPageState *state)
 }
 
 /*
+ * Add new tuple (posting or non-posting) to the page while building index.
+ */
+static void
+_bt_buildadd_posting(BTWriteState *wstate, BTPageState *state,
+					 BTCompressState *compressState)
+{
+	IndexTuple	to_insert;
+
+	/* Return, if there is no tuple to insert */
+	if (state == NULL)
+		return;
+
+	if (compressState->ntuples == 0)
+		to_insert = compressState->itupprev;
+	else
+	{
+		IndexTuple	postingtuple;
+
+		/* form a tuple with a posting list */
+		postingtuple = BTreeFormPostingTuple(compressState->itupprev,
+											 compressState->ipd,
+											 compressState->ntuples);
+		to_insert = postingtuple;
+		pfree(compressState->ipd);
+	}
+
+	_bt_buildadd(wstate, state, to_insert);
+
+	if (compressState->ntuples > 0)
+		pfree(to_insert);
+	compressState->ntuples = 0;
+}
+
+/*
+ * Save item pointer(s) of itup to the posting list in compressState.
+ *
+ * Helper function for _bt_load() and _bt_compress_one_page().
+ *
+ * Note: caller is responsible for size check to ensure that resulting tuple
+ * won't exceed BTMaxItemSize.
+ */
+void
+_bt_add_posting_item(BTCompressState *compressState, IndexTuple itup)
+{
+	int			nposting = 0;
+
+	if (compressState->ntuples == 0)
+	{
+		compressState->ipd = palloc0(compressState->maxitemsize);
+
+		if (BTreeTupleIsPosting(compressState->itupprev))
+		{
+			/* if itupprev is posting, add all its TIDs to the posting list */
+			nposting = BTreeTupleGetNPosting(compressState->itupprev);
+			memcpy(compressState->ipd,
+				   BTreeTupleGetPosting(compressState->itupprev),
+				   sizeof(ItemPointerData) * nposting);
+			compressState->ntuples += nposting;
+		}
+		else
+		{
+			memcpy(compressState->ipd, compressState->itupprev,
+				   sizeof(ItemPointerData));
+			compressState->ntuples++;
+		}
+	}
+
+	if (BTreeTupleIsPosting(itup))
+	{
+		/* if tuple is posting, add all its TIDs to the posting list */
+		nposting = BTreeTupleGetNPosting(itup);
+		memcpy(compressState->ipd + compressState->ntuples,
+			   BTreeTupleGetPosting(itup),
+			   sizeof(ItemPointerData) * nposting);
+		compressState->ntuples += nposting;
+	}
+	else
+	{
+		memcpy(compressState->ipd + compressState->ntuples, itup,
+			   sizeof(ItemPointerData));
+		compressState->ntuples++;
+	}
+}
+
+/*
  * Read tuples in correct sort order from tuplesort, and load them into
  * btree leaves.
  */
@@ -1150,9 +1244,20 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	bool		load1;
 	TupleDesc	tupdes = RelationGetDescr(wstate->index);
 	int			i,
-				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index);
+				keysz = IndexRelationGetNumberOfKeyAttributes(wstate->index),
+				natts = IndexRelationGetNumberOfAttributes(wstate->index);
 	SortSupport sortKeys;
 	int64		tuples_done = 0;
+	bool		use_compression = false;
+	BTCompressState *compressState = NULL;
+
+	/*
+	 * Don't use compression for indexes with INCLUDEd columns and unique
+	 * indexes.
+	 */
+	use_compression = (IndexRelationGetNumberOfKeyAttributes(wstate->index) ==
+					   IndexRelationGetNumberOfAttributes(wstate->index) &&
+					   !wstate->index->rd_index->indisunique);
 
 	if (merge)
 	{
@@ -1266,19 +1371,89 @@ _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2)
 	}
 	else
 	{
-		/* merge is unnecessary */
-		while ((itup = tuplesort_getindextuple(btspool->sortstate,
-											   true)) != NULL)
+		if (!use_compression)
 		{
-			/* When we see first tuple, create first index page */
-			if (state == NULL)
-				state = _bt_pagestate(wstate, 0);
+			/* merge is unnecessary */
+			while ((itup = tuplesort_getindextuple(btspool->sortstate,
+												   true)) != NULL)
+			{
+				/* When we see first tuple, create first index page */
+				if (state == NULL)
+					state = _bt_pagestate(wstate, 0);
 
-			_bt_buildadd(wstate, state, itup);
+				_bt_buildadd(wstate, state, itup);
 
-			/* Report progress */
-			pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
-										 ++tuples_done);
+				/* Report progress */
+				pgstat_progress_update_param(PROGRESS_CREATEIDX_TUPLES_DONE,
+											 ++tuples_done);
+			}
+		}
+		else
+		{
+			/* init compress state needed to build posting tuples */
+			compressState = (BTCompressState *) palloc0(sizeof(BTCompressState));
+			compressState->ipd = NULL;
+			compressState->ntuples = 0;
+			compressState->itupprev = NULL;
+			compressState->maxitemsize = 0;
+			compressState->maxpostingsize = 0;
+
+			while ((itup = tuplesort_getindextuple(btspool->sortstate,
+												   true)) != NULL)
+			{
+				/* When we see first tuple, create first index page */
+				if (state == NULL)
+				{
+					state = _bt_pagestate(wstate, 0);
+					compressState->maxitemsize = BTMaxItemSize(state->btps_page);
+				}
+
+				if (compressState->itupprev != NULL)
+				{
+					int			n_equal_atts = _bt_keep_natts_fast(wstate->index,
+																   compressState->itupprev, itup);
+
+					if (n_equal_atts > natts)
+					{
+						/*
+						 * Tuples are equal. Create or update posting.
+						 *
+						 * Else If posting is too big, insert it on page and
+						 * continue.
+						 */
+						if ((compressState->ntuples + 1) * sizeof(ItemPointerData) <
+							compressState->maxpostingsize)
+							_bt_add_posting_item(compressState, itup);
+						else
+							_bt_buildadd_posting(wstate, state,
+												 compressState);
+					}
+					else
+					{
+						/*
+						 * Tuples are not equal. Insert itupprev into index.
+						 * Save current tuple for the next iteration.
+						 */
+						_bt_buildadd_posting(wstate, state, compressState);
+					}
+				}
+
+				/*
+				 * Save the tuple to compare it with the next one and maybe
+				 * unite them into a posting tuple.
+				 */
+				if (compressState->itupprev)
+					pfree(compressState->itupprev);
+				compressState->itupprev = CopyIndexTuple(itup);
+
+				/* compute max size of posting list */
+				compressState->maxpostingsize = compressState->maxitemsize -
+					IndexInfoFindDataOffset(compressState->itupprev->t_info) -
+					MAXALIGN(IndexTupleSize(compressState->itupprev));
+			}
+
+			/* Handle the last item */
+			_bt_buildadd_posting(wstate, state, compressState);
 		}
 	}
 
diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
index a7882fd..c492b04 100644
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -62,6 +62,11 @@ typedef struct
 	int			nsplits;		/* current number of splits */
 	SplitPoint *splits;			/* all candidate split points for page */
 	int			interval;		/* current range of acceptable split points */
+
+	/* fields only valid when insert splitted posting tuple */
+	OffsetNumber replaceitemoff;
+	IndexTuple	 replaceitem;
+	Size		 replaceitemsz;
 } FindSplitData;
 
 static void _bt_recsplitloc(FindSplitData *state,
@@ -129,6 +134,9 @@ _bt_findsplitloc(Relation rel,
 				 OffsetNumber newitemoff,
 				 Size newitemsz,
 				 IndexTuple newitem,
+				 OffsetNumber replaceitemoff,
+				 Size replaceitemsz,
+				 IndexTuple replaceitem,
 				 bool *newitemonleft)
 {
 	BTPageOpaque opaque;
@@ -183,6 +191,10 @@ _bt_findsplitloc(Relation rel,
 	state.minfirstrightsz = SIZE_MAX;
 	state.newitemoff = newitemoff;
 
+	state.replaceitemoff = replaceitemoff;
+	state.replaceitemsz = replaceitemsz;
+	state.replaceitem = replaceitem;
+
 	/*
 	 * maxsplits should never exceed maxoff because there will be at most as
 	 * many candidate split points as there are points _between_ tuples, once
@@ -207,7 +219,17 @@ _bt_findsplitloc(Relation rel,
 		Size		itemsz;
 
 		itemid = PageGetItemId(page, offnum);
-		itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
+
+		/* use size of replacing item for calculations */
+		if (offnum == replaceitemoff)
+		{
+			itemsz = replaceitemsz + sizeof(ItemIdData);
+			olddataitemstotal = state.olddataitemstotal = state.olddataitemstotal
+									  - MAXALIGN(ItemIdGetLength(itemid))
+									  + replaceitemsz;
+		}
+		else
+			itemsz = MAXALIGN(ItemIdGetLength(itemid)) + sizeof(ItemIdData);
 
 		/*
 		 * When item offset number is not newitemoff, neither side of the
@@ -466,9 +488,13 @@ _bt_recsplitloc(FindSplitData *state,
 							 && !newitemonleft);
 
 	if (newitemisfirstonright)
+	{
 		firstrightitemsz = state->newitemsz;
+	}
 	else
+	{
 		firstrightitemsz = firstoldonrightsz;
+	}
 
 	/* Account for all the old tuples */
 	leftfree = state->leftspace - olddataitemstoleft;
@@ -492,12 +518,12 @@ _bt_recsplitloc(FindSplitData *state,
 	 * adding a heap TID to the left half's new high key when splitting at the
 	 * leaf level.  In practice the new high key will often be smaller and
 	 * will rarely be larger, but conservatively assume the worst case.
+	 * Truncation always truncates away any posting list that appears in the
+	 * first right tuple, though, so it's safe to subtract that overhead
+	 * (while still conservatively assuming that truncation might have to add
+	 * back a single heap TID using the pivot tuple heap TID representation).
 	 */
-	if (state->is_leaf)
-		leftfree -= (int16) (firstrightitemsz +
-							 MAXALIGN(sizeof(ItemPointerData)));
-	else
-		leftfree -= (int16) firstrightitemsz;
+	leftfree -= (int16) firstrightitemsz;
 
 	/* account for the new item */
 	if (newitemonleft)
@@ -1066,13 +1092,20 @@ static inline IndexTuple
 _bt_split_lastleft(FindSplitData *state, SplitPoint *split)
 {
 	ItemId		itemid;
+	OffsetNumber offset;
 
 	if (split->newitemonleft && split->firstoldonright == state->newitemoff)
 		return state->newitem;
 
-	itemid = PageGetItemId(state->page,
-						   OffsetNumberPrev(split->firstoldonright));
-	return (IndexTuple) PageGetItem(state->page, itemid);
+	offset = OffsetNumberPrev(split->firstoldonright);
+	if (offset == state->replaceitemoff)
+		return state->replaceitem;
+	else
+	{
+		itemid = PageGetItemId(state->page,
+							OffsetNumberPrev(split->firstoldonright));
+		return (IndexTuple) PageGetItem(state->page, itemid);
+	}
 }
 
 /*
@@ -1086,6 +1119,11 @@ _bt_split_firstright(FindSplitData *state, SplitPoint *split)
 	if (!split->newitemonleft && split->firstoldonright == state->newitemoff)
 		return state->newitem;
 
-	itemid = PageGetItemId(state->page, split->firstoldonright);
-	return (IndexTuple) PageGetItem(state->page, itemid);
+	if (split->firstoldonright == state->replaceitemoff)
+		return state->replaceitem;
+	else
+	{
+		itemid = PageGetItemId(state->page, split->firstoldonright);
+		return (IndexTuple) PageGetItem(state->page, itemid);
+	}
 }
diff --git a/src/backend/access/nbtree/nbtutils.c b/src/backend/access/nbtree/nbtutils.c
index 9b172c1..c56f5ab 100644
--- a/src/backend/access/nbtree/nbtutils.c
+++ b/src/backend/access/nbtree/nbtutils.c
@@ -111,8 +111,12 @@ _bt_mkscankey(Relation rel, IndexTuple itup)
 	key->nextkey = false;
 	key->pivotsearch = false;
 	key->keysz = Min(indnkeyatts, tupnatts);
-	key->scantid = key->heapkeyspace && itup ?
-		BTreeTupleGetHeapTID(itup) : NULL;
+
+	if (itup && key->heapkeyspace)
+		key->scantid = BTreeTupleGetHeapTID(itup);
+	else
+		key->scantid = NULL;
+
 	skey = key->scankeys;
 	for (i = 0; i < indnkeyatts; i++)
 	{
@@ -1787,7 +1791,9 @@ _bt_killitems(IndexScanDesc scan)
 			ItemId		iid = PageGetItemId(page, offnum);
 			IndexTuple	ituple = (IndexTuple) PageGetItem(page, iid);
 
-			if (ItemPointerEquals(&ituple->t_tid, &kitem->heapTid))
+			/* No microvacuum for posting tuples */
+			if (!BTreeTupleIsPosting(ituple) &&
+				(ItemPointerEquals(&ituple->t_tid, &kitem->heapTid)))
 			{
 				/* found the item */
 				ItemIdMarkDead(iid);
@@ -2112,6 +2118,7 @@ btbuildphasename(int64 phasenum)
  * returning an enlarged tuple to caller when truncation + TOAST compression
  * ends up enlarging the final datum.
  */
+
 IndexTuple
 _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 			 BTScanInsert itup_key)
@@ -2124,6 +2131,17 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	ItemPointer pivotheaptid;
 	Size		newsize;
 
+	elog(DEBUG4, "_bt_truncate left N %d (%u,%u) to (%u,%u), right N %d (%u,%u) to (%u,%u) ",
+				BTreeTupleIsPosting(lastleft)?BTreeTupleGetNPosting(lastleft):0,
+				ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(lastleft)),
+				ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(lastleft)),
+				ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(lastleft)),
+				ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(lastleft)),
+				BTreeTupleIsPosting(firstright)?BTreeTupleGetNPosting(firstright):0,
+				ItemPointerGetBlockNumberNoCheck(BTreeTupleGetHeapTID(firstright)),
+				ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetHeapTID(firstright)),
+				ItemPointerGetBlockNumberNoCheck(BTreeTupleGetMaxTID(firstright)),
+				ItemPointerGetOffsetNumberNoCheck(BTreeTupleGetMaxTID(firstright)));
 	/*
 	 * We should only ever truncate leaf index tuples.  It's never okay to
 	 * truncate a second time.
@@ -2145,6 +2163,16 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 
 		pivot = index_truncate_tuple(itupdesc, firstright, keepnatts);
 
+		if (BTreeTupleIsPosting(firstright))
+		{
+			BTreeTupleClearBtIsPosting(pivot);
+			BTreeTupleSetNAtts(pivot, keepnatts);
+			pivot->t_info &= ~INDEX_SIZE_MASK;
+			pivot->t_info |= BTreeTupleGetPostingOffset(firstright);
+		}
+
+		Assert(!BTreeTupleIsPosting(pivot));
+
 		/*
 		 * If there is a distinguishing key attribute within new pivot tuple,
 		 * there is no need to add an explicit heap TID attribute
@@ -2161,6 +2189,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		 * attribute to the new pivot tuple.
 		 */
 		Assert(natts != nkeyatts);
+		Assert(!BTreeTupleIsPosting(lastleft));
+		Assert(!BTreeTupleIsPosting(firstright));
 		newsize = IndexTupleSize(pivot) + MAXALIGN(sizeof(ItemPointerData));
 		tidpivot = palloc0(newsize);
 		memcpy(tidpivot, pivot, IndexTupleSize(pivot));
@@ -2168,6 +2198,27 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 		pfree(pivot);
 		pivot = tidpivot;
 	}
+	else if (BTreeTupleIsPosting(firstright))
+	{
+		/*
+		 * No truncation was possible, since key attributes are all equal. But
+		 * the tuple is a compressed tuple with a posting list, so we still
+		 * must truncate it.
+		 *
+		 * It's necessary to add a heap TID attribute to the new pivot tuple.
+		 */
+		newsize = BTreeTupleGetPostingOffset(firstright) +
+			MAXALIGN(sizeof(ItemPointerData));
+		pivot = palloc0(newsize);
+		memcpy(pivot, firstright, BTreeTupleGetPostingOffset(firstright));
+
+		pivot->t_info &= ~INDEX_SIZE_MASK;
+		pivot->t_info |= newsize;
+		BTreeTupleClearBtIsPosting(pivot);
+		BTreeTupleSetAltHeapTID(pivot);
+
+		Assert(!BTreeTupleIsPosting(pivot));
+	}
 	else
 	{
 		/*
@@ -2205,7 +2256,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	pivotheaptid = (ItemPointer) ((char *) pivot + newsize -
 								  sizeof(ItemPointerData));
-	ItemPointerCopy(&lastleft->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetMaxTID(lastleft), pivotheaptid);
 
 	/*
 	 * Lehman and Yao require that the downlink to the right page, which is to
@@ -2216,9 +2267,12 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * tiebreaker.
 	 */
 #ifndef DEBUG_NO_TRUNCATE
-	Assert(ItemPointerCompare(&lastleft->t_tid, &firstright->t_tid) < 0);
-	Assert(ItemPointerCompare(pivotheaptid, &lastleft->t_tid) >= 0);
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(BTreeTupleGetMaxTID(lastleft),
+							  BTreeTupleGetHeapTID(firstright)) < 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(lastleft)) >= 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
 #else
 
 	/*
@@ -2231,7 +2285,7 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 * attribute values along with lastleft's heap TID value when lastleft's
 	 * TID happens to be greater than firstright's TID.
 	 */
-	ItemPointerCopy(&firstright->t_tid, pivotheaptid);
+	ItemPointerCopy(BTreeTupleGetHeapTID(firstright), pivotheaptid);
 
 	/*
 	 * Pivot heap TID should never be fully equal to firstright.  Note that
@@ -2240,7 +2294,8 @@ _bt_truncate(Relation rel, IndexTuple lastleft, IndexTuple firstright,
 	 */
 	ItemPointerSetOffsetNumber(pivotheaptid,
 							   OffsetNumberPrev(ItemPointerGetOffsetNumber(pivotheaptid)));
-	Assert(ItemPointerCompare(pivotheaptid, &firstright->t_tid) < 0);
+	Assert(ItemPointerCompare(pivotheaptid,
+							  BTreeTupleGetHeapTID(firstright)) < 0);
 #endif
 
 	BTreeTupleSetNAtts(pivot, nkeyatts);
@@ -2330,6 +2385,25 @@ _bt_keep_natts(Relation rel, IndexTuple lastleft, IndexTuple firstright,
  * leaving excessive amounts of free space on either side of page split.
  * Callers can rely on the fact that attributes considered equal here are
  * definitely also equal according to _bt_keep_natts.
+ *
+ * To build a posting tuple we need to ensure that all attributes
+ * of both tuples are equal. Use this function to compare them.
+ * TODO: maybe it's worth to rename the function.
+ *
+ * XXX: Obviously we need infrastructure for making sure it is okay to use
+ * this for posting list stuff.  For example, non-deterministic collations
+ * cannot use compression, and will not work with what we have now.
+ *
+ * XXX: Even then, we probably also need to worry about TOAST as a special
+ * case.  Don't repeat bugs like the amcheck bug that was fixed in commit
+ * eba775345d23d2c999bbb412ae658b6dab36e3e8.  As the test case added in that
+ * commit shows, we need to worry about pg_attribute.attstorage changing in
+ * the underlying table due to an ALTER TABLE (and maybe a few other things
+ * like that).  In general, the "TOAST input state" of a TOASTable datum isn't
+ * something that we make many guarantees about today, so even with C
+ * collation text we could in theory get different answers from
+ * _bt_keep_natts_fast() and _bt_keep_natts().  This needs to be nailed down
+ * in some way.
  */
 int
 _bt_keep_natts_fast(Relation rel, IndexTuple lastleft, IndexTuple firstright)
@@ -2415,7 +2489,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 			 * Non-pivot tuples currently never use alternative heap TID
 			 * representation -- even those within heapkeyspace indexes
 			 */
-			if ((itup->t_info & INDEX_ALT_TID_MASK) != 0)
+			if (BTreeTupleIsPivot(itup))
 				return false;
 
 			/*
@@ -2470,7 +2544,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 			 * that to decide if the tuple is a pre-v11 tuple.
 			 */
 			return tupnatts == 0 ||
-				((itup->t_info & INDEX_ALT_TID_MASK) == 0 &&
+				(!BTreeTupleIsPivot(itup) &&
 				 ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
 		}
 		else
@@ -2497,7 +2571,7 @@ _bt_check_natts(Relation rel, bool heapkeyspace, Page page, OffsetNumber offnum)
 	 * heapkeyspace index pivot tuples, regardless of whether or not there are
 	 * non-key attributes.
 	 */
-	if ((itup->t_info & INDEX_ALT_TID_MASK) == 0)
+	if (!BTreeTupleIsPivot(itup))
 		return false;
 
 	/*
@@ -2549,6 +2623,8 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
 	if (!needheaptidspace && itemsz <= BTMaxItemSizeNoHeapTid(page))
 		return;
 
+	/* TODO correct error messages for posting tuples */
+
 	/*
 	 * Internal page insertions cannot fail here, because that would mean that
 	 * an earlier leaf level insertion that should have failed didn't
@@ -2575,3 +2651,79 @@ _bt_check_third_page(Relation rel, Relation heap, bool needheaptidspace,
 					 "or use full text indexing."),
 			 errtableconstraint(heap, RelationGetRelationName(rel))));
 }
+
+/*
+ * Given a basic tuple that contains key datum and posting list,
+ * build a posting tuple.
+ *
+ * Basic tuple can be a posting tuple, but we only use key part of it,
+ * all ItemPointers must be passed via ipd.
+ *
+ * If nipd == 1 fallback to building a non-posting tuple.
+ * It is necessary to avoid storage overhead after posting tuple was vacuumed.
+ */
+IndexTuple
+BTreeFormPostingTuple(IndexTuple tuple, ItemPointerData *ipd, int nipd)
+{
+	uint32		keysize,
+				newsize = 0;
+	IndexTuple	itup;
+
+	/* We only need key part of the tuple */
+	if (BTreeTupleIsPosting(tuple))
+		keysize = BTreeTupleGetPostingOffset(tuple);
+	else
+		keysize = IndexTupleSize(tuple);
+
+	Assert(nipd > 0);
+
+	/* Add space needed for posting list */
+	if (nipd > 1)
+		newsize = SHORTALIGN(keysize) + sizeof(ItemPointerData) * nipd;
+	else
+		newsize = keysize;
+
+	newsize = MAXALIGN(newsize);
+	itup = palloc0(newsize);
+	memcpy(itup, tuple, keysize);
+	itup->t_info &= ~INDEX_SIZE_MASK;
+	itup->t_info |= newsize;
+
+	if (nipd > 1)
+	{
+		/* Form posting tuple, fill posting fields */
+
+		/* Set meta info about the posting list */
+		itup->t_info |= INDEX_ALT_TID_MASK;
+		BTreeSetPostingMeta(itup, nipd, SHORTALIGN(keysize));
+
+		/* sort the list to preserve TID order invariant */
+		qsort((void *) ipd, nipd, sizeof(ItemPointerData),
+			  (int (*) (const void *, const void *)) ItemPointerCompare);
+
+		/* Copy posting list into the posting tuple */
+		memcpy(BTreeTupleGetPosting(itup), ipd,
+			   sizeof(ItemPointerData) * nipd);
+	}
+	else
+	{
+		/* To finish building of a non-posting tuple, copy TID from ipd */
+		itup->t_info &= ~INDEX_ALT_TID_MASK;
+		ItemPointerCopy(ipd, &itup->t_tid);
+	}
+
+	return itup;
+}
+
+/*
+ * Opposite of BTreeFormPostingTuple.
+ * returns regular tuple that contains the key,
+ * the tid of the new tuple is the nth tid of original tuple's posting list
+ * result tuple palloc'd in a caller's context.
+ */
+IndexTuple
+BTreeGetNthTupleOfPosting(IndexTuple tuple, int n)
+{
+	Assert(BTreeTupleIsPosting(tuple));
+	return BTreeFormPostingTuple(tuple, BTreeTupleGetPostingN(tuple, n), 1);
+}
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index dd5315c..538a6bc 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -386,8 +386,8 @@ btree_xlog_vacuum(XLogReaderState *record)
 	Buffer		buffer;
 	Page		page;
 	BTPageOpaque opaque;
-#ifdef UNUSED
 	xl_btree_vacuum *xlrec = (xl_btree_vacuum *) XLogRecGetData(record);
+#ifdef UNUSED
 
 	/*
 	 * This section of code is thought to be no longer needed, after analysis
@@ -478,14 +478,34 @@ btree_xlog_vacuum(XLogReaderState *record)
 
 		if (len > 0)
 		{
-			OffsetNumber *unused;
-			OffsetNumber *unend;
+			if (xlrec->nremaining)
+			{
+				OffsetNumber *remainingoffset;
+				IndexTuple	remaining;
+				Size		itemsz;
+
+				remainingoffset = (OffsetNumber *)
+					(ptr + xlrec->ndeleted * sizeof(OffsetNumber));
+				remaining = (IndexTuple) ((char *) remainingoffset +
+										  xlrec->nremaining * sizeof(OffsetNumber));
 
-			unused = (OffsetNumber *) ptr;
-			unend = (OffsetNumber *) ((char *) ptr + len);
+				/* Handle posting tuples */
+				for (int i = 0; i < xlrec->nremaining; i++)
+				{
+					PageIndexTupleDelete(page, remainingoffset[i]);
+
+					itemsz = MAXALIGN(IndexTupleSize(remaining));
+
+					if (PageAddItem(page, (Item) remaining, itemsz, remainingoffset[i],
+									false, false) == InvalidOffsetNumber)
+						elog(PANIC, "btree_xlog_vacuum: failed to add remaining item");
+
+					remaining = (IndexTuple) ((char *) remaining + itemsz);
+				}
+			}
 
-			if ((unend - unused) > 0)
-				PageIndexMultiDelete(page, unused, unend - unused);
+			if (xlrec->ndeleted)
+				PageIndexMultiDelete(page, (OffsetNumber *) ptr, xlrec->ndeleted);
 		}
 
 		/*
diff --git a/src/backend/access/rmgrdesc/nbtdesc.c b/src/backend/access/rmgrdesc/nbtdesc.c
index a14eb79..e4fa99a 100644
--- a/src/backend/access/rmgrdesc/nbtdesc.c
+++ b/src/backend/access/rmgrdesc/nbtdesc.c
@@ -46,8 +46,10 @@ btree_desc(StringInfo buf, XLogReaderState *record)
 			{
 				xl_btree_vacuum *xlrec = (xl_btree_vacuum *) rec;
 
-				appendStringInfo(buf, "lastBlockVacuumed %u",
-								 xlrec->lastBlockVacuumed);
+				appendStringInfo(buf, "lastBlockVacuumed %u; nremaining %u; ndeleted %u",
+								 xlrec->lastBlockVacuumed,
+								 xlrec->nremaining,
+								 xlrec->ndeleted);
 				break;
 			}
 		case XLOG_BTREE_DELETE:
diff --git a/src/include/access/itup.h b/src/include/access/itup.h
index 744ffb6..b10c0d5 100644
--- a/src/include/access/itup.h
+++ b/src/include/access/itup.h
@@ -141,6 +141,10 @@ typedef IndexAttributeBitMapData * IndexAttributeBitMap;
  * On such a page, N tuples could take one MAXALIGN quantum less space than
  * estimated here, seemingly allowing one more tuple than estimated here.
  * But such a page always has at least MAXALIGN special space, so we're safe.
+ *
+ * Note: btree leaf pages may contain posting tuples, which store duplicates
+ * in a more effective way, so they may contain more tuples.
+ * Use MaxPostingIndexTuplesPerPage instead.
  */
 #define MaxIndexTuplesPerPage	\
 	((int) ((BLCKSZ - SizeOfPageHeaderData) / \
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 83e0e6c..3064afb 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -234,8 +234,7 @@ typedef struct BTMetaPageData
  *  t_tid | t_info | key values | INCLUDE columns, if any
  *
  * t_tid points to the heap TID, which is a tiebreaker key column as of
- * BTREE_VERSION 4.  Currently, the INDEX_ALT_TID_MASK status bit is never
- * set for non-pivot tuples.
+ * BTREE_VERSION 4.
  *
  * All other types of index tuples ("pivot" tuples) only have key columns,
  * since pivot tuples only exist to represent how the key space is
@@ -252,6 +251,39 @@ typedef struct BTMetaPageData
  * omitted rather than truncated, since its representation is different to
  * the non-pivot representation.)
  *
+ * Non-pivot posting tuple format:
+ *  t_tid | t_info | key values | INCLUDE columns, if any | posting_list[]
+ *
+ * In order to store duplicated keys more effectively,
+ * we use special format of tuples - posting tuples.
+ * posting_list is an array of ItemPointerData.
+ *
+ * This type of compression never applies to system indexes, unique indexes
+ * or indexes with INCLUDEd columns.
+ *
+ * To differ posting tuples we use INDEX_ALT_TID_MASK flag in t_info and
+ * BT_IS_POSTING flag in t_tid.
+ * These flags redefine the content of the posting tuple's tid:
+ * - t_tid.ip_blkid contains offset of the posting list.
+ * - t_tid offset field contains number of posting items this tuple contain
+ *
+ * The 12 least significant offset bits from t_tid are used to represent
+ * the number of posting items in posting tuples, leaving 4 status
+ * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
+ * future use.
+ * BT_N_POSTING_OFFSET_MASK is large enough to store any number of posting
+ * tuples, which is constrainted by BTMaxItemSize.
+
+ * If page contains so many duplicates, that they do not fit into one posting
+ * tuple (bounded by BTMaxItemSize and ), page may contain several posting
+ * tuples with the same key.
+ * Also page can contain both posting and non-posting tuples with the same key.
+ * Currently, posting tuples always contain at least two TIDs in the posting
+ * list.
+ *
+ * Posting tuples always have the same number of attributes as the index has
+ * generally.
+ *
  * Pivot tuple format:
  *
  *  t_tid | t_info | key values | [heap TID]
@@ -281,23 +313,144 @@ typedef struct BTMetaPageData
  * bits (BT_RESERVED_OFFSET_MASK bits), 3 of which that are reserved for
  * future use.  BT_N_KEYS_OFFSET_MASK should be large enough to store any
  * number of columns/attributes <= INDEX_MAX_KEYS.
+ * BT_IS_POSTING bit must be unset for pivot tuples, since we use it
+ * to distinct posting tuples from pivot tuples.
  *
  * Note well: The macros that deal with the number of attributes in tuples
- * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple,
- * and that a tuple without INDEX_ALT_TID_MASK set must be a non-pivot
- * tuple (or must have the same number of attributes as the index has
- * generally in the case of !heapkeyspace indexes).  They will need to be
- * updated if non-pivot tuples ever get taught to use INDEX_ALT_TID_MASK
- * for something else.
+ * assume that a tuple with INDEX_ALT_TID_MASK set must be a pivot tuple or
+ * non-pivot posting tuple, and that a tuple without INDEX_ALT_TID_MASK set
+ * must be a non-pivot tuple (or must have the same number of attributes as
+ * the index has generally in the case of !heapkeyspace indexes).
  */
 #define INDEX_ALT_TID_MASK			INDEX_AM_RESERVED_BIT
 
 /* Item pointer offset bits */
 #define BT_RESERVED_OFFSET_MASK		0xF000
 #define BT_N_KEYS_OFFSET_MASK		0x0FFF
+#define BT_N_POSTING_OFFSET_MASK	0x0FFF
 #define BT_HEAP_TID_ATTR			0x1000
+#define BT_IS_POSTING				0x2000
+
+#define BTreeTupleIsPosting(itup)  \
+	( \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0))\
+	)
+
+#define BTreeTupleIsPivot(itup)  \
+	( \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0))\
+	)
 
-/* Get/set downlink block number */
+/*
+ * MaxPostingIndexTuplesPerPage is an upper bound on the number of tuples
+ * that can fit on one btree leaf page.
+ *
+ * Btree leaf pages may contain posting tuples, which store duplicates
+ * in a more effective way, so MaxPostingIndexTuplesPerPage is larger then
+ * MaxIndexTuplesPerPage.
+ *
+ * Each leaf page must contain at least three items, so estimate it as
+ * if we have three posting tuples with minimal size keys.
+ */
+#define MaxPostingIndexTuplesPerPage \
+	((int) ((BLCKSZ - SizeOfPageHeaderData - \
+			3*((MAXALIGN(sizeof(IndexTupleData) + 1) + sizeof(ItemIdData))) )) / \
+			(sizeof(ItemPointerData)))
+
+/*
+ * Btree-private state needed to build posting tuples.
+ * ipd is a posting list - an array of ItemPointerData.
+ *
+ * Iterating over tuples during index build or applying compression to a
+ * single page, we remember a tuple in itupprev, then compare the next one
+ * with it. If tuples are equal, save their TIDs in the posting list.
+ * ntuples contains the size of the posting list.
+ *
+ * Use maxitemsize and maxpostingsize to ensure that resulting posting tuple
+ * will satisfy BTMaxItemSize.
+ */
+typedef struct BTCompressState
+{
+	Size		maxitemsize;
+	Size		maxpostingsize;
+	IndexTuple	itupprev;
+	int			ntuples;
+	ItemPointerData *ipd;
+} BTCompressState;
+
+/* macros to work with posting tuples *BEGIN* */
+#define BTreeTupleSetBtIsPosting(itup) \
+	do { \
+		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+		Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_IS_POSTING); \
+	} while(0)
+
+#define BTreeTupleClearBtIsPosting(itup) \
+	do { \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
+		ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & ~BT_IS_POSTING); \
+	} while(0)
+
+#define BTreeTupleGetNPosting(itup)	\
+	( \
+		AssertMacro(BTreeTupleIsPosting(itup)), \
+		ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_POSTING_OFFSET_MASK \
+	)
+
+#define BTreeTupleSetNPosting(itup, n) \
+	do { \
+		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_POSTING_OFFSET_MASK); \
+		BTreeTupleSetBtIsPosting(itup); \
+	} while(0)
+
+/*
+ * If tuple is posting, t_tid.ip_blkid contains offset of the posting list.
+ * Caller is responsible for checking BTreeTupleIsPosting to ensure that it
+ * will get what is expected.
+ */
+#define BTreeTupleGetPostingOffset(itup) \
+	( \
+		AssertMacro(BTreeTupleIsPosting(itup)), \
+		ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid)) \
+	)
+#define BTreeTupleSetPostingOffset(itup, offset) \
+	( \
+		AssertMacro(BTreeTupleIsPosting(itup)), \
+		ItemPointerSetBlockNumber(&((itup)->t_tid), (offset)) \
+	)
+#define BTreeSetPostingMeta(itup, nposting, off) \
+	do { \
+		BTreeTupleSetNPosting(itup, nposting); \
+		BTreeTupleSetPostingOffset(itup, off); \
+	} while(0)
+
+#define BTreeTupleGetPosting(itup) \
+	(ItemPointerData*) ((char*)(itup) + BTreeTupleGetPostingOffset(itup))
+#define BTreeTupleGetPostingN(itup,n) \
+	(ItemPointerData*) (BTreeTupleGetPosting(itup) + (n))
+
+/*
+ * Posting tuples always contain more than one TID.  The minimum TID can be
+ * accessed using BTreeTupleGetHeapTID().  The maximum is accessed using
+ * BTreeTupleGetMaxTID().
+ */
+#define BTreeTupleGetMaxTID(itup) \
+	( \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING))) ? \
+		( \
+			(ItemPointer) (BTreeTupleGetPosting(itup) + (BTreeTupleGetNPosting(itup)-1)) \
+		) \
+		: \
+		(ItemPointer) &((itup)->t_tid) \
+	)
+/* macros to work with posting tuples *END* */
+
+/* Get/set downlink block number  */
 #define BTreeInnerTupleGetDownLink(itup) \
 	ItemPointerGetBlockNumberNoCheck(&((itup)->t_tid))
 #define BTreeInnerTupleSetDownLink(itup, blkno) \
@@ -326,7 +479,8 @@ typedef struct BTMetaPageData
  */
 #define BTreeTupleGetNAtts(itup, rel)	\
 	( \
-		(itup)->t_info & INDEX_ALT_TID_MASK ? \
+		((itup)->t_info & INDEX_ALT_TID_MASK && \
+		((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) == 0)) ? \
 		( \
 			ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_N_KEYS_OFFSET_MASK \
 		) \
@@ -335,6 +489,7 @@ typedef struct BTMetaPageData
 	)
 #define BTreeTupleSetNAtts(itup, n) \
 	do { \
+		Assert(!BTreeTupleIsPosting(itup)); \
 		(itup)->t_info |= INDEX_ALT_TID_MASK; \
 		ItemPointerSetOffsetNumber(&(itup)->t_tid, (n) & BT_N_KEYS_OFFSET_MASK); \
 	} while(0)
@@ -342,6 +497,8 @@ typedef struct BTMetaPageData
 /*
  * Get tiebreaker heap TID attribute, if any.  Macro works with both pivot
  * and non-pivot tuples, despite differences in how heap TID is represented.
+ *
+ * For non-pivot posting tuples this returns the first tid from posting list.
  */
 #define BTreeTupleGetHeapTID(itup) \
 	( \
@@ -351,7 +508,10 @@ typedef struct BTMetaPageData
 		(ItemPointer) (((char *) (itup) + IndexTupleSize(itup)) - \
 					   sizeof(ItemPointerData)) \
 	  ) \
-	  : (itup)->t_info & INDEX_ALT_TID_MASK ? NULL : (ItemPointer) &((itup)->t_tid) \
+	  : (itup)->t_info & INDEX_ALT_TID_MASK ? \
+		(((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0) ? \
+		(ItemPointer) BTreeTupleGetPosting(itup) : NULL) \
+		: (ItemPointer) &((itup)->t_tid) \
 	)
 /*
  * Set the heap TID attribute for a tuple that uses the INDEX_ALT_TID_MASK
@@ -360,6 +520,7 @@ typedef struct BTMetaPageData
 #define BTreeTupleSetAltHeapTID(itup) \
 	do { \
 		Assert((itup)->t_info & INDEX_ALT_TID_MASK); \
+		Assert(!((ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) & BT_IS_POSTING) != 0)); \
 		ItemPointerSetOffsetNumber(&(itup)->t_tid, \
 								   ItemPointerGetOffsetNumberNoCheck(&(itup)->t_tid) | BT_HEAP_TID_ATTR); \
 	} while(0)
@@ -501,6 +662,12 @@ typedef struct BTInsertStateData
 	Buffer		buf;
 
 	/*
+	 * if _bt_binsrch_insert() found the location inside existing posting
+	 * list, save the position inside the list.
+	 */
+	int			in_posting_offset;
+
+	/*
 	 * Cache of bounds within the current buffer.  Only used for insertions
 	 * where _bt_check_unique is called.  See _bt_binsrch_insert and
 	 * _bt_findinsertloc for details.
@@ -567,6 +734,8 @@ typedef struct BTScanPosData
 	 * location in the associated tuple storage workspace.
 	 */
 	int			nextTupleOffset;
+	/* prevTupleOffset is for posting list handling */
+	int			prevTupleOffset;
 
 	/*
 	 * The items array is always ordered in index order (ie, increasing
@@ -579,7 +748,7 @@ typedef struct BTScanPosData
 	int			lastItem;		/* last valid index in items[] */
 	int			itemIndex;		/* current index in items[] */
 
-	BTScanPosItem items[MaxIndexTuplesPerPage]; /* MUST BE LAST */
+	BTScanPosItem items[MaxPostingIndexTuplesPerPage];	/* MUST BE LAST */
 } BTScanPosData;
 
 typedef BTScanPosData *BTScanPos;
@@ -739,6 +908,8 @@ extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
  */
 extern OffsetNumber _bt_findsplitloc(Relation rel, Page page,
 									 OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
+									 OffsetNumber replaceitemoff, Size replaceitemsz,
+									 IndexTuple replaceitem,
 									 bool *newitemonleft);
 
 /*
@@ -763,6 +934,8 @@ extern void _bt_delitems_delete(Relation rel, Buffer buf,
 								OffsetNumber *itemnos, int nitems, Relation heapRel);
 extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
 								OffsetNumber *itemnos, int nitems,
+								OffsetNumber *remainingoffset,
+								IndexTuple *remaining, int nremaining,
 								BlockNumber lastBlockVacuumed);
 extern int	_bt_pagedel(Relation rel, Buffer buf);
 
@@ -775,6 +948,8 @@ extern Buffer _bt_moveright(Relation rel, BTScanInsert key, Buffer buf,
 							bool forupdate, BTStack stack, int access, Snapshot snapshot);
 extern OffsetNumber _bt_binsrch_insert(Relation rel, BTInsertState insertstate);
 extern int32 _bt_compare(Relation rel, BTScanInsert key, Page page, OffsetNumber offnum);
+extern int32 _bt_compare_posting(Relation rel, BTScanInsert key, Page page,
+								 OffsetNumber offnum, int *in_posting_offset);
 extern bool _bt_first(IndexScanDesc scan, ScanDirection dir);
 extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);
 extern Buffer _bt_get_endpoint(Relation rel, uint32 level, bool rightmost,
@@ -813,6 +988,9 @@ extern bool _bt_check_natts(Relation rel, bool heapkeyspace, Page page,
 							OffsetNumber offnum);
 extern void _bt_check_third_page(Relation rel, Relation heap,
 								 bool needheaptidspace, Page page, IndexTuple newtup);
+extern IndexTuple BTreeFormPostingTuple(IndexTuple tuple, ItemPointerData *ipd,
+										int nipd);
+extern IndexTuple BTreeGetNthTupleOfPosting(IndexTuple tuple, int n);
 
 /*
  * prototypes for functions in nbtvalidate.c
@@ -825,5 +1003,7 @@ extern bool btvalidate(Oid opclassoid);
 extern IndexBuildResult *btbuild(Relation heap, Relation index,
 								 struct IndexInfo *indexInfo);
 extern void _bt_parallel_build_main(dsm_segment *seg, shm_toc *toc);
+extern void _bt_add_posting_item(BTCompressState *compressState,
+								 IndexTuple itup);
 
 #endif							/* NBTREE_H */
diff --git a/src/include/access/nbtxlog.h b/src/include/access/nbtxlog.h
index afa614d..4b615e0 100644
--- a/src/include/access/nbtxlog.h
+++ b/src/include/access/nbtxlog.h
@@ -173,10 +173,19 @@ typedef struct xl_btree_vacuum
 {
 	BlockNumber lastBlockVacuumed;
 
-	/* TARGET OFFSET NUMBERS FOLLOW */
+	/*
+	 * This field helps us to find beginning of the remaining tuples from
+	 * postings which follow array of offset numbers.
+	 */
+	uint32		nremaining;
+	uint32		ndeleted;
+
+	/* REMAINING OFFSET NUMBERS FOLLOW (nremaining values) */
+	/* REMAINING TUPLES TO INSERT FOLLOW (if nremaining > 0) */
+	/* TARGET OFFSET NUMBERS FOLLOW (if any) */
 } xl_btree_vacuum;
 
-#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, lastBlockVacuumed) + sizeof(BlockNumber))
+#define SizeOfBtreeVacuum	(offsetof(xl_btree_vacuum, ndeleted) + sizeof(BlockNumber))
 
 /*
  * This is what we need to know about marking an empty branch for deletion.

Re: [HACKERS] [WIP] Effective storage of duplicates in B-tree index.

Reply via email to