commit 77422f67e58947bd20bee1c4977817512c928cbb
Author: mithun <mithun@localhost.localdomain>
Date:   Fri Dec 16 11:29:18 2016 +0530

    cache meta page

diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 6806e32..9161e2e 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -541,7 +541,7 @@ loop_top:
 		bool		split_cleanup = false;
 
 		/* Get address of bucket's start page */
-		bucket_blkno = BUCKET_TO_BLKNO(&local_metapage, cur_bucket);
+		bucket_blkno = BUCKET_TO_BLKNO(local_metapage.hashm_spares, cur_bucket);
 
 		blkno = bucket_blkno;
 
diff --git a/src/backend/access/hash/hashinsert.c b/src/backend/access/hash/hashinsert.c
index 572146a..886ab2e 100644
--- a/src/backend/access/hash/hashinsert.c
+++ b/src/backend/access/hash/hashinsert.c
@@ -31,19 +31,16 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	Buffer		buf = InvalidBuffer;
 	Buffer		bucket_buf;
 	Buffer		metabuf;
+	HashMetaCache metac;
 	HashMetaPage metap;
 	BlockNumber blkno;
-	BlockNumber oldblkno;
-	bool		retry;
 	Page		page;
 	HashPageOpaque pageopaque;
 	Size		itemsz;
 	bool		do_expand;
 	uint32		hashkey;
 	Bucket		bucket;
-	uint32		maxbucket;
-	uint32		highmask;
-	uint32		lowmask;
+	uint32		bufcount;
 
 	/*
 	 * Get the hash key for the item (it's stored in the index tuple itself).
@@ -54,28 +51,34 @@ _hash_doinsert(Relation rel, IndexTuple itup)
 	itemsz = IndexTupleDSize(*itup);
 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 								 * need to be consistent */
-
 restart_insert:
-	/* Read the metapage */
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	metap = HashPageGetMeta(BufferGetPage(metabuf));
+	metabuf = InvalidBuffer;
+	metap = NULL;
 
-	/*
-	 * Check whether the item can fit on a hash page at all. (Eventually, we
-	 * ought to try to apply TOAST methods if not.)  Note that at this point,
-	 * itemsz doesn't include the ItemId.
-	 *
-	 * XXX this is useless code if we are only storing hash keys.
-	 */
-	if (itemsz > HashMaxItemSize((Page) metap))
-		ereport(ERROR,
-				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-				 errmsg("index row size %zu exceeds hash maximum %zu",
-						itemsz, HashMaxItemSize((Page) metap)),
-			errhint("Values larger than a buffer page cannot be indexed.")));
+	if (rel->rd_amcache != NULL)
+	{
+		metac = (HashMetaCache) rel->rd_amcache;
+	}
+	else
+	{
+		/* Read the metapage */
+		metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+		page = BufferGetPage(metabuf);
+		metap = HashPageGetMeta(page);
+
+		/* Cache the metapage data for next time. */
+		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+											 sizeof(HashMetaDataCache));
+		metac = (HashMetaCache)rel->rd_amcache;
+		metac->hmc_maxbucket = metap->hashm_maxbucket;
+		metac->hmc_highmask = metap->hashm_highmask;
+		metac->hmc_lowmask = metap->hashm_lowmask;
+		memcpy(metac->hmc_spares, metap->hashm_spares,
+			   sizeof(uint32)*HASH_MAX_SPLITPOINTS);
 
-	oldblkno = InvalidBlockNumber;
-	retry = false;
+		/* Release metapage lock, but keep pin. */
+		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+	}
 
 	/*
 	 * Loop until we get a lock on the correct target bucket.
@@ -86,54 +89,73 @@ restart_insert:
 		 * Compute the target bucket number, and convert to block number.
 		 */
 		bucket = _hash_hashkey2bucket(hashkey,
-									  metap->hashm_maxbucket,
-									  metap->hashm_highmask,
-									  metap->hashm_lowmask);
+									  metac->hmc_maxbucket,
+									  metac->hmc_highmask,
+									  metac->hmc_lowmask);
 
-		blkno = BUCKET_TO_BLKNO(metap, bucket);
+		blkno = BUCKET_TO_BLKNO(metac->hmc_spares, bucket);
+
+		/* Fetch the primary bucket page for the bucket */
+		buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
+		page = BufferGetPage(buf);
+		pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		Assert(pageopaque->hasho_bucket == bucket);
 
 		/*
-		 * Copy bucket mapping info now; refer the comment in
-		 * _hash_expandtable where we copy this information before calling
-		 * _hash_splitbucket to see why this is okay.
+		 * Check if this bucket is split after we have cached the hash meta
+		 * data. To do this we need to check whether cached maxbucket number is
+		 * less than or equal to maxbucket number stored in bucket page, which
+		 * was set with that times maxbucket number during bucket page splits.
+		 * In case of upgrade hashno_prevblkno of old bucket page will be set
+		 * with InvalidBlockNumber. And as of now maximum value the
+		 * hashm_maxbucket can take is 1 less than InvalidBlockNumber
+		 * (see _hash_expandtable). So an explicit check for InvalidBlockNumber
+		 * in hasho_prevblkno will tell whether current bucket has been split
+		 * after caching hash meta data.
 		 */
-		maxbucket = metap->hashm_maxbucket;
-		highmask = metap->hashm_highmask;
-		lowmask = metap->hashm_lowmask;
+		if (pageopaque->hasho_prevblkno == InvalidBlockNumber ||
+			pageopaque->hasho_prevblkno <=  metac->hmc_maxbucket)
+		{
+			/* Ok now we have the right bucket proceed to search in it. */
+			break;
+		}
 
-		/* Release metapage lock, but keep pin. */
-		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+		/* First drop any locks held on bucket buffers. */
+		_hash_relbuf(rel, buf);
 
-		/*
-		 * If the previous iteration of this loop locked the primary page of
-		 * what is still the correct target bucket, we are done.  Otherwise,
-		 * drop any old lock before acquiring the new one.
-		 */
-		if (retry)
+		/* Cached meta data is old try again updating it. */
+		if (BufferIsInvalid(metabuf))
 		{
-			if (oldblkno == blkno)
-				break;
-			_hash_relbuf(rel, buf);
+			metabuf =
+				_hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+			metap = HashPageGetMeta(BufferGetPage(metabuf));
 		}
+		else
+			_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
 
-		/* Fetch and lock the primary bucket page for the target bucket */
-		buf = _hash_getbuf(rel, blkno, HASH_WRITE, LH_BUCKET_PAGE);
+		metac = (HashMetaCache)rel->rd_amcache;
+		metac->hmc_maxbucket = metap->hashm_maxbucket;
+		metac->hmc_highmask = metap->hashm_highmask;
+		metac->hmc_lowmask = metap->hashm_lowmask;
+		memcpy(metac->hmc_spares, metap->hashm_spares,
+			   sizeof(uint32)*HASH_MAX_SPLITPOINTS);
 
-		/*
-		 * Reacquire metapage lock and check that no bucket split has taken
-		 * place while we were awaiting the bucket lock.
-		 */
-		_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
-		oldblkno = blkno;
-		retry = true;
+		/* Release Meta page buffer lock, but keep pin. */
+		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
 	}
 
 	/* remember the primary bucket buffer to release the pin on it at end. */
 	bucket_buf = buf;
 
-	page = BufferGetPage(buf);
-	pageopaque = (HashPageOpaque) PageGetSpecialPointer(page);
-	Assert(pageopaque->hasho_bucket == bucket);
+	/*
+	 * Below we need the metabuf for all cases. If we have not read the Meta
+	 * Page yet just read it once in HASH_NOLOCK mode and hold the pin.
+	 */
+	if (BufferIsInvalid(metabuf))
+	{
+		metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_NOLOCK, LH_META_PAGE);
+		metap = HashPageGetMeta(BufferGetPage(metabuf));
+	}
 
 	/*
 	 * If this bucket is in the process of being split, try to finish the
@@ -150,7 +172,8 @@ restart_insert:
 		_hash_chgbufaccess(rel, buf, HASH_READ, HASH_NOLOCK);
 
 		_hash_finish_split(rel, metabuf, buf, pageopaque->hasho_bucket,
-						   maxbucket, highmask, lowmask);
+						   metac->hmc_maxbucket, metac->hmc_highmask,
+						   metac->hmc_lowmask);
 
 		/* release the pin on old and meta buffer.  retry for insert. */
 		_hash_dropbuf(rel, buf);
diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c
index 44332e7..ba35402 100644
--- a/src/backend/access/hash/hashpage.c
+++ b/src/backend/access/hash/hashpage.c
@@ -475,10 +475,17 @@ _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum)
 		/* Allow interrupts, in case N is huge */
 		CHECK_FOR_INTERRUPTS();
 
-		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum);
+		buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap->hashm_spares, i),
+							  forkNum);
 		pg = BufferGetPage(buf);
 		pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg);
-		pageopaque->hasho_prevblkno = InvalidBlockNumber;
+
+		/*
+		 * Setting hasho_prevblkno of bucket page with latest maxbucket number
+		 * to indicate bucket has been initialized and need to reconstruct
+		 * HashMetaCache if it is older.
+		 */
+		pageopaque->hasho_prevblkno = metap->hashm_maxbucket;
 		pageopaque->hasho_nextblkno = InvalidBlockNumber;
 		pageopaque->hasho_bucket = i;
 		pageopaque->hasho_flag = LH_BUCKET_PAGE;
@@ -594,7 +601,7 @@ restart_expand:
 
 	old_bucket = (new_bucket & metap->hashm_lowmask);
 
-	start_oblkno = BUCKET_TO_BLKNO(metap, old_bucket);
+	start_oblkno = BUCKET_TO_BLKNO(metap->hashm_spares, old_bucket);
 
 	buf_oblkno = _hash_getbuf_with_condlock_cleanup(rel, start_oblkno, LH_BUCKET_PAGE);
 	if (!buf_oblkno)
@@ -682,7 +689,7 @@ restart_expand:
 	 * the current value of hashm_spares[hashm_ovflpoint] correctly shows
 	 * where we are going to put a new splitpoint's worth of buckets.
 	 */
-	start_nblkno = BUCKET_TO_BLKNO(metap, new_bucket);
+	start_nblkno = BUCKET_TO_BLKNO(metap->hashm_spares, new_bucket);
 
 	/*
 	 * If the split point is increasing (hashm_maxbucket's log base 2
@@ -886,6 +893,12 @@ _hash_splitbucket(Relation rel,
 	 */
 	oopaque->hasho_flag |= LH_BUCKET_BEING_SPLIT;
 
+	/*
+	 * Setting hasho_prevblkno of bucket page with latest maxbucket number
+	 * to indicate bucket has been split and need to reconstruct HashMetaCache.
+	 * Below same is done for new bucket page.
+	 */
+	oopaque->hasho_prevblkno = maxbucket;
 	npage = BufferGetPage(nbuf);
 
 	/*
@@ -893,7 +906,7 @@ _hash_splitbucket(Relation rel,
 	 * split is in progress.
 	 */
 	nopaque = (HashPageOpaque) PageGetSpecialPointer(npage);
-	nopaque->hasho_prevblkno = InvalidBlockNumber;
+	nopaque->hasho_prevblkno = maxbucket;
 	nopaque->hasho_nextblkno = InvalidBlockNumber;
 	nopaque->hasho_bucket = nbucket;
 	nopaque->hasho_flag = LH_BUCKET_PAGE | LH_BUCKET_BEING_POPULATED;
diff --git a/src/backend/access/hash/hashsearch.c b/src/backend/access/hash/hashsearch.c
index 8d43b38..2b6cf31 100644
--- a/src/backend/access/hash/hashsearch.c
+++ b/src/backend/access/hash/hashsearch.c
@@ -152,6 +152,11 @@ _hash_readprev(IndexScanDesc scan,
 		_hash_relbuf(rel, *bufp);
 
 	*bufp = InvalidBuffer;
+
+	/* If it is a bucket page there will not be a prevblkno. */
+	if ((*opaquep)->hasho_flag & LH_BUCKET_PAGE)
+		return;
+
 	/* check for interrupts while we're not holding any buffer lock */
 	CHECK_FOR_INTERRUPTS();
 	if (BlockNumberIsValid(blkno))
@@ -216,13 +221,11 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 	uint32		hashkey;
 	Bucket		bucket;
 	BlockNumber blkno;
-	BlockNumber oldblkno = InvalidBuffer;
-	bool		retry = false;
 	Buffer		buf;
-	Buffer		metabuf;
+	Buffer		metabuf = InvalidBuffer;
 	Page		page;
 	HashPageOpaque opaque;
-	HashMetaPage metap;
+	HashMetaCache metac;
 	IndexTuple	itup;
 	ItemPointer current;
 	OffsetNumber offnum;
@@ -277,59 +280,102 @@ _hash_first(IndexScanDesc scan, ScanDirection dir)
 
 	so->hashso_sk_hash = hashkey;
 
-	/* Read the metapage */
-	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
-	page = BufferGetPage(metabuf);
-	metap = HashPageGetMeta(page);
+	if (rel->rd_amcache != NULL)
+	{
+		metac = (HashMetaCache) rel->rd_amcache;
+	}
+	else
+	{
+		HashMetaPage	metap;
+
+		/* Read the metapage */
+		metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+		page = BufferGetPage(metabuf);
+		metap = HashPageGetMeta(page);
+
+		/* Cache the metapage data for next time. */
+		rel->rd_amcache = MemoryContextAlloc(rel->rd_indexcxt,
+											 sizeof(HashMetaDataCache));
+		metac = (HashMetaCache)rel->rd_amcache;
+		metac->hmc_maxbucket = metap->hashm_maxbucket;
+		metac->hmc_highmask = metap->hashm_highmask;
+		metac->hmc_lowmask = metap->hashm_lowmask;
+		memcpy(metac->hmc_spares, metap->hashm_spares,
+			   sizeof(uint32)*HASH_MAX_SPLITPOINTS);
+
+		/* Release metapage lock, but keep pin. */
+		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+	}
 
 	/*
 	 * Loop until we get a lock on the correct target bucket.
 	 */
 	for (;;)
 	{
+		HashMetaPage	metap;
+
 		/*
 		 * Compute the target bucket number, and convert to block number.
 		 */
 		bucket = _hash_hashkey2bucket(hashkey,
-									  metap->hashm_maxbucket,
-									  metap->hashm_highmask,
-									  metap->hashm_lowmask);
+									  metac->hmc_maxbucket,
+									  metac->hmc_highmask,
+									  metac->hmc_lowmask);
 
-		blkno = BUCKET_TO_BLKNO(metap, bucket);
+		blkno = BUCKET_TO_BLKNO(metac->hmc_spares, bucket);
 
-		/* Release metapage lock, but keep pin. */
-		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
+		/* Fetch the primary bucket page for the bucket */
+		buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
+		page = BufferGetPage(buf);
+		opaque = (HashPageOpaque) PageGetSpecialPointer(page);
+		Assert(opaque->hasho_bucket == bucket);
 
 		/*
-		 * If the previous iteration of this loop locked what is still the
-		 * correct target bucket, we are done.  Otherwise, drop any old lock
-		 * and lock what now appears to be the correct bucket.
+		 * Check if this bucket is split after we have cached the hash meta
+		 * data. To do this we need to check whether cached maxbucket number is
+		 * less than or equal to maxbucket number stored in bucket page, which
+		 * was set with that times maxbucket number during bucket page splits.
+		 * In case of upgrade hashno_prevblkno of old bucket page will be set
+		 * with InvalidBlockNumber. And as of now maximum value the
+		 * hashm_maxbucket can take is 1 less than InvalidBlockNumber
+		 * (see _hash_expandtable). So an explicit check for InvalidBlockNumber
+		 * in hasho_prevblkno will tell whether current bucket has been split
+		 * after caching hash meta data.
 		 */
-		if (retry)
+		if (opaque->hasho_prevblkno == InvalidBlockNumber ||
+			opaque->hasho_prevblkno <=  metac->hmc_maxbucket)
 		{
-			if (oldblkno == blkno)
-				break;
-			_hash_relbuf(rel, buf);
+			/* Ok now we have the right bucket proceed to search in it. */
+			break;
 		}
 
-		/* Fetch the primary bucket page for the bucket */
-		buf = _hash_getbuf(rel, blkno, HASH_READ, LH_BUCKET_PAGE);
+		/* First drop any locks held on bucket buffers. */
+		_hash_relbuf(rel, buf);
 
-		/*
-		 * Reacquire metapage lock and check that no bucket split has taken
-		 * place while we were awaiting the bucket lock.
-		 */
-		_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
-		oldblkno = blkno;
-		retry = true;
+		/* Cached meta data is old try again updating it. */
+		if (BufferIsInvalid(metabuf))
+		{
+			metabuf =
+				_hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
+			metap = HashPageGetMeta(BufferGetPage(metabuf));
+		}
+		else
+			_hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_READ);
+
+		metac = (HashMetaCache)rel->rd_amcache;
+		metac->hmc_maxbucket = metap->hashm_maxbucket;
+		metac->hmc_highmask = metap->hashm_highmask;
+		metac->hmc_lowmask = metap->hashm_lowmask;
+		memcpy(metac->hmc_spares, metap->hashm_spares,
+			   sizeof(uint32)*HASH_MAX_SPLITPOINTS);
+
+		/* Release Meta page buffer lock, but keep pin. */
+		_hash_chgbufaccess(rel, metabuf, HASH_READ, HASH_NOLOCK);
 	}
 
 	/* done with the metapage */
-	_hash_dropbuf(rel, metabuf);
-
-	page = BufferGetPage(buf);
-	opaque = (HashPageOpaque) PageGetSpecialPointer(page);
-	Assert(opaque->hasho_bucket == bucket);
+	if (!BufferIsInvalid(metabuf))
+		_hash_dropbuf(rel, metabuf);
 
 	so->hashso_bucket_buf = buf;
 
diff --git a/src/backend/access/hash/hashutil.c b/src/backend/access/hash/hashutil.c
index fa9cbdc..b719ffc 100644
--- a/src/backend/access/hash/hashutil.c
+++ b/src/backend/access/hash/hashutil.c
@@ -382,7 +382,7 @@ _hash_get_oldblock_from_newbucket(Relation rel, Bucket new_bucket)
 	metabuf = _hash_getbuf(rel, HASH_METAPAGE, HASH_READ, LH_META_PAGE);
 	metap = HashPageGetMeta(BufferGetPage(metabuf));
 
-	blkno = BUCKET_TO_BLKNO(metap, old_bucket);
+	blkno = BUCKET_TO_BLKNO(metap->hashm_spares, old_bucket);
 
 	_hash_relbuf(rel, metabuf);
 
@@ -412,7 +412,7 @@ _hash_get_newblock_from_oldbucket(Relation rel, Bucket old_bucket)
 	new_bucket = _hash_get_newbucket_from_oldbucket(rel, old_bucket,
 													metap->hashm_lowmask,
 													metap->hashm_maxbucket);
-	blkno = BUCKET_TO_BLKNO(metap, new_bucket);
+	blkno = BUCKET_TO_BLKNO(metap->hashm_spares, new_bucket);
 
 	_hash_relbuf(rel, metabuf);
 
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index 6dfc41f..2c2c59f 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -35,8 +35,8 @@ typedef uint32 Bucket;
 
 #define InvalidBucket	((Bucket) 0xFFFFFFFF)
 
-#define BUCKET_TO_BLKNO(metap,B) \
-		((BlockNumber) ((B) + ((B) ? (metap)->hashm_spares[_hash_log2((B)+1)-1] : 0)) + 1)
+#define BUCKET_TO_BLKNO(spares,B) \
+		((BlockNumber) ((B) + ((B) ? spares[_hash_log2((B)+1)-1] : 0)) + 1)
 
 /*
  * Special space for hash index pages.
@@ -60,7 +60,15 @@ typedef uint32 Bucket;
 
 typedef struct HashPageOpaqueData
 {
-	BlockNumber hasho_prevblkno;	/* previous ovfl (or bucket) blkno */
+	/*
+	 * If this is an ovfl page this stores previous ovfl (or bucket) blkno.
+	 * Else if this is a bucket page we use this for a special purpose. We
+	 * store hashm_maxbucket value, whenever this page is initialized or split.
+	 * So this helps us to know whether the bucket has been split after caching
+	 * the some of the meta page data. See _hash_doinsert(), _hash_first() to
+	 * know how to use same.
+	 */
+	BlockNumber hasho_prevblkno;
 	BlockNumber hasho_nextblkno;	/* next ovfl blkno */
 	Bucket		hasho_bucket;	/* bucket number this pg belongs to */
 	uint16		hasho_flag;		/* page type code, see above */
@@ -183,6 +191,28 @@ typedef struct HashMetaPageData
 typedef HashMetaPageData *HashMetaPage;
 
 /*
+ * This structure caches minimal hash index metadata, which is sufficient to
+ * say whether a bucket is split after the meta data is cached. By caching
+ * same, we can avoid a buffer read of HASH_METAPAGE, whenever we need to map
+ * the key to a bucket. See _hash_first(), _hash_doinsert().
+ *
+ * NOTE : Unfortunately we cannot put this data structure inside
+ * HashMetaPageData and avoid duplication. Because HashMetaPageData is stored
+ * inside a page, so we will break backward compatibility if we change that
+ * structure.
+ */
+typedef struct HashMetaDataCache
+{
+	uint32		hmc_maxbucket;	/* ID of maximum bucket in use */
+	uint32		hmc_highmask;	/* mask to modulo into entire table */
+	uint32		hmc_lowmask;	/* mask to modulo into lower half of table */
+	uint32		hmc_spares[HASH_MAX_SPLITPOINTS];		/* spare pages before
+														 * each splitpoint */
+} HashMetaDataCache;
+
+typedef HashMetaDataCache *HashMetaCache;
+
+/*
  * Maximum size of a hash index item (it's okay to have only one per page)
  */
 #define HashMaxItemSize(page) \
