From 2fe09c749e7fbca1998f7964ab8341df466023c3 Mon Sep 17 00:00:00 2001
From: Dilip Kumar <dilip.kumar@enterprisedb.com>
Date: Wed, 11 Oct 2023 15:41:34 +0530
Subject: [PATCH v1 3/3] Introduce bank-wise LRU counter

Since we have already divided buffer pool in banks and victim
buffer search is also done at the bank level so there is no need
to have a centralized lru counter.  And this will also improve
the performance by reducing the frequent cpu cache invalidation by
not updating the common variable.

Dilip Kumar based on design idea from Robert Haas
---
 src/backend/access/transam/slru.c | 23 +++++++++++++++--------
 src/include/access/slru.h         | 28 +++++++++++++++++-----------
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index c06e4eddd1..fd44ad7d47 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -110,13 +110,13 @@ typedef struct SlruWriteAllData *SlruWriteAll;
  *
  * The reason for the if-test is that there are often many consecutive
  * accesses to the same page (particularly the latest page).  By suppressing
- * useless increments of cur_lru_count, we reduce the probability that old
+ * useless increments of bank_cur_lru_count, we reduce the probability that old
  * pages' counts will "wrap around" and make them appear recently used.
  *
  * We allow this code to be executed concurrently by multiple processes within
  * SimpleLruReadPage_ReadOnly().  As long as int reads and writes are atomic,
  * this should not cause any completely-bogus values to enter the computation.
- * However, it is possible for either cur_lru_count or individual
+ * However, it is possible for either bank_cur_lru_count or individual
  * page_lru_count entries to be "reset" to lower values than they should have,
  * in case a process is delayed while it executes this macro.  With care in
  * SlruSelectLRUPage(), this does little harm, and in any case the absolute
@@ -125,9 +125,10 @@ typedef struct SlruWriteAllData *SlruWriteAll;
  */
 #define SlruRecentlyUsed(shared, slotno)	\
 	do { \
-		int		new_lru_count = (shared)->cur_lru_count; \
+		int		bankno = slotno / SLRU_BANK_SIZE; \
+		int		new_lru_count = (shared)->bank_cur_lru_count[bankno]; \
 		if (new_lru_count != (shared)->page_lru_count[slotno]) { \
-			(shared)->cur_lru_count = ++new_lru_count; \
+			(shared)->bank_cur_lru_count[bankno] = ++new_lru_count; \
 			(shared)->page_lru_count[slotno] = new_lru_count; \
 		} \
 	} while (0)
@@ -200,6 +201,7 @@ SimpleLruShmemSize(int nslots, int nlsns)
 	sz += MAXALIGN(nslots * sizeof(int));	/* page_lru_count[] */
 	sz += MAXALIGN(nslots * sizeof(LWLockPadded));	/* buffer_locks[] */
 	sz += MAXALIGN((bankmask + 1) * sizeof(LWLockPadded));	/* bank_locks[] */
+	sz += MAXALIGN((bankmask + 1) * sizeof(int));   /* bank_cur_lru_count[] */
 
 	if (nlsns > 0)
 		sz += MAXALIGN(nslots * nlsns * sizeof(XLogRecPtr));	/* group_lsn[] */
@@ -276,8 +278,6 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 		shared->num_slots = nslots;
 		shared->lsn_groups_per_page = nlsns;
 
-		shared->cur_lru_count = 0;
-
 		/* shared->latest_page_number will be set later */
 
 		shared->slru_stats_idx = pgstat_get_slru_index(name);
@@ -300,6 +300,8 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 		offset += MAXALIGN(nslots * sizeof(LWLockPadded));
 		shared->bank_locks = (LWLockPadded *) (ptr + offset);
 		offset += MAXALIGN(nbanks * sizeof(LWLockPadded));
+		shared->bank_cur_lru_count = (int *) (ptr + offset);
+		offset += MAXALIGN(nbanks * sizeof(int));
 
 		if (nlsns > 0)
 		{
@@ -321,8 +323,11 @@ SimpleLruInit(SlruCtl ctl, const char *name, int nslots, int nlsns,
 		}
 		/* initialize bank locks for each buffer bank */
 		for (bankno = 0; bankno < nbanks; bankno++)
+		{
 			LWLockInitialize(&shared->bank_locks[bankno].lock,
 							 slru_tranche_id);
+			shared->bank_cur_lru_count[bankno] = 0;
+		}
 
 		/* Should fit to estimated shmem size */
 		Assert(ptr - (char *) shared <= SimpleLruShmemSize(nslots, nlsns));
@@ -1112,9 +1117,11 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		int			best_invalid_page_number = 0;	/* keep compiler quiet */
 
 		/* See if page already has a buffer assigned */
-		int			bankstart = (pageno & ctl->bank_mask) * SLRU_BANK_SIZE;
+		int			bankno = pageno & ctl->bank_mask;
+		int			bankstart = bankno * SLRU_BANK_SIZE;
 		int			bankend = bankstart + SLRU_BANK_SIZE;
 
+
 		for (slotno = bankstart; slotno < bankend; slotno++)
 		{
 			if (shared->page_number[slotno] == pageno &&
@@ -1149,7 +1156,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		 * That gets us back on the path to having good data when there are
 		 * multiple pages with the same lru_count.
 		 */
-		cur_count = (shared->cur_lru_count)++;
+		cur_count = (shared->bank_cur_lru_count[bankno])++;
 		for (slotno = bankstart; slotno < bankend; slotno++)
 		{
 			int			this_delta;
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index eec7a568dc..fea12cdfb3 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -73,6 +73,23 @@ typedef struct SlruSharedData
 	 */
 	LWLockPadded *bank_locks;
 
+	/*----------
+	 * Instead of global counter we maintain a bank-wise lru counter because
+	 * a) we are doing the victim buffer selection as bank level so there is
+	 * no point of having a global counter b) manipulating a global counter
+	 * will have frequent cpu cache invalidation and that will affect the
+	 * performance.
+	 *
+	 * We mark a page "most recently used" by setting
+	 *		page_lru_count[slotno] = ++bank_cur_lru_count[bankno];
+	 * The oldest page is therefore the one with the highest value of
+	 *		bank_cur_lru_count[bankno] - page_lru_count[slotno]
+	 * The counts will eventually wrap around, but this calculation still
+	 * works as long as no page's age exceeds INT_MAX counts.
+	 *----------
+	 */
+	int			 *bank_cur_lru_count;
+
 	/*
 	 * Optional array of WAL flush LSNs associated with entries in the SLRU
 	 * pages.  If not zero/NULL, we must flush WAL before writing pages (true
@@ -84,17 +101,6 @@ typedef struct SlruSharedData
 	XLogRecPtr *group_lsn;
 	int			lsn_groups_per_page;
 
-	/*----------
-	 * We mark a page "most recently used" by setting
-	 *		page_lru_count[slotno] = ++cur_lru_count;
-	 * The oldest page is therefore the one with the highest value of
-	 *		cur_lru_count - page_lru_count[slotno]
-	 * The counts will eventually wrap around, but this calculation still
-	 * works as long as no page's age exceeds INT_MAX counts.
-	 *----------
-	 */
-	int			cur_lru_count;
-
 	/*
 	 * latest_page_number is the page number of the current end of the log;
 	 * this is not critical data, since we use it only to avoid swapping out
-- 
2.39.2 (Apple Git-143)

