Re: Prefetch the next tuple's memory during seqscans

Andres Freund Wed, 02 Nov 2022 10:26:14 -0700

Hi,

On 2022-11-01 20:00:43 -0700, Andres Freund wrote:
> I suspect that prefetching in heapgetpage() would provide gains as well, at
> least for pages that aren't marked all-visible, pretty common in the real
> world IME.


Attached is an experimental patch/hack for that. It ended up being more
beneficial to make the access ordering more optimal than prefetching the tuple
contents, but I'm not at all sure that's the be-all-end-all.


I separately benchmarked pinning the CPU and memory to the same socket,
different socket and interleaving memory.

I did this for HEAD, your patch, your patch and mine.

BEGIN; DROP TABLE IF EXISTS large; CREATE TABLE large(a int8 not null, b int8 
not null default '0', c int8); INSERT INTO large SELECT generate_series(1, 
50000000);COMMIT;


server is started with
local: numactl --membind 1 --physcpubind 10
remote: numactl --membind 0 --physcpubind 10
interleave: numactl --interleave=all --physcpubind 10

benchmark stared with:
psql -qX -f ~/tmp/prewarm.sql && \
    pgbench -n -f ~/tmp/seqbench.sql -t 1 -r > /dev/null && \
    perf stat -e task-clock,LLC-loads,LLC-load-misses,cycles,instructions -C
    10 \
    pgbench -n -f ~/tmp/seqbench.sql -t 3 -r

seqbench.sql:
SELECT count(*) FROM large WHERE c IS NOT NULL;
SELECT sum(a), sum(b), sum(c) FROM large;
SELECT sum(c) FROM large;

branch            memory        time s   miss %
head              local         31.612   74.03
david             local         32.034   73.54
david+andres      local         31.644   42.80
andres            local         30.863   48.05

head              remote        33.350   72.12
david             remote        33.425   71.30
david+andres      remote        32.428   49.57
andres            remote        30.907   44.33

head              interleave    32.465   71.33
david             interleave    33.176   72.60
david+andres      interleave    32.590   46.23
andres            interleave    30.440   45.13

It's cool seeing how doing optimizing heapgetpage seems to pretty much remove
the performance difference between local / remote memory.


It makes some sense that David's patch doesn't help in this case - without
all-visible being set the tuple headers will have already been pulled in for
the HTSV call.

I've not yet experimented with moving the prefetch for the tuple contents from
David's location to before the HTSV. I suspect that might benefit both
workloads.

Greetings,

Andres Freund

diff --git i/src/include/access/heapam.h w/src/include/access/heapam.h
index 9dab35551e1..dff7616abeb 100644
--- i/src/include/access/heapam.h
+++ w/src/include/access/heapam.h
@@ -74,7 +74,8 @@ typedef struct HeapScanDescData
 	/* these fields only used in page-at-a-time mode and for bitmap scans */
 	int			rs_cindex;		/* current tuple's index in vistuples */
 	int			rs_ntuples;		/* number of visible tuples on page */
-	OffsetNumber rs_vistuples[MaxHeapTuplesPerPage];	/* their offsets */
+	OffsetNumber *rs_vistuples;
+	OffsetNumber rs_vistuples_d[MaxHeapTuplesPerPage];	/* their offsets */
 }			HeapScanDescData;
 typedef struct HeapScanDescData *HeapScanDesc;
 
diff --git i/src/backend/access/heap/heapam.c w/src/backend/access/heap/heapam.c
index 12be87efed4..632f315f4e1 100644
--- i/src/backend/access/heap/heapam.c
+++ w/src/backend/access/heap/heapam.c
@@ -448,30 +448,99 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 	 */
 	all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery;
 
-	for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
-		 lineoff <= lines;
-		 lineoff++, lpp++)
+	if (all_visible)
 	{
-		if (ItemIdIsNormal(lpp))
-		{
-			HeapTupleData loctup;
-			bool		valid;
+		HeapTupleData loctup;
+
+		loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
+
+		scan->rs_vistuples = scan->rs_vistuples_d;
+
+		for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
+			 lineoff <= lines;
+			 lineoff++, lpp++)
+		{
+			if (!ItemIdIsNormal(lpp))
+				continue;
 
-			loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
 			loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
 			loctup.t_len = ItemIdGetLength(lpp);
 			ItemPointerSet(&(loctup.t_self), page, lineoff);
 
-			if (all_visible)
-				valid = true;
-			else
-				valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
+			HeapCheckForSerializableConflictOut(true, scan->rs_base.rs_rd,
+												&loctup, buffer, snapshot);
+			scan->rs_vistuples[ntup++] = lineoff;
+		}
+	}
+	else
+	{
+		HeapTupleData loctup;
+		int			normcount = 0;
+		OffsetNumber normoffsets[MaxHeapTuplesPerPage];
+
+		loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd);
+
+		for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
+			 lineoff <= lines;
+			 lineoff++, lpp++)
+
+		/*
+		 * Iterate forward over line items, they're laid out in increasing
+		 * order in memory. Doing this separately allows to benefit from
+		 * out-of-order capabilities of the CPU and simplifies the next loop.
+		 *
+		 * FIXME: Worth unrolling so that we don't fetch the same cacheline
+		 * over and over, due to line items being smaller than a cacheline?
+		 */
+		for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff);
+			 lineoff <= lines;
+			 lineoff++, lpp++)
+		{
+			pg_prefetch_mem(PageGetItemId(dp, lineoff+5));
+			if (!ItemIdIsNormal(lpp))
+				continue;
+			normoffsets[normcount++] = lineoff;
+		}
+
+		/*
+		 * Process tuples in reverse order. That'll most often lead to memory
+		 * accesses in increasing order, which typically is more efficient for
+		 * the CPUs prefetcher. To avoid affecting sort order, we store the
+		 * visible tuples in decreasing order in rs_vistuples_d and then set
+		 * rs_vistuple to the last tuple found.
+		 *
+		 * FIXME: We should likely compute rs_cindex in a smarter way, rather
+		 * than changing rs_vistuples.
+		 */
+		scan->rs_vistuples = scan->rs_vistuples_d + (MaxHeapTuplesPerPage);
+		for (int i = normcount - 1; i >= 0; i--)
+		{
+			bool valid;
+
+			/* doesn't appear to be beneficial */
+#if 0
+			if (i > 0)
+				pg_prefetch_mem(PageGetItem(dp, PageGetItemId(dp, normoffsets[i - 1])));
+#endif
+
+			lineoff = normoffsets[i];
+			lpp = PageGetItemId(dp, lineoff);
+
+			loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp);
+			loctup.t_len = ItemIdGetLength(lpp);
+			ItemPointerSet(&(loctup.t_self), page, lineoff);
+			valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
 
 			HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd,
 												&loctup, buffer, snapshot);
 
 			if (valid)
-				scan->rs_vistuples[ntup++] = lineoff;
+			{
+				scan->rs_vistuples--;
+				*scan->rs_vistuples = lineoff;
+				ntup++;
+			}
+
 		}
 	}
 
diff --git i/src/backend/access/heap/heapam_handler.c w/src/backend/access/heap/heapam_handler.c
index 41f1ca65d01..f2876ecbc60 100644
--- i/src/backend/access/heap/heapam_handler.c
+++ w/src/backend/access/heap/heapam_handler.c
@@ -2162,6 +2162,8 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 		 */
 		int			curslot;
 
+		hscan->rs_vistuples = hscan->rs_vistuples_d;
+
 		for (curslot = 0; curslot < tbmres->ntuples; curslot++)
 		{
 			OffsetNumber offnum = tbmres->offsets[curslot];
@@ -2184,6 +2186,8 @@ heapam_scan_bitmap_next_block(TableScanDesc scan,
 		OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
 		OffsetNumber offnum;
 
+		hscan->rs_vistuples = hscan->rs_vistuples_d;
+
 		for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
 		{
 			ItemId		lp;

Re: Prefetch the next tuple's memory during seqscans

Reply via email to