Hi, On 2022-11-01 20:00:43 -0700, Andres Freund wrote: > I suspect that prefetching in heapgetpage() would provide gains as well, at > least for pages that aren't marked all-visible, pretty common in the real > world IME.
Attached is an experimental patch/hack for that. It ended up being more beneficial to make the access ordering more optimal than prefetching the tuple contents, but I'm not at all sure that's the be-all-end-all. I separately benchmarked pinning the CPU and memory to the same socket, different socket and interleaving memory. I did this for HEAD, your patch, your patch and mine. BEGIN; DROP TABLE IF EXISTS large; CREATE TABLE large(a int8 not null, b int8 not null default '0', c int8); INSERT INTO large SELECT generate_series(1, 50000000);COMMIT; server is started with local: numactl --membind 1 --physcpubind 10 remote: numactl --membind 0 --physcpubind 10 interleave: numactl --interleave=all --physcpubind 10 benchmark stared with: psql -qX -f ~/tmp/prewarm.sql && \ pgbench -n -f ~/tmp/seqbench.sql -t 1 -r > /dev/null && \ perf stat -e task-clock,LLC-loads,LLC-load-misses,cycles,instructions -C 10 \ pgbench -n -f ~/tmp/seqbench.sql -t 3 -r seqbench.sql: SELECT count(*) FROM large WHERE c IS NOT NULL; SELECT sum(a), sum(b), sum(c) FROM large; SELECT sum(c) FROM large; branch memory time s miss % head local 31.612 74.03 david local 32.034 73.54 david+andres local 31.644 42.80 andres local 30.863 48.05 head remote 33.350 72.12 david remote 33.425 71.30 david+andres remote 32.428 49.57 andres remote 30.907 44.33 head interleave 32.465 71.33 david interleave 33.176 72.60 david+andres interleave 32.590 46.23 andres interleave 30.440 45.13 It's cool seeing how doing optimizing heapgetpage seems to pretty much remove the performance difference between local / remote memory. It makes some sense that David's patch doesn't help in this case - without all-visible being set the tuple headers will have already been pulled in for the HTSV call. I've not yet experimented with moving the prefetch for the tuple contents from David's location to before the HTSV. I suspect that might benefit both workloads. Greetings, Andres Freund
diff --git i/src/include/access/heapam.h w/src/include/access/heapam.h index 9dab35551e1..dff7616abeb 100644 --- i/src/include/access/heapam.h +++ w/src/include/access/heapam.h @@ -74,7 +74,8 @@ typedef struct HeapScanDescData /* these fields only used in page-at-a-time mode and for bitmap scans */ int rs_cindex; /* current tuple's index in vistuples */ int rs_ntuples; /* number of visible tuples on page */ - OffsetNumber rs_vistuples[MaxHeapTuplesPerPage]; /* their offsets */ + OffsetNumber *rs_vistuples; + OffsetNumber rs_vistuples_d[MaxHeapTuplesPerPage]; /* their offsets */ } HeapScanDescData; typedef struct HeapScanDescData *HeapScanDesc; diff --git i/src/backend/access/heap/heapam.c w/src/backend/access/heap/heapam.c index 12be87efed4..632f315f4e1 100644 --- i/src/backend/access/heap/heapam.c +++ w/src/backend/access/heap/heapam.c @@ -448,30 +448,99 @@ heapgetpage(TableScanDesc sscan, BlockNumber page) */ all_visible = PageIsAllVisible(dp) && !snapshot->takenDuringRecovery; - for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff); - lineoff <= lines; - lineoff++, lpp++) + if (all_visible) { - if (ItemIdIsNormal(lpp)) - { - HeapTupleData loctup; - bool valid; + HeapTupleData loctup; + + loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); + + scan->rs_vistuples = scan->rs_vistuples_d; + + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff); + lineoff <= lines; + lineoff++, lpp++) + { + if (!ItemIdIsNormal(lpp)) + continue; - loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); loctup.t_len = ItemIdGetLength(lpp); ItemPointerSet(&(loctup.t_self), page, lineoff); - if (all_visible) - valid = true; - else - valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); + HeapCheckForSerializableConflictOut(true, scan->rs_base.rs_rd, + &loctup, buffer, snapshot); + scan->rs_vistuples[ntup++] = lineoff; + } + } + else + { + HeapTupleData loctup; + int normcount = 0; + OffsetNumber normoffsets[MaxHeapTuplesPerPage]; + + loctup.t_tableOid = RelationGetRelid(scan->rs_base.rs_rd); + + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff); + lineoff <= lines; + lineoff++, lpp++) + + /* + * Iterate forward over line items, they're laid out in increasing + * order in memory. Doing this separately allows to benefit from + * out-of-order capabilities of the CPU and simplifies the next loop. + * + * FIXME: Worth unrolling so that we don't fetch the same cacheline + * over and over, due to line items being smaller than a cacheline? + */ + for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff); + lineoff <= lines; + lineoff++, lpp++) + { + pg_prefetch_mem(PageGetItemId(dp, lineoff+5)); + if (!ItemIdIsNormal(lpp)) + continue; + normoffsets[normcount++] = lineoff; + } + + /* + * Process tuples in reverse order. That'll most often lead to memory + * accesses in increasing order, which typically is more efficient for + * the CPUs prefetcher. To avoid affecting sort order, we store the + * visible tuples in decreasing order in rs_vistuples_d and then set + * rs_vistuple to the last tuple found. + * + * FIXME: We should likely compute rs_cindex in a smarter way, rather + * than changing rs_vistuples. + */ + scan->rs_vistuples = scan->rs_vistuples_d + (MaxHeapTuplesPerPage); + for (int i = normcount - 1; i >= 0; i--) + { + bool valid; + + /* doesn't appear to be beneficial */ +#if 0 + if (i > 0) + pg_prefetch_mem(PageGetItem(dp, PageGetItemId(dp, normoffsets[i - 1]))); +#endif + + lineoff = normoffsets[i]; + lpp = PageGetItemId(dp, lineoff); + + loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lpp); + loctup.t_len = ItemIdGetLength(lpp); + ItemPointerSet(&(loctup.t_self), page, lineoff); + valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer); HeapCheckForSerializableConflictOut(valid, scan->rs_base.rs_rd, &loctup, buffer, snapshot); if (valid) - scan->rs_vistuples[ntup++] = lineoff; + { + scan->rs_vistuples--; + *scan->rs_vistuples = lineoff; + ntup++; + } + } } diff --git i/src/backend/access/heap/heapam_handler.c w/src/backend/access/heap/heapam_handler.c index 41f1ca65d01..f2876ecbc60 100644 --- i/src/backend/access/heap/heapam_handler.c +++ w/src/backend/access/heap/heapam_handler.c @@ -2162,6 +2162,8 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, */ int curslot; + hscan->rs_vistuples = hscan->rs_vistuples_d; + for (curslot = 0; curslot < tbmres->ntuples; curslot++) { OffsetNumber offnum = tbmres->offsets[curslot]; @@ -2184,6 +2186,8 @@ heapam_scan_bitmap_next_block(TableScanDesc scan, OffsetNumber maxoff = PageGetMaxOffsetNumber(dp); OffsetNumber offnum; + hscan->rs_vistuples = hscan->rs_vistuples_d; + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId lp;