Hello hackers, I tried to investigate a recent buildfarm test failure: https://buildfarm.postgresql.org/cgi-bin/show_log.pl?nm=adder&dt=2024-06-04%2003%3A27%3A47
29/295 postgresql:recovery / recovery/026_overwrite_contrecord ERROR 39.55s exit status 32 log/026_overwrite_contrecord_standby.log TRAP: failed Assert("ItemIdIsNormal(lpp)"), File: "../pgsql/src/backend/access/heap/heapam.c", Line: 1002, PID: 3740958 postgres: standby: bf postgres [local] startup(ExceptionalCondition+0x81)[0x56c60bf9] postgres: standby: bf postgres [local] startup(+0xf776e)[0x5667276e] postgres: standby: bf postgres [local] startup(heap_getnextslot+0x40)[0x56672ee1] postgres: standby: bf postgres [local] startup(+0x11c218)[0x56697218] postgres: standby: bf postgres [local] startup(systable_getnext+0xfa)[0x56697c1a] postgres: standby: bf postgres [local] startup(+0x6d29c7)[0x56c4d9c7] postgres: standby: bf postgres [local] startup(+0x6d372c)[0x56c4e72c] postgres: standby: bf postgres [local] startup(+0x6d8288)[0x56c53288] postgres: standby: bf postgres [local] startup(RelationCacheInitializePhase3+0x149)[0x56c52d71] (It's not the only failure of that ilk in the buildfarm.) and managed to reproduce the failure by running many 026_overwrite_contrecord tests in parallel (with fsync=on). Analyzing the core dump added some info: ... #3 0x0000000000bb43cc in ExceptionalCondition (conditionName=0xc45c77 "ItemIdIsNormal(lpp)", fileName=0xc45aa8 "heapam.c", lineNumber=1002) at assert.c:66 #4 0x00000000004f7f13 in heapgettup_pagemode (scan=0x19f5660, dir=ForwardScanDirection, nkeys=2, key=0x19f61d0) at heapam.c:1002 #5 0x00000000004f86d1 in heap_getnextslot (sscan=0x19f5660, direction=ForwardScanDirection, slot=0x19f5da0) at heapam.c:1307 #6 0x000000000051d028 in table_scan_getnextslot (sscan=0x19f5660, direction=ForwardScanDirection, slot=0x19f5da0) at ../../../../src/include/access/tableam.h:1081 #7 0x000000000051da80 in systable_getnext (sysscan=0x19f5470) at genam.c:530 #8 0x0000000000ba0937 in RelationBuildTupleDesc (relation=0x7fa004feea88) at relcache.c:572 #9 0x0000000000ba17b9 in RelationBuildDesc (targetRelId=2679, insertIt=true) at relcache.c:1184 #10 0x0000000000ba6520 in load_critical_index (indexoid=2679, heapoid=2610) at relcache.c:4353 #11 0x0000000000ba607d in RelationCacheInitializePhase3 () at relcache.c:4132 #12 0x0000000000bcb704 in InitPostgres (in_dbname=0x196ca30 "postgres", dboid=5, username=0x19a91b8 "law", useroid=0, flags=1, out_dbname=0x0) at postinit.c:1193 ... (gdb) frame 4 (gdb) p lpp->lp_flags $2 = 1 (gdb) p ItemIdIsNormal(lpp) $12 = 1 So it looks like the Assert had failed when lpp->lp_flags had some other contents... I added the following debugging code: --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -995,10 +995,14 @@ continue_page: for (; linesleft > 0; linesleft--, lineindex += dir) { ItemId lpp; + ItemIdData iid; OffsetNumber lineoff; lineoff = scan->rs_vistuples[lineindex]; lpp = PageGetItemId(page, lineoff); + iid = *((ItemIdData *)lpp); + + Assert(ItemIdIsNormal(&iid)); Assert(ItemIdIsNormal(lpp)); and got: ... #2 0x000055b68dc6998c in ExceptionalCondition (conditionName=0x55b68dcfe5f7 "ItemIdIsNormal(&iid)", fileName=0x55b68dcfe428 "heapam.c", lineNumber=1010) at assert.c:66 #3 0x000055b68d588a78 in heapgettup_pagemode (scan=0x55b68f0905e0, dir=ForwardScanDirection, nkeys=2, key=0x55b68f091150) at heapam.c:1010 #4 0x000055b68d58930e in heap_getnextslot (sscan=0x55b68f0905e0, direction=ForwardScanDirection, slot=0x55b68f090d20) at heapam.c:1322 ... (gdb) frame 3 #3 0x000055b68d588a78 in heapgettup_pagemode (...) at heapam.c:1010 1010 Assert(ItemIdIsNormal(&iid)); (gdb) info locals lpp = 0x7f615c34b0ec iid = {lp_off = 0, lp_flags = 0, lp_len = 0} lineoff = 54 tuple = 0x55b68f090638 page = 0x7f615c34b000 "" (gdb) p *lpp $1 = {lp_off = 3160, lp_flags = 1, lp_len = 136} It seemingly confirms that the underlying memory was changed while being processed in heapgettup_pagemode(). I've tried to add checks for the page buffer content as below: --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -953,11 +953,15 @@ heapgettup_pagemode(HeapScanDesc scan, Page page; int lineindex; int linesleft; +char page_copy[BLCKSZ]; if (likely(scan->rs_inited)) { /* continue from previously returned page/tuple */ page = BufferGetPage(scan->rs_cbuf); +memcpy(page_copy, page, BLCKSZ); +for (int i = 0; i < 100; i++) +Assert(memcmp(page_copy, page, BLCKSZ) == 0); lineindex = scan->rs_cindex + dir; if (ScanDirectionIsForward(dir)) @@ -986,6 +990,10 @@ heapgettup_pagemode(HeapScanDesc scan, /* prune the page and determine visible tuple offsets */ heap_prepare_pagescan((TableScanDesc) scan); page = BufferGetPage(scan->rs_cbuf); +memcpy(page_copy, page, BLCKSZ); +for (int i = 0; i < 100; i++) +Assert(memcmp(page_copy, page, BLCKSZ) == 0); + linesleft = scan->rs_ntuples; lineindex = ScanDirectionIsForward(dir) ? 0 : linesleft - 1; and got the assertion failures even during `make check`: ... #5 0x00005577f29e0bc4 in ExceptionalCondition ( conditionName=conditionName@entry=0x5577f2a4a5d0 "memcmp(page_copy, page, BLCKSZ) == 0", fileName=fileName@entry=0x5577f2a4aa38 "heapam.c", lineNumber=lineNumber@entry=966) at assert.c:66 #6 0x00005577f24faa68 in heapgettup_pagemode (scan=scan@entry=0x5577f46574e8, dir=ForwardScanDirection, nkeys=0, key=0x0) at heapam.c:966 ... (gdb) frame 6 #6 0x00005577f24faa68 in heapgettup_pagemode (...) at heapam.c:966 966 Assert(memcmp(page_copy, page, BLCKSZ) == 0); (gdb) p i $1 = 25 Am I missing something or the the page buffer indeed lacks locking there? Best regards, Alexander