Hi, I have noticed one of my buildfarm machines - widowbird - did not report any results since February 17. And it seems to be stuck somewhere in amcheck:
$ ps ax | grep postgres 1180067 ? Ss 0:02 /mnt/data/buildfarm/buildroot/HEAD/inst/bin/postgres -D data-C 1180069 ? Ss 0:00 postgres: checkpointer 1180070 ? Ss 0:00 postgres: background writer 1180072 ? Ss 0:00 postgres: walwriter 1180073 ? Ss 0:01 postgres: autovacuum launcher 1180074 ? Ss 0:00 postgres: logical replication launcher 1180107 ? Ss 0:05 postgres: buildfarm contrib_regression_amcheck [local] INSERT 1180111 ? Ss 0:00 postgres: autovacuum worker 1180134 ? Ss 0:00 postgres: autovacuum worker 1180135 ? Ss 0:00 postgres: autovacuum worker 1374029 pts/0 S+ 0:00 grep --color=auto postgres So there's PID 1180107, executing an insert, but not progressing. The backtrace looks like this (first couple lines, full backtrace attached): #0 0x0000007fa64b8ddc in __GI_epoll_pwait (epfd=5, events=0x55ad6285a8, maxevents=1, timeout=timeout@entry=-1, set=set@entry=0x0) at ../sysdeps/unix/sysv/linux/epoll_pwait.c:42 #1 0x0000007fa64b8fe8 in epoll_wait (epfd=<optimized out>, events=<optimized out>, maxevents=<optimized out>, timeout=timeout@entry=-1) at ../sysdeps/unix/sysv/linux/epoll_wait.c:32 #2 0x000000558f043588 in WaitEventSetWaitBlock (nevents=1, occurred_events=0x7ff8ed4e18, cur_timeout=-1, set=0x55ad628540) at latch.c:1571 #3 WaitEventSetWait (set=0x55ad628540, timeout=timeout@entry=-1, occurred_events=occurred_events@entry=0x7ff8ed4e18, nevents=nevents@entry=1, wait_event_info=wait_event_info@entry=134217781) at latch.c:1519 #4 0x000000558f043778 in WaitLatch (latch=<optimized out>, wakeEvents=wakeEvents@entry=33, timeout=timeout@entry=-1, wait_event_info=wait_event_info@entry=134217781) at latch.c:538 #5 0x000000558f052274 in ConditionVariableTimedSleep (cv=0x7f9ac9deb0, timeout=timeout@entry=-1, wait_event_info=wait_event_info@entry=134217781) at condition_variable.c:163 #6 0x000000558f05286c in ConditionVariableTimedSleep (wait_event_info=134217781, timeout=-1, cv=<optimized out>) at condition_variable.c:135 #7 0x000000558ed2fc90 in AdvanceXLInsertBuffer (upto=upto@entry=608174080, tli=tli@entry=1, opportunistic=opportunistic@entry=false) at xlog.c:2224 So, it's stuck in AdvanceXLInsertBuffer ... interesting. Another interesting fact is it's testing 75dfde13639, which is just a couple commits after 6a2275b895: commit 6a2275b8953a4462d44daf001bdd60b3d48f0946 Author: Alexander Korotkov <akorot...@postgresql.org> Date: Mon Feb 17 04:19:01 2025 +0200 Get rid of WALBufMappingLock Allow multiple backends to initialize WAL buffers concurrently. This way `MemSet((char *) NewPage, 0, XLOG_BLCKSZ);` can run in parallel without taking a single LWLock in exclusive mode. ... which reworked AdvanceXLInsertBuffer() quite a bit, it seems. OTOH the last (successful) run on widorbird was on eaf502747b, which already includes 6a2275b895, so maybe it's unrelated. Is there something else I could collect from the stuck instance, before I restart it? regards -- Tomas Vondra
0x0000007fa64b8ddc in __GI_epoll_pwait (epfd=5, events=0x55ad6285a8, maxevents=1, timeout=timeout@entry=-1, set=set@entry=0x0) at ../sysdeps/unix/sysv/linux/epoll_pwait.c:42 42 ../sysdeps/unix/sysv/linux/epoll_pwait.c: No such file or directory. (gdb) bt #0 0x0000007fa64b8ddc in __GI_epoll_pwait (epfd=5, events=0x55ad6285a8, maxevents=1, timeout=timeout@entry=-1, set=set@entry=0x0) at ../sysdeps/unix/sysv/linux/epoll_pwait.c:42 #1 0x0000007fa64b8fe8 in epoll_wait (epfd=<optimized out>, events=<optimized out>, maxevents=<optimized out>, timeout=timeout@entry=-1) at ../sysdeps/unix/sysv/linux/epoll_wait.c:32 #2 0x000000558f043588 in WaitEventSetWaitBlock (nevents=1, occurred_events=0x7ff8ed4e18, cur_timeout=-1, set=0x55ad628540) at latch.c:1571 #3 WaitEventSetWait (set=0x55ad628540, timeout=timeout@entry=-1, occurred_events=occurred_events@entry=0x7ff8ed4e18, nevents=nevents@entry=1, wait_event_info=wait_event_info@entry=134217781) at latch.c:1519 #4 0x000000558f043778 in WaitLatch (latch=<optimized out>, wakeEvents=wakeEvents@entry=33, timeout=timeout@entry=-1, wait_event_info=wait_event_info@entry=134217781) at latch.c:538 #5 0x000000558f052274 in ConditionVariableTimedSleep (cv=0x7f9ac9deb0, timeout=timeout@entry=-1, wait_event_info=wait_event_info@entry=134217781) at condition_variable.c:163 #6 0x000000558f05286c in ConditionVariableTimedSleep (wait_event_info=134217781, timeout=-1, cv=<optimized out>) at condition_variable.c:135 #7 0x000000558ed2fc90 in AdvanceXLInsertBuffer (upto=upto@entry=608174080, tli=tli@entry=1, opportunistic=opportunistic@entry=false) at xlog.c:2224 #8 0x000000558ed301a0 in GetXLogBuffer (ptr=ptr@entry=608174080, tli=tli@entry=1) at xlog.c:1710 #9 0x000000558ed30a3c in CopyXLogRecordToWAL (tli=1, EndPos=608174184, StartPos=608174072, rdata=0x558f5c8ab0 <hdr_rdt>, isLogSwitch=false, write_len=87) at xlog.c:1282 #10 XLogInsertRecord (rdata=rdata@entry=0x558f5c8ab0 <hdr_rdt>, fpw_lsn=fpw_lsn@entry=608174072, flags=<optimized out>, num_fpi=num_fpi@entry=0, topxid_included=topxid_included@entry=false) at xlog.c:928 #11 0x000000558ed39b40 in XLogInsert (rmid=rmid@entry=10 '\n', info=info@entry=0 '\000') at xloginsert.c:523 #12 0x000000558ecbc94c in heap_insert (relation=relation@entry=0x55ad741ac0, tup=tup@entry=0x55ad7b6638, cid=cid@entry=0, options=options@entry=0, bistate=bistate@entry=0x0) at heapam.c:2157 #13 0x000000558ecc4c84 in heapam_tuple_insert (relation=0x55ad741ac0, slot=0x55ad7b64d8, cid=0, options=0, bistate=0x0) at heapam_handler.c:251 #14 0x000000558eeb8a98 in table_tuple_insert (bistate=0x0, options=0, cid=<optimized out>, slot=<optimized out>, rel=0x55ad741ac0) at ../../../src/include/access/tableam.h:1411 #15 ExecInsert (context=context@entry=0x7ff8ed53c8, resultRelInfo=resultRelInfo@entry=0x55ad72a370, slot=<optimized out>, slot@entry=0x55ad7b64d8, canSetTag=<optimized out>, inserted_tuple=inserted_tuple@entry=0x0, insert_destrel=insert_destrel@entry=0x0) at nodeModifyTable.c:1213 #16 0x000000558eeba900 in ExecModifyTable (pstate=0x55ad72a160) at nodeModifyTable.c:4322 #17 0x000000558ee857a4 in ExecProcNode (node=0x55ad72a160) at ../../../src/include/executor/executor.h:272 #18 ExecutePlan (dest=0x55ad7b3358, direction=<optimized out>, numberTuples=0, sendTuples=<optimized out>, operation=CMD_INSERT, queryDesc=0x55ad716dc0) at execMain.c:1675 #19 standard_ExecutorRun (queryDesc=0x55ad716dc0, direction=<optimized out>, count=0) at execMain.c:364 #20 0x000000558f0768fc in ProcessQuery (plan=0x55ad7b31d8, sourceText=0x55ad62e490 "INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;", params=0x0, queryEnv=0x0, dest=0x55ad7b3358, qc=0x7ff8ed57c0) at pquery.c:160 #21 0x000000558f077708 in PortalRunMulti (portal=portal@entry=0x55ad6ad720, isTopLevel=isTopLevel@entry=true, setHoldSnapshot=setHoldSnapshot@entry=false, dest=dest@entry=0x55ad7b3358, altdest=altdest@entry=0x55ad7b3358, qc=qc@entry=0x7ff8ed57c0) at pquery.c:1271 #22 0x000000558f077b0c in PortalRun (portal=portal@entry=0x55ad6ad720, count=count@entry=9223372036854775807, isTopLevel=isTopLevel@entry=true, dest=dest@entry=0x55ad7b3358, altdest=altdest@entry=0x55ad7b3358, qc=qc@entry=0x7ff8ed57c0) at pquery.c:787 #23 0x000000558f072380 in exec_simple_query (query_string=query_string@entry=0x55ad62e490 "INSERT INTO delete_test_table SELECT i, 1, 2, 3 FROM generate_series(1,80000) i;") at postgres.c:1271 #24 0x000000558f0732dc in PostgresMain (dbname=<optimized out>, username=<optimized out>) at postgres.c:4691 #25 0x000000558f06dcbc in BackendMain (startup_data=startup_data@entry=0x7ff8ed5ca8 "", startup_data_len=startup_data_len@entry=4) at backend_startup.c:107 #26 0x000000558efbe038 in postmaster_child_launch (child_type=<optimized out>, child_slot=1, startup_data=startup_data@entry=0x7ff8ed5ca8 "", startup_data_len=startup_data_len@entry=4, client_sock=client_sock@entry=0x7ff8ed5cb0) at launch_backend.c:274 #27 0x000000558efc1d2c in BackendStartup (client_sock=0x7ff8ed5cb0) at postmaster.c:3519 #28 ServerLoop () at postmaster.c:1688 #29 0x000000558efc3684 in PostmasterMain (argc=argc@entry=3, argv=argv@entry=0x55ad627e40) at postmaster.c:1386 #30 0x000000558ec69248 in main (argc=3, argv=0x55ad627e40) at main.c:230 (gdb) q