Tom Lane <t...@sss.pgh.pa.us> 于2025年4月15日周二 05:39写道:
> Alvaro Herrera <alvhe...@alvh.no-ip.org> writes: > > On 2025-Apr-14, Tom Lane wrote: > >> I would not have expected that adding pg_constraint rows implies > >> stronger locks than what ALTER ADD PRIMARY KEY was using before, > >> and I suspect that doing so will cause more problems than just > >> breaking parallel restore. > > > I wasn't aware of this side effect. I'll investigate this in more > > depth. I suspect it might be a bug in the way we run through ALTER > > TABLE for the primary key. > > After further thought it occurs to me that it might not be a case > of "we get stronger locks", but a case of "we accidentally get a > weaker lock earlier and then try to upgrade it", thus creating a > possibility of deadlock where before we'd just have blocked till > the other statement cleared. Still worthy of being fixed if that's > true, though. > I added sleep(1) in the DeadLockReport() before error report to display the status when a deadlock happened. bool continue_sleep = true; do { sleep(1); } while (continue_sleep); ereport(ERROR, (errcode(ERRCODE_T_R_DEADLOCK_DETECTED), errmsg("deadlock detected"), errdetail_internal("%s", clientbuf.data), errdetail_log("%s", logbuf.data), errhint("See server log for query details."))); ubuntu@VM-0-17-ubuntu:/workspace/postgres$ ps -ef|grep postgres ubuntu 2911109 1 0 10:34 ? 00:00:00 /workspace/pgsql/bin/postgres -D ../data ubuntu 2911110 2911109 0 10:34 ? 00:00:00 postgres: io worker 0 ubuntu 2911111 2911109 0 10:34 ? 00:00:00 postgres: io worker 1 ubuntu 2911112 2911109 0 10:34 ? 00:00:00 postgres: io worker 2 ubuntu 2911113 2911109 0 10:34 ? 00:00:00 postgres: checkpointer ubuntu 2911114 2911109 0 10:34 ? 00:00:00 postgres: background writer ubuntu 2911116 2911109 0 10:34 ? 00:00:00 postgres: walwriter ubuntu 2911117 2911109 0 10:34 ? 00:00:00 postgres: autovacuum launcher ubuntu 2911118 2911109 0 10:34 ? 00:00:00 postgres: logical replication launcher ubuntu 2911180 2911109 0 10:34 ? 00:00:00 postgres: ubuntu target [local] COPY waiting ubuntu 2911184 2911109 0 10:34 ? 00:00:00 postgres: ubuntu target [local] idle ubuntu 2911187 2911109 0 10:34 ? 00:00:00 postgres: ubuntu target [local] idle ubuntu 2911188 2911109 0 10:34 ? 00:00:00 postgres: ubuntu target [local] ALTER TABLE ubuntu 2911189 2911109 0 10:34 ? 00:00:00 postgres: ubuntu target [local] SELECT waiting ubuntu 2911190 2911109 0 10:34 ? 00:00:00 postgres: ubuntu target [local] idle ubuntu 2911191 2911109 0 10:34 ? 00:00:00 postgres: ubuntu target [local] TRUNCATE TABLE waiting ubuntu 2911192 2911109 0 10:34 ? 00:00:00 postgres: ubuntu target [local] idle ubuntu 2911193 2911109 0 10:34 ? 00:00:00 postgres: ubuntu target [local] SELECT waiting ubuntu 2911194 2911109 0 10:34 ? 00:00:00 postgres: ubuntu target [local] idle gdb -p 2911188 // ALTER TABLE ONLY public.parent2 ADD CONSTRAINT parent2_pkey PRIMARY KEY (id); (gdb) bt #0 0x00007f2f6910878a in __GI___clock_nanosleep (clock_id=clock_id@entry=0, flags=flags@entry=0, req=req@entry=0x7ffc0e560e60, rem=rem@entry=0x7ffc0e560e60) at ../sysdeps/unix/sysv/linux/clock_nanosleep.c:78 #1 0x00007f2f6910d677 in __GI___nanosleep (req=req@entry=0x7ffc0e560e60, rem=rem@entry=0x7ffc0e560e60) at ../sysdeps/unix/sysv/linux/nanosleep.c:25 #2 0x00007f2f6910d5ae in __sleep (seconds=0) at ../sysdeps/posix/sleep.c:55 #3 0x0000561cd9386100 in DeadLockReport () at deadlock.c:1136 #4 0x0000561cd9389df8 in LockAcquireExtended (locktag=0x7ffc0e5610b0, lockmode=8, sessionLock=false, dontWait=false, reportMemoryError=true, locallockp=0x7ffc0e5610a8, logLockFailure=false) at lock.c:1232 #5 0x0000561cd93864bc in LockRelationOid (relid=16473, lockmode=8) at lmgr.c:115 #6 0x0000561cd8f21b20 in find_inheritance_children_extended (parentrelId=16463, omit_detached=true, lockmode=8, detached_exist=0x0, detached_xmin=0x0) at pg_inherits.c:213 #7 0x0000561cd8f217c1 in find_inheritance_children (parentrelId=16463, lockmode=8) at pg_inherits.c:60 #8 0x0000561cd904dd73 in ATPrepAddPrimaryKey (wqueue=0x7ffc0e561348, rel=0x7f2f5d7d6240, cmd=0x561cf1d2ee38, recurse=false, lockmode=8, context=0x7ffc0e561540) at tablecmds.c:9463 #9 0x0000561cd9043906 in ATPrepCmd (wqueue=0x7ffc0e561348, rel=0x7f2f5d7d6240, cmd=0x561cf1d2ee38, recurse=false, recursing=false, lockmode=8, context=0x7ffc0e561540) at tablecmds.c:5079 #10 0x0000561cd90432aa in ATController (parsetree=0x561cf1d062e0, rel=0x7f2f5d7d6240, cmds=0x561cf1d06290, recurse=false, lockmode=8, context=0x7ffc0e561540) at tablecmds.c:4871 #11 0x0000561cd9042f3b in AlterTable (stmt=0x561cf1d062e0, lockmode=8, context=0x7ffc0e561540) at tablecmds.c:4533 #12 0x0000561cd93bb7a8 in ProcessUtilitySlow (pstate=0x561cf1d2f9e0, pstmt=0x561cf1d06390, queryString=0x561cf1d05570 "ALTER TABLE ONLY public.parent2\n ADD CONSTRAINT parent2_pkey PRIMARY KEY (id);\n\n\n", context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, dest=0x561cf1d06750, qc=0x7ffc0e561ba0) at utility.c:1321 #13 0x0000561cd93bb04e in standard_ProcessUtility (pstmt=0x561cf1d06390, queryString=0x561cf1d05570 "ALTER TABLE ONLY public.parent2\n ADD CONSTRAINT parent2_pkey PRIMARY KEY (id);\n\n\n", readOnlyTree=false, context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, dest=0x561cf1d06750, qc=0x7ffc0e561ba0) at utility.c:1070 #14 0x0000561cd93b9f4c in ProcessUtility (pstmt=0x561cf1d06390, queryString=0x561cf1d05570 "ALTER TABLE ONLY public.parent2\n ADD CONSTRAINT parent2_pkey PRIMARY KEY (id);\n\n\n", readOnlyTree=false, context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, dest=0x561cf1d06750, qc=0x7ffc0e561ba0) at utility.c:523 #15 0x0000561cd93b87a9 in PortalRunUtility (portal=0x561cf1d85d90, pstmt=0x561cf1d06390, isTopLevel=true, setHoldSnapshot=false, dest=0x561cf1d06750, qc=0x7ffc0e561ba0) at pquery.c:1185 #16 0x0000561cd93b8a52 in PortalRunMulti (portal=0x561cf1d85d90, isTopLevel=true, setHoldSnapshot=false, dest=0x561cf1d06750, altdest=0x561cf1d06750, qc=0x7ffc0e561ba0) at pquery.c:1349 #17 0x0000561cd93b7e73 in PortalRun (portal=0x561cf1d85d90, count=9223372036854775807, isTopLevel=true, dest=0x561cf1d06750, altdest=0x561cf1d06750, qc=0x7ffc0e561ba0) at pquery.c:820 #18 0x0000561cd93b037e in exec_simple_query (query_string=0x561cf1d05570 "ALTER TABLE ONLY public.parent2\n ADD CONSTRAINT parent2_pkey PRIMARY KEY (id);\n\n\n") at postgres.c:1274 #19 0x0000561cd93b5b46 in PostgresMain (dbname=0x561cf1d3f580 "target", username=0x561cf1d3f568 "ubuntu") at postgres.c:4771 #20 0x0000561cd93abab7 in BackendMain (startup_data=0x7ffc0e561e50, startup_data_len=24) at backend_startup.c:124 #21 0x0000561cd92abf09 in postmaster_child_launch (child_type=B_BACKEND, child_slot=2, startup_data=0x7ffc0e561e50, startup_data_len=24, client_sock=0x7ffc0e561eb0) at launch_backend.c:290 #22 0x0000561cd92b2946 in BackendStartup (client_sock=0x7ffc0e561eb0) at postmaster.c:3580 #23 0x0000561cd92afeb1 in ServerLoop () at postmaster.c:1702 #24 0x0000561cd92af7a2 in PostmasterMain (argc=3, argv=0x561cf1cffb00) at postmaster.c:1400 #25 0x0000561cd914cf06 in main (argc=3, argv=0x561cf1cffb00) at main.c:227 gdb -p 2911180 // COPY public.c22 (id, ref, b) FROM stdin; (gdb) bt #0 0x00007f2f69148dea in epoll_wait (epfd=5, events=0x561cf1d003a8, maxevents=1, timeout=-1) at ../sysdeps/unix/sysv/linux/epoll_wait.c:30 #1 0x0000561cd938106f in WaitEventSetWaitBlock (set=0x561cf1d00340, cur_timeout=-1, occurred_events=0x7ffc0e561000, nevents=1) at waiteventset.c:1190 #2 0x0000561cd9380f74 in WaitEventSetWait (set=0x561cf1d00340, timeout=-1, occurred_events=0x7ffc0e561000, nevents=1, wait_event_info=50331648) at waiteventset.c:1138 #3 0x0000561cd936f6a5 in WaitLatch (latch=0x7f2f665629e4, wakeEvents=33, timeout=0, wait_event_info=50331648) at latch.c:194 #4 0x0000561cd939fdbc in ProcSleep (locallock=0x561cf1d63560) at proc.c:1454 #5 0x0000561cd938b2c6 in WaitOnLock (locallock=0x561cf1d63560, owner=0x561cf1d44040) at lock.c:1968 #6 0x0000561cd9389dbd in LockAcquireExtended (locktag=0x7ffc0e5613f0, lockmode=1, sessionLock=false, dontWait=false, reportMemoryError=true, locallockp=0x7ffc0e5613e8, logLockFailure=false) at lock.c:1217 #7 0x0000561cd93864bc in LockRelationOid (relid=16463, lockmode=1) at lmgr.c:115 #8 0x0000561cd8daf922 in relation_open (relationId=16463, lockmode=1) at relation.c:55 #9 0x0000561cd95a29f3 in generate_partition_qual (rel=0x7f2f5d7d6640) at partcache.c:362 #10 0x0000561cd95a28b8 in RelationGetPartitionQual (rel=0x7f2f5d7d6640) at partcache.c:283 #11 0x0000561cd90bd59f in ExecPartitionCheck (resultRelInfo=0x561cf1d2ead8, slot=0x561cf1d33af8, estate=0x561cf1dde450, emitError=true) at execMain.c:1952 #12 0x0000561cd8fd1c9f in CopyFrom (cstate=0x561cf1de23a8) at copyfrom.c:1368 #13 0x0000561cd8fccd30 in DoCopy (pstate=0x561cf1d2f9e0, stmt=0x561cf1d06140, stmt_location=0, stmt_len=39, processed=0x7ffc0e561830) at copy.c:306 #14 0x0000561cd93ba623 in standard_ProcessUtility (pstmt=0x561cf1d06210, queryString=0x561cf1d05570 "COPY public.c22 (id, ref, b) FROM stdin;\n", readOnlyTree=false, context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, dest=0x561cf1d065d0, qc=0x7ffc0e561ba0) at utility.c:738 #15 0x0000561cd93b9f4c in ProcessUtility (pstmt=0x561cf1d06210, queryString=0x561cf1d05570 "COPY public.c22 (id, ref, b) FROM stdin;\n", readOnlyTree=false, context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, dest=0x561cf1d065d0, qc=0x7ffc0e561ba0) at utility.c:523 #16 0x0000561cd93b87a9 in PortalRunUtility (portal=0x561cf1d85d90, pstmt=0x561cf1d06210, isTopLevel=true, setHoldSnapshot=false, dest=0x561cf1d065d0, qc=0x7ffc0e561ba0) at pquery.c:1185 #17 0x0000561cd93b8a52 in PortalRunMulti (portal=0x561cf1d85d90, isTopLevel=true, setHoldSnapshot=false, dest=0x561cf1d065d0, altdest=0x561cf1d065d0, qc=0x7ffc0e561ba0) at pquery.c:1349 #18 0x0000561cd93b7e73 in PortalRun (portal=0x561cf1d85d90, count=9223372036854775807, isTopLevel=true, dest=0x561cf1d065d0, altdest=0x561cf1d065d0, qc=0x7ffc0e561ba0) at pquery.c:820 #19 0x0000561cd93b037e in exec_simple_query (query_string=0x561cf1d05570 "COPY public.c22 (id, ref, b) FROM stdin;\n") at postgres.c:1274 #20 0x0000561cd93b5b46 in PostgresMain (dbname=0x561cf1d3f580 "target", username=0x561cf1d3f568 "ubuntu") at postgres.c:4771 #21 0x0000561cd93abab7 in BackendMain (startup_data=0x7ffc0e561e50, startup_data_len=24) at backend_startup.c:124 #22 0x0000561cd92abf09 in postmaster_child_launch (child_type=B_BACKEND, child_slot=1, startup_data=0x7ffc0e561e50, startup_data_len=24, client_sock=0x7ffc0e561eb0) at launch_backend.c:290 #23 0x0000561cd92b2946 in BackendStartup (client_sock=0x7ffc0e561eb0) at postmaster.c:3580 #24 0x0000561cd92afeb1 in ServerLoop () at postmaster.c:1702 #25 0x0000561cd92af7a2 in PostmasterMain (argc=3, argv=0x561cf1cffb00) at postmaster.c:1400 #26 0x0000561cd914cf06 in main (argc=3, argv=0x561cf1cffb00) at main.c:227 The alter table session do check its children not-null constraint using lockmod=8, and the copy session do get partition_qual to lock parent using lockmode =1. I wonder if we have to use the same lockmode for checking children's not-null constraint. -- Thanks, Tender Wang