Tender Wang <tndrw...@gmail.com> 于2025年4月15日周二 11:20写道:
> > > Tom Lane <t...@sss.pgh.pa.us> 于2025年4月15日周二 05:39写道: > >> Alvaro Herrera <alvhe...@alvh.no-ip.org> writes: >> > On 2025-Apr-14, Tom Lane wrote: >> >> I would not have expected that adding pg_constraint rows implies >> >> stronger locks than what ALTER ADD PRIMARY KEY was using before, >> >> and I suspect that doing so will cause more problems than just >> >> breaking parallel restore. >> >> > I wasn't aware of this side effect. I'll investigate this in more >> > depth. I suspect it might be a bug in the way we run through ALTER >> > TABLE for the primary key. >> >> After further thought it occurs to me that it might not be a case >> of "we get stronger locks", but a case of "we accidentally get a >> weaker lock earlier and then try to upgrade it", thus creating a >> possibility of deadlock where before we'd just have blocked till >> the other statement cleared. Still worthy of being fixed if that's >> true, though. >> > > I added sleep(1) in the DeadLockReport() before error report to display > the status when a deadlock happened. > bool continue_sleep = true; > do > { > sleep(1); > } while (continue_sleep); > ereport(ERROR, > (errcode(ERRCODE_T_R_DEADLOCK_DETECTED), > errmsg("deadlock detected"), > errdetail_internal("%s", clientbuf.data), > errdetail_log("%s", logbuf.data), > errhint("See server log for query details."))); > > > > ubuntu@VM-0-17-ubuntu:/workspace/postgres$ ps -ef|grep postgres > ubuntu 2911109 1 0 10:34 ? 00:00:00 > /workspace/pgsql/bin/postgres -D ../data > ubuntu 2911110 2911109 0 10:34 ? 00:00:00 postgres: io worker 0 > ubuntu 2911111 2911109 0 10:34 ? 00:00:00 postgres: io worker 1 > ubuntu 2911112 2911109 0 10:34 ? 00:00:00 postgres: io worker 2 > ubuntu 2911113 2911109 0 10:34 ? 00:00:00 postgres: checkpointer > ubuntu 2911114 2911109 0 10:34 ? 00:00:00 postgres: background > writer > ubuntu 2911116 2911109 0 10:34 ? 00:00:00 postgres: walwriter > ubuntu 2911117 2911109 0 10:34 ? 00:00:00 postgres: autovacuum > launcher > ubuntu 2911118 2911109 0 10:34 ? 00:00:00 postgres: logical > replication launcher > ubuntu 2911180 2911109 0 10:34 ? 00:00:00 postgres: ubuntu > target [local] COPY waiting > ubuntu 2911184 2911109 0 10:34 ? 00:00:00 postgres: ubuntu > target [local] idle > ubuntu 2911187 2911109 0 10:34 ? 00:00:00 postgres: ubuntu > target [local] idle > ubuntu 2911188 2911109 0 10:34 ? 00:00:00 postgres: ubuntu > target [local] ALTER TABLE > ubuntu 2911189 2911109 0 10:34 ? 00:00:00 postgres: ubuntu > target [local] SELECT waiting > ubuntu 2911190 2911109 0 10:34 ? 00:00:00 postgres: ubuntu > target [local] idle > ubuntu 2911191 2911109 0 10:34 ? 00:00:00 postgres: ubuntu > target [local] TRUNCATE TABLE waiting > ubuntu 2911192 2911109 0 10:34 ? 00:00:00 postgres: ubuntu > target [local] idle > ubuntu 2911193 2911109 0 10:34 ? 00:00:00 postgres: ubuntu > target [local] SELECT waiting > ubuntu 2911194 2911109 0 10:34 ? 00:00:00 postgres: ubuntu > target [local] idle > > gdb -p 2911188 // ALTER TABLE ONLY public.parent2 ADD CONSTRAINT > parent2_pkey PRIMARY KEY (id); > (gdb) bt > #0 0x00007f2f6910878a in __GI___clock_nanosleep (clock_id=clock_id@entry=0, > flags=flags@entry=0, req=req@entry=0x7ffc0e560e60, > rem=rem@entry=0x7ffc0e560e60) > at ../sysdeps/unix/sysv/linux/clock_nanosleep.c:78 > #1 0x00007f2f6910d677 in __GI___nanosleep (req=req@entry=0x7ffc0e560e60, > rem=rem@entry=0x7ffc0e560e60) at ../sysdeps/unix/sysv/linux/nanosleep.c:25 > #2 0x00007f2f6910d5ae in __sleep (seconds=0) at > ../sysdeps/posix/sleep.c:55 > #3 0x0000561cd9386100 in DeadLockReport () at deadlock.c:1136 > #4 0x0000561cd9389df8 in LockAcquireExtended (locktag=0x7ffc0e5610b0, > lockmode=8, sessionLock=false, dontWait=false, reportMemoryError=true, > locallockp=0x7ffc0e5610a8, logLockFailure=false) at lock.c:1232 > #5 0x0000561cd93864bc in LockRelationOid (relid=16473, lockmode=8) at > lmgr.c:115 > #6 0x0000561cd8f21b20 in find_inheritance_children_extended > (parentrelId=16463, omit_detached=true, lockmode=8, detached_exist=0x0, > detached_xmin=0x0) at pg_inherits.c:213 > #7 0x0000561cd8f217c1 in find_inheritance_children (parentrelId=16463, > lockmode=8) at pg_inherits.c:60 > #8 0x0000561cd904dd73 in ATPrepAddPrimaryKey (wqueue=0x7ffc0e561348, > rel=0x7f2f5d7d6240, cmd=0x561cf1d2ee38, recurse=false, lockmode=8, > context=0x7ffc0e561540) at tablecmds.c:9463 > #9 0x0000561cd9043906 in ATPrepCmd (wqueue=0x7ffc0e561348, > rel=0x7f2f5d7d6240, cmd=0x561cf1d2ee38, recurse=false, recursing=false, > lockmode=8, context=0x7ffc0e561540) at tablecmds.c:5079 > #10 0x0000561cd90432aa in ATController (parsetree=0x561cf1d062e0, > rel=0x7f2f5d7d6240, cmds=0x561cf1d06290, recurse=false, lockmode=8, > context=0x7ffc0e561540) at tablecmds.c:4871 > #11 0x0000561cd9042f3b in AlterTable (stmt=0x561cf1d062e0, lockmode=8, > context=0x7ffc0e561540) at tablecmds.c:4533 > #12 0x0000561cd93bb7a8 in ProcessUtilitySlow (pstate=0x561cf1d2f9e0, > pstmt=0x561cf1d06390, queryString=0x561cf1d05570 "ALTER TABLE ONLY > public.parent2\n ADD CONSTRAINT parent2_pkey PRIMARY KEY (id);\n\n\n", > context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, > dest=0x561cf1d06750, qc=0x7ffc0e561ba0) at utility.c:1321 > #13 0x0000561cd93bb04e in standard_ProcessUtility (pstmt=0x561cf1d06390, > queryString=0x561cf1d05570 "ALTER TABLE ONLY public.parent2\n ADD > CONSTRAINT parent2_pkey PRIMARY KEY (id);\n\n\n", readOnlyTree=false, > context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, > dest=0x561cf1d06750, qc=0x7ffc0e561ba0) at utility.c:1070 > #14 0x0000561cd93b9f4c in ProcessUtility (pstmt=0x561cf1d06390, > queryString=0x561cf1d05570 "ALTER TABLE ONLY public.parent2\n ADD > CONSTRAINT parent2_pkey PRIMARY KEY (id);\n\n\n", readOnlyTree=false, > context=PROCESS_UTILITY_TOPLEVEL, params=0x0, queryEnv=0x0, > dest=0x561cf1d06750, qc=0x7ffc0e561ba0) at utility.c:523 > #15 0x0000561cd93b87a9 in PortalRunUtility (portal=0x561cf1d85d90, > pstmt=0x561cf1d06390, isTopLevel=true, setHoldSnapshot=false, > dest=0x561cf1d06750, qc=0x7ffc0e561ba0) at pquery.c:1185 > #16 0x0000561cd93b8a52 in PortalRunMulti (portal=0x561cf1d85d90, > isTopLevel=true, setHoldSnapshot=false, dest=0x561cf1d06750, > altdest=0x561cf1d06750, qc=0x7ffc0e561ba0) at pquery.c:1349 > #17 0x0000561cd93b7e73 in PortalRun (portal=0x561cf1d85d90, > count=9223372036854775807, isTopLevel=true, dest=0x561cf1d06750, > altdest=0x561cf1d06750, qc=0x7ffc0e561ba0) at pquery.c:820 > #18 0x0000561cd93b037e in exec_simple_query (query_string=0x561cf1d05570 > "ALTER TABLE ONLY public.parent2\n ADD CONSTRAINT parent2_pkey PRIMARY > KEY (id);\n\n\n") at postgres.c:1274 > #19 0x0000561cd93b5b46 in PostgresMain (dbname=0x561cf1d3f580 "target", > username=0x561cf1d3f568 "ubuntu") at postgres.c:4771 > #20 0x0000561cd93abab7 in BackendMain (startup_data=0x7ffc0e561e50, > startup_data_len=24) at backend_startup.c:124 > #21 0x0000561cd92abf09 in postmaster_child_launch (child_type=B_BACKEND, > child_slot=2, startup_data=0x7ffc0e561e50, startup_data_len=24, > client_sock=0x7ffc0e561eb0) at launch_backend.c:290 > #22 0x0000561cd92b2946 in BackendStartup (client_sock=0x7ffc0e561eb0) at > postmaster.c:3580 > #23 0x0000561cd92afeb1 in ServerLoop () at postmaster.c:1702 > #24 0x0000561cd92af7a2 in PostmasterMain (argc=3, argv=0x561cf1cffb00) at > postmaster.c:1400 > #25 0x0000561cd914cf06 in main (argc=3, argv=0x561cf1cffb00) at main.c:227 > > gdb -p 2911180 // COPY public.c22 (id, ref, b) FROM stdin; > (gdb) bt > #0 0x00007f2f69148dea in epoll_wait (epfd=5, events=0x561cf1d003a8, > maxevents=1, timeout=-1) at ../sysdeps/unix/sysv/linux/epoll_wait.c:30 > #1 0x0000561cd938106f in WaitEventSetWaitBlock (set=0x561cf1d00340, > cur_timeout=-1, occurred_events=0x7ffc0e561000, nevents=1) at > waiteventset.c:1190 > #2 0x0000561cd9380f74 in WaitEventSetWait (set=0x561cf1d00340, > timeout=-1, occurred_events=0x7ffc0e561000, nevents=1, > wait_event_info=50331648) at waiteventset.c:1138 > #3 0x0000561cd936f6a5 in WaitLatch (latch=0x7f2f665629e4, wakeEvents=33, > timeout=0, wait_event_info=50331648) at latch.c:194 > #4 0x0000561cd939fdbc in ProcSleep (locallock=0x561cf1d63560) at > proc.c:1454 > #5 0x0000561cd938b2c6 in WaitOnLock (locallock=0x561cf1d63560, > owner=0x561cf1d44040) at lock.c:1968 > #6 0x0000561cd9389dbd in LockAcquireExtended (locktag=0x7ffc0e5613f0, > lockmode=1, sessionLock=false, dontWait=false, reportMemoryError=true, > locallockp=0x7ffc0e5613e8, logLockFailure=false) at lock.c:1217 > #7 0x0000561cd93864bc in LockRelationOid (relid=16463, lockmode=1) at > lmgr.c:115 > #8 0x0000561cd8daf922 in relation_open (relationId=16463, lockmode=1) at > relation.c:55 > #9 0x0000561cd95a29f3 in generate_partition_qual (rel=0x7f2f5d7d6640) at > partcache.c:362 > #10 0x0000561cd95a28b8 in RelationGetPartitionQual (rel=0x7f2f5d7d6640) at > partcache.c:283 > #11 0x0000561cd90bd59f in ExecPartitionCheck > (resultRelInfo=0x561cf1d2ead8, slot=0x561cf1d33af8, estate=0x561cf1dde450, > emitError=true) at execMain.c:1952 > #12 0x0000561cd8fd1c9f in CopyFrom (cstate=0x561cf1de23a8) at > copyfrom.c:1368 > #13 0x0000561cd8fccd30 in DoCopy (pstate=0x561cf1d2f9e0, > stmt=0x561cf1d06140, stmt_location=0, stmt_len=39, > processed=0x7ffc0e561830) at copy.c:306 > #14 0x0000561cd93ba623 in standard_ProcessUtility (pstmt=0x561cf1d06210, > queryString=0x561cf1d05570 "COPY public.c22 (id, ref, b) FROM stdin;\n", > readOnlyTree=false, context=PROCESS_UTILITY_TOPLEVEL, params=0x0, > queryEnv=0x0, dest=0x561cf1d065d0, qc=0x7ffc0e561ba0) at utility.c:738 > #15 0x0000561cd93b9f4c in ProcessUtility (pstmt=0x561cf1d06210, > queryString=0x561cf1d05570 "COPY public.c22 (id, ref, b) FROM stdin;\n", > readOnlyTree=false, context=PROCESS_UTILITY_TOPLEVEL, params=0x0, > queryEnv=0x0, > dest=0x561cf1d065d0, qc=0x7ffc0e561ba0) at utility.c:523 > #16 0x0000561cd93b87a9 in PortalRunUtility (portal=0x561cf1d85d90, > pstmt=0x561cf1d06210, isTopLevel=true, setHoldSnapshot=false, > dest=0x561cf1d065d0, qc=0x7ffc0e561ba0) at pquery.c:1185 > #17 0x0000561cd93b8a52 in PortalRunMulti (portal=0x561cf1d85d90, > isTopLevel=true, setHoldSnapshot=false, dest=0x561cf1d065d0, > altdest=0x561cf1d065d0, qc=0x7ffc0e561ba0) at pquery.c:1349 > #18 0x0000561cd93b7e73 in PortalRun (portal=0x561cf1d85d90, > count=9223372036854775807, isTopLevel=true, dest=0x561cf1d065d0, > altdest=0x561cf1d065d0, qc=0x7ffc0e561ba0) at pquery.c:820 > #19 0x0000561cd93b037e in exec_simple_query (query_string=0x561cf1d05570 > "COPY public.c22 (id, ref, b) FROM stdin;\n") at postgres.c:1274 > #20 0x0000561cd93b5b46 in PostgresMain (dbname=0x561cf1d3f580 "target", > username=0x561cf1d3f568 "ubuntu") at postgres.c:4771 > #21 0x0000561cd93abab7 in BackendMain (startup_data=0x7ffc0e561e50, > startup_data_len=24) at backend_startup.c:124 > #22 0x0000561cd92abf09 in postmaster_child_launch (child_type=B_BACKEND, > child_slot=1, startup_data=0x7ffc0e561e50, startup_data_len=24, > client_sock=0x7ffc0e561eb0) at launch_backend.c:290 > #23 0x0000561cd92b2946 in BackendStartup (client_sock=0x7ffc0e561eb0) at > postmaster.c:3580 > #24 0x0000561cd92afeb1 in ServerLoop () at postmaster.c:1702 > #25 0x0000561cd92af7a2 in PostmasterMain (argc=3, argv=0x561cf1cffb00) at > postmaster.c:1400 > #26 0x0000561cd914cf06 in main (argc=3, argv=0x561cf1cffb00) at main.c:227 > > The alter table session do check its children not-null constraint using > lockmod=8, and the copy session do get partition_qual to lock parent using > lockmode =1. > I wonder if we have to use the same lockmode for checking children's > not-null constraint. > I thought further about the lockmode calling find_inheritance_children in ATPrepAddPrimaryKey. What we do here? We first get oids of children, then check the if the column of children has marked not-null, if not, report an error. No operation here on children. I check other places that call find_inheritance_children, if we have operation on children, we usually pass Lockmode to find_inheritance_children, otherwise pass NoLock. I try NoLock, then restore-deadlock.sh will have no error. -- Thanks, Tender Wang