Hi, I am using Pacemaker-1.1. - glue (2012 Jul 16) 2719:18489f275f75 - libqb (2012 Jul 19) 11b20e19beff7f1b6003be0b4c73da8ecf936442 - corosync (2012 Jul 19) b9eb19e623d2b69c86cf78f1fa50a004c804ac20 - pacemaker (2012 Jul 29) 33119da31c235710195c783e5c9a32c6e95b3efc
The monitor operation of other resources is not performed during the monitor of STONITH resource. 1. service corosync start ; service pacemaker start 2. cibadmin -U -x test.xml (STONITH resource + Dummy resource are started on the same node) 3. After libvirt (STONITH resource) started, sleep was added to status() of libvirt. [root@dev1 external]# diff -u libvirt.ORG libvirt --- libvirt.ORG 2012-07-17 13:10:01.000000000 +0900 +++ libvirt 2012-07-30 13:36:19.661431208 +0900 @@ -221,6 +221,7 @@ ;; status) + sleep 3600 libvirt_check_config libvirt_status exit $? [root@dev1 ~]# ps -ef|egrep "UID|corosync|pacemaker|stonith|fence|sleep" UID PID PPID C STIME TTY TIME CMD root 18567 1 0 18:47 ? 00:00:02 corosync root 18585 1 0 18:47 ? 00:00:00 pacemakerd 496 18587 18585 0 18:47 ? 00:00:00 /usr/libexec/pacemaker/cib root 18588 18585 0 18:47 ? 00:00:00 /usr/libexec/pacemaker/stonithd root 18589 18585 76 18:47 ? 00:05:27 /usr/libexec/pacemaker/lrmd 496 18590 18585 0 18:47 ? 00:00:00 /usr/libexec/pacemaker/attrd 496 18591 18585 0 18:47 ? 00:00:00 /usr/libexec/pacemaker/pengine 496 18592 18585 0 18:47 ? 00:00:00 /usr/libexec/pacemaker/crmd root 18767 18588 0 18:48 ? 00:00:00 /usr/bin/perl /usr/sbin/fence_legacy root 18768 18767 0 18:48 ? 00:00:00 stonith -t external/libvirt -E -S root 18778 18768 0 18:48 ? 00:00:00 /bin/sh /usr/lib64/stonith/plugins/external/libvirt status root 18792 18778 0 18:48 ? 00:00:00 sleep 3600 4. Then monitor of Dummy resource is not performed. The following is behavior of lrmd at that time. # gdb /usr/libexec/pacemaker/lrmd `pgrep lrmd` (gdb) bt #0 0x0000003f808e83e2 in recv () from /lib64/libc.so.6 #1 0x00007f0de3820062 in qb_ipc_us_recv_at_most (one_way=0x1118ee8, msg=0x111f390, len=20480, timeout=500) at ipc_us.c:299 #2 0x00007f0de381a28e in qb_ipcc_recv (c=0x1118ba0, msg_ptr=0x111f390, msg_len= 20480, ms_timeout=500) at ipcc.c:249 #3 0x00007f0de42bc5fb in crm_ipc_send (client=0x111b580, message=0x111d860, reply= 0x7fffc8ec4c60, ms_timeout=61060000) at ipc.c:517 #4 0x00007f0de3c97e29 in stonith_send_command (stonith=0x111a6c0, op= 0x7f0de3c998dd "st_execute", data=0x1119170, output_data=0x0, call_options=4096, timeout=61000) at st_client.c:1676 #5 0x00007f0de3c94bd1 in stonith_api_call (stonith=0x111a6c0, call_options=4096, id=0x1129860 "f-2", action=0x7f0de3c998f7 "monitor", victim=0x0, timeout=61000, output=0x0) at st_client.c:951 #6 0x00007f0de3c94d31 in stonith_api_monitor (stonith=0x111a6c0, call_options=4096, id=0x1129860 "f-2", timeout=61000) at st_client.c:985 #7 0x00000000004044e2 in lrmd_rsc_execute_stonith (rsc=0x111f130, cmd=0x1129660) at lrmd.c:522 #8 0x0000000000404cd6 in lrmd_rsc_execute (rsc=0x111f130) at lrmd.c:667 #9 0x0000000000404d2d in lrmd_rsc_dispatch (user_data=0x111f130) at lrmd.c:678 #10 0x00007f0de42dcd00 in crm_trigger_dispatch (source=0x111f300, callback= 0x404d06 <lrmd_rsc_dispatch>, userdata=0x111f300) at mainloop.c:105 #11 0x0000003642638f0e in g_main_context_dispatch () from /lib64/libglib-2.0.so.0 #12 0x000000364263c938 in ?? () from /lib64/libglib-2.0.so.0 #13 0x000000364263cd55 in g_main_loop_run () from /lib64/libglib-2.0.so.0 #14 0x0000000000402d3f in main (argc=1, argv=0x7fffc8ec5188) at main.c:285 (gdb) fin Run till exit from #0 0x0000003f808e83e2 in recv () from /lib64/libc.so.6 0x00007f0de3820062 in qb_ipc_us_recv_at_most (one_way=0x1118ee8, msg=0x111f390, len=20480, timeout=500) at ipc_us.c:299 299 result = recv(one_way->u.us.sock, &data[processed], to_recv, (gdb) fin Run till exit from #0 0x00007f0de3820062 in qb_ipc_us_recv_at_most (one_way= 0x1118ee8, msg=0x111f390, len=20480, timeout=500) at ipc_us.c:299 0x00007f0de381a28e in qb_ipcc_recv (c=0x1118ba0, msg_ptr=0x111f390, msg_len=20480, ms_timeout=500) at ipcc.c:249 249 res = c->funcs.recv(&c->response, msg_ptr, msg_len, ms_timeout); Value returned is $1 = -11 (gdb) fin Run till exit from #0 0x00007f0de381a28e in qb_ipcc_recv (c=0x1118ba0, msg_ptr= 0x111f390, msg_len=20480, ms_timeout=500) at ipcc.c:249 0x00007f0de42bc5fb in crm_ipc_send (client=0x111b580, message=0x111d860, reply= 0x7fffc8ec4c60, ms_timeout=61060000) at ipc.c:517 517 rc = qb_ipcc_recv(client->ipc, client->buffer, client->buf_size, 500); Value returned is $2 = -11 (gdb) n 518 if(rc > 0 || crm_ipc_connected(client) == FALSE) { (gdb) p rc $3 = -11 (gdb) n 522 } while(time(NULL) < timeout); (gdb) n 517 rc = qb_ipcc_recv(client->ipc, client->buffer, client->buf_size, 500); (gdb) n 518 if(rc > 0 || crm_ipc_connected(client) == FALSE) { (gdb) p rc $4 = -11 (gdb) n 522 } while(time(NULL) < timeout); (gdb) n 517 rc = qb_ipcc_recv(client->ipc, client->buffer, client->buf_size, 500); (gdb) n 518 if(rc > 0 || crm_ipc_connected(client) == FALSE) { (gdb) p rc $5 = -11 (gdb) n 522 } while(time(NULL) < timeout); (gdb) n 517 rc = qb_ipcc_recv(client->ipc, client->buffer, client->buf_size, 500); (gdb) n 518 if(rc > 0 || crm_ipc_connected(client) == FALSE) { (gdb) p rc $6 = -11 (gdb) It seems that lrmd has repeated the reply reception from stonithd out of the g_main_loop. Therefore, monitor of Dummy is not performed. [root@dev1 ~]# top -bn1 top - 18:53:31 up 5 days, 8:56, 4 users, load average: 0.98, 0.63, 0.44 Tasks: 198 total, 2 running, 196 sleeping, 0 stopped, 0 zombie Cpu(s): 0.9%us, 1.2%sy, 0.0%ni, 97.9%id, 0.0%wa, 0.0%hi, 0.0%si, 0.0%st Mem: 5089052k total, 2417444k used, 2671608k free, 266660k buffers Swap: 1048568k total, 0k used, 1048568k free, 1724616k cached PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 18589 root 20 0 83932 3448 2548 R 98.2 0.1 4:15.89 lrmd 1 root 20 0 19348 1520 1212 S 0.0 0.0 0:00.78 init 2 root 20 0 0 0 0 S 0.0 0.0 0:00.00 kthreadd 3 root RT 0 0 0 0 S 0.0 0.0 0:06.93 migration/0 4 root 20 0 0 0 0 S 0.0 0.0 15:23.59 ksoftirqd/0 5 root RT 0 0 0 0 S 0.0 0.0 0:00.10 migration/0 Best Regards, Kazunori INOUE
<cib admin_epoch="0" epoch="100" num_updates="0"> <configuration> <crm_config> <cluster_property_set id="cib-bootstrap-options"> <nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/> <nvpair id="cib-bootstrap-options-stonith-enabled" name="stonith-enabled" value="true"/> <nvpair id="cib-bootstrap-options-startup-fencing" name="startup-fencing" value="false"/> <nvpair id="cib-bootstrap-options-stonith-timeout" name="stonith-timeout" value="60s"/> <nvpair id="cib-bootstrap-options-cluster-delay" name="cluster-delay" value="10s"/> </cluster_property_set> </crm_config> <resources> <primitive class="stonith" id="f-2" type="external/libvirt"> <instance_attributes id="f-2-instance_attributes"> <nvpair id="f-2-instance_attributes-hostlist" name="hostlist" value="dev2"/> <nvpair id="f-2-instance_attributes-hypervisor_uri" name="hypervisor_uri" value="qemu+ssh://n8/system"/> </instance_attributes> <operations> <op id="f-2-start-0" interval="0s" name="start" timeout="60s"/> <op id="f-2-monitor-10" interval="10s" name="monitor" timeout="61s"/> <op id="f-2-stop-0" interval="0s" name="stop" timeout="60s"/> </operations> </primitive> <primitive class="ocf" id="prmDummy" provider="pacemaker" type="Dummy"> <operations> <op id="prmDummy-start-0" interval="0s" name="start" on-fail="restart" timeout="60s"/> <op id="prmDummy-monitor-10" interval="10s" name="monitor" on-fail="fence" timeout="60s"/> <op id="prmDummy-stop-0" interval="0s" name="stop" on-fail="stop" timeout="60s"/> </operations> </primitive> </resources> <constraints> <rsc_location id="rl-f-2" rsc="f-2"> <rule id="rl-f-2-rule" score="100"><expression attribute="#uname" id="rl-f-2-exp" operation="eq" value="dev1"/></rule> <rule id="rl-f-2-rule-0" score="-INFINITY"><expression attribute="#uname" id="rl-f-2-exp-0" operation="eq" value="dev2"/></rule> </rsc_location> <rsc_location id="rl-prmDummy" rsc="prmDummy"> <rule id="rl-prmDummy-rule" score="100"><expression attribute="#uname" id="rl-prmDummy-exp" operation="eq" value="dev1"/></rule> <rule id="rl-prmDummy-rule-0" score="-INFINITY"><expression attribute="#uname" id="rl-prmDummy-exp-0" operation="eq" value="dev2"/></rule> </rsc_location> </constraints> <rsc_defaults> <meta_attributes id="rsc-options"> <nvpair id="rsc-options-resource-stickiness" name="resource-stickiness" value="INFINITY"/> <nvpair id="rsc-options-migration-threshold" name="migration-threshold" value="1"/> </meta_attributes> </rsc_defaults> </configuration> </cib>
_______________________________________________ Pacemaker mailing list: Pacemaker@oss.clusterlabs.org http://oss.clusterlabs.org/mailman/listinfo/pacemaker Project Home: http://www.clusterlabs.org Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf Bugs: http://bugs.clusterlabs.org