1. service corosync start ; service pacemaker start
2. cibadmin -U -x test.xml
(STONITH resource + Dummy resource are started on the same node)
3. After libvirt (STONITH resource) started, sleep was added to
status()
of libvirt.
[root@dev1 external]# diff -u libvirt.ORG libvirt
--- libvirt.ORG 2012-07-17 13:10:01.000000000 +0900
+++ libvirt 2012-07-30 13:36:19.661431208 +0900
@@ -221,6 +221,7 @@
;;
status)
+ sleep 3600
libvirt_check_config
libvirt_status
exit $?
[root@dev1 ~]# ps -ef|egrep
"UID|corosync|pacemaker|stonith|fence|sleep"
UID PID PPID C STIME TTY TIME CMD
root 18567 1 0 18:47 ? 00:00:02 corosync
root 18585 1 0 18:47 ? 00:00:00 pacemakerd
496 18587 18585 0 18:47 ? 00:00:00
/usr/libexec/pacemaker/cib
root 18588 18585 0 18:47 ? 00:00:00
/usr/libexec/pacemaker/stonithd
root 18589 18585 76 18:47 ? 00:05:27
/usr/libexec/pacemaker/lrmd
496 18590 18585 0 18:47 ? 00:00:00
/usr/libexec/pacemaker/attrd
496 18591 18585 0 18:47 ? 00:00:00
/usr/libexec/pacemaker/pengine
496 18592 18585 0 18:47 ? 00:00:00
/usr/libexec/pacemaker/crmd
root 18767 18588 0 18:48 ? 00:00:00 /usr/bin/perl
/usr/sbin/fence_legacy
root 18768 18767 0 18:48 ? 00:00:00 stonith -t
external/libvirt
-E -S
root 18778 18768 0 18:48 ? 00:00:00 /bin/sh
/usr/lib64/stonith/plugins/external/libvirt status
root 18792 18778 0 18:48 ? 00:00:00 sleep 3600
4. Then monitor of Dummy resource is not performed.
The following is behavior of lrmd at that time.
# gdb /usr/libexec/pacemaker/lrmd `pgrep lrmd`
(gdb) bt
#0 0x0000003f808e83e2 in recv () from /lib64/libc.so.6
#1 0x00007f0de3820062 in qb_ipc_us_recv_at_most
(one_way=0x1118ee8,
msg=0x111f390,
len=20480, timeout=500) at ipc_us.c:299
#2 0x00007f0de381a28e in qb_ipcc_recv (c=0x1118ba0,
msg_ptr=0x111f390, msg_len=
20480, ms_timeout=500) at ipcc.c:249
#3 0x00007f0de42bc5fb in crm_ipc_send (client=0x111b580,
message=0x111d860, reply=
0x7fffc8ec4c60, ms_timeout=61060000) at ipc.c:517
#4 0x00007f0de3c97e29 in stonith_send_command (stonith=0x111a6c0,
op=
0x7f0de3c998dd "st_execute", data=0x1119170, output_data=0x0,
call_options=4096,
timeout=61000) at st_client.c:1676
#5 0x00007f0de3c94bd1 in stonith_api_call (stonith=0x111a6c0,
call_options=4096,
id=0x1129860 "f-2", action=0x7f0de3c998f7 "monitor",
victim=0x0,
timeout=61000,
output=0x0) at st_client.c:951
#6 0x00007f0de3c94d31 in stonith_api_monitor (stonith=0x111a6c0,
call_options=4096,
id=0x1129860 "f-2", timeout=61000) at st_client.c:985
#7 0x00000000004044e2 in lrmd_rsc_execute_stonith (rsc=0x111f130,
cmd=0x1129660) at
lrmd.c:522
#8 0x0000000000404cd6 in lrmd_rsc_execute (rsc=0x111f130) at
lrmd.c:667
#9 0x0000000000404d2d in lrmd_rsc_dispatch (user_data=0x111f130)
at
lrmd.c:678
#10 0x00007f0de42dcd00 in crm_trigger_dispatch (source=0x111f300,
callback=
0x404d06 <lrmd_rsc_dispatch>, userdata=0x111f300) at
mainloop.c:105
#11 0x0000003642638f0e in g_main_context_dispatch () from
/lib64/libglib-2.0.so.0
#12 0x000000364263c938 in ?? () from /lib64/libglib-2.0.so.0
#13 0x000000364263cd55 in g_main_loop_run () from
/lib64/libglib-2.0.so.0
#14 0x0000000000402d3f in main (argc=1, argv=0x7fffc8ec5188) at
main.c:285
(gdb) fin
Run till exit from #0 0x0000003f808e83e2 in recv () from
/lib64/libc.so.6
0x00007f0de3820062 in qb_ipc_us_recv_at_most (one_way=0x1118ee8,
msg=0x111f390,
len=20480, timeout=500) at ipc_us.c:299
299 result = recv(one_way->u.us.sock, &data[processed],
to_recv,
(gdb) fin
Run till exit from #0 0x00007f0de3820062 in qb_ipc_us_recv_at_most
(one_way=
0x1118ee8, msg=0x111f390, len=20480, timeout=500) at
ipc_us.c:299
0x00007f0de381a28e in qb_ipcc_recv (c=0x1118ba0, msg_ptr=0x111f390,
msg_len=20480,
ms_timeout=500) at ipcc.c:249
249 res = c->funcs.recv(&c->response, msg_ptr, msg_len,
ms_timeout);
Value returned is $1 = -11
(gdb) fin
Run till exit from #0 0x00007f0de381a28e in qb_ipcc_recv
(c=0x1118ba0, msg_ptr=
0x111f390, msg_len=20480, ms_timeout=500) at ipcc.c:249
0x00007f0de42bc5fb in crm_ipc_send (client=0x111b580,
message=0x111d860, reply=
0x7fffc8ec4c60, ms_timeout=61060000) at ipc.c:517
517 rc = qb_ipcc_recv(client->ipc, client->buffer,
client->buf_size, 500);
Value returned is $2 = -11
(gdb) n
518 if(rc > 0 || crm_ipc_connected(client) ==
FALSE)
{
(gdb) p rc
$3 = -11
(gdb) n
522 } while(time(NULL) < timeout);
(gdb) n
517 rc = qb_ipcc_recv(client->ipc, client->buffer,
client->buf_size, 500);
(gdb) n
518 if(rc > 0 || crm_ipc_connected(client) ==
FALSE)
{
(gdb) p rc
$4 = -11
(gdb) n
522 } while(time(NULL) < timeout);
(gdb) n
517 rc = qb_ipcc_recv(client->ipc, client->buffer,
client->buf_size, 500);
(gdb) n
518 if(rc > 0 || crm_ipc_connected(client) ==
FALSE)
{
(gdb) p rc
$5 = -11
(gdb) n
522 } while(time(NULL) < timeout);
(gdb) n
517 rc = qb_ipcc_recv(client->ipc, client->buffer,
client->buf_size, 500);
(gdb) n
518 if(rc > 0 || crm_ipc_connected(client) ==
FALSE)
{
(gdb) p rc
$6 = -11
(gdb)
It seems that lrmd has repeated the reply reception from stonithd
out of the g_main_loop. Therefore, monitor of Dummy is not
performed.
[root@dev1 ~]# top -bn1
top - 18:53:31 up 5 days, 8:56, 4 users, load average: 0.98,
0.63,
0.44
Tasks: 198 total, 2 running, 196 sleeping, 0 stopped, 0
zombie
Cpu(s): 0.9%us, 1.2%sy, 0.0%ni, 97.9%id, 0.0%wa, 0.0%hi,
0.0%si, 0.0%st
Mem: 5089052k total, 2417444k used, 2671608k free, 266660k
buffers
Swap: 1048568k total, 0k used, 1048568k free, 1724616k
cached
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+
COMMAND
18589 root 20 0 83932 3448 2548 R 98.2 0.1 4:15.89 lrmd
1 root 20 0 19348 1520 1212 S 0.0 0.0 0:00.78 init
2 root 20 0 0 0 0 S 0.0 0.0 0:00.00
kthreadd
3 root RT 0 0 0 0 S 0.0 0.0 0:06.93
migration/0
4 root 20 0 0 0 0 S 0.0 0.0 15:23.59
ksoftirqd/0
5 root RT 0 0 0 0 S 0.0 0.0 0:00.10
migration/0
Best Regards,
Kazunori INOUE
_______________________________________________
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker
Project Home: http://www.clusterlabs.org
Getting started:
http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://bugs.clusterlabs.org