Hi,

I am using Pacemaker-1.1.
- glue       (2012 Jul 16) 2719:18489f275f75
- libqb      (2012 Jul 19) 11b20e19beff7f1b6003be0b4c73da8ecf936442
- corosync   (2012 Jul 19) b9eb19e623d2b69c86cf78f1fa50a004c804ac20
- pacemaker  (2012 Jul 29) 33119da31c235710195c783e5c9a32c6e95b3efc

The monitor operation of other resources is not performed during the
monitor of STONITH resource.

1. service corosync start ; service pacemaker start
2. cibadmin -U -x test.xml
   (STONITH resource + Dummy resource are started on the same node)
3. After libvirt (STONITH resource) started, sleep was added to status()
   of libvirt.

   [root@dev1 external]# diff -u libvirt.ORG libvirt
   --- libvirt.ORG 2012-07-17 13:10:01.000000000 +0900
   +++ libvirt     2012-07-30 13:36:19.661431208 +0900
   @@ -221,6 +221,7 @@
        ;;

        status)
   +    sleep 3600
        libvirt_check_config
        libvirt_status
        exit $?

   [root@dev1 ~]# ps -ef|egrep "UID|corosync|pacemaker|stonith|fence|sleep"
   UID    PID  PPID  C STIME TTY     TIME CMD
   root 18567     1  0 18:47 ?   00:00:02 corosync
   root 18585     1  0 18:47 ?   00:00:00 pacemakerd
   496  18587 18585  0 18:47 ?   00:00:00 /usr/libexec/pacemaker/cib
   root 18588 18585  0 18:47 ?   00:00:00 /usr/libexec/pacemaker/stonithd
   root 18589 18585 76 18:47 ?   00:05:27 /usr/libexec/pacemaker/lrmd
   496  18590 18585  0 18:47 ?   00:00:00 /usr/libexec/pacemaker/attrd
   496  18591 18585  0 18:47 ?   00:00:00 /usr/libexec/pacemaker/pengine
   496  18592 18585  0 18:47 ?   00:00:00 /usr/libexec/pacemaker/crmd
   root 18767 18588  0 18:48 ?   00:00:00 /usr/bin/perl /usr/sbin/fence_legacy
   root 18768 18767  0 18:48 ?   00:00:00 stonith -t external/libvirt -E -S
   root 18778 18768  0 18:48 ?   00:00:00 /bin/sh 
/usr/lib64/stonith/plugins/external/libvirt status
   root 18792 18778  0 18:48 ?   00:00:00 sleep 3600

4. Then monitor of Dummy resource is not performed.

The following is behavior of lrmd at that time.

# gdb /usr/libexec/pacemaker/lrmd `pgrep lrmd`
(gdb) bt
#0  0x0000003f808e83e2 in recv () from /lib64/libc.so.6
#1  0x00007f0de3820062 in qb_ipc_us_recv_at_most (one_way=0x1118ee8, 
msg=0x111f390,
    len=20480, timeout=500) at ipc_us.c:299
#2  0x00007f0de381a28e in qb_ipcc_recv (c=0x1118ba0, msg_ptr=0x111f390, msg_len=
    20480, ms_timeout=500) at ipcc.c:249
#3  0x00007f0de42bc5fb in crm_ipc_send (client=0x111b580, message=0x111d860, 
reply=
    0x7fffc8ec4c60, ms_timeout=61060000) at ipc.c:517
#4  0x00007f0de3c97e29 in stonith_send_command (stonith=0x111a6c0, op=
    0x7f0de3c998dd "st_execute", data=0x1119170, output_data=0x0, 
call_options=4096,
    timeout=61000) at st_client.c:1676
#5  0x00007f0de3c94bd1 in stonith_api_call (stonith=0x111a6c0, 
call_options=4096,
    id=0x1129860 "f-2", action=0x7f0de3c998f7 "monitor", victim=0x0, 
timeout=61000,
    output=0x0) at st_client.c:951
#6  0x00007f0de3c94d31 in stonith_api_monitor (stonith=0x111a6c0, 
call_options=4096,
    id=0x1129860 "f-2", timeout=61000) at st_client.c:985
#7  0x00000000004044e2 in lrmd_rsc_execute_stonith (rsc=0x111f130, 
cmd=0x1129660) at
    lrmd.c:522
#8  0x0000000000404cd6 in lrmd_rsc_execute (rsc=0x111f130) at lrmd.c:667
#9  0x0000000000404d2d in lrmd_rsc_dispatch (user_data=0x111f130) at lrmd.c:678
#10 0x00007f0de42dcd00 in crm_trigger_dispatch (source=0x111f300, callback=
    0x404d06 <lrmd_rsc_dispatch>, userdata=0x111f300) at mainloop.c:105
#11 0x0000003642638f0e in g_main_context_dispatch () from 
/lib64/libglib-2.0.so.0
#12 0x000000364263c938 in ?? () from /lib64/libglib-2.0.so.0
#13 0x000000364263cd55 in g_main_loop_run () from /lib64/libglib-2.0.so.0
#14 0x0000000000402d3f in main (argc=1, argv=0x7fffc8ec5188) at main.c:285
(gdb) fin
Run till exit from #0  0x0000003f808e83e2 in recv () from /lib64/libc.so.6
0x00007f0de3820062 in qb_ipc_us_recv_at_most (one_way=0x1118ee8, msg=0x111f390,
    len=20480, timeout=500) at ipc_us.c:299
299         result = recv(one_way->u.us.sock, &data[processed], to_recv,
(gdb) fin
Run till exit from #0  0x00007f0de3820062 in qb_ipc_us_recv_at_most (one_way=
    0x1118ee8, msg=0x111f390, len=20480, timeout=500) at ipc_us.c:299
0x00007f0de381a28e in qb_ipcc_recv (c=0x1118ba0, msg_ptr=0x111f390, 
msg_len=20480,
    ms_timeout=500) at ipcc.c:249
249         res = c->funcs.recv(&c->response, msg_ptr, msg_len, ms_timeout);
Value returned is $1 = -11
(gdb) fin
Run till exit from #0  0x00007f0de381a28e in qb_ipcc_recv (c=0x1118ba0, msg_ptr=
    0x111f390, msg_len=20480, ms_timeout=500) at ipcc.c:249
0x00007f0de42bc5fb in crm_ipc_send (client=0x111b580, message=0x111d860, reply=
    0x7fffc8ec4c60, ms_timeout=61060000) at ipc.c:517
517                 rc = qb_ipcc_recv(client->ipc, client->buffer, 
client->buf_size, 500);
Value returned is $2 = -11
(gdb) n
518                 if(rc > 0 || crm_ipc_connected(client) == FALSE) {
(gdb) p rc
$3 = -11
(gdb) n
522             } while(time(NULL) < timeout);
(gdb) n
517                 rc = qb_ipcc_recv(client->ipc, client->buffer, 
client->buf_size, 500);
(gdb) n
518                 if(rc > 0 || crm_ipc_connected(client) == FALSE) {
(gdb) p rc
$4 = -11
(gdb) n
522             } while(time(NULL) < timeout);
(gdb) n
517                 rc = qb_ipcc_recv(client->ipc, client->buffer, 
client->buf_size, 500);
(gdb) n
518                 if(rc > 0 || crm_ipc_connected(client) == FALSE) {
(gdb) p rc
$5 = -11
(gdb) n
522             } while(time(NULL) < timeout);
(gdb) n
517                 rc = qb_ipcc_recv(client->ipc, client->buffer, 
client->buf_size, 500);
(gdb) n
518                 if(rc > 0 || crm_ipc_connected(client) == FALSE) {
(gdb) p rc
$6 = -11
(gdb)

It seems that lrmd has repeated the reply reception from stonithd
out of the g_main_loop. Therefore, monitor of Dummy is not performed.

[root@dev1 ~]# top -bn1
top - 18:53:31 up 5 days,  8:56,  4 users,  load average: 0.98, 0.63, 0.44
Tasks: 198 total,   2 running, 196 sleeping,   0 stopped,   0 zombie
Cpu(s):  0.9%us,  1.2%sy,  0.0%ni, 97.9%id,  0.0%wa,  0.0%hi,  0.0%si,  0.0%st
Mem:   5089052k total,  2417444k used,  2671608k free,   266660k buffers
Swap:  1048568k total,        0k used,  1048568k free,  1724616k cached

  PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
18589 root      20   0 83932 3448 2548 R 98.2  0.1   4:15.89 lrmd
    1 root      20   0 19348 1520 1212 S  0.0  0.0   0:00.78 init
    2 root      20   0     0    0    0 S  0.0  0.0   0:00.00 kthreadd
    3 root      RT   0     0    0    0 S  0.0  0.0   0:06.93 migration/0
    4 root      20   0     0    0    0 S  0.0  0.0  15:23.59 ksoftirqd/0
    5 root      RT   0     0    0    0 S  0.0  0.0   0:00.10 migration/0

Best Regards,
Kazunori INOUE
<cib admin_epoch="0" epoch="100" num_updates="0">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/>
        <nvpair id="cib-bootstrap-options-stonith-enabled" name="stonith-enabled" value="true"/>
        <nvpair id="cib-bootstrap-options-startup-fencing" name="startup-fencing" value="false"/>
        <nvpair id="cib-bootstrap-options-stonith-timeout" name="stonith-timeout" value="60s"/>
        <nvpair id="cib-bootstrap-options-cluster-delay" name="cluster-delay" value="10s"/>
      </cluster_property_set>
    </crm_config>

    <resources>
      <primitive class="stonith" id="f-2" type="external/libvirt">
        <instance_attributes id="f-2-instance_attributes">
          <nvpair id="f-2-instance_attributes-hostlist" name="hostlist" value="dev2"/>
          <nvpair id="f-2-instance_attributes-hypervisor_uri" name="hypervisor_uri" value="qemu+ssh://n8/system"/>
        </instance_attributes>
        <operations>
          <op id="f-2-start-0" interval="0s" name="start" timeout="60s"/>
          <op id="f-2-monitor-10" interval="10s" name="monitor" timeout="61s"/>
          <op id="f-2-stop-0" interval="0s" name="stop" timeout="60s"/>
        </operations>
      </primitive>

      <primitive class="ocf" id="prmDummy" provider="pacemaker" type="Dummy">
        <operations>
          <op id="prmDummy-start-0" interval="0s" name="start" on-fail="restart" timeout="60s"/>
          <op id="prmDummy-monitor-10" interval="10s" name="monitor" on-fail="fence" timeout="60s"/>
          <op id="prmDummy-stop-0" interval="0s" name="stop" on-fail="stop" timeout="60s"/>
        </operations>
      </primitive>
    </resources>

    <constraints>
      <rsc_location id="rl-f-2" rsc="f-2">
        <rule id="rl-f-2-rule" score="100"><expression attribute="#uname" id="rl-f-2-exp" operation="eq" value="dev1"/></rule>
        <rule id="rl-f-2-rule-0" score="-INFINITY"><expression attribute="#uname" id="rl-f-2-exp-0" operation="eq" value="dev2"/></rule>
      </rsc_location>
      <rsc_location id="rl-prmDummy" rsc="prmDummy">
        <rule id="rl-prmDummy-rule" score="100"><expression attribute="#uname" id="rl-prmDummy-exp" operation="eq" value="dev1"/></rule>
        <rule id="rl-prmDummy-rule-0" score="-INFINITY"><expression attribute="#uname" id="rl-prmDummy-exp-0" operation="eq" value="dev2"/></rule>
      </rsc_location>
    </constraints>

    <rsc_defaults>
      <meta_attributes id="rsc-options">
        <nvpair id="rsc-options-resource-stickiness" name="resource-stickiness" value="INFINITY"/>
        <nvpair id="rsc-options-migration-threshold" name="migration-threshold" value="1"/>
      </meta_attributes>
    </rsc_defaults>
  </configuration>
</cib>





_______________________________________________
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://bugs.clusterlabs.org

Reply via email to