[Linux-HA] Failover Failure

Yount, William D Thu, 02 Aug 2012 01:44:35 -0700

Attached is my cib.xml file.

I have a two node DRBD cluster setup in Active/Active. For whatever reason, it 
seems all my resources are attached to Node2. What I mean by that is that 
although the resources show that they are collocated, whenever I turn Node2  
off or unplug a cable from Node2, then the cluster goes down. I wait to see if 
they come back up on the other node (although they should already be running as 
it is an Active/Active cluster) but they never do, even after 10 minutes. With 
Node2 off, I can't even ping the collocated IP address. However, if I turn off 
Node1 while Node2 is running, nothing goes down.


I am using the LCMC to give me a graphical overview of the setup and the screen 
seems to indicate that everything is okay. I believe it has to do with my 
fencing agent which is pacemaker. I know that even though it is set to turn a 
node off if there is an issue, the node never seems to shutdown. It complains 
that devices are busy and it can't reboot.

I am just hoping someone can take a look at my configuration and see if there 
is anything that stands out. If it is the fencing agent, is there a better 
fencing agent?


William

<cib epoch="279" num_updates="0" admin_epoch="14" validate-with="pacemaker-1.2" crm_feature_set="3.0.6" update-origin=" NODE 2" update-client="crmd" cib-last-written="Wed Aug  1 03:56:44 2012" have-quorum="1">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <nvpair id="cib-bootstrap-options-stonith-enabled" name="stonith-enabled" value="true"/>
        <nvpair id="cib-bootstrap-options-stonith-action" name="stonith-action" value="poweroff"/>
        <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.1.7-6.el6-148fccfd5985c5590cc601123c6c16e966b85d14"/>
        <nvpair id="cib-bootstrap-options-no-quorum-policy" name="no-quorum-policy" value="ignore"/>
        <nvpair id="cib-bootstrap-options-cluster-recheck-interval" name="cluster-recheck-interval" value="5min"/>
        <nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="cman"/>
        <nvpair id="cib-bootstrap-options-last-lrm-refresh" name="last-lrm-refresh" value="1343364281"/>
      </cluster_property_set>
    </crm_config>
    <nodes>
      <node id="NODE1" type="normal" uname=" NODE 1">
        <instance_attributes id="nodes- NODE 1">
          <nvpair id="nodes- NODE 1-standby" name="standby" value="off"/>
        </instance_attributes>
      </node>
      <node id=" NODE 2" type="normal" uname=" NODE 2">
        <instance_attributes id="nodes- NODE 2">
          <nvpair id="nodes- NODE 2-standby" name="standby" value="off"/>
        </instance_attributes>
      </node>
    </nodes>
    <resources>
      <clone id="ClusterIPClone">
        <meta_attributes id="ClusterIPClone-meta_attributes">
          <nvpair id="ClusterIPClone-meta_attributes-globally-unique" name="globally-unique" value="true"/>
          <nvpair id="ClusterIPClone-meta_attributes-clone-max" name="clone-max" value="2"/>
          <nvpair id="ClusterIPClone-meta_attributes-clone-node-max" name="clone-node-max" value="2"/>
        </meta_attributes>
        <primitive class="ocf" id="ClusterIP" provider="heartbeat" type="IPaddr2">
          <instance_attributes id="ClusterIP-instance_attributes">
            <nvpair id="ClusterIP-instance_attributes-ip" name="ip" value="10.89.99.30"/>
            <nvpair id="ClusterIP-instance_attributes-cidr_netmask" name="cidr_netmask" value="22"/>
            <nvpair id="ClusterIP-instance_attributes-clusterip_hash" name="clusterip_hash" value="sourceip"/>
          </instance_attributes>
          <operations>
            <op id="ClusterIP-monitor-30s" interval="30s" name="monitor"/>
          </operations>
          <meta_attributes id="ClusterIP-meta_attributes">
            <nvpair id="ClusterIP-meta_attributes-is-managed" name="is-managed" value="true"/>
          </meta_attributes>
        </primitive>
      </clone>
      <clone id="dlm_clone">
        <meta_attributes id="dlm_clone-meta_attributes">
          <nvpair id="dlm_clone-meta_attributes-clone-max" name="clone-max" value="2"/>
          <nvpair id="dlm_clone-meta_attributes-clone-node-max" name="clone-node-max" value="1"/>
        </meta_attributes>
        <primitive class="ocf" id="dlm" provider="pacemaker" type="controld">
          <operations>
            <op id="dlm-monitor-60s" interval="60s" name="monitor"/>
          </operations>
          <meta_attributes id="dlm-meta_attributes">
            <nvpair id="dlm-meta_attributes-is-managed" name="is-managed" value="true"/>
          </meta_attributes>
        </primitive>
      </clone>
      <master id="ClusterDataClone">
        <meta_attributes id="ClusterDataClone-meta_attributes">
          <nvpair id="ClusterDataClone-meta_attributes-master-max" name="master-max" value="2"/>
          <nvpair id="ClusterDataClone-meta_attributes-master-node-max" name="master-node-max" value="1"/>
          <nvpair id="ClusterDataClone-meta_attributes-clone-max" name="clone-max" value="2"/>
          <nvpair id="ClusterDataClone-meta_attributes-clone-node-max" name="clone-node-max" value="1"/>
          <nvpair id="ClusterDataClone-meta_attributes-notify" name="notify" value="true"/>
        </meta_attributes>
        <primitive class="ocf" id="ClusterData" provider="linbit" type="drbd">
          <instance_attributes id="ClusterData-instance_attributes">
            <nvpair id="ClusterData-instance_attributes-drbd_resource" name="drbd_resource" value="nfs"/>
          </instance_attributes>
          <operations>
<operations>
            <op id="ClusterData-monitor-60s" interval="60s" name="monitor" role="Master"/>
          </operations>
          <meta_attributes id="ClusterData-meta_attributes">
            <nvpair id="ClusterData-meta_attributes-is-managed" name="is-managed" value="true"/>
          </meta_attributes>
        </primitive>
      </master>
      <clone id="ClusterFSClone">
        <primitive class="ocf" id="ClusterFS" provider="heartbeat" type="Filesystem">
          <instance_attributes id="ClusterFS-instance_attributes">
            <nvpair id="ClusterFS-instance_attributes-device" name="device" value="/dev/drbd/by-res/nfs"/>
            <nvpair id="ClusterFS-instance_attributes-directory" name="directory" value="/Storage"/>
            <nvpair id="ClusterFS-instance_attributes-fstype" name="fstype" value="gfs2"/>
          </instance_attributes>
          <meta_attributes id="ClusterFS-meta_attributes">
            <nvpair id="ClusterFS-meta_attributes-is-managed" name="is-managed" value="true"/>
          </meta_attributes>
        </primitive>
      </clone>
      <clone id="Fencing">
        <primitive class="stonith" id="pcmk-fencing" type="fence_pcmk">
          <instance_attributes id="pcmk-fencing-instance_attributes">
            <nvpair id="pcmk-fencing-instance_attributes-pcmk_host_list" name="pcmk_host_list" value=" NODE 1 NODE 2"/>
          </instance_attributes>
          <operations>
            <op id="pcmk-fencing-monitor-60s" interval="60s" name="monitor"/>
          </operations>
        </primitive>
      </clone>
    </resources>
    <constraints>
<rsc_colocation id="fs_on_drbd" rsc="ClusterFSClone" score="INFINITY" with-rsc="ClusterDataClone" with-rsc-role="Master"/>
      <rsc_order first="ClusterDataClone" first-action="promote" id="ClusterFS-after-ClusterData" score="INFINITY" then="ClusterFSClone" then-action="start"/>
    </constraints>
    <rsc_defaults>
      <meta_attributes id="rsc-options">
        <nvpair id="rsc-options-target-role" name="target-role" value="started"/>
        <nvpair id="rsc-options-allow-migrate" name="allow-migrate" value="true"/>
        <nvpair id="rsc-options-resource-stickiness" name="resource-stickiness" value="100"/>
      </meta_attributes>
    </rsc_defaults>
    <op_defaults>
      <meta_attributes id="op-options">
        <nvpair id="op-options-timeout" name="timeout" value="240s"/>
      </meta_attributes>
    </op_defaults>
  </configuration>
</cib>

_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

[Linux-HA] Failover Failure

Reply via email to