Re: [Pacemaker] Corosync won't recover when a node fails

Andreas Kurz Fri, 04 Oct 2013 01:43:51 -0700

On 2013-10-04 03:03, David Parker wrote:
> Sure.  Here's the full config:


You defintely must not use the deprecated ocf:heartbeat:drbd resource
agent but the one that comes with DRBD: ocf:linbit:drbd ... you should
see a big fat warning in your logs to not use it.

And this colocation is wrong:

...
<rsc_colocation id="drbd-nfs-ha" rsc="ms-drbd_r0"
 rsc-role="Master" score="INFINITY" with-rsc="nfs_resources"/>
...

rsc and with-rcs need to be the other way round

<rsc_colocation id="drbd-nfs-ha" rsc="nfs_resources" score="INFINITY"
with-rsc="ms-drbd_r0" with-rsc-role="Master"  />


... give this a try.

Regards,
Andreas

> 
> <cib epoch="28" num_updates="34" admin_epoch="0"
> validate-with="pacemaker-1.2" cib-last-written="Thu Oct  3 16:26:39
> 2013" crm_feature_set="3.0.6" update-origin="test-vm-2"
> update-client="cibadmin" have-quorum="1" dc-uuid="test-vm-1">
>   <configuration>
>     <crm_config>
>       <cluster_property_set id="cib-bootstrap-options">
>         <nvpair id="cib-bootstrap-options-dc-version" name="dc-version"
> value="1.1.7-ee0730e13d124c3d58f00016c3376a1de5323cff"/>
>         <nvpair id="cib-bootstrap-options-cluster-infrastructure"
> name="cluster-infrastructure" value="openais"/>
>         <nvpair id="cib-bootstrap-options-expected-quorum-votes"
> name="expected-quorum-votes" value="2"/>
>         <nvpair id="cib-bootstrap-options-stonith-enabled"
> name="stonith-enabled" value="false"/>
>         <nvpair id="cib-bootstrap-options-no-quorum-policy"
> name="no-quorum-policy" value="ignore"/>
>       </cluster_property_set>
>     </crm_config>
>     <nodes>
>       <node id="test-vm-1" type="normal" uname="test-vm-1"/>
>       <node id="test-vm-2" type="normal" uname="test-vm-2"/>
>     </nodes>
>     <resources>
>       <group id="nfs_resources">
>         <meta_attributes id="nfs_resources-meta_attributes">
>           <nvpair id="nfs_resources-meta_attributes-target-role"
> name="target-role" value="Started"/>
>         </meta_attributes>
>         <primitive class="ocf" id="nfs_fs" provider="heartbeat"
> type="Filesystem">
>           <instance_attributes id="nfs_fs-instance_attributes">
>             <nvpair id="nfs_fs-instance_attributes-device" name="device"
> value="/dev/drbd1"/>
>             <nvpair id="nfs_fs-instance_attributes-directory"
> name="directory" value="/export/data/"/>
>             <nvpair id="nfs_fs-instance_attributes-fstype" name="fstype"
> value="ext3"/>
>             <nvpair id="nfs_fs-instance_attributes-options"
> name="options" value="noatime,nodiratime"/>
>           </instance_attributes>
>           <operations>
>             <op id="nfs_fs-start-0" interval="0" name="start" timeout="60"/>
>             <op id="nfs_fs-stop-0" interval="0" name="stop" timeout="120"/>
>           </operations>
>         </primitive>
>         <primitive class="ocf" id="nfs_ip" provider="heartbeat"
> type="IPaddr2">
>           <instance_attributes id="nfs_ip-instance_attributes">
>             <nvpair id="nfs_ip-instance_attributes-ip" name="ip"
> value="192.168.25.205"/>
>             <nvpair id="nfs_ip-instance_attributes-cidr_netmask"
> name="cidr_netmask" value="32"/>
>           </instance_attributes>
>           <operations>
>             <op id="nfs_ip-monitor-10s" interval="10s" name="monitor"/>
>           </operations>
>           <meta_attributes id="nfs_ip-meta_attributes">
>             <nvpair id="nfs_ip-meta_attributes-is-managed"
> name="is-managed" value="true"/>
>           </meta_attributes>
>         </primitive>
>         <primitive class="lsb" id="nfs" type="nfs-kernel-server">
>           <operations>
>             <op id="nfs-monitor-5s" interval="5s" name="monitor"/>
>             <op id="nfs-start-0" interval="0" name="start" timeout="120"/>
>             <op id="nfs-stop-0" interval="0" name="stop" timeout="120"/>
>           </operations>
>         </primitive>
>       </group>
>       <master id="ms-drbd_r0">
>         <meta_attributes id="ms-drbd_r0-meta_attributes">
>           <nvpair id="ms-drbd_r0-meta_attributes-clone-max"
> name="clone-max" value="2"/>
>           <nvpair id="ms-drbd_r0-meta_attributes-notify" name="notify"
> value="true"/>
>           <nvpair id="ms-drbd_r0-meta_attributes-globally-unique"
> name="globally-unique" value="false"/>
>           <nvpair id="ms-drbd_r0-meta_attributes-target-role"
> name="target-role" value="Master"/>
>         </meta_attributes>
>         <primitive class="ocf" id="drbd_r0" provider="heartbeat"
> type="drbd">
>           <instance_attributes id="drbd_r0-instance_attributes">
>             <nvpair id="drbd_r0-instance_attributes-drbd_resource"
> name="drbd_resource" value="r0"/>
>           </instance_attributes>
>           <operations>
>             <op id="drbd_r0-monitor-59s" interval="59s" name="monitor"
> role="Master" timeout="30s"/>
>             <op id="drbd_r0-monitor-60s" interval="60s" name="monitor"
> role="Slave" timeout="30s"/>
>           </operations>
>         </primitive>
>       </master>
>     </resources>
>     <constraints>
>       <rsc_colocation id="drbd-nfs-ha" rsc="ms-drbd_r0"
> rsc-role="Master" score="INFINITY" with-rsc="nfs_resources"/>
>       <rsc_order id="drbd-before-nfs" first="ms-drbd_r0"
> first-action="promote" score="INFINITY" then="nfs_resources"
> then-action="start"/>
>     </constraints>
>     <rsc_defaults>
>       <meta_attributes id="rsc-options">
>         <nvpair id="rsc-options-resource-stickiness"
> name="resource-stickiness" value="100"/>
>       </meta_attributes>
>     </rsc_defaults>
>   </configuration>
>   <status>
>     <node_state id="test-vm-1" uname="test-vm-1" ha="active"
> in_ccm="true" crmd="online" join="member" expected="member"
> crm-debug-origin="do_state_transition" shutdown="0">
>       <transient_attributes id="test-vm-1">
>         <instance_attributes id="status-test-vm-1">
>           <nvpair id="status-test-vm-1-fail-count-drbd_r0.1"
> name="fail-count-drbd_r0:1" value="1"/>
>           <nvpair id="status-test-vm-1-last-failure-drbd_r0.1"
> name="last-failure-drbd_r0:1" value="1380831442"/>
>           <nvpair id="status-test-vm-1-master-drbd_r0.0"
> name="master-drbd_r0:0" value="100"/>
>           <nvpair id="status-test-vm-1-probe_complete"
> name="probe_complete" value="true"/>
>         </instance_attributes>
>       </transient_attributes>
>       <lrm id="test-vm-1">
>         <lrm_resources>
>           <lrm_resource id="drbd_r0:0" type="drbd" class="ocf"
> provider="heartbeat">
>             <lrm_rsc_op id="drbd_r0:0_last_failure_0"
> operation_key="drbd_r0:0_monitor_0" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="7:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:8;7:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="32" rc-code="8" op-status="0" interval="0"
> op-digest="c0e018b73fdf522b6cdd355e125af15e"/>
>             <lrm_rsc_op id="drbd_r0:0_monitor_59000"
> operation_key="drbd_r0:0_monitor_59000" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="20:5:8:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:8;20:5:8:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="35" rc-code="8" op-status="0" interval="59000"
> op-digest="6f5adcd7f1211cdfc17850827b8582c5"/>
>           </lrm_resource>
>           <lrm_resource id="nfs" type="nfs-kernel-server" class="lsb">
>             <lrm_rsc_op id="nfs_last_0" operation_key="nfs_start_0"
> operation="start" crm-debug-origin="build_active_RAs"
> crm_feature_set="3.0.6"
> transition-key="14:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;14:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="39" rc-code="0" op-status="0" interval="0"
> op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
>             <lrm_rsc_op id="nfs_last_failure_0"
> operation_key="nfs_monitor_0" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="6:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;6:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="31" rc-code="0" op-status="0" interval="0"
> op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
>             <lrm_rsc_op id="nfs_monitor_5000"
> operation_key="nfs_monitor_5000" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="2:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;2:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="40" rc-code="0" op-status="0" interval="5000"
> op-digest="4811cef7f7f94e3a35a70be7916cb2fd"/>
>           </lrm_resource>
>           <lrm_resource id="nfs_ip" type="IPaddr2" class="ocf"
> provider="heartbeat">
>             <lrm_rsc_op id="nfs_ip_last_failure_0"
> operation_key="nfs_ip_monitor_0" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="5:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;5:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="30" rc-code="0" op-status="0" interval="0"
> op-digest="570cd25774b1ead32cb1840813adbe21"/>
>             <lrm_rsc_op id="nfs_ip_monitor_10000"
> operation_key="nfs_ip_monitor_10000" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="8:5:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;8:5:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="33" rc-code="0" op-status="0" interval="10000"
> op-digest="bc929bfa78c3086ebd199cf0110b87bf"/>
>           </lrm_resource>
>           <lrm_resource id="nfs_fs" type="Filesystem" class="ocf"
> provider="heartbeat">
>             <lrm_rsc_op id="nfs_fs_last_failure_0"
> operation_key="nfs_fs_monitor_0" operation="monitor"
> crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
> transition-key="4:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;4:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="29" rc-code="0" op-status="0" interval="0"
> op-digest="c0a40c0015f71e8b20b5359e12f25eb5"/>
>           </lrm_resource>
>         </lrm_resources>
>       </lrm>
>     </node_state>
>     <node_state id="test-vm-2" uname="test-vm-2" ha="active"
> in_ccm="true" crmd="online" join="member"
> crm-debug-origin="do_update_resource" expected="member" shutdown="0">
>       <lrm id="test-vm-2">
>         <lrm_resources>
>           <lrm_resource id="nfs" type="nfs-kernel-server" class="lsb">
>             <lrm_rsc_op id="nfs_last_0" operation_key="nfs_monitor_0"
> operation="monitor" crm-debug-origin="do_update_resource"
> crm_feature_set="3.0.6"
> transition-key="10:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:7;10:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="4" rc-code="7" op-status="0" interval="0" last-run="1380832563"
> last-rc-change="1380832563" exec-time="210" queue-time="0"
> op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
>           </lrm_resource>
>           <lrm_resource id="nfs_ip" type="IPaddr2" class="ocf"
> provider="heartbeat">
>             <lrm_rsc_op id="nfs_ip_last_0"
> operation_key="nfs_ip_monitor_0" operation="monitor"
> crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
> transition-key="9:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:7;9:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="3" rc-code="7" op-status="0" interval="0" last-run="1380832563"
> last-rc-change="1380832563" exec-time="490" queue-time="0"
> op-digest="570cd25774b1ead32cb1840813adbe21"/>
>           </lrm_resource>
>           <lrm_resource id="nfs_fs" type="Filesystem" class="ocf"
> provider="heartbeat">
>             <lrm_rsc_op id="nfs_fs_last_0"
> operation_key="nfs_fs_monitor_0" operation="monitor"
> crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
> transition-key="8:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:7;8:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="2" rc-code="7" op-status="0" interval="0" last-run="1380832563"
> last-rc-change="1380832563" exec-time="690" queue-time="0"
> op-digest="c0a40c0015f71e8b20b5359e12f25eb5"/>
>           </lrm_resource>
>           <lrm_resource id="drbd_r0:1" type="drbd" class="ocf"
> provider="heartbeat">
>             <lrm_rsc_op id="drbd_r0:1_last_0"
> operation_key="drbd_r0:1_start_0" operation="start"
> crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
> transition-key="26:14:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;26:14:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="6" rc-code="0" op-status="0" interval="0" last-run="1380832564"
> last-rc-change="1380832564" exec-time="840" queue-time="0"
> op-digest="c0e018b73fdf522b6cdd355e125af15e"/>
>             <lrm_rsc_op id="drbd_r0:1_monitor_60000"
> operation_key="drbd_r0:1_monitor_60000" operation="monitor"
> crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
> transition-key="25:15:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> transition-magic="0:0;25:15:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
> call-id="8" rc-code="0" op-status="0" interval="60000"
> last-rc-change="1380832565" exec-time="310" queue-time="10"
> op-digest="6f5adcd7f1211cdfc17850827b8582c5"/>
>           </lrm_resource>
>         </lrm_resources>
>       </lrm>
>       <transient_attributes id="test-vm-2">
>         <instance_attributes id="status-test-vm-2">
>           <nvpair id="status-test-vm-2-probe_complete"
> name="probe_complete" value="true"/>
>           <nvpair id="status-test-vm-2-master-drbd_r0.1"
> name="master-drbd_r0:1" value="75"/>
>         </instance_attributes>
>       </transient_attributes>
>     </node_state>
>   </status>
> </cib>
> 
> 
> On Thu, Oct 3, 2013 at 5:06 PM, Andreas Kurz <andr...@hastexo.com
> <mailto:andr...@hastexo.com>> wrote:
> 
>     On 2013-10-03 22:12, David Parker wrote:
>     > Thanks, Andrew.  The goal was to use either Pacemaker and Corosync 1.x
>     > from the Debain packages, or use both compiled from source.  So, with
>     > the compiled version, I was hoping to avoid CMAN.  However, it
>     seems the
>     > packaged version of Pacemaker doesn't support CMAN anyway, so it's
>     moot.
>     >
>     > I rebuilt my VMs from scratch, re-installed Pacemaker and Corosync
>     from
>     > the Debian packages, but I'm still having an odd problem.  Here is the
>     > config portion of my CIB:
>     >
>     >     <crm_config>
>     >       <cluster_property_set id="cib-bootstrap-options">
>     >         <nvpair id="cib-bootstrap-options-dc-version"
>     name="dc-version"
>     > value="1.1.7-ee0730e13d124c3d58f00016c3376a1de5323cff"/>
>     >         <nvpair id="cib-bootstrap-options-cluster-infrastructure"
>     > name="cluster-infrastructure" value="openais"/>
>     >         <nvpair id="cib-bootstrap-options-expected-quorum-votes"
>     > name="expected-quorum-votes" value="2"/>
>     >         <nvpair id="cib-bootstrap-options-stonith-enabled"
>     > name="stonith-enabled" value="false"/>
>     >         <nvpair id="cib-bootstrap-options-no-quorum-policy"
>     > name="no-quorum-policy" value="ignore"/>
>     >       </cluster_property_set>
>     >     </crm_config>
>     >
>     > I set no-quorum-policy=ignore based on the documentation example for a
>     > 2-node cluster.  But when Pacemaker starts up on the first node, the
>     > DRBD resource is in slave mode and none of the other resources are
>     > started (they depend on DRBD being master), and I see these lines
>     in the
>     > log:
>     >
>     > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: unpack_config: On
>     > loss of CCM Quorum: Ignore
>     > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
>     > nfs_fs   (test-vm-1 - blocked)
>     > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
>     > nfs_ip   (test-vm-1 - blocked)
>     > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
>     > nfs      (test-vm-1 - blocked)
>     > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
>     > drbd_r0:0        (test-vm-1)
>     >
>     > I'm assuming the NFS resources show "blocked" because the resource
>     they
>     > depend on is not in the correct state.
>     >
>     > Even when the second node (test-vm-2) comes online, the state of these
>     > resources does not change.  I can shutdown and re-start Pacemaker over
>     > and over again on test-vm-2, but nothihg changes.  However... and this
>     > is where it gets weird... if I shut down Pacemaker on test-vm-1, then
>     > all of the resources immediately fail over to test-vm-2 and start
>     > correctly.  And I see these lines in the log:
>     >
>     > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: unpack_config: On
>     > loss of CCM Quorum: Ignore
>     > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: stage6: Scheduling
>     > Node test-vm-1 for shutdown
>     > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
>     > nfs_fs   (test-vm-2)
>     > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
>     > nfs_ip   (test-vm-2)
>     > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
>     > nfs      (test-vm-2)
>     > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Stop
>     >  drbd_r0:0        (test-vm-1)
>     > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Promote
>     > drbd_r0:1        (Slave -> Master test-vm-2)
>     >
>     > After that, I can generally move the resources back and forth, and
>     even
>     > fail them over by hard-failing a node, without any problems.  The real
>     > problem is that this isn't consistent, though.  Every once in a while,
>     > I'll hard-fail a node and the other one will go into this "stuck"
>     state
>     > where Pacemaker knows it lost a node, but DRBD will stay in slave mode
>     > and the other resources will never start.  It seems to happen quite
>     > randomly.  Then, even if I restart Pacemaker on both nodes, or reboot
>     > them altogether, I run into the startup issue mentioned previously.
>     >
>     > Any ideas?
> 
>     Yes, share your complete resource configuration ;-)
> 
>     Regards,
>     Andreas
> 
>     >
>     >     Thanks,
>     >     Dave
>     >
>     >
>     >
>     > On Wed, Oct 2, 2013 at 1:01 AM, Andrew Beekhof <and...@beekhof.net
>     <mailto:and...@beekhof.net>
>     > <mailto:and...@beekhof.net <mailto:and...@beekhof.net>>> wrote:
>     >
>     >
>     >     On 02/10/2013, at 5:24 AM, David Parker <dpar...@utica.edu
>     <mailto:dpar...@utica.edu>
>     >     <mailto:dpar...@utica.edu <mailto:dpar...@utica.edu>>> wrote:
>     >
>     >     > Thanks, I did a little Googling and found the git repository
>     for pcs.
>     >
>     >     pcs won't help you rebuild pacemaker with cman support (or
>     corosync
>     >     2.x support) turned on though.
>     >
>     >
>     >     >  Is there any way to make a two-node cluster work with the stock
>     >     Debian packages, though?  It seems odd that this would be
>     impossible.
>     >
>     >     it really depends how the debian maintainers built pacemaker.
>     >     by the sounds of it, it only supports the pacemaker plugin
>     mode for
>     >     corosync 1.x
>     >
>     >     >
>     >     >
>     >     > On Tue, Oct 1, 2013 at 3:16 PM, Larry Brigman
>     >     <larry.brig...@gmail.com <mailto:larry.brig...@gmail.com>
>     <mailto:larry.brig...@gmail.com <mailto:larry.brig...@gmail.com>>>
>     wrote:
>     >     > pcs is another package you will need to install.
>     >     >
>     >     > On Oct 1, 2013 9:04 AM, "David Parker" <dpar...@utica.edu
>     <mailto:dpar...@utica.edu>
>     >     <mailto:dpar...@utica.edu <mailto:dpar...@utica.edu>>> wrote:
>     >     > Hello,
>     >     >
>     >     > Sorry for the delay in my reply.  I've been doing a lot of
>     >     experimentation, but so far I've had no luck.
>     >     >
>     >     > Thanks for the suggestion, but it seems I'm not able to use
>     CMAN.
>     >      I'm running Debian Wheezy with Corosync and Pacemaker
>     installed via
>     >     apt-get.  When I installed CMAN and set up a cluster.conf file,
>     >     Pacemaker refused to start and said that CMAN was not supported.
>     >      When CMAN is not installed, Pacemaker starts up fine, but I see
>     >     these lines in the log:
>     >     >
>     >     > Sep 30 23:36:29 test-vm-1 crmd: [6941]: ERROR:
>     >     init_quorum_connection: The Corosync quorum API is not
>     supported in
>     >     this build
>     >     > Sep 30 23:36:29 test-vm-1 pacemakerd: [6932]: ERROR:
>     >     pcmk_child_exit: Child process crmd exited (pid=6941, rc=100)
>     >     > Sep 30 23:36:29 test-vm-1 pacemakerd: [6932]: WARN:
>     >     pcmk_child_exit: Pacemaker child process crmd no longer wishes
>     to be
>     >     respawned. Shutting ourselves down.
>     >     >
>     >     > So, then I checked to see which plugins are supported:
>     >     >
>     >     > # pacemakerd -F
>     >     > Pacemaker 1.1.7 (Build:
>     ee0730e13d124c3d58f00016c3376a1de5323cff)
>     >     >  Supporting:  generated-manpages agent-manpages ncurses
>      heartbeat
>     >     corosync-plugin snmp libesmtp
>     >     >
>     >     > Am I correct in believing that this Pacemaker package has been
>     >     compiled without support for any quorum API?  If so, does anyone
>     >     know if there is a Debian package which has the correct support?
>     >     >
>     >     > I also tried compiling LibQB, Corosync and Pacemaker from source
>     >     via git, following the instructions documented here:
>     >     >
>     >     > http://clusterlabs.org/wiki/SourceInstall
>     >     >
>     >     > I was hopeful that this would work, because as I understand it,
>     >     Corosync 2.x no longer uses CMAN.  Everything compiled and started
>     >     fine, but the compiled version of Pacemaker did not include either
>     >     the 'crm' or 'pcs' commands.  Do I need to install something
>     else in
>     >     order to get one of these?
>     >     >
>     >     > Any and all help is greatly appreciated!
>     >     >
>     >     >     Thanks,
>     >     >     Dave
>     >     >
>     >     >
>     >     > On Wed, Sep 25, 2013 at 6:08 AM, David Lang <da...@lang.hm
>     <mailto:da...@lang.hm>
>     >     <mailto:da...@lang.hm <mailto:da...@lang.hm>>> wrote:
>     >     > the cluster is trying to reach a quarum (the majority of the
>     nodes
>     >     talking to each other) and that is never going to happen with only
>     >     one node. so you have to disable this.
>     >     >
>     >     > try putting
>     >     > <cman two_node="1" expected_votes="1" transport="udpu"/>
>     >     > in your cluster.conf
>     >     >
>     >     > David Lang
>     >     >
>     >     >  On Tue, 24 Sep 2013, David Parker wrote:
>     >     >
>     >     > Date: Tue, 24 Sep 2013 11:48:59 -0400
>     >     > From: David Parker <dpar...@utica.edu
>     <mailto:dpar...@utica.edu> <mailto:dpar...@utica.edu
>     <mailto:dpar...@utica.edu>>>
>     >     > Reply-To: The Pacemaker cluster resource manager
>     >     >     <pacemaker@oss.clusterlabs.org
>     <mailto:pacemaker@oss.clusterlabs.org>
>     >     <mailto:pacemaker@oss.clusterlabs.org
>     <mailto:pacemaker@oss.clusterlabs.org>>>
>     >     > To: The Pacemaker cluster resource manager
>     >     <pacemaker@oss.clusterlabs.org
>     <mailto:pacemaker@oss.clusterlabs.org>
>     <mailto:pacemaker@oss.clusterlabs.org
>     <mailto:pacemaker@oss.clusterlabs.org>>>
>     >     > Subject: Re: [Pacemaker] Corosync won't recover when a node
>     fails
>     >     >
>     >     >
>     >     > I forgot to mention, OS is Debian Wheezy 64-bit, Corosync and
>     >     Pacemaker
>     >     > installed from packages via apt-get, and there are no local
>     >     firewall rules
>     >     > in place:
>     >     >
>     >     > # iptables -L
>     >     > Chain INPUT (policy ACCEPT)
>     >     > target     prot opt source               destination
>     >     >
>     >     > Chain FORWARD (policy ACCEPT)
>     >     > target     prot opt source               destination
>     >     >
>     >     > Chain OUTPUT (policy ACCEPT)
>     >     > target     prot opt source               destination
>     >     >
>     >     >
>     >     > On Tue, Sep 24, 2013 at 11:41 AM, David Parker
>     <dpar...@utica.edu <mailto:dpar...@utica.edu>
>     >     <mailto:dpar...@utica.edu <mailto:dpar...@utica.edu>>> wrote:
>     >     >
>     >     > Hello,
>     >     >
>     >     > I have a 2-node cluster using Corosync and Pacemaker, where the
>     >     nodes are
>     >     > actually to VirtualBox VMs on the same physical machine.  I
>     have some
>     >     > resources set up in Pacemaker, and everything works fine if
>     I move
>     >     them in
>     >     > a controlled way with the "crm_resource -r <resource> --move
>     >     --node <node>"
>     >     > command.
>     >     >
>     >     > However, when I hard-fail one of the nodes via the "poweroff"
>     >     command in
>     >     > Virtual Box, which "pulls the plug" on the VM, the resources do
>     >     not move,
>     >     > and I see the following output in the log on the remaining node:
>     >     >
>     >     > Sep 24 11:20:30 corosync [TOTEM ] The token was lost in the
>     >     OPERATIONAL
>     >     > state.
>     >     > Sep 24 11:20:30 corosync [TOTEM ] A processor failed,
>     forming new
>     >     > configuration.
>     >     > Sep 24 11:20:30 corosync [TOTEM ] entering GATHER state from 2.
>     >     > Sep 24 11:20:31 test-vm-2 lrmd: [2503]: debug: rsc:drbd_r0:0
>     >     monitor[31]
>     >     > (pid 8495)
>     >     > drbd[8495]:     2013/09/24_11:20:31 WARNING: This resource
>     agent is
>     >     > deprecated and may be removed in a future release. See the man
>     >     page for
>     >     > details. To suppress this warning, set the "ignore_deprecation"
>     >     resource
>     >     > parameter to true.
>     >     > drbd[8495]:     2013/09/24_11:20:31 WARNING: This resource
>     agent is
>     >     > deprecated and may be removed in a future release. See the man
>     >     page for
>     >     > details. To suppress this warning, set the "ignore_deprecation"
>     >     resource
>     >     > parameter to true.
>     >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Calling
>     drbdadm -c
>     >     > /etc/drbd.conf role r0
>     >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Exit code 0
>     >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Command output:
>     >     > Secondary/Primary
>     >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Calling
>     drbdadm -c
>     >     > /etc/drbd.conf cstate r0
>     >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Exit code 0
>     >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Command output:
>     >     Connected
>     >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0 status:
>     >     Secondary/Primary
>     >     > Secondary Primary Connected
>     >     > Sep 24 11:20:31 test-vm-2 lrmd: [2503]: info: operation
>     monitor[31] on
>     >     > drbd_r0:0 for client 2506: pid 8495 exited with return code 0
>     >     > Sep 24 11:20:32 corosync [TOTEM ] entering GATHER state from 0.
>     >     > Sep 24 11:20:34 corosync [TOTEM ] The consensus timeout expired.
>     >     > Sep 24 11:20:34 corosync [TOTEM ] entering GATHER state from 3.
>     >     > Sep 24 11:20:36 corosync [TOTEM ] The consensus timeout expired.
>     >     > Sep 24 11:20:36 corosync [TOTEM ] entering GATHER state from 3.
>     >     > Sep 24 11:20:38 corosync [TOTEM ] The consensus timeout expired.
>     >     > Sep 24 11:20:38 corosync [TOTEM ] entering GATHER state from 3.
>     >     > Sep 24 11:20:40 corosync [TOTEM ] The consensus timeout expired.
>     >     > Sep 24 11:20:40 corosync [TOTEM ] entering GATHER state from 3.
>     >     > Sep 24 11:20:40 corosync [TOTEM ] Totem is unable to form a
>     cluster
>     >     > because of an operating system or network fault. The most common
>     >     cause of
>     >     > this message is that the local firewall is configured
>     improperly.
>     >     > Sep 24 11:20:43 corosync [TOTEM ] The consensus timeout expired.
>     >     > Sep 24 11:20:43 corosync [TOTEM ] entering GATHER state from 3.
>     >     > Sep 24 11:20:43 corosync [TOTEM ] Totem is unable to form a
>     cluster
>     >     > because of an operating system or network fault. The most common
>     >     cause of
>     >     > this message is that the local firewall is configured
>     improperly.
>     >     > Sep 24 11:20:45 corosync [TOTEM ] The consensus timeout expired.
>     >     > Sep 24 11:20:45 corosync [TOTEM ] entering GATHER state from 3.
>     >     > Sep 24 11:20:45 corosync [TOTEM ] Totem is unable to form a
>     cluster
>     >     > because of an operating system or network fault. The most common
>     >     cause of
>     >     > this message is that the local firewall is configured
>     improperly.
>     >     > Sep 24 11:20:47 corosync [TOTEM ] The consensus timeout expired.
>     >     >
>     >     > Those last 3 messages just repeat over and over, the cluster
>     never
>     >     > recovers, and the resources never move.  "crm_mon" reports
>     that the
>     >     > resources are still running on the dead node, and shows no
>     >     indication that
>     >     > anything has gone wrong.
>     >     >
>     >     > Does anyone know what the issue could be?  My expectation
>     was that the
>     >     > remaining node would become the sole member of the cluster, take
>     >     over the
>     >     > resources, and everything would keep running.
>     >     >
>     >     > For reference, my corosync.conf file is below:
>     >     >
>     >     > compatibility: whitetank
>     >     >
>     >     > totem {
>     >     >         version: 2
>     >     >         secauth: off
>     >     >         interface {
>     >     >                 member {
>     >     >                         memberaddr: 192.168.25.201
>     >     >                 }
>     >     >                 member {
>     >     >                         memberaddr: 192.168.25.202
>     >     >                  }
>     >     >                 ringnumber: 0
>     >     >                 bindnetaddr: 192.168.25.0
>     >     >                 mcastport: 5405
>     >     >         }
>     >     >         transport: udpu
>     >     > }
>     >     >
>     >     > logging {
>     >     >         fileline: off
>     >     >         to_logfile: yes
>     >     >         to_syslog: yes
>     >     >         debug: on
>     >     >         logfile: /var/log/cluster/corosync.log
>     >     >         timestamp: on
>     >     >         logger_subsys {
>     >     >                 subsys: AMF
>     >     >                 debug: on
>     >     >         }
>     >     > }
>     >     >
>     >     >
>     >     > Thanks!
>     >     > Dave
>     >     >
>     >     > --
>     >     > Dave Parker
>     >     > Systems Administrator
>     >     > Utica College
>     >     > Integrated Information Technology Services
>     >     > (315) 792-3229
>     >     > Registered Linux User #408177
>     >     >
>     >     >
>     >     >
>     >     >
>     >     >
>     >     > _______________________________________________
>     >     >
>     >     > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>
>     >     <mailto:Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>>
>     >     >
>     >     > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>     >     >
>     >     >
>     >     >
>     >     > Project Home: http://www.clusterlabs.org
>     >     >
>     >     > Getting started:
>     >     http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
>     >     >
>     >     > Bugs: http://bugs.clusterlabs.org
>     >     >
>     >     >
>     >     > _______________________________________________
>     >     > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>
>     >     <mailto:Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>>
>     >     > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>     >     >
>     >     > Project Home: http://www.clusterlabs.org
>     >     > Getting started:
>     >     http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
>     >     > Bugs: http://bugs.clusterlabs.org
>     >     >
>     >     >
>     >     >
>     >     >
>     >     > --
>     >     > Dave Parker
>     >     > Systems Administrator
>     >     > Utica College
>     >     > Integrated Information Technology Services
>     >     > (315) 792-3229 <tel:%28315%29%20792-3229>
>     <tel:%28315%29%20792-3229>
>     >     > Registered Linux User #408177
>     >     >
>     >     > _______________________________________________
>     >     > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>
>     >     <mailto:Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>>
>     >     > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>     >     >
>     >     > Project Home: http://www.clusterlabs.org
>     >     > Getting started:
>     >     http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
>     >     > Bugs: http://bugs.clusterlabs.org
>     >     >
>     >     >
>     >     > _______________________________________________
>     >     > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>
>     >     <mailto:Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>>
>     >     > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>     >     >
>     >     > Project Home: http://www.clusterlabs.org
>     >     > Getting started:
>     >     http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
>     >     > Bugs: http://bugs.clusterlabs.org
>     >     >
>     >     >
>     >     >
>     >     >
>     >     > --
>     >     > Dave Parker
>     >     > Systems Administrator
>     >     > Utica College
>     >     > Integrated Information Technology Services
>     >     > (315) 792-3229 <tel:%28315%29%20792-3229>
>     <tel:%28315%29%20792-3229>
>     >     > Registered Linux User #408177
>     >     > _______________________________________________
>     >     > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>
>     >     <mailto:Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>>
>     >     > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>     >     >
>     >     > Project Home: http://www.clusterlabs.org
>     >     > Getting started:
>     >     http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
>     >     > Bugs: http://bugs.clusterlabs.org
>     >
>     >
>     >     _______________________________________________
>     >     Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>
>     >     <mailto:Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>>
>     >     http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>     >
>     >     Project Home: http://www.clusterlabs.org
>     >     Getting started:
>     http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
>     >     Bugs: http://bugs.clusterlabs.org
>     >
>     >
>     >
>     >
>     > --
>     > Dave Parker
>     > Systems Administrator
>     > Utica College
>     > Integrated Information Technology Services
>     > (315) 792-3229 <tel:%28315%29%20792-3229>
>     > Registered Linux User #408177
>     >
>     >
>     > This body part will be downloaded on demand.
>     >
> 
> 
>     --
>     Need help with Pacemaker?
>     http://www.hastexo.com/now
> 
> 
> 
>     _______________________________________________
>     Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
>     <mailto:Pacemaker@oss.clusterlabs.org>
>     http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> 
>     Project Home: http://www.clusterlabs.org
>     Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
>     Bugs: http://bugs.clusterlabs.org
> 
> 
> 
> 
> -- 
> Dave Parker
> Systems Administrator
> Utica College
> Integrated Information Technology Services
> (315) 792-3229
> Registered Linux User #408177
> 
> 
> _______________________________________________
> Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
> http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> 
> Project Home: http://www.clusterlabs.org
> Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> Bugs: http://bugs.clusterlabs.org
> 


-- 
Need help with Pacemaker?
http://www.hastexo.com/now

signature.asc
Description: OpenPGP digital signature

_______________________________________________
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://bugs.clusterlabs.org

Re: [Pacemaker] Corosync won't recover when a node fails

Reply via email to