On 2013-10-04 03:03, David Parker wrote: > Sure. Here's the full config:
You defintely must not use the deprecated ocf:heartbeat:drbd resource agent but the one that comes with DRBD: ocf:linbit:drbd ... you should see a big fat warning in your logs to not use it. And this colocation is wrong: ... <rsc_colocation id="drbd-nfs-ha" rsc="ms-drbd_r0" rsc-role="Master" score="INFINITY" with-rsc="nfs_resources"/> ... rsc and with-rcs need to be the other way round <rsc_colocation id="drbd-nfs-ha" rsc="nfs_resources" score="INFINITY" with-rsc="ms-drbd_r0" with-rsc-role="Master" /> ... give this a try. Regards, Andreas > > <cib epoch="28" num_updates="34" admin_epoch="0" > validate-with="pacemaker-1.2" cib-last-written="Thu Oct 3 16:26:39 > 2013" crm_feature_set="3.0.6" update-origin="test-vm-2" > update-client="cibadmin" have-quorum="1" dc-uuid="test-vm-1"> > <configuration> > <crm_config> > <cluster_property_set id="cib-bootstrap-options"> > <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" > value="1.1.7-ee0730e13d124c3d58f00016c3376a1de5323cff"/> > <nvpair id="cib-bootstrap-options-cluster-infrastructure" > name="cluster-infrastructure" value="openais"/> > <nvpair id="cib-bootstrap-options-expected-quorum-votes" > name="expected-quorum-votes" value="2"/> > <nvpair id="cib-bootstrap-options-stonith-enabled" > name="stonith-enabled" value="false"/> > <nvpair id="cib-bootstrap-options-no-quorum-policy" > name="no-quorum-policy" value="ignore"/> > </cluster_property_set> > </crm_config> > <nodes> > <node id="test-vm-1" type="normal" uname="test-vm-1"/> > <node id="test-vm-2" type="normal" uname="test-vm-2"/> > </nodes> > <resources> > <group id="nfs_resources"> > <meta_attributes id="nfs_resources-meta_attributes"> > <nvpair id="nfs_resources-meta_attributes-target-role" > name="target-role" value="Started"/> > </meta_attributes> > <primitive class="ocf" id="nfs_fs" provider="heartbeat" > type="Filesystem"> > <instance_attributes id="nfs_fs-instance_attributes"> > <nvpair id="nfs_fs-instance_attributes-device" name="device" > value="/dev/drbd1"/> > <nvpair id="nfs_fs-instance_attributes-directory" > name="directory" value="/export/data/"/> > <nvpair id="nfs_fs-instance_attributes-fstype" name="fstype" > value="ext3"/> > <nvpair id="nfs_fs-instance_attributes-options" > name="options" value="noatime,nodiratime"/> > </instance_attributes> > <operations> > <op id="nfs_fs-start-0" interval="0" name="start" timeout="60"/> > <op id="nfs_fs-stop-0" interval="0" name="stop" timeout="120"/> > </operations> > </primitive> > <primitive class="ocf" id="nfs_ip" provider="heartbeat" > type="IPaddr2"> > <instance_attributes id="nfs_ip-instance_attributes"> > <nvpair id="nfs_ip-instance_attributes-ip" name="ip" > value="192.168.25.205"/> > <nvpair id="nfs_ip-instance_attributes-cidr_netmask" > name="cidr_netmask" value="32"/> > </instance_attributes> > <operations> > <op id="nfs_ip-monitor-10s" interval="10s" name="monitor"/> > </operations> > <meta_attributes id="nfs_ip-meta_attributes"> > <nvpair id="nfs_ip-meta_attributes-is-managed" > name="is-managed" value="true"/> > </meta_attributes> > </primitive> > <primitive class="lsb" id="nfs" type="nfs-kernel-server"> > <operations> > <op id="nfs-monitor-5s" interval="5s" name="monitor"/> > <op id="nfs-start-0" interval="0" name="start" timeout="120"/> > <op id="nfs-stop-0" interval="0" name="stop" timeout="120"/> > </operations> > </primitive> > </group> > <master id="ms-drbd_r0"> > <meta_attributes id="ms-drbd_r0-meta_attributes"> > <nvpair id="ms-drbd_r0-meta_attributes-clone-max" > name="clone-max" value="2"/> > <nvpair id="ms-drbd_r0-meta_attributes-notify" name="notify" > value="true"/> > <nvpair id="ms-drbd_r0-meta_attributes-globally-unique" > name="globally-unique" value="false"/> > <nvpair id="ms-drbd_r0-meta_attributes-target-role" > name="target-role" value="Master"/> > </meta_attributes> > <primitive class="ocf" id="drbd_r0" provider="heartbeat" > type="drbd"> > <instance_attributes id="drbd_r0-instance_attributes"> > <nvpair id="drbd_r0-instance_attributes-drbd_resource" > name="drbd_resource" value="r0"/> > </instance_attributes> > <operations> > <op id="drbd_r0-monitor-59s" interval="59s" name="monitor" > role="Master" timeout="30s"/> > <op id="drbd_r0-monitor-60s" interval="60s" name="monitor" > role="Slave" timeout="30s"/> > </operations> > </primitive> > </master> > </resources> > <constraints> > <rsc_colocation id="drbd-nfs-ha" rsc="ms-drbd_r0" > rsc-role="Master" score="INFINITY" with-rsc="nfs_resources"/> > <rsc_order id="drbd-before-nfs" first="ms-drbd_r0" > first-action="promote" score="INFINITY" then="nfs_resources" > then-action="start"/> > </constraints> > <rsc_defaults> > <meta_attributes id="rsc-options"> > <nvpair id="rsc-options-resource-stickiness" > name="resource-stickiness" value="100"/> > </meta_attributes> > </rsc_defaults> > </configuration> > <status> > <node_state id="test-vm-1" uname="test-vm-1" ha="active" > in_ccm="true" crmd="online" join="member" expected="member" > crm-debug-origin="do_state_transition" shutdown="0"> > <transient_attributes id="test-vm-1"> > <instance_attributes id="status-test-vm-1"> > <nvpair id="status-test-vm-1-fail-count-drbd_r0.1" > name="fail-count-drbd_r0:1" value="1"/> > <nvpair id="status-test-vm-1-last-failure-drbd_r0.1" > name="last-failure-drbd_r0:1" value="1380831442"/> > <nvpair id="status-test-vm-1-master-drbd_r0.0" > name="master-drbd_r0:0" value="100"/> > <nvpair id="status-test-vm-1-probe_complete" > name="probe_complete" value="true"/> > </instance_attributes> > </transient_attributes> > <lrm id="test-vm-1"> > <lrm_resources> > <lrm_resource id="drbd_r0:0" type="drbd" class="ocf" > provider="heartbeat"> > <lrm_rsc_op id="drbd_r0:0_last_failure_0" > operation_key="drbd_r0:0_monitor_0" operation="monitor" > crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6" > transition-key="7:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:8;7:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="32" rc-code="8" op-status="0" interval="0" > op-digest="c0e018b73fdf522b6cdd355e125af15e"/> > <lrm_rsc_op id="drbd_r0:0_monitor_59000" > operation_key="drbd_r0:0_monitor_59000" operation="monitor" > crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6" > transition-key="20:5:8:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:8;20:5:8:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="35" rc-code="8" op-status="0" interval="59000" > op-digest="6f5adcd7f1211cdfc17850827b8582c5"/> > </lrm_resource> > <lrm_resource id="nfs" type="nfs-kernel-server" class="lsb"> > <lrm_rsc_op id="nfs_last_0" operation_key="nfs_start_0" > operation="start" crm-debug-origin="build_active_RAs" > crm_feature_set="3.0.6" > transition-key="14:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:0;14:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="39" rc-code="0" op-status="0" interval="0" > op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/> > <lrm_rsc_op id="nfs_last_failure_0" > operation_key="nfs_monitor_0" operation="monitor" > crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6" > transition-key="6:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:0;6:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="31" rc-code="0" op-status="0" interval="0" > op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/> > <lrm_rsc_op id="nfs_monitor_5000" > operation_key="nfs_monitor_5000" operation="monitor" > crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6" > transition-key="2:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:0;2:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="40" rc-code="0" op-status="0" interval="5000" > op-digest="4811cef7f7f94e3a35a70be7916cb2fd"/> > </lrm_resource> > <lrm_resource id="nfs_ip" type="IPaddr2" class="ocf" > provider="heartbeat"> > <lrm_rsc_op id="nfs_ip_last_failure_0" > operation_key="nfs_ip_monitor_0" operation="monitor" > crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6" > transition-key="5:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:0;5:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="30" rc-code="0" op-status="0" interval="0" > op-digest="570cd25774b1ead32cb1840813adbe21"/> > <lrm_rsc_op id="nfs_ip_monitor_10000" > operation_key="nfs_ip_monitor_10000" operation="monitor" > crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6" > transition-key="8:5:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:0;8:5:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="33" rc-code="0" op-status="0" interval="10000" > op-digest="bc929bfa78c3086ebd199cf0110b87bf"/> > </lrm_resource> > <lrm_resource id="nfs_fs" type="Filesystem" class="ocf" > provider="heartbeat"> > <lrm_rsc_op id="nfs_fs_last_failure_0" > operation_key="nfs_fs_monitor_0" operation="monitor" > crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6" > transition-key="4:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:0;4:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="29" rc-code="0" op-status="0" interval="0" > op-digest="c0a40c0015f71e8b20b5359e12f25eb5"/> > </lrm_resource> > </lrm_resources> > </lrm> > </node_state> > <node_state id="test-vm-2" uname="test-vm-2" ha="active" > in_ccm="true" crmd="online" join="member" > crm-debug-origin="do_update_resource" expected="member" shutdown="0"> > <lrm id="test-vm-2"> > <lrm_resources> > <lrm_resource id="nfs" type="nfs-kernel-server" class="lsb"> > <lrm_rsc_op id="nfs_last_0" operation_key="nfs_monitor_0" > operation="monitor" crm-debug-origin="do_update_resource" > crm_feature_set="3.0.6" > transition-key="10:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:7;10:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="4" rc-code="7" op-status="0" interval="0" last-run="1380832563" > last-rc-change="1380832563" exec-time="210" queue-time="0" > op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/> > </lrm_resource> > <lrm_resource id="nfs_ip" type="IPaddr2" class="ocf" > provider="heartbeat"> > <lrm_rsc_op id="nfs_ip_last_0" > operation_key="nfs_ip_monitor_0" operation="monitor" > crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" > transition-key="9:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:7;9:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="3" rc-code="7" op-status="0" interval="0" last-run="1380832563" > last-rc-change="1380832563" exec-time="490" queue-time="0" > op-digest="570cd25774b1ead32cb1840813adbe21"/> > </lrm_resource> > <lrm_resource id="nfs_fs" type="Filesystem" class="ocf" > provider="heartbeat"> > <lrm_rsc_op id="nfs_fs_last_0" > operation_key="nfs_fs_monitor_0" operation="monitor" > crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" > transition-key="8:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:7;8:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="2" rc-code="7" op-status="0" interval="0" last-run="1380832563" > last-rc-change="1380832563" exec-time="690" queue-time="0" > op-digest="c0a40c0015f71e8b20b5359e12f25eb5"/> > </lrm_resource> > <lrm_resource id="drbd_r0:1" type="drbd" class="ocf" > provider="heartbeat"> > <lrm_rsc_op id="drbd_r0:1_last_0" > operation_key="drbd_r0:1_start_0" operation="start" > crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" > transition-key="26:14:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:0;26:14:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="6" rc-code="0" op-status="0" interval="0" last-run="1380832564" > last-rc-change="1380832564" exec-time="840" queue-time="0" > op-digest="c0e018b73fdf522b6cdd355e125af15e"/> > <lrm_rsc_op id="drbd_r0:1_monitor_60000" > operation_key="drbd_r0:1_monitor_60000" operation="monitor" > crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" > transition-key="25:15:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > transition-magic="0:0;25:15:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f" > call-id="8" rc-code="0" op-status="0" interval="60000" > last-rc-change="1380832565" exec-time="310" queue-time="10" > op-digest="6f5adcd7f1211cdfc17850827b8582c5"/> > </lrm_resource> > </lrm_resources> > </lrm> > <transient_attributes id="test-vm-2"> > <instance_attributes id="status-test-vm-2"> > <nvpair id="status-test-vm-2-probe_complete" > name="probe_complete" value="true"/> > <nvpair id="status-test-vm-2-master-drbd_r0.1" > name="master-drbd_r0:1" value="75"/> > </instance_attributes> > </transient_attributes> > </node_state> > </status> > </cib> > > > On Thu, Oct 3, 2013 at 5:06 PM, Andreas Kurz <andr...@hastexo.com > <mailto:andr...@hastexo.com>> wrote: > > On 2013-10-03 22:12, David Parker wrote: > > Thanks, Andrew. The goal was to use either Pacemaker and Corosync 1.x > > from the Debain packages, or use both compiled from source. So, with > > the compiled version, I was hoping to avoid CMAN. However, it > seems the > > packaged version of Pacemaker doesn't support CMAN anyway, so it's > moot. > > > > I rebuilt my VMs from scratch, re-installed Pacemaker and Corosync > from > > the Debian packages, but I'm still having an odd problem. Here is the > > config portion of my CIB: > > > > <crm_config> > > <cluster_property_set id="cib-bootstrap-options"> > > <nvpair id="cib-bootstrap-options-dc-version" > name="dc-version" > > value="1.1.7-ee0730e13d124c3d58f00016c3376a1de5323cff"/> > > <nvpair id="cib-bootstrap-options-cluster-infrastructure" > > name="cluster-infrastructure" value="openais"/> > > <nvpair id="cib-bootstrap-options-expected-quorum-votes" > > name="expected-quorum-votes" value="2"/> > > <nvpair id="cib-bootstrap-options-stonith-enabled" > > name="stonith-enabled" value="false"/> > > <nvpair id="cib-bootstrap-options-no-quorum-policy" > > name="no-quorum-policy" value="ignore"/> > > </cluster_property_set> > > </crm_config> > > > > I set no-quorum-policy=ignore based on the documentation example for a > > 2-node cluster. But when Pacemaker starts up on the first node, the > > DRBD resource is in slave mode and none of the other resources are > > started (they depend on DRBD being master), and I see these lines > in the > > log: > > > > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: unpack_config: On > > loss of CCM Quorum: Ignore > > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start > > nfs_fs (test-vm-1 - blocked) > > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start > > nfs_ip (test-vm-1 - blocked) > > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start > > nfs (test-vm-1 - blocked) > > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start > > drbd_r0:0 (test-vm-1) > > > > I'm assuming the NFS resources show "blocked" because the resource > they > > depend on is not in the correct state. > > > > Even when the second node (test-vm-2) comes online, the state of these > > resources does not change. I can shutdown and re-start Pacemaker over > > and over again on test-vm-2, but nothihg changes. However... and this > > is where it gets weird... if I shut down Pacemaker on test-vm-1, then > > all of the resources immediately fail over to test-vm-2 and start > > correctly. And I see these lines in the log: > > > > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: unpack_config: On > > loss of CCM Quorum: Ignore > > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: stage6: Scheduling > > Node test-vm-1 for shutdown > > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start > > nfs_fs (test-vm-2) > > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start > > nfs_ip (test-vm-2) > > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start > > nfs (test-vm-2) > > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Stop > > drbd_r0:0 (test-vm-1) > > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Promote > > drbd_r0:1 (Slave -> Master test-vm-2) > > > > After that, I can generally move the resources back and forth, and > even > > fail them over by hard-failing a node, without any problems. The real > > problem is that this isn't consistent, though. Every once in a while, > > I'll hard-fail a node and the other one will go into this "stuck" > state > > where Pacemaker knows it lost a node, but DRBD will stay in slave mode > > and the other resources will never start. It seems to happen quite > > randomly. Then, even if I restart Pacemaker on both nodes, or reboot > > them altogether, I run into the startup issue mentioned previously. > > > > Any ideas? > > Yes, share your complete resource configuration ;-) > > Regards, > Andreas > > > > > Thanks, > > Dave > > > > > > > > On Wed, Oct 2, 2013 at 1:01 AM, Andrew Beekhof <and...@beekhof.net > <mailto:and...@beekhof.net> > > <mailto:and...@beekhof.net <mailto:and...@beekhof.net>>> wrote: > > > > > > On 02/10/2013, at 5:24 AM, David Parker <dpar...@utica.edu > <mailto:dpar...@utica.edu> > > <mailto:dpar...@utica.edu <mailto:dpar...@utica.edu>>> wrote: > > > > > Thanks, I did a little Googling and found the git repository > for pcs. > > > > pcs won't help you rebuild pacemaker with cman support (or > corosync > > 2.x support) turned on though. > > > > > > > Is there any way to make a two-node cluster work with the stock > > Debian packages, though? It seems odd that this would be > impossible. > > > > it really depends how the debian maintainers built pacemaker. > > by the sounds of it, it only supports the pacemaker plugin > mode for > > corosync 1.x > > > > > > > > > > > On Tue, Oct 1, 2013 at 3:16 PM, Larry Brigman > > <larry.brig...@gmail.com <mailto:larry.brig...@gmail.com> > <mailto:larry.brig...@gmail.com <mailto:larry.brig...@gmail.com>>> > wrote: > > > pcs is another package you will need to install. > > > > > > On Oct 1, 2013 9:04 AM, "David Parker" <dpar...@utica.edu > <mailto:dpar...@utica.edu> > > <mailto:dpar...@utica.edu <mailto:dpar...@utica.edu>>> wrote: > > > Hello, > > > > > > Sorry for the delay in my reply. I've been doing a lot of > > experimentation, but so far I've had no luck. > > > > > > Thanks for the suggestion, but it seems I'm not able to use > CMAN. > > I'm running Debian Wheezy with Corosync and Pacemaker > installed via > > apt-get. When I installed CMAN and set up a cluster.conf file, > > Pacemaker refused to start and said that CMAN was not supported. > > When CMAN is not installed, Pacemaker starts up fine, but I see > > these lines in the log: > > > > > > Sep 30 23:36:29 test-vm-1 crmd: [6941]: ERROR: > > init_quorum_connection: The Corosync quorum API is not > supported in > > this build > > > Sep 30 23:36:29 test-vm-1 pacemakerd: [6932]: ERROR: > > pcmk_child_exit: Child process crmd exited (pid=6941, rc=100) > > > Sep 30 23:36:29 test-vm-1 pacemakerd: [6932]: WARN: > > pcmk_child_exit: Pacemaker child process crmd no longer wishes > to be > > respawned. Shutting ourselves down. > > > > > > So, then I checked to see which plugins are supported: > > > > > > # pacemakerd -F > > > Pacemaker 1.1.7 (Build: > ee0730e13d124c3d58f00016c3376a1de5323cff) > > > Supporting: generated-manpages agent-manpages ncurses > heartbeat > > corosync-plugin snmp libesmtp > > > > > > Am I correct in believing that this Pacemaker package has been > > compiled without support for any quorum API? If so, does anyone > > know if there is a Debian package which has the correct support? > > > > > > I also tried compiling LibQB, Corosync and Pacemaker from source > > via git, following the instructions documented here: > > > > > > http://clusterlabs.org/wiki/SourceInstall > > > > > > I was hopeful that this would work, because as I understand it, > > Corosync 2.x no longer uses CMAN. Everything compiled and started > > fine, but the compiled version of Pacemaker did not include either > > the 'crm' or 'pcs' commands. Do I need to install something > else in > > order to get one of these? > > > > > > Any and all help is greatly appreciated! > > > > > > Thanks, > > > Dave > > > > > > > > > On Wed, Sep 25, 2013 at 6:08 AM, David Lang <da...@lang.hm > <mailto:da...@lang.hm> > > <mailto:da...@lang.hm <mailto:da...@lang.hm>>> wrote: > > > the cluster is trying to reach a quarum (the majority of the > nodes > > talking to each other) and that is never going to happen with only > > one node. so you have to disable this. > > > > > > try putting > > > <cman two_node="1" expected_votes="1" transport="udpu"/> > > > in your cluster.conf > > > > > > David Lang > > > > > > On Tue, 24 Sep 2013, David Parker wrote: > > > > > > Date: Tue, 24 Sep 2013 11:48:59 -0400 > > > From: David Parker <dpar...@utica.edu > <mailto:dpar...@utica.edu> <mailto:dpar...@utica.edu > <mailto:dpar...@utica.edu>>> > > > Reply-To: The Pacemaker cluster resource manager > > > <pacemaker@oss.clusterlabs.org > <mailto:pacemaker@oss.clusterlabs.org> > > <mailto:pacemaker@oss.clusterlabs.org > <mailto:pacemaker@oss.clusterlabs.org>>> > > > To: The Pacemaker cluster resource manager > > <pacemaker@oss.clusterlabs.org > <mailto:pacemaker@oss.clusterlabs.org> > <mailto:pacemaker@oss.clusterlabs.org > <mailto:pacemaker@oss.clusterlabs.org>>> > > > Subject: Re: [Pacemaker] Corosync won't recover when a node > fails > > > > > > > > > I forgot to mention, OS is Debian Wheezy 64-bit, Corosync and > > Pacemaker > > > installed from packages via apt-get, and there are no local > > firewall rules > > > in place: > > > > > > # iptables -L > > > Chain INPUT (policy ACCEPT) > > > target prot opt source destination > > > > > > Chain FORWARD (policy ACCEPT) > > > target prot opt source destination > > > > > > Chain OUTPUT (policy ACCEPT) > > > target prot opt source destination > > > > > > > > > On Tue, Sep 24, 2013 at 11:41 AM, David Parker > <dpar...@utica.edu <mailto:dpar...@utica.edu> > > <mailto:dpar...@utica.edu <mailto:dpar...@utica.edu>>> wrote: > > > > > > Hello, > > > > > > I have a 2-node cluster using Corosync and Pacemaker, where the > > nodes are > > > actually to VirtualBox VMs on the same physical machine. I > have some > > > resources set up in Pacemaker, and everything works fine if > I move > > them in > > > a controlled way with the "crm_resource -r <resource> --move > > --node <node>" > > > command. > > > > > > However, when I hard-fail one of the nodes via the "poweroff" > > command in > > > Virtual Box, which "pulls the plug" on the VM, the resources do > > not move, > > > and I see the following output in the log on the remaining node: > > > > > > Sep 24 11:20:30 corosync [TOTEM ] The token was lost in the > > OPERATIONAL > > > state. > > > Sep 24 11:20:30 corosync [TOTEM ] A processor failed, > forming new > > > configuration. > > > Sep 24 11:20:30 corosync [TOTEM ] entering GATHER state from 2. > > > Sep 24 11:20:31 test-vm-2 lrmd: [2503]: debug: rsc:drbd_r0:0 > > monitor[31] > > > (pid 8495) > > > drbd[8495]: 2013/09/24_11:20:31 WARNING: This resource > agent is > > > deprecated and may be removed in a future release. See the man > > page for > > > details. To suppress this warning, set the "ignore_deprecation" > > resource > > > parameter to true. > > > drbd[8495]: 2013/09/24_11:20:31 WARNING: This resource > agent is > > > deprecated and may be removed in a future release. See the man > > page for > > > details. To suppress this warning, set the "ignore_deprecation" > > resource > > > parameter to true. > > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Calling > drbdadm -c > > > /etc/drbd.conf role r0 > > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Exit code 0 > > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Command output: > > > Secondary/Primary > > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Calling > drbdadm -c > > > /etc/drbd.conf cstate r0 > > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Exit code 0 > > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0: Command output: > > Connected > > > drbd[8495]: 2013/09/24_11:20:31 DEBUG: r0 status: > > Secondary/Primary > > > Secondary Primary Connected > > > Sep 24 11:20:31 test-vm-2 lrmd: [2503]: info: operation > monitor[31] on > > > drbd_r0:0 for client 2506: pid 8495 exited with return code 0 > > > Sep 24 11:20:32 corosync [TOTEM ] entering GATHER state from 0. > > > Sep 24 11:20:34 corosync [TOTEM ] The consensus timeout expired. > > > Sep 24 11:20:34 corosync [TOTEM ] entering GATHER state from 3. > > > Sep 24 11:20:36 corosync [TOTEM ] The consensus timeout expired. > > > Sep 24 11:20:36 corosync [TOTEM ] entering GATHER state from 3. > > > Sep 24 11:20:38 corosync [TOTEM ] The consensus timeout expired. > > > Sep 24 11:20:38 corosync [TOTEM ] entering GATHER state from 3. > > > Sep 24 11:20:40 corosync [TOTEM ] The consensus timeout expired. > > > Sep 24 11:20:40 corosync [TOTEM ] entering GATHER state from 3. > > > Sep 24 11:20:40 corosync [TOTEM ] Totem is unable to form a > cluster > > > because of an operating system or network fault. The most common > > cause of > > > this message is that the local firewall is configured > improperly. > > > Sep 24 11:20:43 corosync [TOTEM ] The consensus timeout expired. > > > Sep 24 11:20:43 corosync [TOTEM ] entering GATHER state from 3. > > > Sep 24 11:20:43 corosync [TOTEM ] Totem is unable to form a > cluster > > > because of an operating system or network fault. The most common > > cause of > > > this message is that the local firewall is configured > improperly. > > > Sep 24 11:20:45 corosync [TOTEM ] The consensus timeout expired. > > > Sep 24 11:20:45 corosync [TOTEM ] entering GATHER state from 3. > > > Sep 24 11:20:45 corosync [TOTEM ] Totem is unable to form a > cluster > > > because of an operating system or network fault. The most common > > cause of > > > this message is that the local firewall is configured > improperly. > > > Sep 24 11:20:47 corosync [TOTEM ] The consensus timeout expired. > > > > > > Those last 3 messages just repeat over and over, the cluster > never > > > recovers, and the resources never move. "crm_mon" reports > that the > > > resources are still running on the dead node, and shows no > > indication that > > > anything has gone wrong. > > > > > > Does anyone know what the issue could be? My expectation > was that the > > > remaining node would become the sole member of the cluster, take > > over the > > > resources, and everything would keep running. > > > > > > For reference, my corosync.conf file is below: > > > > > > compatibility: whitetank > > > > > > totem { > > > version: 2 > > > secauth: off > > > interface { > > > member { > > > memberaddr: 192.168.25.201 > > > } > > > member { > > > memberaddr: 192.168.25.202 > > > } > > > ringnumber: 0 > > > bindnetaddr: 192.168.25.0 > > > mcastport: 5405 > > > } > > > transport: udpu > > > } > > > > > > logging { > > > fileline: off > > > to_logfile: yes > > > to_syslog: yes > > > debug: on > > > logfile: /var/log/cluster/corosync.log > > > timestamp: on > > > logger_subsys { > > > subsys: AMF > > > debug: on > > > } > > > } > > > > > > > > > Thanks! > > > Dave > > > > > > -- > > > Dave Parker > > > Systems Administrator > > > Utica College > > > Integrated Information Technology Services > > > (315) 792-3229 > > > Registered Linux User #408177 > > > > > > > > > > > > > > > > > > _______________________________________________ > > > > > > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org> > > <mailto:Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org>> > > > > > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker > > > > > > > > > > > > Project Home: http://www.clusterlabs.org > > > > > > Getting started: > > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf > > > > > > Bugs: http://bugs.clusterlabs.org > > > > > > > > > _______________________________________________ > > > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org> > > <mailto:Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org>> > > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker > > > > > > Project Home: http://www.clusterlabs.org > > > Getting started: > > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf > > > Bugs: http://bugs.clusterlabs.org > > > > > > > > > > > > > > > -- > > > Dave Parker > > > Systems Administrator > > > Utica College > > > Integrated Information Technology Services > > > (315) 792-3229 <tel:%28315%29%20792-3229> > <tel:%28315%29%20792-3229> > > > Registered Linux User #408177 > > > > > > _______________________________________________ > > > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org> > > <mailto:Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org>> > > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker > > > > > > Project Home: http://www.clusterlabs.org > > > Getting started: > > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf > > > Bugs: http://bugs.clusterlabs.org > > > > > > > > > _______________________________________________ > > > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org> > > <mailto:Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org>> > > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker > > > > > > Project Home: http://www.clusterlabs.org > > > Getting started: > > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf > > > Bugs: http://bugs.clusterlabs.org > > > > > > > > > > > > > > > -- > > > Dave Parker > > > Systems Administrator > > > Utica College > > > Integrated Information Technology Services > > > (315) 792-3229 <tel:%28315%29%20792-3229> > <tel:%28315%29%20792-3229> > > > Registered Linux User #408177 > > > _______________________________________________ > > > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org> > > <mailto:Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org>> > > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker > > > > > > Project Home: http://www.clusterlabs.org > > > Getting started: > > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf > > > Bugs: http://bugs.clusterlabs.org > > > > > > _______________________________________________ > > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org> > > <mailto:Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org>> > > http://oss.clusterlabs.org/mailman/listinfo/pacemaker > > > > Project Home: http://www.clusterlabs.org > > Getting started: > http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf > > Bugs: http://bugs.clusterlabs.org > > > > > > > > > > -- > > Dave Parker > > Systems Administrator > > Utica College > > Integrated Information Technology Services > > (315) 792-3229 <tel:%28315%29%20792-3229> > > Registered Linux User #408177 > > > > > > This body part will be downloaded on demand. > > > > > -- > Need help with Pacemaker? > http://www.hastexo.com/now > > > > _______________________________________________ > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org > <mailto:Pacemaker@oss.clusterlabs.org> > http://oss.clusterlabs.org/mailman/listinfo/pacemaker > > Project Home: http://www.clusterlabs.org > Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf > Bugs: http://bugs.clusterlabs.org > > > > > -- > Dave Parker > Systems Administrator > Utica College > Integrated Information Technology Services > (315) 792-3229 > Registered Linux User #408177 > > > _______________________________________________ > Pacemaker mailing list: Pacemaker@oss.clusterlabs.org > http://oss.clusterlabs.org/mailman/listinfo/pacemaker > > Project Home: http://www.clusterlabs.org > Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf > Bugs: http://bugs.clusterlabs.org > -- Need help with Pacemaker? http://www.hastexo.com/now
signature.asc
Description: OpenPGP digital signature
_______________________________________________ Pacemaker mailing list: Pacemaker@oss.clusterlabs.org http://oss.clusterlabs.org/mailman/listinfo/pacemaker Project Home: http://www.clusterlabs.org Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf Bugs: http://bugs.clusterlabs.org