Hi,
> I am experiencing an error situation which gets not detected by the cluster.
>
> I created a 2-node cluster using drbd and want to use ocfs2 on both
> nodes simultaneously. (stripped off some monitor/meta stuff)
> baaaaad idea ... pretty useless without the full configuration, especially
> the meta attributes in this case. Also share your drbd and corosycn
> configuration please. BTW: what is your use case to start with a "simple"
> dual-primary OCFS2 setup?
Thanks for this hint. I wanted to be brief. I am sharing more than happily this
information.
Here is the cluster configuration:
node rt-lxcl9a \
attributes standby="on"
node rt-lxcl9b \
attributes standby="off"
primitive dlm ocf:pacemaker:controld \
op monitor interval="60" timeout="60"
primitive ip-rt-lxlr9a ocf:heartbeat:IPaddr \
params ip="10.13.132.94" cidr_netmask="255.255.252.0" \
op monitor interval="5s" timeout="20s" depth="0" \
meta target-role="Started"
primitive ip-rt-lxlr9b ocf:heartbeat:IPaddr \
params ip="10.13.132.95" cidr_netmask="255.255.252.0" \
op monitor interval="5s" timeout="20s" depth="0" \
meta target-role="Started"
primitive o2cb ocf:ocfs2:o2cb \
op monitor interval="60" timeout="60" \
meta target-role="Stopped"
primitive resDRBD ocf:linbit:drbd \
params drbd_resource="r0" \
operations $id="resDRBD-operations" \
op monitor interval="20" role="Master" timeout="20" \
op monitor interval="30" role="Slave" timeout="20" \
meta target-role="Stopped"
primitive resource-fs ocf:heartbeat:Filesystem \
params device="/dev/drbd_r0" directory="/SHARED" fstype="ocfs2" \
op monitor interval="120s" \
meta target-role="Stopped"
primitive stonith-ilo-rt-lxcl9ar stonith:external/ipmi \
params hostname="rt-lxcl9a" ipaddr="10.13.172.85" userid="stonith"
passwd="stonithstonith" passwd_method="param" interface="lanplus"
pcmk_host_check="static-list" pcmk_host_list="rt-lxcl9a" \
meta target-role="Started"
primitive stonith-ilo-rt-lxcl9br stonith:external/ipmi \
params hostname="rt-lxcl9b" ipaddr="10.13.172.93" userid="stonith"
passwd="stonithstonith" passwd_method="param" interface="lanplus"
pcmk_host_check="static-list" pcmk_host_list="rt-lxcl9b"
ms msDRBD resDRBD \
meta resource-stickines="100" notify="true" master-max="2"
interleave="true" target-role="Stopped"
clone clone-dlm dlm \
meta globally-unique="false" interleave="true" target-role="Stopped"
clone clone-fs resource-fs \
meta interleave="true" ordered="true" target-role="Started"
clone clone-ocb o2cb \
meta globally-unique="false" interleave="true" target-role="Stopped"
location location-stonith-ilo-rt-lxcl9ar stonith-ilo-rt-lxcl9ar -inf: rt-lxcl9a
location location-stonith-ilo-rt-lxcl9br stonith-ilo-rt-lxcl9br -inf: rt-lxcl9b
colocation colocation-dlm-drbd inf: clone-dlm msDRBD:Master
colocation colocation-fs-o2cb inf: clone-fs clone-ocb
colocation colocation-ocation-dlm inf: clone-ocb clone-dlm
order order-dlm-o2cb 0: clone-dlm clone-ocb
order order-drbd-dlm 0: msDRBD:promote clone-dlm:start
order order-o2cb-fs 0: clone-ocb clone-fs
property $id="cib-bootstrap-options" \
stonith-enabled="true" \
no-quorum-policy="ignore" \
placement-strategy="balanced" \
dc-version="1.1.6-b988976485d15cb702c9307df55512d323831a5e" \
cluster-infrastructure="openais" \
expected-quorum-votes="2" \
last-lrm-refresh="1372504787" \
stonith-timeout="30s" \
maintenance-mode="false"
rsc_defaults $id="rsc-options" \
resource-stickiness="200" \
migration-threshold="3"
op_defaults $id="op-options" \
timeout="600" \
record-pending="true"
# cat /etc/drbd.d/ro.res
resource r0 {
startup {
# become-primary-on both;
}
handlers {
split-brain "/usr/lib/drbd/notify-split-brain.sh root";
}
net {
protocol C;
allow-two-primaries;
after-sb-0pri discard-least-changes;
after-sb-1pri discard-secondary;
after-sb-2pri disconnect;
}
disk {
disk-flushes no;
}
on rt-lxcl9a {
device /dev/drbd_r0 minor 0;
disk /dev/cciss/c0d1;
meta-disk internal;
address 192.168.4.1:7788;
}
on rt-lxcl9b {
device /dev/drbd_r0 minor 0;
disk /dev/cciss/c0d1;
meta-disk internal;
address 192.168.4.2:7788;
}
syncer {
rate 100M;
}
}
cat /etc/corosync/corosync.conf | grep -v ^$ | grep -v \#
aisexec {
group: root
user: root
}
service {
use_mgmtd: yes
use_logd: yes
ver: 0
name: pacemaker
}
totem {
rrp_mode: passive
join: 60
max_messages: 20
vsftype: none
token: 5000
consensus: 6000
secauth: on
token_retransmits_before_loss_const: 10
threads: 24
transport: udp
version: 2
interface {
bindnetaddr: 192.168.4.0
mcastaddr: 239.129.234.245
mcastport: 5405
ringnumber: 0
}
interface {
mcastaddr: 239.129.234.245
mcastport: 5405
bindnetaddr: 10.13.132.0
ringnumber: 1
}
clear_node_high_bit: yes
}
logging {
to_logfile: yes
to_syslog: yes
debug: off
timestamp: off
to_stderr: no
fileline: off
syslog_facility: daemon
}
amf {
mode: disabled
}
> > 1. How to avoid split brain situations (I am confident that the cross
> > link using a 10GB cable was never interrupted)?
logs should reveal what happend
> at least you also need to stop the filesystem if it is running and you want
> to demote one Primary ... and then follow that link
Thanks for this info!
> > 3. How to make the cluster aware of the split brain situation? (It
> > thinks everything is fine)
> setup fening method "resource-and-stonith" in drbd configuration, preferable
> use the "crm-fence-peer.sh" stonith script ... Pacemaker itself or better the
> DRBD resource agent will not react on such a situation.
This is important information for me.
Regards,
-- martin
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems