Re: [Linux-HA] drbd primary/primary for ocfs2 and undetected split brain

EXTERNAL Konold Martin (erfrakon, RtP2/TEF72) Fri, 29 Jun 2012 08:11:49 -0700

Hi,

> I am experiencing an error situation which gets not detected by the cluster.
>
> I created a 2-node cluster using drbd and want to use ocfs2 on both
> nodes simultaneously. (stripped off some monitor/meta stuff)


> baaaaad idea ... pretty useless without the full configuration, especially 
> the meta attributes in this case. Also share your drbd and corosycn 
> configuration please. BTW: what is your use case to start with a "simple" 
> dual-primary OCFS2 setup?

Thanks for this hint. I wanted to be brief. I am sharing more than happily this 
information.

Here is the cluster configuration:

node rt-lxcl9a \
        attributes standby="on"
node rt-lxcl9b \
        attributes standby="off"
primitive dlm ocf:pacemaker:controld \
        op monitor interval="60" timeout="60"
primitive ip-rt-lxlr9a ocf:heartbeat:IPaddr \
        params ip="10.13.132.94" cidr_netmask="255.255.252.0" \
        op monitor interval="5s" timeout="20s" depth="0" \
        meta target-role="Started"
primitive ip-rt-lxlr9b ocf:heartbeat:IPaddr \
        params ip="10.13.132.95" cidr_netmask="255.255.252.0" \
        op monitor interval="5s" timeout="20s" depth="0" \
        meta target-role="Started"
primitive o2cb ocf:ocfs2:o2cb \
        op monitor interval="60" timeout="60" \
        meta target-role="Stopped"
primitive resDRBD ocf:linbit:drbd \
        params drbd_resource="r0" \
        operations $id="resDRBD-operations" \
        op monitor interval="20" role="Master" timeout="20" \
        op monitor interval="30" role="Slave" timeout="20" \
        meta target-role="Stopped"
primitive resource-fs ocf:heartbeat:Filesystem \
        params device="/dev/drbd_r0" directory="/SHARED" fstype="ocfs2" \
        op monitor interval="120s" \
        meta target-role="Stopped"
primitive stonith-ilo-rt-lxcl9ar stonith:external/ipmi \
        params hostname="rt-lxcl9a" ipaddr="10.13.172.85" userid="stonith" 
passwd="stonithstonith" passwd_method="param" interface="lanplus" 
pcmk_host_check="static-list" pcmk_host_list="rt-lxcl9a" \
        meta target-role="Started"
primitive stonith-ilo-rt-lxcl9br stonith:external/ipmi \
        params hostname="rt-lxcl9b" ipaddr="10.13.172.93" userid="stonith" 
passwd="stonithstonith" passwd_method="param" interface="lanplus" 
pcmk_host_check="static-list" pcmk_host_list="rt-lxcl9b"
ms msDRBD resDRBD \
        meta resource-stickines="100" notify="true" master-max="2" 
interleave="true" target-role="Stopped"
clone clone-dlm dlm \
        meta globally-unique="false" interleave="true" target-role="Stopped"
clone clone-fs resource-fs \
        meta interleave="true" ordered="true" target-role="Started"
clone clone-ocb o2cb \
        meta globally-unique="false" interleave="true" target-role="Stopped"
location location-stonith-ilo-rt-lxcl9ar stonith-ilo-rt-lxcl9ar -inf: rt-lxcl9a
location location-stonith-ilo-rt-lxcl9br stonith-ilo-rt-lxcl9br -inf: rt-lxcl9b
colocation colocation-dlm-drbd inf: clone-dlm msDRBD:Master
colocation colocation-fs-o2cb inf: clone-fs clone-ocb
colocation colocation-ocation-dlm inf: clone-ocb clone-dlm
order order-dlm-o2cb 0: clone-dlm clone-ocb
order order-drbd-dlm 0: msDRBD:promote clone-dlm:start
order order-o2cb-fs 0: clone-ocb clone-fs
property $id="cib-bootstrap-options" \
        stonith-enabled="true" \
        no-quorum-policy="ignore" \
        placement-strategy="balanced" \
        dc-version="1.1.6-b988976485d15cb702c9307df55512d323831a5e" \
        cluster-infrastructure="openais" \
        expected-quorum-votes="2" \
        last-lrm-refresh="1372504787" \
        stonith-timeout="30s" \
        maintenance-mode="false"
rsc_defaults $id="rsc-options" \
        resource-stickiness="200" \
        migration-threshold="3"
op_defaults $id="op-options" \
        timeout="600" \
        record-pending="true"

# cat /etc/drbd.d/ro.res
resource r0 {
  startup {
#    become-primary-on both;
  }
  handlers {
     split-brain "/usr/lib/drbd/notify-split-brain.sh root";
  }
  net {
     protocol C;
     allow-two-primaries;
     after-sb-0pri discard-least-changes;
     after-sb-1pri discard-secondary;
     after-sb-2pri disconnect;
  }
  disk {
    disk-flushes no;
  }
  on rt-lxcl9a {
    device /dev/drbd_r0 minor 0;
    disk /dev/cciss/c0d1;
    meta-disk internal;
    address 192.168.4.1:7788;
  }
  on rt-lxcl9b {
    device /dev/drbd_r0 minor 0;
    disk /dev/cciss/c0d1;
    meta-disk internal;
    address 192.168.4.2:7788;
  }
  syncer {
    rate  100M;
  }
}

cat /etc/corosync/corosync.conf | grep -v ^$ | grep -v \#
aisexec {
        group:  root
        user:   root
}
service {
        use_mgmtd:      yes
        use_logd:       yes
        ver:    0
        name:   pacemaker
}
totem {
        rrp_mode:       passive
        join:   60
        max_messages:   20
        vsftype:        none
        token:  5000
        consensus:      6000
        secauth:        on
        token_retransmits_before_loss_const:    10
        threads:        24
        transport:      udp
        version:        2
        interface {
                bindnetaddr:    192.168.4.0
                mcastaddr:      239.129.234.245
                mcastport:      5405
                ringnumber:     0
        }
        interface {
                mcastaddr:      239.129.234.245
                mcastport:      5405
                bindnetaddr:    10.13.132.0
                ringnumber:     1
        }
        clear_node_high_bit:    yes
}
logging {
        to_logfile:     yes
        to_syslog:      yes
        debug:  off
        timestamp:      off
        to_stderr:      no
        fileline:       off
        syslog_facility:        daemon
}
amf {
        mode:   disabled
}

> > 1.      How to avoid split brain situations (I am confident that the cross 
> > link using a 10GB cable was never interrupted)?

logs should reveal what happend

> at least you also need to stop the filesystem if it is running and you want 
> to demote one Primary ... and then follow that link

Thanks for this info!


> > 3.      How to make the cluster aware of the split brain situation? (It 
> > thinks everything is fine)

> setup fening method "resource-and-stonith" in drbd configuration, preferable 
> use the "crm-fence-peer.sh" stonith script ... Pacemaker itself or better the 
> DRBD resource agent will not react on such a situation.

This is important information for me.

Regards,
-- martin
_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

Re: [Linux-HA] drbd primary/primary for ocfs2 and undetected split brain

Reply via email to