[Linux-HA] Centos 6.4 KVM+ DRBD 8.4.2 + Pacemaker 1.1.10 - Ddbd Monitor Timeout? Drbd promotion/demotion failing!

Jimmy Magee Thu, 04 Jul 2013 02:40:55 -0700

Hi All,

Currently setting up a drbd pacemaker cluster on two Centos 6.4 kvm's. The 
kvm's are running on a Centos 6.4 host operating, each vm installed on separate 
logical volumes,  with 1 GB ram allocated to each vm. DRBD 8.4.2 + Pacemaker 
1.1.10 + Corosync 1.4.1-15 + Cman 3.0.12.1-49 cluster software is installed and 
configured on both vms.
Drbb starts manually and promoting/demoting the device via drbdadm while 
testing is perfect. Drbd under ha control causes the primary node to repeatedly 
restart drbd / dependant resources and
fails to promote drbd on active node when primary is in standby mode..
Observing drbd via service drbd status or /proc/drbd when both nodes are online 
all seems healthy, however the drbd monitor is timing out causing the resources 
to be restart on the primary node..


Jul  3 21:47:19 webtext-2 lrmd[2511]:  warning: child_timeout_callback: 
mysql_drbd_monitor_0 process (PID 18391) timed out
Jul  3 21:47:21 webtext-2 crmd[2514]:    error: process_lrm_event: LRM 
operation mysql_drbd_monitor_0 (666) Timed Out (timeout=20000ms)
Jul  3 22:51:46 webtext-2 lrmd[19204]:  warning: child_timeout_callback: 
mysql_drbd_promote_0 process (PID 21046) timed out
Jul  3 22:51:47 webtext-2 crmd[19207]:    error: process_lrm_event: LRM 
operation mysql_drbd_promote_0 (189) Timed Out (timeout=20000ms)


I have adjusted a number of timeout settings to allow for the limited hardware 
however not getting consistent failover.  Current cluster and drbd config 
setting below,and the full corosync/pacemaker logs available 
here....https://dl.dropboxusercontent.com/u/89694994/corosync-logs.zip with 
more detailed info.
Appreciate some guidance on resolving this issue.

Many thanks,
Jimmy.



Cluster Setup and Configs.


# cat /proc/drbd
version: 8.4.2 (api:1/proto:86-101)
GIT-hash: 7ad5f850d711223713d6dcadc3dd48860321070c build by dag@Build64R6, 
2012-09-06 08:16:10
 0: cs:Connected ro:Primary/Secondary ds:UpToDate/UpToDate C r-----
    ns:376 nr:0 dw:376 dr:6961 al:7 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0

 Primary node is put in standby mode


# cat /proc/drbd  (webtext-2)
version: 8.4.2 (api:1/proto:86-101)
GIT-hash: 7ad5f850d711223713d6dcadc3dd48860321070c build by dag@Build64R6, 
2012-09-06 08:16:10
 0: cs:WFConnection ro:Secondary/Unknown ds:UpToDate/Outdated C r-----
    ns:0 nr:0 dw:0 dr:0 al:0 bm:0 lo:0 pe:0 ua:0 ap:0 ep:1 wo:f oos:0

# cat /proc/drbd (webtext-1)
version: 8.4.2 (api:1/proto:86-101)
GIT-hash: 7ad5f850d711223713d6dcadc3dd48860321070c build by dag@Build64R6, 
2012-09-06 08:16:10






# crm_mon --inactive --group-by-node -1
Last updated: Wed Jul  3 23:17:32 2013
Last change: Wed Jul  3 22:51:38 2013 via cibadmin on webtext-2
Stack: cman
Current DC: webtext-1 - partition with quorum
Version: 1.1.10-1.el6-2718638
2 Nodes configured, unknown expected votes
5 Resources configured.


Node webtext-1: standby
Node webtext-2: online
        mysql_drbd      (ocf::linbit:drbd):     Started 

Inactive resources:

 Master/Slave Set: mysql_ms [mysql_drbd]
     Slaves: [ webtext-2 ]
     Stopped: [ webtext-1 ]
 Resource Group: mysql
     mysql_fs   (ocf::heartbeat:Filesystem):    Stopped 
     mysql_init (lsb:mysql):    Stopped 
 jboss_init     (lsb:jboss):    Stopped 

Failed actions:
    mysql_drbd_monitor_130000 (node=webtext-1, call=349, rc=1, status=Timed 
Out, last-rc-change=Wed Jul  3 22:44:49 2013
, queued=0ms, exec=0ms
): unknown error
    mysql_drbd_promote_0 (node=webtext-2, call=189, rc=1, status=Timed Out, 
last-rc-change=Wed Jul  3 22:51:25 2013
, queued=20980ms, exec=13ms
): unknown error



# crm configure show
node webtext-1 \
        attributes standby="on"
node webtext-2 \
        attributes standby="off"
primitive jboss_init lsb:jboss \
        op monitor interval="40" timeout="120" start-delay="320" \
        op start interval="0" timeout="320" \
        op stop interval="0" timeout="320" \
        meta target-role="Started"
primitive mysql_drbd ocf:linbit:drbd \
        params drbd_resource="r0" \
        op monitor interval="130" role="Master" \
        op monitor interval="140" role="Slave" \
        op stop interval="0" timeout="240" \
        op start interval="0" timeout="320" \
        meta target-role="Started"
primitive mysql_fs ocf:heartbeat:Filesystem \
        params device="/dev/drbd/by-res/r0" directory="/drbd0/" fstype="ext4" \
        op stop interval="0" timeout="120" \
        op start interval="0" timeout="120" \
        op monitor interval="40" timeout="120" \
        meta is-managed="true"
primitive mysql_init lsb:mysql \
        op stop interval="0" timeout="320" \
        op start interval="0" timeout="320" \
        meta is-managed="true"
group mysql mysql_fs mysql_init \
        meta target-role="Started"
ms mysql_ms mysql_drbd \
        meta master-max="1" master-node-max="1" clone-max="2" 
clone-node-max="1" notify="true" is-managed="true"
location drbd-fence-by-handler-r0-mysql_ms mysql_ms \
        rule $id="drbd-fence-by-handler-r0-rule-mysql_ms" $role="Master" -inf: 
#uname ne webtext-2.vennetics.com
colocation jboss_with_mysql inf: jboss_init mysql
colocation mysql_on_drbd inf: mysql mysql_ms:Master
order jboss_after_mysql inf: mysql_init jboss_init
order mysql_after_drbd inf: mysql_ms:promote mysql:start
property $id="cib-bootstrap-options" \
        dc-version="1.1.10-1.el6-2718638" \
        cluster-infrastructure="cman" \
        stonith-enabled="false" \
        last-lrm-refresh="1372884412" \
        no-quorum-policy="ignore"




#vi /etc/drbd.conf


global {
    usage-count yes;
}
 
resource r0 {
 
    # write IO is reported as completed if it has reached both local
    # and remote disk
    protocol C;
 
    net {
        # set up peer authentication
        cram-hmac-alg sha1;
        shared-secret "test";
    }
 
    startup {
        # wait for connection timeout - boot process blocked
        # until DRBD resources are connected
        # -----  wfc-timeout 30;
        # WFC timeout if peer was outdated
        # -----  outdated-wfc-timeout 20;
        # WFC timeout if this node was in a degraded cluster (i.e. only had one
        # node left)
        # -----   degr-wfc-timeout 30;
    }
 
    disk {
         fencing resource-only;  
    }

    handlers {
        fence-peer "/usr/lib/drbd/crm-fence-peer.sh";
        after-resync-target "/usr/lib/drbd/crm-unfence-peer.sh";
    }


    # first node
    on webtext-1.vennetics.com {
        # DRBD device
        device /dev/drbd0;
        # backing store device
        disk /dev/vg_webtext1_02/lv_drbd0;
        # IP address of node, and port to listen on
        address 10.87.79.218:7788;
        # use internal meta data (don't create a filesystem before 
        # you create metadata!)
        meta-disk internal;
    }
    # second node
    on webtext-2.vennetics.com {
        # DRBD debice
        device /dev/drbd0;
        # backing store device
        disk /dev/vg_webtext2_02/lv_drbd0;
        # IP address of node, and port to listen on
        address 10.87.79.219:7788;
        # use internal meta data (don't create a filesystem before
        # you create metadata!)
        meta-disk internal;
    }
}


vi /etc/cluster/cluster.conf

<?xml version="1.0"?>
<cluster config_version="1" name="webtext_cluster">
        <clusternodes>
                <clusternode name="webtext-1" nodeid="1">
                        <fence>
                                <method name="pcmk-redirect">
                                        <device name="pcmk" port="webtext-1"/>
                                </method>
                        </fence>
                </clusternode>
                <clusternode name="webtext-2" nodeid="2">
                        <fence>
                                <method name="pcmk-redirect">
                                        <device name="pcmk" port="webtext-2"/>
                                </method>
                        </fence>
                </clusternode>
        </clusternodes>
        <fencedevices>
                <fencedevice agent="fence_pcmk" name="pcmk"/>
        </fencedevices>
        <cman expected_votes="1" two_node="1"/>
        <logging to_syslog="yes" to_logfile="yes" syslog_facility="daemon"
                 syslog_priority="info" logfile_priority="info">
        <logging_daemon name="qdiskd"
             logfile="/var/log/cluster/qdiskd.log"  logfile_priority="debug"/>
        <logging_daemon name="fenced"
             logfile="/var/log/cluster/fenced.log"  logfile_priority="debug"/>
        <logging_daemon name="dlm_controld"
             logfile="/var/log/cluster/dlm_controld.log"  
logfile_priority="debug"/>
        <logging_daemon name="gfs_controld"
             logfile="/var/log/cluster/gfs_controld.log"  
logfile_priority="debug"/>
        <logging_daemon name="corosync" 
             logfile="/var/log/cluster/corosync.log" logfile_priority="debug"/>
        </logging>
</cluster>


# vi /etc/sysconfig/pacemaker 

# For non-systemd based systems, prefix export to each enabled line

# Turn on special handling for CMAN clusters in the init script
# Without this, fenced (and by inference, cman) cannot reliably be made to shut 
down
PCMK_STACK=cman

#==#==# Variables that control logging

# Enable debug logging globally or per-subsystem
# Multiple subsystems may me listed separated by commas
PCMK_debug=crmd,pengine,cib,stonith-ng,attrd,pacemakerd


# rpm -qa | grep pacemaker
pacemaker-cli-1.1.10-1.el6.x86_64
pacemaker-libs-1.1.10-1.el6.x86_64
pacemaker-cluster-libs-1.1.10-1.el6.x86_64
pacemaker-libs-devel-1.1.10-1.el6.x86_64
pacemaker-remote-1.1.10-1.el6.x86_64
pacemaker-cts-1.1.10-1.el6.x86_64
pacemaker-debuginfo-1.1.10-1.el6.x86_64
pacemaker-1.1.10-1.el6.x86_64

# rpm -qa | grep cman
cman-3.0.12.1-49.el6.x86_64

# rpm -qa | grep coro
corosync-1.4.1-15.el6_4.1.x86_64
corosynclib-1.4.1-15.el6_4.1.x86_64
corosynclib-devel-1.4.1-15.el6_4.1.x86_64

# rpm -qa | grep resource-agents
resource-agents-3.9.2-21.el6.x86_64


# rpm -qa | grep libqb
libqb-devel-0.14.2-3.el6.x86_64
libqb-0.14.2-3.el6.x86_64

# rpm -qa | grep drbd
drbd84-utils-8.4.2-1.el6.elrepo.x86_64
kmod-drbd84-8.4.2-1.el6_3.elrepo.x86_64


# ifconfig (webtext-2)
eth0      Link encap:Ethernet  HWaddr 52:54:00:65:EC:27  
          inet addr:10.87.79.217  Bcast:10.87.79.255  Mask:255.255.255.0
          inet6 addr: fe80::5054:ff:fe65:ec27/64 Scope:Link
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          RX packets:116526 errors:0 dropped:0 overruns:0 frame:0
          TX packets:104213 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:19340444 (18.4 MiB)  TX bytes:53027494 (50.5 MiB)
          Interrupt:10 Base address:0xc000 

eth1      Link encap:Ethernet  HWaddr 52:54:00:95:68:C1  
          inet addr:10.87.79.219  Bcast:10.87.79.255  Mask:255.255.255.0
          inet6 addr: fe80::5054:ff:fe95:68c1/64 Scope:Link
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          RX packets:436 errors:0 dropped:0 overruns:0 frame:0
          TX packets:14 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:25724 (25.1 KiB)  TX bytes:900 (900.0 b)
          Interrupt:10 Base address:0xe000 

# ifconfig (webtext-1)
eth0      Link encap:Ethernet  HWaddr 52:54:00:CB:9A:F4  
          inet addr:10.87.79.216  Bcast:10.87.79.255  Mask:255.255.255.0
          inet6 addr: fe80::5054:ff:fecb:9af4/64 Scope:Link
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          RX packets:121593 errors:0 dropped:0 overruns:0 frame:0
          TX packets:107920 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:54007733 (51.5 MiB)  TX bytes:22464750 (21.4 MiB)
          Interrupt:10 Base address:0xc000 

eth1      Link encap:Ethernet  HWaddr 52:54:00:30:07:C9  
          inet addr:10.87.79.218  Bcast:10.87.79.255  Mask:255.255.255.0
          inet6 addr: fe80::5054:ff:fe30:7c9/64 Scope:Link
          UP BROADCAST RUNNING MULTICAST  MTU:1500  Metric:1
          RX packets:510 errors:0 dropped:0 overruns:0 frame:0
          TX packets:12 errors:0 dropped:0 overruns:0 carrier:0
          collisions:0 txqueuelen:1000 
          RX bytes:29934 (29.2 KiB)  TX bytes:720 (720.0 b)
          Interrupt:10 Base address:0xe000 


# lvdisplay
  --- Logical volume ---
  LV Path                /dev/vg_webtext2_02/lv_drbd0
  LV Name                lv_drbd0
  VG Name                vg_webtext2_02
  LV UUID                d8ATq9-XPqT-mTAZ-By3H-dEoL-SoDV-ebCJL3
  LV Write Access        read/write
  LV Creation host, time webtext-2.vennetics.com, 2013-06-30 10:35:10 +0100
  LV Status              available
  # open                 2
  LV Size                4.00 GiB
  Current LE             1023
  Segments               1
  Allocation             inherit
  Read ahead sectors     auto
  - currently set to     256
  Block device           253:2

# lvdisplay
  --- Logical volume ---
  LV Path                /dev/vg_webtext1_02/lv_drbd0
  LV Name                lv_drbd0
  VG Name                vg_webtext1_02
  LV UUID                3qB7lS-zH0O-WIKC-F6nl-0cuE-2Zu9-95RkF9
  LV Write Access        read/write
  LV Creation host, time webtext-1.vennetics.com, 2013-06-30 12:00:59 +0100
  LV Status              available
  # open                 0
  LV Size                4.00 GiB
  Current LE             1023
  Segments               1
  Allocation             inherit
  Read ahead sectors     auto
  - currently set to     256
  Block device           253:2










_______________________________________________
Linux-HA mailing list
[email protected]
http://lists.linux-ha.org/mailman/listinfo/linux-ha
See also: http://linux-ha.org/ReportingProblems

[Linux-HA] Centos 6.4 KVM+ DRBD 8.4.2 + Pacemaker 1.1.10 - Ddbd Monitor Timeout? Drbd promotion/demotion failing!

Reply via email to