Secondary NFS cluster doesn't always failover cleanly

Justin Pasher Thu, 18 Oct 2012 11:09:09 -0700

I have a pretty basic setup by most people's standards, but there mustbe something that is not quite right about it. Sometimes when I force aresource failover from one server to the other, the clients with the NFSmounts don't cleanly migrate to the new server. I configured this usinga few different "Pacemaker-DRBD-NFS" guides out there for reference (Ibelieve they were the Linbit guides).


Sorry in advance for the long email.


Here is the config:
------------------------------
------------------------------
* Two identical servers

* Four exported NFS shares total (so I can independently fail overindividual shares and run half on one server and half on the other)

* Bonded interface using LACP for "outgoing" client access

* Direct ethernet connection between the two servers (forPacemaker/Corosync and DRBD)


Package versions (installed from either Debian Squeeze or Backports)
* lvm 2.02.66-5
* drbd 8.3.7-2.1
* nfs-kernel-server 1.2.2-4squeeze2
* pacemaker 1.1.7-1~bpo60+1
* corosync 1.4.2-1~bpo60+1

Each NFS share is created using the same component format and has itsown virtual IP.

Hardware RAID -> /dev/sdb -> LVM -> DRBD single master (one resource foreach share)



Here is the pacemaker config (I really hope it doesn't get mangled):
====================
node storage1 \
    attributes standby="off"
node storage2 \
    attributes standby="off"
primitive p_drbd_distribion_storage ocf:linbit:drbd \
    params drbd_resource="distribion-storage" \
    op monitor interval="15" role="Master" \
    op monitor interval="30" role="Slave"
primitive p_drbd_vni_storage ocf:linbit:drbd \
    params drbd_resource="vni-storage" \
    op monitor interval="15" role="Master" \
    op monitor interval="30" role="Slave"
primitive p_drbd_xen_data1 ocf:linbit:drbd \
    params drbd_resource="xen-data1" \
    op monitor interval="15" role="Master" \
    op monitor interval="30" role="Slave"
primitive p_drbd_xen_data2 ocf:linbit:drbd \
    params drbd_resource="xen-data2" \
    op monitor interval="15" role="Master" \
    op monitor interval="30" role="Slave"
primitive p_exportfs_distribion_storage ocf:heartbeat:exportfs \

params fsid="1" directory="/data/distribion-storage"options="rw,async,no_root_squash,subtree_check"clientspec="10.205.152.0/21" wait_for_leasetime_on_stop="false" \

    op monitor interval="30s"
primitive p_exportfs_vni_storage ocf:heartbeat:exportfs \

params fsid="2" directory="/data/vni-storage"options="rw,async,no_root_squash,subtree_check"clientspec="10.205.152.0/21" wait_for_leasetime_on_stop="false" \

    op monitor interval="30s"
primitive p_exportfs_xen_data1 ocf:heartbeat:exportfs \

params fsid="3" directory="/data/xen-data1"options="rw,async,no_root_squash,subtree_check"clientspec="10.205.152.0/21" wait_for_leasetime_on_stop="false" \

    op monitor interval="30s"
primitive p_exportfs_xen_data2 ocf:heartbeat:exportfs \

params fsid="4" directory="/data/xen-data2"options="rw,async,no_root_squash,subtree_check"clientspec="10.205.152.0/21" wait_for_leasetime_on_stop="false" \

    op monitor interval="30s"
primitive p_fs_distribion_storage ocf:heartbeat:Filesystem \

params fstype="xfs" directory="/data/distribion-storage"device="/dev/drbd1" \

    meta target-role="Started"
primitive p_fs_vni_storage ocf:heartbeat:Filesystem \
    params fstype="xfs" directory="/data/vni-storage" device="/dev/drbd2"
primitive p_fs_xen_data1 ocf:heartbeat:Filesystem \
    params fstype="xfs" directory="/data/xen-data1" device="/dev/drbd3" \
    meta target-role="Started"
primitive p_fs_xen_data2 ocf:heartbeat:Filesystem \
    params fstype="xfs" directory="/data/xen-data2" device="/dev/drbd4" \
    meta target-role="Started"
primitive p_ip_distribion_storage ocf:heartbeat:IPaddr2 \
    params ip="10.205.154.137" cidr_netmask="21" \
    op monitor interval="20s"
primitive p_ip_vni_storage ocf:heartbeat:IPaddr2 \
    params ip="10.205.154.138" cidr_netmask="21" \
    op monitor interval="20s"
primitive p_ip_xen_data1 ocf:heartbeat:IPaddr2 \
    params ip="10.205.154.139" cidr_netmask="21" \
    op monitor interval="20s"
primitive p_ip_xen_data2 ocf:heartbeat:IPaddr2 \
    params ip="10.205.154.140" cidr_netmask="21" \
    op monitor interval="20s"
primitive p_lsb_nfsserver lsb:nfs-kernel-server \
    op monitor interval="30s"
primitive p_ping ocf:pacemaker:ping \
    params host_list="10.205.154.66" multiplier="100" \
    op monitor interval="15s" timeout="5s"

group g_nfs_distribion_storage p_ip_distribion_storagep_fs_distribion_storage p_exportfs_distribion_storagegroup g_nfs_vni_storage p_ip_vni_storage p_fs_vni_storagep_exportfs_vni_storage \

    meta is-managed="true" target-role="Started"
group g_nfs_xen_data1 p_ip_xen_data1 p_fs_xen_data1 p_exportfs_xen_data1
group g_nfs_xen_data2 p_ip_xen_data2 p_fs_xen_data2 p_exportfs_xen_data2
ms ms_drbd_distribion_storage p_drbd_distribion_storage \

meta master-max="1" master-node-max="1" clone-max="2"clone-node-max="1" notify="true"

ms ms_drbd_vni_storage p_drbd_vni_storage \

meta master-max="1" master-node-max="1" clone-max="2"clone-node-max="1" notify="true" is-managed="true" target-role="Started"

ms ms_drbd_xen_data1 p_drbd_xen_data1 \

meta master-max="1" master-node-max="1" clone-max="2"clone-node-max="1" notify="true"

ms ms_drbd_xen_data2 p_drbd_xen_data2 \

meta master-max="1" master-node-max="1" clone-max="2"clone-node-max="1" notify="true"

clone cl_lsb_nfsserver p_lsb_nfsserver \
    meta target-role="Started"
clone cl_ping p_ping \
    meta globally-unique="false"
location l_live_distribion_storage g_nfs_distribion_storage \

rule $id="l_live_distribion_storage-rule" -inf: not_defined pingdor pingd lte 0

location l_live_vni_storage g_nfs_vni_storage \

rule $id="l_live_vni_storage-rule" -inf: not_defined pingd or pingdlte 0

location l_live_xen_data1 g_nfs_xen_data1 \
    rule $id="l_live_xen_data1-rule" -inf: not_defined pingd or pingd lte 0
location l_live_xen_data2 g_nfs_xen_data2 \
    rule $id="l_live_xen_data2-rule" -inf: not_defined pingd or pingd lte 0

colocation c_p_fs_distribion_storage_on_ms_drbd_distribion_storage inf:g_nfs_distribion_storage ms_drbd_distribion_storage:Mastercolocation c_p_fs_vni_storage_on_ms_drbd_vni_storage inf:g_nfs_vni_storage ms_drbd_vni_storage:Mastercolocation c_p_fs_xen_data1_on_ms_drbd_xen_data1 inf: g_nfs_xen_data1ms_drbd_xen_data1:Mastercolocation c_p_fs_xen_data2_on_ms_drbd_xen_data2 inf: g_nfs_xen_data2ms_drbd_xen_data2:Masterorder o_ms_drbd_distribion_storage_before_p_fs_distribion_storage inf:ms_drbd_distribion_storage:promote g_nfs_distribion_storage:startorder o_ms_drbd_vni_storage_before_p_fs_vni_storage inf:ms_drbd_vni_storage:promote g_nfs_vni_storage:startorder o_ms_drbd_xen_data1_before_p_fs_xen_data1 inf:ms_drbd_xen_data1:promote p_exportfs_vni_storage:startorder o_ms_drbd_xen_data2_before_p_fs_xen_data2 inf:ms_drbd_xen_data2:promote g_nfs_xen_data2:start

property $id="cib-bootstrap-options" \
    dc-version="1.1.6-9971ebba4494012a93c03b40a2c58ec0eb60f50c" \
    cluster-infrastructure="openais" \
    expected-quorum-votes="2" \
    stonith-enabled="false" \
    no-quorum-policy="ignore" \
    last-lrm-refresh="1350405150"
rsc_defaults $id="rsc-options" \
    resource-stickiness="200"
====================

------------------------------
------------------------------

Now on to the issue I'm experiencing. I have a particular client machinethat mounts /data/vni-storage via NFS for its apache root directory(/var/www). If I log into the server and do an "ls /var/www", I'll seethe files. I then manually force a resource migration ("crm resourcemigrate g_nfs_vni_storage storage2"). The resource will migratesuccessfully on the back end (as shown by crm status), and everythingseems fine. However, if I issue an "ls /var/www" again on the client, itwill basically hang and not properly "see" the share at its newlocation. If I wait long enough (usually a matter of minutes), it willsometimes eventually spit on an I/O error message. I've even hadinstances (without my intervention) where the ocf:heartbeat:exportfsresource would time out (according to the logs) and "re-export" itself.If I log into the server, it will still show everything running fine,but on the client, it will now be showing a "stale NFS handle" errormessage.

I've done research to try to understand the issue, and some havecommented on the fsid parameter needing to match between the clusterservers. In fact, I've had that parameter set in the options forp_exportfs_vni_storage since the initial deployment about six months ago.

I then had an issue the other day where I had to manually migrate theshare to the other server, which ultimately led to issues with some ofthe OTHER NFS shares (namely g_nfs_xen_data1). This was a bad share tohave trouble with, as it is an NFS storage repository for our XenServerVM guests, which led to all sorts of "disk failure" issues on the guests.

After looking around some more today, my first thought was that multipleNFS shares might not be well supported (even though I really need themto be this way). I took a look at the resource script for exportfs(/usr/lib/ocf/resource.d/heartbeat/exportfs), and I noticed that whenthe script makes a copy of /var/lib/nfs/rmtab in the backup_rmtabfunction, it filters out any shares that don't match the exporteddirectory of the active resource. It looks like this may become aproblem when the restore_rmtab function is later called after a resourcemigration, because now /var/lib/nfs/rmtab will only contain thedirectory for the active resource and not the other three NFS mounts.Maybe this leads to the failover issue?



So to sum it up:

Was ocf:heartbeat:exportfs intended to work with multiple, separate NFSshares? Due to the way that the rmtab file is backed up, it doesn't seemlike that is the case. If so, what would be the recommended course ofaction? If I manage the exportfs shares outside of pacemaker, I stillhave to worry about keeping /var/lib/nfs/rmtab copied over on the shares.

In regards to the client getting the "stale NFS handle" error and havingtrouble failing over, is this in any way related to Apache keeping a lotof files open on that share (primarily log files)? Would that affect theability for the NFS client to try to "reconnect" to the new server?

Are there any other obviously mistakes or improvements to my pacemakerconfig that could be made?


Thanks.


--
Justin Pasher

_______________________________________________
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://bugs.clusterlabs.org

[Pacemaker] "Simple" LVM/drbd backed Primary/Secondary NFS cluster doesn't always failover cleanly

Reply via email to