On Sun, Oct 2, 2011 at 12:31 AM, Gerald Vogt <[email protected]> wrote:
> On 02.10.11 03:18, Serge Dubrouski wrote:
> > 1. You expect rndc and host to be in $PATH. At the same time the path
> to
> > named can be configured. I think consequently, the same should apply
> to
> > rndc and host as they are bind utils.
> >
> > On our CentOS servers we run the latest version of bind, compiled
> from
> > source and installed in a custom path which is added in /etc/profile.
> > For some reason /etc/profile doesn't seem to apply to the ocf scripts
> > thus the script doesn't find rndc or host unless I extend PATH
> manually
> > at the beginning of the script.
> >
> >
> > We had some discussion around this and finally decided to leave it up
> > to sysadmin ti make sure that both tools are available in PATH. One
> > can always create a couple of symlink to cover it.
>
> But isn't it inconsequent that you can set the named path as a parameter
> but not rndc or host. named, rndc, and host all come out of a bind
> installation and they all run on the same host...
>
> > 2. In the stop function you call "rndc stop" to stop the daemon.
> > However, if the daemon hangs, rndc will hang. Thus pacemaker runs
> into a
> > timeout and kills the ocf script, leading to a failed stop.
> >
> >
> > You didn't read the code carefully again. Yes it does exactly what you
> > want or at least it's supposed to:
> >
> > if ! $RNDC stop >/dev/null; then
>
> The problem is your script never gets beyond this line. rndc tries to
> contact named which is hanging. I don't know what time out rndc has
> exactly but at least on our CentOS installation it doesn't time out
> within 60s.
>
> 60s is currently the timeout we have set in the "primitive" declaration.
> Thus after 60s pacemaker assumes your script is hanging and kills your
> script with TERM.
>
> As I wrote before: you should be able to test this easily by sending a
> STOP signal to the named process. At least in this situation I see that
> the "rndc stop" doesn't return before those 60s.
>
Indeed you are right. Thanks for catching. Attached is the patch that fixes
this issue. It also makes rndc and host commands configurable.
Please take a look at the patch and if it's all right I'll ask pacemaker
team to push it into git.
Thanks again.
>
> > kill `cat ${OCF_RESKEY_named_pidfile}`
> > fi
> >
> > if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
> > # Allow 2/3 of the action timeout for the orderly shutdown
> > # (The origin unit is ms, hence the conversion)
> > timeout=$((OCF_RESKEY_CRM_meta_timeout/1500))
> > else
> > timeout=20
> > fi
> >
> > while named_status ; do
> > if [ $timeout -ge ${OCF_RESKEY_named_stop_timeout} ]; then
> > break
> > else
> > sleep 1
> > timeout=$((timeout++))
> > fi
> > done
> >
> > *#If still up*
> > * if named_status 2>&1; then*
> > * ocf_log err "named is still up! Killing";*
> > * kill -9 `cat ${OCF_RESKEY_named_pidfile}`*
> > * fi*
> >
> >
> > I think the ocf script should have its own timeout and abort the rndc
> > call if it takes too long and then try to kill the server.
> >
> >
> > See above.
> >
> >
> >
> > To test send a STOP signal to named and wait...
>
> Gerald
>
> _______________________________________________
> Pacemaker mailing list: [email protected]
> http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>
> Project Home: http://www.clusterlabs.org
> Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> Bugs:
> http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker
>
--
Serge Dubrouski.
diff --git a/heartbeat/named b/heartbeat/named
index 8d15db6..e115eaf 100755
--- a/heartbeat/named
+++ b/heartbeat/named
@@ -15,23 +15,23 @@
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
-# Used binaries
-RNDC="rndc"
-HOST="host"
-
#Defaults
OCF_RESKEY_named_default="/usr/sbin/named"
+OCF_RESKEY_rndc_default="/usr/sbin/rndc"
+OCF_RESKEY_host_default="/usr/bin/host"
OCF_RESKEY_named_user_default=named
OCF_RESKEY_named_config_default=""
OCF_RESKEY_named_pidfile_default="/var/run/named/named.pid"
OCF_RESKEY_named_rootdir_default=""
OCF_RESKEY_named_options_default=""
OCF_RESKEY_named_keytab_file_default=""
-OCF_RESKEY_monitor_request_default="localhost"
+OCF_RESKEY_monitor_request_default="localhost.localdomain"
OCF_RESKEY_monitor_response_default="127.0.0.1"
OCF_RESKEY_monitor_ip_default="127.0.0.1"
: ${OCF_RESKEY_named=${OCF_RESKEY_named_default}}
+: ${OCF_RESKEY_rndc=${OCF_RESKEY_rndc_default}}
+: ${OCF_RESKEY_host=${OCF_RESKEY_host_default}}
: ${OCF_RESKEY_named_user=${OCF_RESKEY_named_user_default}}
: ${OCF_RESKEY_named_config=${OCF_RESKEY_named_config_default}}
: ${OCF_RESKEY_named_pidfile=${OCF_RESKEY_named_pidfile_default}}
@@ -80,6 +80,24 @@ Path to the named command.
<content type="string" default="${OCF_RESKEY_named_default}" />
</parameter>
+<parameters>
+<parameter name="rndc" unique="0" required="0">
+<longdesc lang="en">
+Path to the rndc command.
+</longdesc>
+<shortdesc lang="en">rndc</shortdesc>
+<content type="string" default="${OCF_RESKEY_rndc_default}" />
+</parameter>
+
+<parameters>
+<parameter name="host" unique="0" required="0">
+<longdesc lang="en">
+Path to the host command.
+</longdesc>
+<shortdesc lang="en">host</shortdesc>
+<content type="string" default="${OCF_RESKEY_host_default}" />
+</parameter>
+
<parameter name="named_user" unique="0" required="0">
<longdesc lang="en">
User that should own named process.
@@ -187,8 +205,8 @@ EOF
# Validate most critical parameters
named_validate_all() {
check_binary $OCF_RESKEY_named
- check_binary $RNDC
- check_binary $HOST
+ check_binary $OCF_RESKEY_rndc
+ check_binary $OCF_RESKEY_host
if [ -n "$OCF_RESKEY_named_config" -a \
! -r "${OCF_RESKEY_named_rootdir}/${OCF_RESKEY_named_config}" ]; then
@@ -256,7 +274,7 @@ named_monitor() {
return $OCF_NOT_RUNNING
fi
- output=`$HOST $OCF_RESKEY_monitor_request $OCF_RESKEY_monitor_ip`
+ output=`$OCF_RESKEY_host $OCF_RESKEY_monitor_request $OCF_RESKEY_monitor_ip`
if [ $? -ne 0 ] || ! echo $output | grep -q '.* has address '"$OCF_RESKEY_monitor_response"
then
@@ -274,7 +292,7 @@ named_monitor() {
#
named_reload() {
- $RNDC reload >/dev/null || return $OCF_ERR_GENERIC
+ $OCF_RESKEY_rndc reload >/dev/null || return $OCF_ERR_GENERIC
return $OCF_SUCCESS
}
@@ -338,33 +356,38 @@ named_start() {
named_stop () {
local timeout
+ local timewait
named_status || return $OCF_SUCCESS
- if ! $RNDC stop >/dev/null; then
+ $OCF_RESKEY_rndc stop >/dev/null &
+ if [ $? -ne 0 ]; then
+ ocf_log info "rndc stop failed. Killing named."
kill `cat ${OCF_RESKEY_named_pidfile}`
fi
if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then
# Allow 2/3 of the action timeout for the orderly shutdown
# (The origin unit is ms, hence the conversion)
- timeout=$((OCF_RESKEY_CRM_meta_timeout/1500))
+ timewait=$((OCF_RESKEY_CRM_meta_timeout/1500))
else
- timeout=20
+ timewait=20
fi
-
+
+ sleep 1; timeout=0 # Sleep here for 1 sec to let rndc finish.
while named_status ; do
- if [ $timeout -ge ${OCF_RESKEY_named_stop_timeout} ]; then
+ if [ $timeout -ge $timewait ]; then
break
else
sleep 1
- timeout=$((timeout++))
+ timeout=`expr $timeout + 1`
+ ocf_log debug "named appears to hung, waiting ..."
fi
done
#If still up
if named_status 2>&1; then
- ocf_log err "named is still up! Killing";
+ ocf_log err "named is still up! Killing"
kill -9 `cat ${OCF_RESKEY_named_pidfile}`
fi
_______________________________________________
Pacemaker mailing list: [email protected]
http://oss.clusterlabs.org/mailman/listinfo/pacemaker
Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker