Hi Yuichi
 
I create two patches trying to fix this issue. 

In these patches, expand lockfile() to let it not only record the daemon pid, 
but also record daemon starting status(include "starting" and "started") .
At the same time, modify the logic of the controld RA, so that it can read 
that status and return more precise result.

Would you mind testing it to see if it works for you?

Regards,
 Xia Li

>>> On 1/23/2013 at 12:43 PM, in message
<CAMb0o5J2JS3earuk5Z++O+z7p5f+5z=mmwzrna7h+5nzect...@mail.gmail.com>, Yuichi
SEINO <seino.clust...@gmail.com> wrote: 
> Hi Jiaju, 
>  
> I understood about the complete solution. 
> However because this issue causes the critical problem that multiple 
> resources start, Could you apply this request or simply revert a 
> commit to tentatively handle this issue until you are resolved at the 
> summer? I think that we are difficult to avoid  this issue by the 
> operation unlike booth deadlock etc. If booth does not start at the 
> same time, then booth can avoid deadlock. 
>  
> This issue caused following things. 
> * Multiple resources start. 
> * When booth causes deadlock, the resource timeout dose not happen. 
> Previous, we could watch timeout on crm_mon. Currently, timeout 
> happens after booth was daemon. 
>  
> Sincerely, 
> Yuichi 
>  
> 2013/1/21 Jiaju Zhang <jjzh...@suse.de>: 
> > Hi Yuichi, 
> > 
> > On Fri, 2013-01-18 at 17:02 +0900, Yuichi SEINO wrote: 
> >> Hi Jiaju, 
> >> 
> >> I try fixing this issue by reverting a commit. What do you think about it? 
> >> https://github.com/jjzhang/booth/pull/48 
> > 
> > Moving the while setup stage before daemonizing seems not to be a sane 
> > solution. setup_ticket() needs to get the latest ticket information by 
> > communicating with other nodes. Currently it was there and using TCP, 
> > but long term and sane solution would be to move it to the main poll(), 
> > asynchronously waiting for catch-up result. Before catching-up was 
> > ready, booth can still response, it can participate in Paxos as a 
> > non-voting member. 
> > 
> > To fix this issue, how do you think if we remove the stale ticket 
> > information in the CIB once booth was starting? We already have the APIs 
> > in pacemaker.c which can clear the ticket information in the CIB. This 
> > step is reasonable because the tickets at that moment is really stale 
> > data. 
> > 
> > About the implementation, I have not thought it in very detail but one 
> > idea that came into my mind is that maybe we can expand lockfile() (or 
> > some wrapper to lockfile()) to let it do more things, not only record 
> > the daemon pid, but also record daemon starting status, like "starting", 
> > "started", thus, the controld RA can read that status and return more 
> > precise result. 
> > 
> > I'll have Xia to look into this problem in more detail. 
> > 
> > Thanks, 
> > Jiaju 
> > 
> > 
>  
>  
>  
> -- 
> Yuichi SEINO 
> METROSYSTEMS CORPORATION 
> E-mail:seino.clust...@gmail.com 
>  
>  


Index: booth/src/main.c
===================================================================
--- booth.orig/src/main.c
+++ booth/src/main.c
@@ -63,6 +63,12 @@ static int client_size = 0;
 struct client *client = NULL;
 struct pollfd *pollfd = NULL;
 
+typedef enum 
+{
+       BOOTHD_STARTED=0,
+       BOOTHD_STARTING
+} BOOTH_DAEMON_STATE;
+
 int poll_timeout = -1;
 
 typedef enum {
@@ -450,7 +456,30 @@ static int setup_timer(void)
        return timerlist_init();
 }
 
-static int loop(void)
+static int write_daemon_state(int fd, int state)
+{
+       char buf[16];
+       int rv=1;
+       
+       memset(buf, 0, sizeof(buf));
+       snprintf(buf, sizeof(buf), "%d %d", getpid(), state);
+
+       rv = lseek(fd, 0, SEEK_SET);
+       if (rv <= 0) {
+               log_error("lseek set offset to 0 error: %d: %s",
+                       fd, strerror(errno));
+       } 
+       
+       rv = write(fd, buf, strlen(buf));
+       if (rv <= 0) {
+               log_error("lockfile write error %d: %s",
+                      fd, strerror(errno));
+               return rv;    
+       }
+
+       return rv;
+}
+static int loop(int fd)
 {
        void (*workfn) (int ci);
        void (*deadfn) (int ci);
@@ -473,6 +502,18 @@ static int loop(void)
                goto fail;
        client_add(rv, process_listener, NULL);
 
+       rv = write_daemon_state(fd, BOOTHD_STARTED);
+       if (rv <= 0) {
+               log_error("lockfile write state %d error %s: %s",
+                      BOOTHD_STARTED, cl.lockfile, strerror(errno));
+               goto fail;
+       }
+
+       if (cl.type == ARBITRATOR)
+               log_info("BOOTH arbitrator daemon started");
+       else if (cl.type == SITE)
+               log_info("BOOTH cluster site daemon started");
+
         while (1) {
                 rv = poll(pollfd, client_maxi + 1, poll_timeout);
                 if (rv == -1 && errno == EINTR)
@@ -681,9 +722,10 @@ static int do_revoke(void)
        return do_command(BOOTHC_CMD_REVOKE);
 }
 
+
+
 static int lockfile(void)
 {
-       char buf[16];
        struct flock lock;
        int fd, rv;
 
@@ -691,39 +733,41 @@ static int lockfile(void)
        if (fd < 0) {
                 log_error("lockfile open error %s: %s",
                           cl.lockfile, strerror(errno));
-                return -1;
-        }       
-
-        lock.l_type = F_WRLCK;
-        lock.l_start = 0;
-        lock.l_whence = SEEK_SET;
-        lock.l_len = 0;
-        
-        rv = fcntl(fd, F_SETLK, &lock);
-        if (rv < 0) {
-                log_error("lockfile setlk error %s: %s",
-                          cl.lockfile, strerror(errno));
-                goto fail;
-        }
-
-        rv = ftruncate(fd, 0);
-        if (rv < 0) {
-                log_error("lockfile truncate error %s: %s",
-                          cl.lockfile, strerror(errno));
-                goto fail;
-        }
+            return -1;
+    }       
 
-        memset(buf, 0, sizeof(buf));
-        snprintf(buf, sizeof(buf), "%d\n", getpid());
-
-        rv = write(fd, buf, strlen(buf));
-        if (rv <= 0) {
-                log_error("lockfile write error %s: %s",
-                          cl.lockfile, strerror(errno));
-                goto fail;
-        }
+    lock.l_type = F_WRLCK;
+    lock.l_start = 0;
+    lock.l_whence = SEEK_SET;
+    lock.l_len = 0;
+    
+    rv = fcntl(fd, F_SETLK, &lock);
+    if (rv < 0) {
+            log_error("lockfile setlk error %s: %s",
+                      cl.lockfile, strerror(errno));
+            goto fail;
+    }
+
+    rv = ftruncate(fd, 0);
+    if (rv < 0) {
+            log_error("lockfile truncate error %s: %s",
+                      cl.lockfile, strerror(errno));
+            goto fail;
+    }
+
+    /*memset(buf, 0, sizeof(buf));
+    snprintf(buf, sizeof(buf), "%d %d\n", getpid(), BOOTHD_STARTING);
+
+    rv = write(fd, buf, strlen(buf));
+    */
+    rv = write_daemon_state(fd, BOOTHD_STARTING);
+    if (rv <= 0) {
+               log_error("lockfile write error %s: %s",
+                      cl.lockfile, strerror(errno));
+               goto fail;
+    }
 
-        return fd;
+    return fd;
  fail:
         close(fd);
         return -1;
@@ -986,14 +1030,14 @@ static int do_server(int type)
                return fd;
 
        if (type == ARBITRATOR)
-               log_info("BOOTH arbitrator daemon started");
+               log_info("BOOTH arbitrator daemon is starting.");
        else if (type == SITE)
-               log_info("BOOTH cluster site daemon started");
+               log_info("BOOTH cluster site daemon is starting.");
 
        set_scheduler();
        set_oom_adj(-16);
 
-       rv = loop();
+       rv = loop(fd);
 
 out:
        if (fd >= 0)
Index: booth/script/lsb/booth-arbitrator
===================================================================
--- booth.orig/script/lsb/booth-arbitrator
+++ booth/script/lsb/booth-arbitrator
@@ -24,6 +24,12 @@ exec="/usr/sbin/$prog"
 type="arbitrator"       
 lockfile="/var/run/booth.pid"
 
+BOOTH_DAEMON_STARTED=0
+BOOTH_DAEMON_STARTING=1
+BOOTH_DAEMON_EXIST=2
+BOOTH_DAEMON_NOT_RUNNING=3
+BOOTH_ERROR_GENERIC=4
+
 . /etc/rc.status
 
 internal_status() {
@@ -32,16 +38,22 @@ internal_status() {
 }
 
 check_status() {
-       internal_status
-       if [ "$?" -eq 0 ];then
-               pidnum=$(cat $lockfile)
+       internal_status ; rc=$?
+
+       if [ $rc == 0 ];then
+               pidnum=$(cat $lockfile|awk '{print $1}')
+               daemonstate=$(cat $lockfile |awk '{print $2}')
                if cat /proc/$pidnum/cmdline |grep $type > /dev/null 2>&1 ; then
-                       return 0  #arbitrator daemon is Running
+                       case $daemonstate in
+                       $BOOTH_DAEMON_STARTED) return $BOOTH_DAEMON_STARTED;;
+                       $BOOTH_DAEMON_STARTING) return $BOOTH_DAEMON_STARTING;; 
#arbitrator daemon is Running
+                       *) return $BOOTH_ERROR_GENERIC;;
+                       esac
                else
-                       return 1  #site daemon is Running
+                       return $BOOTH_DAEMON_EXIST  #site daemon is Running
                fi
        else
-               return 2 #BOOTH daemon is not running    
+               return $BOOTH_DAEMON_NOT_RUNNING #BOOTH daemon is not running   
 
        fi    
 }
 
@@ -58,18 +70,21 @@ status() {
 
 start() {
        [ -x $exec ] || exit 5
-       check_status
-  rc=$?
+       check_status; rc=$?
        case "$rc" in
-       0)
+       $BOOTH_DAEMON_STARTED)
                echo "BOOTH arbitrator daemon is Running"
-               return 1
+               return 0
+               ;;
+       $BOOTH_DAEMON_STARTING)
+               echo "BOOTH arbitrator daemon is Running"
+               return 0
                ;;
-       1)
+       $BOOTH_DAEMON_EXIST)
                echo "BOOTH site daemon is Running"
                return 1
                ;;
-       2)
+       $BOOTH_DAEMON_NOT_RUNNING)
                echo -n $"Starting BOOTH arbitrator daemon: "           
                startproc $exec $type
                rc_status -v
@@ -78,14 +93,23 @@ start() {
 }
 
 stop() {
-       if check_status; then
-               echo -n $"Stopping BOOTH arbitrator daemon: "
-               killproc -p $lockfile $prog -TERM
-               rc_status -v
-       else
+       check_status; rc=$?
+       case $rc in
+       $BOOTH_DAEMON_STARTED);;
+       $BOOTH_DAEMON_STARTING);;
+       $BOOTH_DAEMON_EXIST) 
+               echo "BOOTH site daemon is running."
+               return $OCF_ERR_GENERIC;;
+       $BOOTH_DAEMON_NOT_RUNNING)
                echo "BOOTH arbitrator daemon is not running."
-               return 1
-       fi
+               return $OCF_SUCCESS 
+       ;;
+       *) return $OCF_ERR_GENERIC;;
+       esac
+       
+       echo -n $"Stopping BOOTH arbitrator daemon: "
+       killproc -p $lockfile $prog -TERM
+       rc_status -v
 }
 
 wait_for_stop() {
Index: booth/script/ocf/booth-site
===================================================================
--- booth.orig/script/ocf/booth-site
+++ booth/script/ocf/booth-site
@@ -25,6 +25,14 @@
 #######################################################################
 # Initialization:
 
+lockfile="/var/run/booth.pid"
+
+BOOTH_DAEMON_STARTED=0
+BOOTH_DAEMON_STARTING=1
+BOOTH_DAEMON_EXIST=2
+BOOTH_DAEMON_NOT_RUNNING=3
+BOOTH_ERROR_GENERIC=4
+
 . ${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs
 
 #######################################################################
@@ -36,6 +44,7 @@ meta_data() {
 <resource-agent name="booth-site" version="0.9">
 <version>1.0</version>
 
+
 <longdesc lang="en">
 This Resource Agent can control the BOOTH site daemon.
 It assumes that the binary boothd is in your default PATH.
@@ -92,55 +101,109 @@ Expects to have a fully populated OCF RA
 END
 }
 
-booth_start() {
-    booth_monitor; rc=$?
-
-    case $rc in
-      $OCF_SUCCESS)     return $OCF_SUCCESS;;
-      $OCF_NOT_RUNNING) ;;
-      *) return $OCF_ERR_GENERIC;;
-    esac
-
-    ${OCF_RESKEY_daemon} $OCF_RESKEY_type $OCF_RESKEY_args
-
-    sleep 1
-    booth_monitor
+booth_check_daemon_exist(){
+       killall -0 ${OCF_RESKEY_daemon} >/dev/null 2>&1; rc=$?
+       
+       case $rc in
+       0) return $OCF_SUCCESS;;
+       1) return $OCF_NOT_RUNNING;;
+       *) return $OCF_ERR_GENERIC;; 
+       esac
 }
 
-booth_stop() {
-    booth_monitor; rc=$?
+booth_check_daemon_state(){
+       booth_check_daemon_exist ; rc=$?
+       
+       case $rc in 
+       $OCF_SUCCESS)
+               pidnum=$(cat $lockfile |awk '{print $1}')
+               daemonstate=$(cat $lockfile |awk '{print $2}')
+               if cat /proc/$pidnum/cmdline |grep $OCF_RESKEY_type >/dev/null 
2>&1; then
+                       case  $daemonstate in 
+                       $BOOTH_DAEMON_STARTED) return $BOOTH_DAEMON_STARTED;;
+                       $BOOTH_DAEMON_STARTING) return $BOOTH_DAEMON_STARTING;;
+                       *) return $BOOTH_ERROR_GENERIC;;        
+                       esac
+               else
+                       return $BOOTH_DAEMON_EXIST;
+               fi
+       ;;
+       $OCF_NOT_RUNNING) return $BOOTH_DAEMON_NOT_RUNNING;;
+       $OCF_ERR_GENERIC) return $BOOTH_ERROR_GENERIC;;
+       *) return $BOOTH_ERROR_GENERIC;;
+       esac
+}
 
-    if [ $rc = $OCF_NOT_RUNNING ]; then
-       return $OCF_SUCCESS
-    fi
+booth_start() {
+       booth_check_daemon_state; rc=$?
 
-    killall -TERM ${OCF_RESKEY_daemon}; rc=$?
+       case $rc in
+       $BOOTH_DAEMON_STARTED)     return $OCF_SUCCESS;;
+       $BOOTH_DAEMON_STARTING) 
+               while [ $rc != $BOOTH_DAEMON_STARTED ]; do
+                       sleep 1
+                       booth_check_daemon_state ; rc=$?
+               done
+               return $OCF_SUCCESS;;
+       $BOOTH_DAEMON_EXIST) return $OCF_ERR_GENERIC;;
+       $BOOTH_DAEMON_NOT_RUNNING) ;;
+       *) return $OCF_ERR_GENERIC;;
+       esac
 
-    if [ $rc != 0 ]; then
-       return $OCF_ERR_GENERIC
-    fi
+       ${OCF_RESKEY_daemon} $OCF_RESKEY_type $OCF_RESKEY_args
 
-    rc=$OCF_SUCCESS
-    while [ $rc = $OCF_SUCCESS ]; do
-       booth_monitor; rc=$?
        sleep 1
-    done
+       rc=$BOOTH_DAEMON_STARTING
+       while [ $rc != $BOOTH_DAEMON_STARTED ]; do
+               booth_check_daemon_state; rc=$?
+               sleep 1
+       done
+       
+       return $OCF_SUCCESS
+}
+
+booth_stop() {
+       booth_check_daemon_state; rc=$?
 
-    if [ $rc = $OCF_NOT_RUNNING ]; then
-       rc=$OCF_SUCCESS
-    fi
+       case $rc in
+       $BOOTH_DAEMON_STARTED) ;;
+       $BOOTH_DAEMON_STARTING) ;;
+       $BOOTH_DAEMON_EXIST) return $OCF_ERR_GENERIC;;
+       $BOOTH_DAEMON_NOT_RUNNING) return $OCF_SUCCESS;;
+       *) return $OCF_ERR_GENERIC;;
+       esac
+
+       killall -TERM ${OCF_RESKEY_daemon}; rc=$?
+
+       if [ $rc != 0 ]; then
+               return $OCF_ERR_GENERIC
+       fi
+
+       rc=$BOOTH_DAEMON_STARTED
+       while [ $rc != $BOOTH_DAEMON_NOT_RUNNING ]; do
+               booth_check_daemon_state; rc=$?
+               sleep 1
+       done
+
+       if [ $rc = $BOOTH_DAEMON_NOT_RUNNING ]; then
+               rc=$OCF_SUCCESS
+       else
+               rc=$OCF_ERR_GENERIC
+       fi
 
-    return $rc
+       return $rc
 }
 
 booth_monitor() {
-    killall -0 ${OCF_RESKEY_daemon}; rc=$?
+       booth_check_daemon_state; rc=$?
 
-    case $rc in
-      0) return $OCF_SUCCESS;;
-      1) return $OCF_NOT_RUNNING;;
-      *) return $OCF_ERR_GENERIC;;
-    esac
+       case $rc in
+       $BOOTH_DAEMON_STARTED) return $OCF_SUCCESS;;
+       $BOOTH_DAEMON_STARTING) return $OCF_NOT_RUNNING;;
+       $BOOTH_DAEMON_EXIST) return $OCF_NOT_RUNNING;;
+       $BOOTH_DAEMON_NOT_RUNNING) return $OCF_NOT_RUNNING;;
+       *) return $OCF_ERR_GENERIC;;
+       esac
 }
 
 booth_validate() {
_______________________________________________
Pacemaker mailing list: Pacemaker@oss.clusterlabs.org
http://oss.clusterlabs.org/mailman/listinfo/pacemaker

Project Home: http://www.clusterlabs.org
Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
Bugs: http://bugs.clusterlabs.org

Reply via email to