costin      02/04/15 16:47:57

  Modified:    jk/native2/common jk_worker_lb.c
  Log:
  Patch from Eugene Gluzberg <[EMAIL PROTECTED]>
  
  > If a tomcat goes down, it gets taken out of the "worker list" for 60
  > seconds, before its retried. If all the tomcats that are being load balanced
  > are being restarted (one at a time to upgrade the system for instance), you
  > need to wait until the tomcat that you just brought up to get back in the
  > list before you shut down the next one, or you may have website downtime.
  > Same problem if any intermittent network problem would occur between
  > Netscape and tomcat. The downtime would be at least 60 seconds.
  
  > To avoid that I made a fix to the load balancer worker that would be
  > activated only when all the tomcat workers were taken out of the list due to
  > failure.
  
  > If all the load balanced tomcat workers are out of the list, the patched
  > load balancer would go through all of the workers again once in reverse
  > order of their "last error time" until one is found. They would be retried
  > even though 60 seconds did not pass yet. If and only if all the workers have
  > been tried once in this particular request and all failed an error would be
  > returned to the user. The retries would continue for every request until at
  > least one tomcat worker recovers. After the first tomcat worker recovers,
  > the rest would be retried once every 60 seconds as before.
  
  Revision  Changes    Path
  1.3       +52 -1     jakarta-tomcat-connectors/jk/native2/common/jk_worker_lb.c
  
  Index: jk_worker_lb.c
  ===================================================================
  RCS file: /home/cvs/jakarta-tomcat-connectors/jk/native2/common/jk_worker_lb.c,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- jk_worker_lb.c    12 Apr 2002 00:23:07 -0000      1.2
  +++ jk_worker_lb.c    15 Apr 2002 23:47:57 -0000      1.3
  @@ -60,7 +60,7 @@
    *              several workers.                                           *
    * Author:      Gal Shachor <[EMAIL PROTECTED]>                           *
    * Based on:                                                               *
  - * Version:     $Revision: 1.2 $                                           *
  + * Version:     $Revision: 1.3 $                                           *
    ***************************************************************************/
   
   #include "jk_pool.h"
  @@ -128,6 +128,7 @@
           }
       }
   
  +    /** Get one worker that is ready */
       for(i = 0 ; i < p->num_of_workers ; i++) {
           if(p->lb_workers[i]->in_error_state) {
               if(!p->lb_workers[i]->in_recovering) {
  @@ -137,6 +138,7 @@
                       
                       p->lb_workers[i]->in_recovering  = JK_TRUE;
                       p->lb_workers[i]->error_time     = now;
  +                    p->lb_workers[i]->retry_count++;
                       rc = p->lb_workers[i];
   
                       break;
  @@ -150,6 +152,45 @@
           }            
       }
   
  +    if ( !rc ) {
  +        /* no workers found (rc is null), now try as hard as possible to get a
  +           worker anyway, pick one with largest error time.. */
  +        for(i = 0 ; i < p->num_of_workers ; i++) {
  +            if(p->lb_workers[i]->in_error_state) {
  +                if(!p->lb_workers[i]->in_recovering) {
  +                    /* if the retry count is zero, that means the worker only
  +                       failed once, this is to e that the failed worker will
  +                       not continue to be retried over and over again.
  +                    */
  +                    if ( p->lb_workers[i]->retry_count == 0 ) {
  +                        if ( rc ) {
  +                            /* pick the oldest failed worker */
  +                            if ( p->lb_workers[i]->error_time < rc->error_time ) {
  +                                rc = p->lb_workers[i];
  +                            }
  +                        } else {
  +                            rc = p->lb_workers[i];
  +                        }
  +                    }
  +                }
  +            } else {
  +                /* This is a good worker - it may have come to life */
  +                if(p->lb_workers[i]->lb_value < lb_min || rc != NULL) {
  +                    lb_min = p->lb_workers[i]->lb_value;
  +                    rc = p->lb_workers[i];
  +                    break;
  +                }
  +            }
  +        }
  +        
  +        if ( rc  && rc->in_error_state ) {
  +            time_t now = time(0);
  +            rc->in_recovering  = JK_TRUE;
  +            rc->error_time     = now;
  +            rc->retry_count++;
  +        }
  +    }
  +    
       if(rc) {
           rc->lb_value += rc->lb_factor;                
       }
  @@ -167,6 +208,7 @@
                                       jk_ws_service_t *s)
   {
       int attempt=0;
  +    int i;
   
       if( s==NULL ) {
           env->l->jkLog(env, env->l, JK_LOG_ERROR,
  @@ -177,6 +219,13 @@
       /* you can not recover on another load balancer */
       s->realWorker=NULL;
   
  +       /* reset all the retry counts to 0 */
  +       for(i = 0 ; i < w->num_of_workers ; i++) {
  +           w->lb_workers[i]->retry_count = 0;
  +       }
  +
  +
  +    
       while(1) {
           jk_worker_t *rec;
           int rc;
  @@ -210,6 +259,7 @@
               }
               rec->in_error_state = JK_FALSE;
               rec->in_recovering  = JK_FALSE;
  +            rec->retry_count    = 0;
               rec->error_time     = 0;
               /* the endpoint that succeeded is saved for done() */
               s->realWorker = rec;
  @@ -295,6 +345,7 @@
               _this->lb_workers[currentWorker]->lb_factor;
           _this->lb_workers[currentWorker]->in_error_state = JK_FALSE;
           _this->lb_workers[currentWorker]->in_recovering  = JK_FALSE;
  +        _this->lb_workers[currentWorker]->retry_count  = 0;
   
           currentWorker++;
       }
  
  
  

--
To unsubscribe, e-mail:   <mailto:[EMAIL PROTECTED]>
For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>

Reply via email to