costin 02/04/15 16:47:57 Modified: jk/native2/common jk_worker_lb.c Log: Patch from Eugene Gluzberg <[EMAIL PROTECTED]> > If a tomcat goes down, it gets taken out of the "worker list" for 60 > seconds, before its retried. If all the tomcats that are being load balanced > are being restarted (one at a time to upgrade the system for instance), you > need to wait until the tomcat that you just brought up to get back in the > list before you shut down the next one, or you may have website downtime. > Same problem if any intermittent network problem would occur between > Netscape and tomcat. The downtime would be at least 60 seconds. > To avoid that I made a fix to the load balancer worker that would be > activated only when all the tomcat workers were taken out of the list due to > failure. > If all the load balanced tomcat workers are out of the list, the patched > load balancer would go through all of the workers again once in reverse > order of their "last error time" until one is found. They would be retried > even though 60 seconds did not pass yet. If and only if all the workers have > been tried once in this particular request and all failed an error would be > returned to the user. The retries would continue for every request until at > least one tomcat worker recovers. After the first tomcat worker recovers, > the rest would be retried once every 60 seconds as before. Revision Changes Path 1.3 +52 -1 jakarta-tomcat-connectors/jk/native2/common/jk_worker_lb.c Index: jk_worker_lb.c =================================================================== RCS file: /home/cvs/jakarta-tomcat-connectors/jk/native2/common/jk_worker_lb.c,v retrieving revision 1.2 retrieving revision 1.3 diff -u -r1.2 -r1.3 --- jk_worker_lb.c 12 Apr 2002 00:23:07 -0000 1.2 +++ jk_worker_lb.c 15 Apr 2002 23:47:57 -0000 1.3 @@ -60,7 +60,7 @@ * several workers. * * Author: Gal Shachor <[EMAIL PROTECTED]> * * Based on: * - * Version: $Revision: 1.2 $ * + * Version: $Revision: 1.3 $ * ***************************************************************************/ #include "jk_pool.h" @@ -128,6 +128,7 @@ } } + /** Get one worker that is ready */ for(i = 0 ; i < p->num_of_workers ; i++) { if(p->lb_workers[i]->in_error_state) { if(!p->lb_workers[i]->in_recovering) { @@ -137,6 +138,7 @@ p->lb_workers[i]->in_recovering = JK_TRUE; p->lb_workers[i]->error_time = now; + p->lb_workers[i]->retry_count++; rc = p->lb_workers[i]; break; @@ -150,6 +152,45 @@ } } + if ( !rc ) { + /* no workers found (rc is null), now try as hard as possible to get a + worker anyway, pick one with largest error time.. */ + for(i = 0 ; i < p->num_of_workers ; i++) { + if(p->lb_workers[i]->in_error_state) { + if(!p->lb_workers[i]->in_recovering) { + /* if the retry count is zero, that means the worker only + failed once, this is to e that the failed worker will + not continue to be retried over and over again. + */ + if ( p->lb_workers[i]->retry_count == 0 ) { + if ( rc ) { + /* pick the oldest failed worker */ + if ( p->lb_workers[i]->error_time < rc->error_time ) { + rc = p->lb_workers[i]; + } + } else { + rc = p->lb_workers[i]; + } + } + } + } else { + /* This is a good worker - it may have come to life */ + if(p->lb_workers[i]->lb_value < lb_min || rc != NULL) { + lb_min = p->lb_workers[i]->lb_value; + rc = p->lb_workers[i]; + break; + } + } + } + + if ( rc && rc->in_error_state ) { + time_t now = time(0); + rc->in_recovering = JK_TRUE; + rc->error_time = now; + rc->retry_count++; + } + } + if(rc) { rc->lb_value += rc->lb_factor; } @@ -167,6 +208,7 @@ jk_ws_service_t *s) { int attempt=0; + int i; if( s==NULL ) { env->l->jkLog(env, env->l, JK_LOG_ERROR, @@ -177,6 +219,13 @@ /* you can not recover on another load balancer */ s->realWorker=NULL; + /* reset all the retry counts to 0 */ + for(i = 0 ; i < w->num_of_workers ; i++) { + w->lb_workers[i]->retry_count = 0; + } + + + while(1) { jk_worker_t *rec; int rc; @@ -210,6 +259,7 @@ } rec->in_error_state = JK_FALSE; rec->in_recovering = JK_FALSE; + rec->retry_count = 0; rec->error_time = 0; /* the endpoint that succeeded is saved for done() */ s->realWorker = rec; @@ -295,6 +345,7 @@ _this->lb_workers[currentWorker]->lb_factor; _this->lb_workers[currentWorker]->in_error_state = JK_FALSE; _this->lb_workers[currentWorker]->in_recovering = JK_FALSE; + _this->lb_workers[currentWorker]->retry_count = 0; currentWorker++; }
-- To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]> For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>