costin 02/04/15 16:47:57
Modified: jk/native2/common jk_worker_lb.c
Log:
Patch from Eugene Gluzberg <[EMAIL PROTECTED]>
> If a tomcat goes down, it gets taken out of the "worker list" for 60
> seconds, before its retried. If all the tomcats that are being load balanced
> are being restarted (one at a time to upgrade the system for instance), you
> need to wait until the tomcat that you just brought up to get back in the
> list before you shut down the next one, or you may have website downtime.
> Same problem if any intermittent network problem would occur between
> Netscape and tomcat. The downtime would be at least 60 seconds.
> To avoid that I made a fix to the load balancer worker that would be
> activated only when all the tomcat workers were taken out of the list due to
> failure.
> If all the load balanced tomcat workers are out of the list, the patched
> load balancer would go through all of the workers again once in reverse
> order of their "last error time" until one is found. They would be retried
> even though 60 seconds did not pass yet. If and only if all the workers have
> been tried once in this particular request and all failed an error would be
> returned to the user. The retries would continue for every request until at
> least one tomcat worker recovers. After the first tomcat worker recovers,
> the rest would be retried once every 60 seconds as before.
Revision Changes Path
1.3 +52 -1 jakarta-tomcat-connectors/jk/native2/common/jk_worker_lb.c
Index: jk_worker_lb.c
===================================================================
RCS file: /home/cvs/jakarta-tomcat-connectors/jk/native2/common/jk_worker_lb.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -u -r1.2 -r1.3
--- jk_worker_lb.c 12 Apr 2002 00:23:07 -0000 1.2
+++ jk_worker_lb.c 15 Apr 2002 23:47:57 -0000 1.3
@@ -60,7 +60,7 @@
* several workers. *
* Author: Gal Shachor <[EMAIL PROTECTED]> *
* Based on: *
- * Version: $Revision: 1.2 $ *
+ * Version: $Revision: 1.3 $ *
***************************************************************************/
#include "jk_pool.h"
@@ -128,6 +128,7 @@
}
}
+ /** Get one worker that is ready */
for(i = 0 ; i < p->num_of_workers ; i++) {
if(p->lb_workers[i]->in_error_state) {
if(!p->lb_workers[i]->in_recovering) {
@@ -137,6 +138,7 @@
p->lb_workers[i]->in_recovering = JK_TRUE;
p->lb_workers[i]->error_time = now;
+ p->lb_workers[i]->retry_count++;
rc = p->lb_workers[i];
break;
@@ -150,6 +152,45 @@
}
}
+ if ( !rc ) {
+ /* no workers found (rc is null), now try as hard as possible to get a
+ worker anyway, pick one with largest error time.. */
+ for(i = 0 ; i < p->num_of_workers ; i++) {
+ if(p->lb_workers[i]->in_error_state) {
+ if(!p->lb_workers[i]->in_recovering) {
+ /* if the retry count is zero, that means the worker only
+ failed once, this is to e that the failed worker will
+ not continue to be retried over and over again.
+ */
+ if ( p->lb_workers[i]->retry_count == 0 ) {
+ if ( rc ) {
+ /* pick the oldest failed worker */
+ if ( p->lb_workers[i]->error_time < rc->error_time ) {
+ rc = p->lb_workers[i];
+ }
+ } else {
+ rc = p->lb_workers[i];
+ }
+ }
+ }
+ } else {
+ /* This is a good worker - it may have come to life */
+ if(p->lb_workers[i]->lb_value < lb_min || rc != NULL) {
+ lb_min = p->lb_workers[i]->lb_value;
+ rc = p->lb_workers[i];
+ break;
+ }
+ }
+ }
+
+ if ( rc && rc->in_error_state ) {
+ time_t now = time(0);
+ rc->in_recovering = JK_TRUE;
+ rc->error_time = now;
+ rc->retry_count++;
+ }
+ }
+
if(rc) {
rc->lb_value += rc->lb_factor;
}
@@ -167,6 +208,7 @@
jk_ws_service_t *s)
{
int attempt=0;
+ int i;
if( s==NULL ) {
env->l->jkLog(env, env->l, JK_LOG_ERROR,
@@ -177,6 +219,13 @@
/* you can not recover on another load balancer */
s->realWorker=NULL;
+ /* reset all the retry counts to 0 */
+ for(i = 0 ; i < w->num_of_workers ; i++) {
+ w->lb_workers[i]->retry_count = 0;
+ }
+
+
+
while(1) {
jk_worker_t *rec;
int rc;
@@ -210,6 +259,7 @@
}
rec->in_error_state = JK_FALSE;
rec->in_recovering = JK_FALSE;
+ rec->retry_count = 0;
rec->error_time = 0;
/* the endpoint that succeeded is saved for done() */
s->realWorker = rec;
@@ -295,6 +345,7 @@
_this->lb_workers[currentWorker]->lb_factor;
_this->lb_workers[currentWorker]->in_error_state = JK_FALSE;
_this->lb_workers[currentWorker]->in_recovering = JK_FALSE;
+ _this->lb_workers[currentWorker]->retry_count = 0;
currentWorker++;
}
--
To unsubscribe, e-mail: <mailto:[EMAIL PROTECTED]>
For additional commands, e-mail: <mailto:[EMAIL PROTECTED]>