Changeset: 3551b30d82fe for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3551b30d82fe Modified Files: tools/merovingian/daemon/forkmserver.c tools/merovingian/daemon/merovingian.c Branch: Jun2016 Log Message:
Move terminateProcess to same file as formMserver, and use fork_lock. fork_lock is held while a new server is being started so that multiple starts of the same server don't interfere with each other. We should then also hold the same lock while stopping a server. Part of the ongoing quest to quash bug 4066. diffs (285 lines): diff --git a/tools/merovingian/daemon/forkmserver.c b/tools/merovingian/daemon/forkmserver.c --- a/tools/merovingian/daemon/forkmserver.c +++ b/tools/merovingian/daemon/forkmserver.c @@ -31,6 +31,145 @@ static pthread_mutex_t fork_lock = PTHREAD_MUTEX_INITIALIZER; /** + * The terminateProcess function tries to let the given mserver process + * shut down gracefully within a given time-out. If that fails, it + * sends the deadly SIGKILL signal to the mserver process and returns. + */ +void * +terminateProcess(void *p) +{ + dpair d = (dpair)p; + sabdb *stats; + char *er; + int i; + confkeyval *kv; + /* make local copies since d will disappear when killed */ + pid_t pid = d->pid; + char *dbname = strdup(d->dbname); + + pthread_mutex_lock(&fork_lock); + + er = msab_getStatus(&stats, dbname); + if (er != NULL) { + pthread_mutex_unlock(&fork_lock); + Mfprintf(stderr, "cannot terminate process " LLFMT ": %s\n", + (long long int)pid, er); + free(er); + free(dbname); + return NULL; + } + + if (stats == NULL) { + pthread_mutex_unlock(&fork_lock); + Mfprintf(stderr, "strange, process " LLFMT " serves database '%s' " + "which does not exist\n", (long long int)pid, dbname); + free(dbname); + return NULL; + } + + switch (stats->state) { + case SABdbRunning: + /* ok, what we expect */ + break; + case SABdbCrashed: + pthread_mutex_unlock(&fork_lock); + Mfprintf(stderr, "cannot shut down database '%s', mserver " + "(pid " LLFMT ") has crashed\n", + dbname, (long long int)pid); + msab_freeStatus(&stats); + free(dbname); + return NULL; + case SABdbInactive: + pthread_mutex_unlock(&fork_lock); + Mfprintf(stdout, "database '%s' appears to have shut down already\n", + dbname); + fflush(stdout); + msab_freeStatus(&stats); + free(dbname); + return NULL; + case SABdbStarting: + pthread_mutex_unlock(&fork_lock); + Mfprintf(stderr, "database '%s' appears to be starting up\n", + dbname); + /* starting up, so we'll go to the shut down phase */ + break; + default: + pthread_mutex_unlock(&fork_lock); + Mfprintf(stderr, "unknown state: %d\n", (int)stats->state); + msab_freeStatus(&stats); + free(dbname); + return NULL; + } + + if (d->type == MEROFUN) { + pthread_mutex_unlock(&fork_lock); + multiplexDestroy(dbname); + msab_freeStatus(&stats); + free(dbname); + return NULL; + } else if (d->type != MERODB) { + /* barf */ + pthread_mutex_unlock(&fork_lock); + Mfprintf(stderr, "cannot stop merovingian process role: %s\n", dbname); + msab_freeStatus(&stats); + free(dbname); + return NULL; + } + + /* ok, once we get here, we'll be shutting down the server */ + Mfprintf(stdout, "sending process " LLFMT " (database '%s') the " + "TERM signal\n", (long long int)pid, dbname); + kill(pid, SIGTERM); + kv = findConfKey(_mero_props, "exittimeout"); + for (i = 0; i < atoi(kv->val) * 2; i++) { + if (stats != NULL) + msab_freeStatus(&stats); + sleep_ms(500); + er = msab_getStatus(&stats, dbname); + if (er != NULL) { + Mfprintf(stderr, "unexpected problem: %s\n", er); + free(er); + /* don't die, just continue, so we KILL in the end */ + } else if (stats == NULL) { + Mfprintf(stderr, "hmmmm, database '%s' suddenly doesn't exist " + "any more\n", dbname); + } else { + switch (stats->state) { + case SABdbRunning: + case SABdbStarting: + /* ok, try again */ + break; + case SABdbCrashed: + pthread_mutex_unlock(&fork_lock); + Mfprintf (stderr, "database '%s' crashed after SIGTERM\n", + dbname); + msab_freeStatus(&stats); + free(dbname); + return NULL; + case SABdbInactive: + pthread_mutex_unlock(&fork_lock); + Mfprintf(stdout, "database '%s' has shut down\n", dbname); + fflush(stdout); + msab_freeStatus(&stats); + free(dbname); + return NULL; + default: + Mfprintf(stderr, "unknown state: %d\n", (int)stats->state); + break; + } + } + } + Mfprintf(stderr, "timeout of %s seconds expired, sending process " LLFMT + " (database '%s') the KILL signal\n", + kv->val, (long long int)pid, dbname); + kill(pid, SIGKILL); + msab_freeStatus(&stats); + free(dbname); + pthread_mutex_unlock(&fork_lock); + return NULL; +} + +/** * Fork an mserver and detach. Before forking off, Sabaoth is consulted * to see if forking makes sense, or whether it is necessary at all, or * forbidden by restart policy, e.g. when in maintenance. diff --git a/tools/merovingian/daemon/merovingian.c b/tools/merovingian/daemon/merovingian.c --- a/tools/merovingian/daemon/merovingian.c +++ b/tools/merovingian/daemon/merovingian.c @@ -233,132 +233,6 @@ logListener(void *x) } /** - * The terminateProcess function tries to let the given mserver process - * shut down gracefully within a given time-out. If that fails, it - * sends the deadly SIGKILL signal to the mserver process and returns. - */ -void * -terminateProcess(void *p) -{ - dpair d = (dpair)p; - sabdb *stats; - char *er; - int i; - confkeyval *kv; - /* make local copies since d will disappear when killed */ - pid_t pid = d->pid; - char *dbname = strdup(d->dbname); - - er = msab_getStatus(&stats, dbname); - if (er != NULL) { - Mfprintf(stderr, "cannot terminate process " LLFMT ": %s\n", - (long long int)pid, er); - free(er); - free(dbname); - return NULL; - } - - if (stats == NULL) { - Mfprintf(stderr, "strange, process " LLFMT " serves database '%s' " - "which does not exist\n", (long long int)pid, dbname); - free(dbname); - return NULL; - } - - switch (stats->state) { - case SABdbRunning: - /* ok, what we expect */ - break; - case SABdbCrashed: - Mfprintf(stderr, "cannot shut down database '%s', mserver " - "(pid " LLFMT ") has crashed\n", - dbname, (long long int)pid); - msab_freeStatus(&stats); - free(dbname); - return NULL; - case SABdbInactive: - Mfprintf(stdout, "database '%s' appears to have shut down already\n", - dbname); - fflush(stdout); - msab_freeStatus(&stats); - free(dbname); - return NULL; - case SABdbStarting: - Mfprintf(stderr, "database '%s' appears to be starting up\n", - dbname); - /* starting up, so we'll go to the shut down phase */ - break; - default: - Mfprintf(stderr, "unknown state: %d\n", (int)stats->state); - msab_freeStatus(&stats); - free(dbname); - return NULL; - } - - if (d->type == MEROFUN) { - multiplexDestroy(dbname); - msab_freeStatus(&stats); - free(dbname); - return NULL; - } else if (d->type != MERODB) { - /* barf */ - Mfprintf(stderr, "cannot stop merovingian process role: %s\n", dbname); - msab_freeStatus(&stats); - free(dbname); - return NULL; - } - - /* ok, once we get here, we'll be shutting down the server */ - Mfprintf(stdout, "sending process " LLFMT " (database '%s') the " - "TERM signal\n", (long long int)pid, dbname); - kill(pid, SIGTERM); - kv = findConfKey(_mero_props, "exittimeout"); - for (i = 0; i < atoi(kv->val) * 2; i++) { - if (stats != NULL) - msab_freeStatus(&stats); - sleep_ms(500); - er = msab_getStatus(&stats, dbname); - if (er != NULL) { - Mfprintf(stderr, "unexpected problem: %s\n", er); - free(er); - /* don't die, just continue, so we KILL in the end */ - } else if (stats == NULL) { - Mfprintf(stderr, "hmmmm, database '%s' suddenly doesn't exist " - "any more\n", dbname); - } else { - switch (stats->state) { - case SABdbRunning: - case SABdbStarting: - /* ok, try again */ - break; - case SABdbCrashed: - Mfprintf (stderr, "database '%s' crashed after SIGTERM\n", - dbname); - msab_freeStatus(&stats); - free(dbname); - return NULL; - case SABdbInactive: - Mfprintf(stdout, "database '%s' has shut down\n", dbname); - fflush(stdout); - msab_freeStatus(&stats); - free(dbname); - return NULL; - default: - Mfprintf(stderr, "unknown state: %d\n", (int)stats->state); - break; - } - } - } - Mfprintf(stderr, "timeout of %s seconds expired, sending process " LLFMT - " (database '%s') the KILL signal\n", - kv->val, (long long int)pid, dbname); - kill(pid, SIGKILL); - msab_freeStatus(&stats); - free(dbname); - return NULL; -} - -/** * Creates a new error, allocated with malloc. The error should be * freed using freeErr(). */ _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list