[OpenWrt-Devel] [PATCH] procd: stop service using SIGKILL if SIGTERM failed to do so

2017-02-09 Thread Alin Nastac
SIGKILL is sent if instance process is still running after
 seconds after SIGTERM has been sent. To prevent
another daemon process being launched before old process dies,
the instance is kept until SIGCHLD confirms that service has
been stopped.

Signed-off-by: Alin Nastac 
---
 service/instance.c | 44 +---
 service/instance.h |  1 +
 service/service.c  | 26 --
 service/service.h  |  3 +++
 4 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/service/instance.c b/service/instance.c
index 018db3c..4d340fd 100644
--- a/service/instance.c
+++ b/service/instance.c
@@ -55,6 +55,7 @@ enum {
INSTANCE_ATTR_SECCOMP,
INSTANCE_ATTR_PIDFILE,
INSTANCE_ATTR_RELOADSIG,
+   INSTANCE_ATTR_TERMTIMEOUT,
__INSTANCE_ATTR_MAX
 };
 
@@ -79,6 +80,7 @@ static const struct blobmsg_policy 
instance_attr[__INSTANCE_ATTR_MAX] = {
[INSTANCE_ATTR_SECCOMP] = { "seccomp", BLOBMSG_TYPE_STRING },
[INSTANCE_ATTR_PIDFILE] = { "pidfile", BLOBMSG_TYPE_STRING },
[INSTANCE_ATTR_RELOADSIG] = { "reload_signal", BLOBMSG_TYPE_INT32 },
+   [INSTANCE_ATTR_TERMTIMEOUT] = { "term_timeout", BLOBMSG_TYPE_INT32 },
 };
 
 enum {
@@ -389,8 +391,16 @@ instance_start(struct service_instance *in)
return;
}
 
-   if (in->proc.pending || !in->command)
+   if (!in->command) {
+   LOG("Not starting instance %s::%s, command not set\n", 
in->srv->name, in->name);
return;
+   }
+
+   if (in->proc.pending) {
+   if (in->halt)
+   in->restart = true;
+   return;
+   }
 
instance_free_stdio(in);
if (in->_stdout.fd.fd > -2) {
@@ -408,7 +418,7 @@ instance_start(struct service_instance *in)
}
 
in->restart = false;
-   in->halt = !in->respawn;
+   in->halt = false;
 
if (!in->valid)
return;
@@ -494,7 +504,11 @@ instance_timeout(struct uloop_timeout *t)
 
in = container_of(t, struct service_instance, timeout);
 
-   if (!in->halt && (in->restart || in->respawn))
+   if (in->halt) {
+   LOG("Instance %s::%s pid %d not stopped on SIGTERM, sending 
SIGKILL instead\n",
+   in->srv->name, in->name, in->proc.pid);
+   kill(in->proc.pid, SIGKILL);
+   } else if (in->restart || in->respawn)
instance_start(in);
 }
 
@@ -515,8 +529,19 @@ instance_exit(struct uloop_process *p, int ret)
return;
 
uloop_timeout_cancel(&in->timeout);
+   service_event("instance.stop", in->srv->name, in->name);
+
if (in->halt) {
instance_removepid(in);
+   if (in->restart)
+   instance_start(in);
+   else {
+   struct service *s = in->srv;
+
+   avl_delete(&s->instances.avl, &in->node.avl);
+   instance_free(in);
+   service_stopped(s);
+   }
} else if (in->restart) {
instance_start(in);
} else if (in->respawn) {
@@ -535,7 +560,6 @@ instance_exit(struct uloop_process *p, int ret)
uloop_timeout_set(&in->timeout, in->respawn_timeout * 
1000);
}
}
-   service_event("instance.stop", in->srv->name, in->name);
 }
 
 void
@@ -546,6 +570,7 @@ instance_stop(struct service_instance *in)
in->halt = true;
in->restart = in->respawn = false;
kill(in->proc.pid, SIGTERM);
+   uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
 }
 
 static void
@@ -559,10 +584,10 @@ instance_restart(struct service_instance *in)
return;
}
 
-   in->halt = false;
+   in->halt = true;
in->restart = true;
kill(in->proc.pid, SIGTERM);
-   instance_removepid(in);
+   uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
 }
 
 static bool
@@ -796,6 +821,8 @@ instance_config_parse(struct service_instance *in)
if (!instance_config_parse_command(in, tb))
return false;
 
+   if (tb[INSTANCE_ATTR_TERMTIMEOUT])
+   in->term_timeout = 
blobmsg_get_u32(tb[INSTANCE_ATTR_TERMTIMEOUT]);
if (tb[INSTANCE_ATTR_RESPAWN]) {
int i = 0;
uint32_t vals[3] = { 3600, 5, 5};
@@ -933,8 +960,9 @@ instance_update(struct service_instance *in, struct 
service_instance *in_new)
 {
bool changed = instance_config_changed(in, in_new);
bool running = in->proc.pending;
+   bool stopping = in->halt;
 
-   if (!running) {
+   if (!running || stopping) {
instance_config_move(in, in_new);
instance_start(in);
} else {
@@ -967,6 +995,7 @@ instance_init(struct service_instance *in, struct service 
*s, struct blob_attr *
in->config = config;
in->timeout.cb = instance_

Re: [OpenWrt-Devel] [LEDE-DEV] [PATCH] procd: stop service using SIGKILL if SIGTERM failed to do so

2017-02-09 Thread John Crispin
Hi,

i know that someone else is about to send a fix for the same issue but
with a different approach of fixing it. i'd like to wait for this 2nd
patch to arrive before we decide which to merge

John

On 09/02/2017 11:02, Alin Nastac wrote:
> SIGKILL is sent if instance process is still running after
>  seconds after SIGTERM has been sent. To prevent
> another daemon process being launched before old process dies,
> the instance is kept until SIGCHLD confirms that service has
> been stopped.
> 
> Signed-off-by: Alin Nastac 
> ---
>  service/instance.c | 44 +---
>  service/instance.h |  1 +
>  service/service.c  | 26 --
>  service/service.h  |  3 +++
>  4 files changed, 57 insertions(+), 17 deletions(-)
> 
> diff --git a/service/instance.c b/service/instance.c
> index 018db3c..4d340fd 100644
> --- a/service/instance.c
> +++ b/service/instance.c
> @@ -55,6 +55,7 @@ enum {
>   INSTANCE_ATTR_SECCOMP,
>   INSTANCE_ATTR_PIDFILE,
>   INSTANCE_ATTR_RELOADSIG,
> + INSTANCE_ATTR_TERMTIMEOUT,
>   __INSTANCE_ATTR_MAX
>  };
>  
> @@ -79,6 +80,7 @@ static const struct blobmsg_policy 
> instance_attr[__INSTANCE_ATTR_MAX] = {
>   [INSTANCE_ATTR_SECCOMP] = { "seccomp", BLOBMSG_TYPE_STRING },
>   [INSTANCE_ATTR_PIDFILE] = { "pidfile", BLOBMSG_TYPE_STRING },
>   [INSTANCE_ATTR_RELOADSIG] = { "reload_signal", BLOBMSG_TYPE_INT32 },
> + [INSTANCE_ATTR_TERMTIMEOUT] = { "term_timeout", BLOBMSG_TYPE_INT32 },
>  };
>  
>  enum {
> @@ -389,8 +391,16 @@ instance_start(struct service_instance *in)
>   return;
>   }
>  
> - if (in->proc.pending || !in->command)
> + if (!in->command) {
> + LOG("Not starting instance %s::%s, command not set\n", 
> in->srv->name, in->name);
>   return;
> + }
> +
> + if (in->proc.pending) {
> + if (in->halt)
> + in->restart = true;
> + return;
> + }
>  
>   instance_free_stdio(in);
>   if (in->_stdout.fd.fd > -2) {
> @@ -408,7 +418,7 @@ instance_start(struct service_instance *in)
>   }
>  
>   in->restart = false;
> - in->halt = !in->respawn;
> + in->halt = false;
>  
>   if (!in->valid)
>   return;
> @@ -494,7 +504,11 @@ instance_timeout(struct uloop_timeout *t)
>  
>   in = container_of(t, struct service_instance, timeout);
>  
> - if (!in->halt && (in->restart || in->respawn))
> + if (in->halt) {
> + LOG("Instance %s::%s pid %d not stopped on SIGTERM, sending 
> SIGKILL instead\n",
> + in->srv->name, in->name, in->proc.pid);
> + kill(in->proc.pid, SIGKILL);
> + } else if (in->restart || in->respawn)
>   instance_start(in);
>  }
>  
> @@ -515,8 +529,19 @@ instance_exit(struct uloop_process *p, int ret)
>   return;
>  
>   uloop_timeout_cancel(&in->timeout);
> + service_event("instance.stop", in->srv->name, in->name);
> +
>   if (in->halt) {
>   instance_removepid(in);
> + if (in->restart)
> + instance_start(in);
> + else {
> + struct service *s = in->srv;
> +
> + avl_delete(&s->instances.avl, &in->node.avl);
> + instance_free(in);
> + service_stopped(s);
> + }
>   } else if (in->restart) {
>   instance_start(in);
>   } else if (in->respawn) {
> @@ -535,7 +560,6 @@ instance_exit(struct uloop_process *p, int ret)
>   uloop_timeout_set(&in->timeout, in->respawn_timeout * 
> 1000);
>   }
>   }
> - service_event("instance.stop", in->srv->name, in->name);
>  }
>  
>  void
> @@ -546,6 +570,7 @@ instance_stop(struct service_instance *in)
>   in->halt = true;
>   in->restart = in->respawn = false;
>   kill(in->proc.pid, SIGTERM);
> + uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
>  }
>  
>  static void
> @@ -559,10 +584,10 @@ instance_restart(struct service_instance *in)
>   return;
>   }
>  
> - in->halt = false;
> + in->halt = true;
>   in->restart = true;
>   kill(in->proc.pid, SIGTERM);
> - instance_removepid(in);
> + uloop_timeout_set(&in->timeout, in->term_timeout * 1000);
>  }
>  
>  static bool
> @@ -796,6 +821,8 @@ instance_config_parse(struct service_instance *in)
>   if (!instance_config_parse_command(in, tb))
>   return false;
>  
> + if (tb[INSTANCE_ATTR_TERMTIMEOUT])
> + in->term_timeout = 
> blobmsg_get_u32(tb[INSTANCE_ATTR_TERMTIMEOUT]);
>   if (tb[INSTANCE_ATTR_RESPAWN]) {
>   int i = 0;
>   uint32_t vals[3] = { 3600, 5, 5};
> @@ -933,8 +960,9 @@ instance_update(struct service_instance *in, struct 
> service_instance *in_new)
>  {
>   bool changed = instance_config_changed(in, in_new);
>   bool running = in->p

Re: [OpenWrt-Devel] [LEDE-DEV] [PATCH] procd: stop service using SIGKILL if SIGTERM failed to do so

2017-02-09 Thread Alin Năstac
On Thu, Feb 9, 2017 at 11:54 AM, John Crispin  wrote:
> Hi,
>
> i know that someone else is about to send a fix for the same issue but
> with a different approach of fixing it. i'd like to wait for this 2nd
> patch to arrive before we decide which to merge
Are you sure it wasn't me? :)
You said yesterday that I should send you a patch for it.

The only other approach I could think of would involve a
instance_stop() that waits for the service instance to exit.
I thought initially to do it like this, but decided that waiting
asynchronously for stop event would be a fer better technical solution
to the given issue, don't you agree?

>
> John
>

Alin
___
openwrt-devel mailing list
openwrt-devel@lists.openwrt.org
https://lists.openwrt.org/cgi-bin/mailman/listinfo/openwrt-devel