[PATCH] [RFC] proc connector: add namespace events

Alban Crequy Thu, 08 Sep 2016 08:40:03 -0700

From: Alban Crequy <al...@kinvolk.io>

The act of a process creating or joining a namespace via clone(),
unshare() or setns() is a useful signal for monitoring applications.


I am working on a monitoring application that keeps track of all the
containers and all processes inside each container. The current way of
doing it is by polling regularly in /proc for the list of processes and
in /proc/*/ns/* to know which namespaces they belong to. This is
inefficient on systems with a large number of containers and a large
number of processes.

Instead, I would inspect /proc only one time and get the updates with
the proc connector. Unfortunately, the proc connector gives me the list
of processes but does not notify me when a process changes namespaces.
So I would still need to inspect /proc/*/ns/*.

This patch add namespace events for processes. It generates a namespace
event each time a process changes namespace via clone(), unshare() or
setns().

For example, the following command:
| # unshare -n -f ls -l /proc/self/ns/net
| lrwxrwxrwx 1 root root 0 Sep  6 05:35 /proc/self/ns/net -> 'net:[4026532142]'

causes the proc connector to generate the following events:
| fork: ppid=696 pid=858
| exec: pid=858
| ns: pid=858 type=net reason=set old_inum=4026531957 inum=4026532142
| fork: ppid=858 pid=859
| exec: pid=859
| exit: pid=859
| exit: pid=858

Note: this patch is just a RFC, we are exploring other ways to achieve
      the same feature.

The current implementation has the following limitations:

- Ideally, I want to know whether the event is cause by clone(),
  unshare() or setns(). At the moment, the reason field only
  distinguishes between clone() and non-clone.

- The event for pid namespaces is generated when pid_ns_for_children
  changes. I think that's ok, and it just needs to be documented for
  userspace in the same way it is already documented in
  pid_namespaces(7). Userspace really needs to know whether the event is
  caused by clone() or non-clone to interpret the event correctly.

- Events for userns are not implemented yet. I skipped it for now
  because user namespaces are not managed with nsproxy as other namespaces.

- The mnt namespace struct is more private than other so the code is a
  bit different for this. I don't know if there is a better way to do
  this.

- Userspace needs a way to know whether namespace events are implemented
  in the proc connector. If not implemented, userspaces needs to
  fallback to polling changes in /proc/*/ns/*. I am not sure whether to
  add a Netlink message to query the kernel if the feature is implemented
  or otherwise.

- There is no granularity when subscribing for proc connector events. I
  figured it might not be a problem since namespace events are more rare
  than other fork/exec events. It will probably not flood existing users
  of the proc connector.

Signed-off-by: Alban Crequy <al...@kinvolk.io>
---
 drivers/connector/cn_proc.c  | 28 +++++++++++++++++
 include/linux/cn_proc.h      |  4 +++
 include/uapi/linux/cn_proc.h | 16 +++++++++-
 kernel/nsproxy.c             | 71 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 118 insertions(+), 1 deletion(-)

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index a782ce8..69e6815 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -246,6 +246,34 @@ void proc_comm_connector(struct task_struct *task)
        send_msg(msg);
 }
 
+void proc_ns_connector(struct task_struct *task, int type, int reason, u64 
old_inum, u64 inum)
+{
+       struct cn_msg *msg;
+       struct proc_event *ev;
+       __u8 buffer[CN_PROC_MSG_SIZE] __aligned(8);
+
+       if (atomic_read(&proc_event_num_listeners) < 1)
+               return;
+
+       msg = buffer_to_cn_msg(buffer);
+       ev = (struct proc_event *)msg->data;
+       memset(&ev->event_data, 0, sizeof(ev->event_data));
+       ev->timestamp_ns = ktime_get_ns();
+       ev->what = PROC_EVENT_NM;
+       ev->event_data.nm.process_pid  = task->pid;
+       ev->event_data.nm.process_tgid = task->tgid;
+       ev->event_data.nm.type = type;
+       ev->event_data.nm.reason = reason;
+       ev->event_data.nm.old_inum = old_inum;
+       ev->event_data.nm.inum = inum;
+
+       memcpy(&msg->id, &cn_proc_event_id, sizeof(msg->id));
+       msg->ack = 0; /* not used */
+       msg->len = sizeof(*ev);
+       msg->flags = 0; /* not used */
+       send_msg(msg);
+}
+
 void proc_coredump_connector(struct task_struct *task)
 {
        struct cn_msg *msg;
diff --git a/include/linux/cn_proc.h b/include/linux/cn_proc.h
index 1d5b02a..2e6915e 100644
--- a/include/linux/cn_proc.h
+++ b/include/linux/cn_proc.h
@@ -26,6 +26,7 @@ void proc_id_connector(struct task_struct *task, int 
which_id);
 void proc_sid_connector(struct task_struct *task);
 void proc_ptrace_connector(struct task_struct *task, int which_id);
 void proc_comm_connector(struct task_struct *task);
+void proc_ns_connector(struct task_struct *task, int type, int change, u64 
old_inum, u64 inum);
 void proc_coredump_connector(struct task_struct *task);
 void proc_exit_connector(struct task_struct *task);
 #else
@@ -45,6 +46,9 @@ static inline void proc_sid_connector(struct task_struct 
*task)
 static inline void proc_comm_connector(struct task_struct *task)
 {}
 
+static inline void proc_ns_connector(struct task_struct *task, int type, int 
change, u64 old_inum, u64 inum)
+{}
+
 static inline void proc_ptrace_connector(struct task_struct *task,
                                         int ptrace_id)
 {}
diff --git a/include/uapi/linux/cn_proc.h b/include/uapi/linux/cn_proc.h
index f6c2710..95607304 100644
--- a/include/uapi/linux/cn_proc.h
+++ b/include/uapi/linux/cn_proc.h
@@ -55,7 +55,8 @@ struct proc_event {
                PROC_EVENT_SID  = 0x00000080,
                PROC_EVENT_PTRACE = 0x00000100,
                PROC_EVENT_COMM = 0x00000200,
-               /* "next" should be 0x00000400 */
+               PROC_EVENT_NM   = 0x00000400,
+               /* "next" should be 0x00000800 */
                /* "last" is the last process event: exit,
                 * while "next to last" is coredumping event */
                PROC_EVENT_COREDUMP = 0x40000000,
@@ -112,6 +113,19 @@ struct proc_event {
                        char           comm[16];
                } comm;
 
+               struct nm_proc_event {
+                       __kernel_pid_t process_pid;
+                       __kernel_pid_t process_tgid;
+                       __u32 type;   /* CLONE_NEWNS, CLONE_NEWPID, ... */
+                       enum reason {
+                               PROC_NM_REASON_CLONE = 0x00000001,
+                               PROC_NM_REASON_SET   = 0x00000002, /* setns or 
unshare */
+                               PROC_NM_REASON_LAST  = 0x80000000,
+                       } reason;
+                       __u64 old_inum;
+                       __u64 inum;
+               } nm;
+
                struct coredump_proc_event {
                        __kernel_pid_t process_pid;
                        __kernel_pid_t process_tgid;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 782102e..34306f7 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/syscalls.h>
 #include <linux/cgroup.h>
+#include <linux/cn_proc.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
@@ -139,6 +140,8 @@ int copy_namespaces(unsigned long flags, struct task_struct 
*tsk)
        struct nsproxy *old_ns = tsk->nsproxy;
        struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns);
        struct nsproxy *new_ns;
+       struct ns_common *mntns;
+       u64 old_mntns_inum = 0;
 
        if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
                              CLONE_NEWPID | CLONE_NEWNET |
@@ -165,7 +168,41 @@ int copy_namespaces(unsigned long flags, struct 
task_struct *tsk)
        if (IS_ERR(new_ns))
                return  PTR_ERR(new_ns);
 
+       mntns = mntns_operations.get(tsk);
+       if (mntns) {
+               old_mntns_inum = mntns->inum;
+               mntns_operations.put(mntns);
+       }
+
        tsk->nsproxy = new_ns;
+
+       if (old_ns && new_ns) {
+               struct ns_common *mntns;
+               u64 new_mntns_inum = 0;
+               mntns = mntns_operations.get(tsk);
+               if (mntns) {
+                       new_mntns_inum = mntns->inum;
+                       mntns_operations.put(mntns);
+               }
+               if (old_ns->mnt_ns != new_ns->mnt_ns)
+                       proc_ns_connector(tsk, CLONE_NEWNS, 
PROC_NM_REASON_CLONE, old_mntns_inum, new_mntns_inum);
+
+               if (old_ns->uts_ns != new_ns->uts_ns)
+                       proc_ns_connector(tsk, CLONE_NEWUTS, 
PROC_NM_REASON_CLONE, old_ns->uts_ns->ns.inum, new_ns->uts_ns->ns.inum);
+
+               if (old_ns->ipc_ns != new_ns->ipc_ns)
+                       proc_ns_connector(tsk, CLONE_NEWIPC, 
PROC_NM_REASON_CLONE, old_ns->ipc_ns->ns.inum, new_ns->ipc_ns->ns.inum);
+
+               if (old_ns->net_ns != new_ns->net_ns)
+                       proc_ns_connector(tsk, CLONE_NEWNET, 
PROC_NM_REASON_CLONE, old_ns->net_ns->ns.inum, new_ns->net_ns->ns.inum);
+
+               if (old_ns->cgroup_ns != new_ns->cgroup_ns)
+                       proc_ns_connector(tsk, CLONE_NEWCGROUP, 
PROC_NM_REASON_CLONE, old_ns->cgroup_ns->ns.inum, new_ns->cgroup_ns->ns.inum);
+
+               if (old_ns->pid_ns_for_children != new_ns->pid_ns_for_children)
+                       proc_ns_connector(tsk, CLONE_NEWPID, 
PROC_NM_REASON_CLONE, old_ns->pid_ns_for_children->ns.inum, 
new_ns->pid_ns_for_children->ns.inum);
+       }
+
        return 0;
 }
 
@@ -216,14 +253,48 @@ out:
 void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
 {
        struct nsproxy *ns;
+       struct ns_common *mntns;
+       u64 old_mntns_inum = 0;
 
        might_sleep();
 
+       mntns = mntns_operations.get(p);
+       if (mntns) {
+               old_mntns_inum = mntns->inum;
+               mntns_operations.put(mntns);
+       }
+
        task_lock(p);
        ns = p->nsproxy;
        p->nsproxy = new;
        task_unlock(p);
 
+       if (ns && new) {
+               u64 new_mntns_inum = 0;
+               mntns = mntns_operations.get(p);
+               if (mntns) {
+                       new_mntns_inum = mntns->inum;
+                       mntns_operations.put(mntns);
+               }
+               if (ns->mnt_ns != new->mnt_ns)
+                       proc_ns_connector(p, CLONE_NEWNS, PROC_NM_REASON_SET, 
old_mntns_inum, new_mntns_inum);
+
+               if (ns->uts_ns != new->uts_ns)
+                       proc_ns_connector(p, CLONE_NEWUTS, PROC_NM_REASON_SET, 
ns->uts_ns->ns.inum, new->uts_ns->ns.inum);
+
+               if (ns->ipc_ns != new->ipc_ns)
+                       proc_ns_connector(p, CLONE_NEWIPC, PROC_NM_REASON_SET, 
ns->ipc_ns->ns.inum, new->ipc_ns->ns.inum);
+
+               if (ns->net_ns != new->net_ns)
+                       proc_ns_connector(p, CLONE_NEWNET, PROC_NM_REASON_SET, 
ns->net_ns->ns.inum, new->net_ns->ns.inum);
+
+               if (ns->cgroup_ns != new->cgroup_ns)
+                       proc_ns_connector(p, CLONE_NEWCGROUP, 
PROC_NM_REASON_SET, ns->cgroup_ns->ns.inum, new->cgroup_ns->ns.inum);
+
+               if (ns->pid_ns_for_children != new->pid_ns_for_children)
+                       proc_ns_connector(p, CLONE_NEWPID, PROC_NM_REASON_SET, 
ns->pid_ns_for_children->ns.inum, new->pid_ns_for_children->ns.inum);
+       }
+
        if (ns && atomic_dec_and_test(&ns->count))
                free_nsproxy(ns);
 }
-- 
2.7.4

[PATCH] [RFC] proc connector: add namespace events

Reply via email to