This patch adds the following files to the container filesystem:

notify_on_release - configures/reports whether the container subsystem
should attempt to run a release script when this container becomes
unused

release_agent - configures/reports the release agent to be used for
this hierarchy (top level in each hierarchy only)

releasable - reports whether this container would have been
auto-released if notify_on_release was true and a release agent was
configured (mainly useful for debugging)

To avoid locking issues, invoking the userspace release agent is done
via a workqueue task; containers that need to have their release
agents invoked by the workqueue task are linked on to a list.

When the "cpuset" filesystem is mounted, it automatically sets the
hierarchy's release agent to be /sbin/cpuset_release_agent for
backward-compatibility with existing cpusets users.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
---
 include/linux/container.h |   15 +
 kernel/container.c        |  364 ++++++++++++++++++++++++++++++++++++++++++----
 kernel/cpuset.c           |    5 
 3 files changed, 348 insertions(+), 36 deletions(-)

Index: container-2.6.22-rc2-mm1/include/linux/container.h
===================================================================
--- container-2.6.22-rc2-mm1.orig/include/linux/container.h
+++ container-2.6.22-rc2-mm1/include/linux/container.h
@@ -64,11 +64,7 @@ static inline void css_get(struct contai
  * css_put() should be called to release a reference taken by
  * css_get()
  */
-
-static inline void css_put(struct container_subsys_state *css)
-{
-       atomic_dec(&css->refcnt);
-}
+void css_put(struct container_subsys_state *css);
 
 struct container {
        unsigned long flags;            /* "unsigned long" so bitops work */
@@ -99,6 +95,13 @@ struct container {
         * tasks in this container. Protected by css_group_lock
         */
        struct list_head css_groups;
+
+       /*
+        * Linked list running through all containers that can
+        * potentially be reaped by the release agent. Protected by
+        * container_mutex
+        */
+       struct list_head release_list;
 };
 
 /* A css_group is a structure holding pointers to a set of
@@ -271,6 +274,8 @@ struct task_struct *container_iter_next(
                                        struct container_iter *it);
 void container_iter_end(struct container *cont, struct container_iter *it);
 
+void container_set_release_agent_path(struct container_subsys *ss,
+                                     const char *path);
 
 #else /* !CONFIG_CONTAINERS */
 
Index: container-2.6.22-rc2-mm1/kernel/container.c
===================================================================
--- container-2.6.22-rc2-mm1.orig/kernel/container.c
+++ container-2.6.22-rc2-mm1/kernel/container.c
@@ -62,6 +62,8 @@
 
 #define CONTAINER_SUPER_MAGIC          0x27e0eb
 
+static DEFINE_MUTEX(container_mutex);
+
 /* Generate an array of container subsystem pointers */
 #define SUBSYS(_x) &_x ## _subsys,
 
@@ -89,6 +91,13 @@ struct containerfs_root {
 
        /* A list running through the mounted hierarchies */
        struct list_head root_list;
+
+       /* The path to use for release notifications. No locking
+        * between setting and use - so if userspace updates this
+        * while subcontainers exist, you could miss a
+        * notification. We ensure that it's always a valid
+        * NUL-terminated string */
+       char release_agent_path[PATH_MAX];
 };
 
 
@@ -115,7 +124,13 @@ static int need_forkexit_callback = 0;
 
 /* bits in struct container flags field */
 typedef enum {
+       /* Container is dead */
        CONT_REMOVED,
+       /* Container has previously had a child container or a task,
+        * but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */
+       CONT_RELEASABLE,
+       /* Container requires release notifications to userspace */
+       CONT_NOTIFY_ON_RELEASE,
 } container_flagbits_t;
 
 /* convenient tests for these bits */
@@ -124,6 +139,19 @@ inline int container_is_removed(const st
        return test_bit(CONT_REMOVED, &cont->flags);
 }
 
+inline int container_is_releasable(const struct container *cont)
+{
+       const int bits =
+               (1 << CONT_RELEASABLE) |
+               (1 << CONT_NOTIFY_ON_RELEASE);
+       return (cont->flags & bits) == bits;
+}
+
+inline int notify_on_release(const struct container *cont)
+{
+       return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
+}
+
 /* for_each_subsys() allows you to iterate on each subsystem attached to
  * an active hierarchy */
 #define for_each_subsys(_root, _ss) \
@@ -133,6 +161,12 @@ list_for_each_entry(_ss, &_root->subsys_
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
+/* the list of containers eligible for automatic release */
+static LIST_HEAD(release_list);
+static void container_release_agent(struct work_struct *work);
+static DECLARE_WORK(release_agent_work, container_release_agent);
+static void check_for_release(struct container *cont);
+
 /* Link structure for associating css_group objects with containers */
 struct cg_container_link {
        /*
@@ -181,11 +215,8 @@ static int css_group_count;
 /*
  * unlink a css_group from the list and free it
  */
-static void release_css_group(struct kref *k)
+static void unlink_css_group(struct css_group *cg)
 {
-       struct css_group *cg =
-               container_of(k, struct css_group, ref);
-       int i;
        write_lock(&css_group_lock);
        list_del(&cg->list);
        css_group_count--;
@@ -198,8 +229,47 @@ static void release_css_group(struct kre
                kfree(link);
        }
        write_unlock(&css_group_lock);
+}
+
+static void release_css_group(struct kref *k)
+{
+       int i;
+       struct css_group *cg = container_of(k, struct css_group, ref);
+       BUG_ON(!mutex_is_locked(&container_mutex));
+
+       unlink_css_group(cg);
        for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
-               atomic_dec(&cg->subsys[i]->container->count);
+               struct container *cont = cg->subsys[i]->container;
+               if (atomic_dec_and_test(&cont->count) &&
+                   container_is_releasable(cont)) {
+                       check_for_release(cont);
+               }
+       }
+       kfree(cg);
+}
+
+/*
+ * In the task exit path we want to avoid taking container_mutex
+ * unless absolutely necessary, so the release process is slightly
+ * different.
+ */
+static void release_css_group_taskexit(struct kref *k)
+{
+       int i;
+       struct css_group *cg = container_of(k, struct css_group, ref);
+       unlink_css_group(cg);
+       for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
+               struct container *cont = cg->subsys[i]->container;
+               if (notify_on_release(cont)) {
+                       mutex_lock(&container_mutex);
+                       set_bit(CONT_RELEASABLE, &cont->flags);
+                       if (atomic_dec_and_test(&cont->count)) {
+                               check_for_release(cont);
+                       }
+                       mutex_unlock(&container_mutex);
+               } else {
+                       atomic_dec(&cont->count);
+               }
        }
        kfree(cg);
 }
@@ -217,6 +287,11 @@ static inline void put_css_group(struct 
        kref_put(&cg->ref, release_css_group);
 }
 
+static inline void put_css_group_taskexit(struct css_group *cg)
+{
+       kref_put(&cg->ref, release_css_group_taskexit);
+}
+
 /*
  * find_existing_css_group() is a helper for
  * find_css_group(), and checks to see whether an existing
@@ -446,8 +521,6 @@ static struct css_group *find_css_group(
  * update of a tasks container pointer by attach_task()
  */
 
-static DEFINE_MUTEX(container_mutex);
-
 /**
  * container_lock - lock out any changes to container structures
  *
@@ -795,6 +868,7 @@ static int container_fill_super(struct s
        root->d_fsdata = &hroot->top_container;
        hroot->top_container.dentry = root;
 
+       strcpy(hroot->release_agent_path, "");
        sb->s_fs_info = hroot;
        hroot->sb = sb;
 
@@ -811,6 +885,7 @@ static void init_container_root(struct c
        INIT_LIST_HEAD(&cont->sibling);
        INIT_LIST_HEAD(&cont->children);
        INIT_LIST_HEAD(&cont->css_groups);
+       INIT_LIST_HEAD(&cont->release_list);
        list_add(&root->root_list, &roots);
        root_count++;
 }
@@ -1057,7 +1132,7 @@ static int attach_task(struct container 
                        ss->attach(ss, cont, oldcont, tsk);
                }
        }
-
+       set_bit(CONT_RELEASABLE, &oldcont->flags);
        synchronize_rcu();
        put_css_group(cg);
        return 0;
@@ -1109,6 +1184,9 @@ typedef enum {
        FILE_ROOT,
        FILE_DIR,
        FILE_TASKLIST,
+       FILE_NOTIFY_ON_RELEASE,
+       FILE_RELEASABLE,
+       FILE_RELEASE_AGENT,
 } container_filetype_t;
 
 static ssize_t container_common_file_write(struct container *cont,
@@ -1145,6 +1223,28 @@ static ssize_t container_common_file_wri
        case FILE_TASKLIST:
                retval = attach_task_by_pid(cont, buffer);
                break;
+       case FILE_NOTIFY_ON_RELEASE:
+               clear_bit(CONT_RELEASABLE, &cont->flags);
+               if (simple_strtoul(buffer, NULL, 10) != 0)
+                       set_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
+               else
+                       clear_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
+               break;
+       case FILE_RELEASE_AGENT:
+       {
+               struct containerfs_root *root = cont->root;
+               if (nbytes < sizeof(root->release_agent_path)) {
+                       /* We never write anything other than '\0'
+                        * into the last char of release_agent_path,
+                        * so it always remains a NUL-terminated
+                        * string */
+                       strncpy(root->release_agent_path, buffer, nbytes);
+                       root->release_agent_path[nbytes] = 0;
+               } else {
+                       retval = -ENOSPC;
+               }
+               break;
+       }
        default:
                retval = -EINVAL;
                goto out2;
@@ -1183,6 +1283,49 @@ static ssize_t container_read_uint(struc
        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 
+static ssize_t container_common_file_read(struct container *cont,
+                                         struct cftype *cft,
+                                         struct file *file,
+                                         char __user *buf,
+                                         size_t nbytes, loff_t *ppos)
+{
+       container_filetype_t type = cft->private;
+       char *page;
+       ssize_t retval = 0;
+       char *s;
+
+       if (!(page = (char *)__get_free_page(GFP_KERNEL)))
+               return -ENOMEM;
+
+       s = page;
+
+       switch (type) {
+       case FILE_RELEASE_AGENT:
+       {
+               struct containerfs_root *root;
+               size_t n;
+               mutex_lock(&container_mutex);
+               root = cont->root;
+               n = strnlen(root->release_agent_path,
+                           sizeof(root->release_agent_path));
+               n = min(n, (size_t) PAGE_SIZE);
+               strncpy(s, root->release_agent_path, n);
+               mutex_unlock(&container_mutex);
+               s += n;
+               break;
+       }
+       default:
+               retval = -EINVAL;
+               goto out;
+       }
+       *s++ = '\n';
+
+       retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
+out:
+       free_page((unsigned long)page);
+       return retval;
+}
+
 static ssize_t container_file_read(struct file *file, char __user *buf,
                                   size_t nbytes, loff_t *ppos)
 {
@@ -1560,17 +1703,51 @@ static int container_tasks_release(struc
        return 0;
 }
 
+static u64 container_read_notify_on_release(struct container *cont,
+                                           struct cftype *cft)
+{
+       return notify_on_release(cont);
+}
+
+static u64 container_read_releasable(struct container *cont,
+                                    struct cftype *cft)
+{
+       return test_bit(CONT_RELEASABLE, &cont->flags);
+}
+
 /*
  * for the common functions, 'private' gives the type of file
  */
 
-static struct cftype cft_tasks = {
-       .name = "tasks",
-       .open = container_tasks_open,
-       .read = container_tasks_read,
+static struct cftype files[] = {
+       {
+               .name = "tasks",
+               .open = container_tasks_open,
+               .read = container_tasks_read,
+               .write = container_common_file_write,
+               .release = container_tasks_release,
+               .private = FILE_TASKLIST,
+       },
+
+       {
+               .name = "notify_on_release",
+               .read_uint = container_read_notify_on_release,
+               .write = container_common_file_write,
+               .private = FILE_NOTIFY_ON_RELEASE,
+       },
+
+       {
+               .name = "releasable",
+               .read_uint = container_read_releasable,
+               .private = FILE_RELEASABLE,
+       }
+};
+
+static struct cftype cft_release_agent = {
+       .name = "release_agent",
+       .read = container_common_file_read,
        .write = container_common_file_write,
-       .release = container_tasks_release,
-       .private = FILE_TASKLIST,
+       .private = FILE_RELEASE_AGENT,
 };
 
 static int container_populate_dir(struct container *cont)
@@ -1581,9 +1758,14 @@ static int container_populate_dir(struct
        /* First clear out any existing files */
        container_clear_directory(cont->dentry);
 
-       if ((err = container_add_file(cont, &cft_tasks)) < 0)
+       if ((err = container_add_files(cont, files, ARRAY_SIZE(files)) < 0))
                return err;
 
+       if (cont == cont->top_container) {
+               if ((err = container_add_file(cont, &cft_release_agent)) < 0)
+                       return err;
+       }
+
        for_each_subsys(cont->root, ss) {
                if (ss->populate && (err = ss->populate(ss, cont)) < 0)
                        return err;
@@ -1635,6 +1817,7 @@ static long container_create(struct cont
        INIT_LIST_HEAD(&cont->sibling);
        INIT_LIST_HEAD(&cont->children);
        INIT_LIST_HEAD(&cont->css_groups);
+       INIT_LIST_HEAD(&cont->release_list);
 
        cont->parent = parent;
        cont->root = parent->root;
@@ -1693,6 +1876,24 @@ static int container_mkdir(struct inode 
        return container_create(c_parent, dentry, mode | S_IFDIR);
 }
 
+static inline int container_has_css_refs(struct container *cont)
+{
+       /* Check the reference count on each subsystem. Since we
+        * already established that there are no tasks in the
+        * container, if the css refcount is also 0, then there should
+        * be no outstanding references, so the subsystem is safe to
+        * destroy */
+       struct container_subsys *ss;
+       for_each_subsys(cont->root, ss) {
+               struct container_subsys_state *css;
+               css = cont->subsys[ss->subsys_id];
+               if (atomic_read(&css->refcnt)) {
+                       return 1;
+               }
+       }
+       return 0;
+}
+
 static int container_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
        struct container *cont = dentry->d_fsdata;
@@ -1701,7 +1902,6 @@ static int container_rmdir(struct inode 
        struct container_subsys *ss;
        struct super_block *sb;
        struct containerfs_root *root;
-       int css_busy = 0;
 
        /* the vfs holds both inode->i_mutex already */
 
@@ -1719,20 +1919,7 @@ static int container_rmdir(struct inode 
        root = cont->root;
        sb = root->sb;
 
-       /* Check the reference count on each subsystem. Since we
-        * already established that there are no tasks in the
-        * container, if the css refcount is also 0, then there should
-        * be no outstanding references, so the subsystem is safe to
-        * destroy */
-       for_each_subsys(root, ss) {
-               struct container_subsys_state *css;
-               css = cont->subsys[ss->subsys_id];
-               if (atomic_read(&css->refcnt)) {
-                       css_busy = 1;
-                       break;
-               }
-       }
-       if (css_busy) {
+       if (container_has_css_refs(cont)) {
                mutex_unlock(&container_mutex);
                return -EBUSY;
        }
@@ -1754,6 +1941,11 @@ static int container_rmdir(struct inode 
        dput(d);
        root->number_of_containers--;
 
+       if (!list_empty(&cont->release_list))
+               list_del(&cont->release_list);
+       set_bit(CONT_RELEASABLE, &parent->flags);
+       check_for_release(parent);
+
        mutex_unlock(&container_mutex);
        /* Drop the active superblock reference that we took when we
         * created the container */
@@ -2107,7 +2299,7 @@ void container_exit(struct task_struct *
        tsk->containers = &init_css_group;
        task_unlock(tsk);
        if (cg)
-               put_css_group(cg);
+               put_css_group_taskexit(cg);
 }
 
 static atomic_t namecnt;
@@ -2215,7 +2407,10 @@ int container_clone(struct task_struct *
 
  out_release:
        mutex_unlock(&inode->i_mutex);
+
+       mutex_lock(&container_mutex);
        put_css_group(cg);
+       mutex_unlock(&container_mutex);
        deactivate_super(parent->root->sb);
        return ret;
 }
@@ -2236,3 +2431,110 @@ int container_is_descendant(const struct
        ret = (cont == target);
        return ret;
 }
+
+static void check_for_release(struct container *cont)
+{
+       BUG_ON(!mutex_is_locked(&container_mutex));
+       if (container_is_releasable(cont) && !atomic_read(&cont->count)
+           && list_empty(&cont->children) && !container_has_css_refs(cont)) {
+               /* Container is currently removeable. If it's not
+                * already queued for a userspace notification, queue
+                * it now */
+               if (list_empty(&cont->release_list)) {
+                       list_add(&cont->release_list, &release_list);
+                       schedule_work(&release_agent_work);
+               }
+       }
+}
+
+void css_put(struct container_subsys_state *css)
+{
+       struct container *cont = css->container;
+       if (notify_on_release(cont)) {
+               mutex_lock(&container_mutex);
+               set_bit(CONT_RELEASABLE, &cont->flags);
+               if (atomic_dec_and_test(&css->refcnt)) {
+                       check_for_release(cont);
+               }
+               mutex_unlock(&container_mutex);
+       } else {
+               atomic_dec(&css->refcnt);
+       }
+}
+
+void container_set_release_agent_path(struct container_subsys *ss,
+                                     const char *path)
+{
+       mutex_lock(&container_mutex);
+       strcpy(ss->root->release_agent_path, path);
+       mutex_unlock(&container_mutex);
+}
+
+/*
+ * Notify userspace when a container is released, by running the
+ * configured release agent with the name of the container (path
+ * relative to the root of container file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this container.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this container before it is removed, or that some other
+ * user task will 'mkdir' a child container of this container.  That's ok.
+ * The presumed 'rmdir' will fail quietly if this container is no longer
+ * unused, and this container will be reprieved from its death sentence,
+ * to continue to serve a useful existence.  Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d.  The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task.  We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ *
+ */
+
+static void container_release_agent(struct work_struct *work)
+{
+       BUG_ON(work != &release_agent_work);
+       mutex_lock(&container_mutex);
+       while (!list_empty(&release_list)) {
+               char *argv[3], *envp[3];
+               int i;
+               char *pathbuf;
+               struct container *cont = list_entry(release_list.next,
+                                                   struct container,
+                                                   release_list);
+               list_del_init(&cont->release_list);
+
+               pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+               if (!pathbuf)
+                       continue;
+
+               if (container_path(cont, pathbuf, PAGE_SIZE) < 0) {
+                       kfree(pathbuf);
+                       continue;
+               }
+
+               i = 0;
+               argv[i++] = cont->root->release_agent_path;
+               argv[i++] = (char *)pathbuf;
+               argv[i] = NULL;
+
+               i = 0;
+               /* minimal command environment */
+               envp[i++] = "HOME=/";
+               envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+               envp[i] = NULL;
+
+               /* Drop the lock while we invoke the usermode helper,
+                * since the exec could involve hitting disk and hence
+                * be a slow process */
+               mutex_unlock(&container_mutex);
+               call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+               kfree(pathbuf);
+               mutex_lock(&container_mutex);
+       }
+       mutex_unlock(&container_mutex);
+}
Index: container-2.6.22-rc2-mm1/kernel/cpuset.c
===================================================================
--- container-2.6.22-rc2-mm1.orig/kernel/cpuset.c
+++ container-2.6.22-rc2-mm1/kernel/cpuset.c
@@ -275,6 +275,11 @@ static int cpuset_get_sb(struct file_sys
                                           unused_dev_name,
                                           "cpuset", mnt);
                put_filesystem(container_fs);
+               if (!ret) {
+                       container_set_release_agent_path(
+                               &cpuset_subsys,
+                               "/sbin/cpuset_release_agent");
+               }
        }
        return ret;
 }

--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to