from:"menage"

[PATCH 2/2] cgroup map files: Use cgroup map for memcontrol stats file

2008-02-19 Thread menage

Remove the seq_file boilerplate used to construct the memcontrol stats
map, and instead use the new map representation for cgroup control
files

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 mm/memcontrol.c |   30 ++
 1 file changed, 6 insertions(+), 24 deletions(-)

Index: cgroupmap-2.6.25-rc2-mm1/mm/memcontrol.c
===
--- cgroupmap-2.6.25-rc2-mm1.orig/mm/memcontrol.c
+++ cgroupmap-2.6.25-rc2-mm1/mm/memcontrol.c
@@ -974,9 +974,9 @@ static const struct mem_cgroup_stat_desc
[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
 };
 
-static int mem_control_stat_show(struct seq_file *m, void *arg)
+static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+struct cgroup_map_cb *cb)
 {
-   struct cgroup *cont = m->private;
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
struct mem_cgroup_stat *stat = &mem_cont->stat;
int i;
@@ -986,8 +986,7 @@ static int mem_control_stat_show(struct 
 
val = mem_cgroup_read_stat(stat, i);
val *= mem_cgroup_stat_desc[i].unit;
-   seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg,
-   (long long)val);
+   cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
}
/* showing # of active pages */
{
@@ -997,29 +996,12 @@ static int mem_control_stat_show(struct 
MEM_CGROUP_ZSTAT_INACTIVE);
active = mem_cgroup_get_all_zonestat(mem_cont,
MEM_CGROUP_ZSTAT_ACTIVE);
-   seq_printf(m, "active %ld\n", (active) * PAGE_SIZE);
-   seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE);
+   cb->fill(cb, "active", (active) * PAGE_SIZE);
+   cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
}
return 0;
 }
 
-static const struct file_operations mem_control_stat_file_operations = {
-   .read = seq_read,
-   .llseek = seq_lseek,
-   .release = single_release,
-};
-
-static int mem_control_stat_open(struct inode *unused, struct file *file)
-{
-   /* XXX __d_cont */
-   struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
-
-   file->f_op = &mem_control_stat_file_operations;
-   return single_open(file, mem_control_stat_show, cont);
-}
-
-
-
 static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -1044,7 +1026,7 @@ static struct cftype mem_cgroup_files[] 
},
{
.name = "stat",
-   .open = mem_control_stat_open,
+   .read_map = mem_control_stat_show,
},
 };
 

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] cgroup map files: Add cgroup map data type

2008-02-19 Thread menage

Adds a new type of supported control file representation, a map from
strings to u64 values.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/cgroup.h |   19 +++
 kernel/cgroup.c|   59 -
 2 files changed, 77 insertions(+), 1 deletion(-)

Index: cgroupmap-2.6.25-rc2-mm1/include/linux/cgroup.h
===
--- cgroupmap-2.6.25-rc2-mm1.orig/include/linux/cgroup.h
+++ cgroupmap-2.6.25-rc2-mm1/include/linux/cgroup.h
@@ -166,6 +166,16 @@ struct css_set {
 
 };
 
+/*
+ * cgroup_map_cb is an abstract callback API for reporting map-valued
+ * control files
+ */
+
+struct cgroup_map_cb {
+   int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
+   void *state;
+};
+
 /* struct cftype:
  *
  * The files in the cgroup filesystem mostly have a very simple read/write
@@ -194,6 +204,15 @@ struct cftype {
 * single integer. Use it in place of read()
 */
u64 (*read_uint) (struct cgroup *cont, struct cftype *cft);
+   /*
+* read_map() is used for defining a map of key/value
+* pairs. It should call cb->fill(cb, key, value) for each
+* entry. The key/value pairs (and their ordering) should not
+* change between reboots.
+*/
+   int (*read_map) (struct cgroup *cont, struct cftype *cft,
+struct cgroup_map_cb *cb);
+
ssize_t (*write) (struct cgroup *cont, struct cftype *cft,
  struct file *file,
  const char __user *buf, size_t nbytes, loff_t *ppos);
Index: cgroupmap-2.6.25-rc2-mm1/kernel/cgroup.c
===
--- cgroupmap-2.6.25-rc2-mm1.orig/kernel/cgroup.c
+++ cgroupmap-2.6.25-rc2-mm1/kernel/cgroup.c
@@ -1487,6 +1487,46 @@ static ssize_t cgroup_file_read(struct f
return -EINVAL;
 }
 
+/*
+ * seqfile ops/methods for returning structured data. Currently just
+ * supports string->u64 maps, but can be extended in future.
+ */
+
+struct cgroup_seqfile_state {
+   struct cftype *cft;
+   struct cgroup *cgroup;
+};
+
+static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+{
+   struct seq_file *sf = cb->state;
+   return seq_printf(sf, "%s: %llu\n", key, value);
+}
+
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+   struct cgroup_seqfile_state *state = m->private;
+   struct cftype *cft = state->cft;
+   struct cgroup_map_cb cb = {
+   .fill = cgroup_map_add,
+   .state = m,
+   };
+   if (cft->read_map) {
+   return cft->read_map(state->cgroup, cft, &cb);
+   } else {
+   BUG();
+   }
+}
+
+int cgroup_seqfile_release(struct inode *inode, struct file *file)
+{
+   struct seq_file *seq = file->private_data;
+   kfree(seq->private);
+   return single_release(inode, file);
+}
+
+static struct file_operations cgroup_seqfile_operations;
+
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
int err;
@@ -1499,7 +1539,18 @@ static int cgroup_file_open(struct inode
cft = __d_cft(file->f_dentry);
if (!cft)
return -ENODEV;
-   if (cft->open)
+   if (cft->read_map) {
+   struct cgroup_seqfile_state *state =
+   kzalloc(sizeof(*state), GFP_USER);
+   if (!state)
+   return -ENOMEM;
+   state->cft = cft;
+   state->cgroup = __d_cgrp(file->f_dentry->d_parent);
+   file->f_op = &cgroup_seqfile_operations;
+   err = single_open(file, cgroup_seqfile_show, state);
+   if (err < 0)
+   kfree(state);
+   } else if (cft->open)
err = cft->open(inode, file);
else
err = 0;
@@ -1538,6 +1589,12 @@ static struct file_operations cgroup_fil
.release = cgroup_file_release,
 };
 
+static struct file_operations cgroup_seqfile_operations = {
+   .read = seq_read,
+   .llseek = seq_lseek,
+   .release = cgroup_seqfile_release,
+};
+
 static struct inode_operations cgroup_dir_inode_operations = {
.lookup = simple_lookup,
.mkdir = cgroup_mkdir,

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/2] cgroup map files: Add a key/value map file type to cgroups

2008-02-19 Thread menage

These patches add a new cgroup control file output type - a map from
strings to u64 values - and make use of it for the memory controller
"stat" file.

It is intended for use when the subsystem wants to return a collection
of values that are related in some way, for which a separate control
file for each value would make the reporting unwieldy.

The advantages of this are:

- more standardized output from control files that report
similarly-structured data

- less boilerplate required in cgroup subsystems

- simplifies transition to a future efficient cgroups binary API

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] ResCounter: Use read_uint in memory controller

2008-02-21 Thread menage

Update the memory controller to use read_uint for its
limit/usage/failcnt control files, calling the new
res_counter_read_uint() function.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 mm/memcontrol.c |   15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

Index: rescounter-2.6.25-rc2-mm1/mm/memcontrol.c
===
--- rescounter-2.6.25-rc2-mm1.orig/mm/memcontrol.c
+++ rescounter-2.6.25-rc2-mm1/mm/memcontrol.c
@@ -922,13 +922,10 @@ int mem_cgroup_write_strategy(char *buf,
return 0;
 }
 
-static ssize_t mem_cgroup_read(struct cgroup *cont,
-   struct cftype *cft, struct file *file,
-   char __user *userbuf, size_t nbytes, loff_t *ppos)
+static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
-   return res_counter_read(&mem_cgroup_from_cont(cont)->res,
-   cft->private, userbuf, nbytes, ppos,
-   NULL);
+   return res_counter_read_uint(&mem_cgroup_from_cont(cont)->res,
+cft->private);
 }
 
 static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
@@ -1024,18 +1021,18 @@ static struct cftype mem_cgroup_files[] 
{
.name = "usage_in_bytes",
.private = RES_USAGE,
-   .read = mem_cgroup_read,
+   .read_uint = mem_cgroup_read,
},
{
.name = "limit_in_bytes",
.private = RES_LIMIT,
.write = mem_cgroup_write,
-   .read = mem_cgroup_read,
+   .read_uint = mem_cgroup_read,
},
{
.name = "failcnt",
.private = RES_FAILCNT,
-   .read = mem_cgroup_read,
+   .read_uint = mem_cgroup_read,
},
{
.name = "force_empty",

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/2] ResCounter: Add res_counter_read_uint and use it in memory cgroup

2008-02-21 Thread menage

These patches simplify the code required to report values from a
res_counter object in a cgroups control file.

The first patch adds res_counter_read_uint, which simply reports the
current value for a res_counter member.

The second replaces the existing mem_cgroup_read() with a simpler
version that calls res_counter_read_uint().

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] ResCounter: Add res_counter_read_uint()

2008-02-21 Thread menage

Adds a function for returning the value of a resource counter member,
in a form suitable for use in a cgroup read_uint control file method.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/res_counter.h |1 +
 kernel/res_counter.c|5 +
 2 files changed, 6 insertions(+)

Index: rescounter-2.6.25-rc2-mm1/include/linux/res_counter.h
===
--- rescounter-2.6.25-rc2-mm1.orig/include/linux/res_counter.h
+++ rescounter-2.6.25-rc2-mm1/include/linux/res_counter.h
@@ -54,6 +54,7 @@ struct res_counter {
 ssize_t res_counter_read(struct res_counter *counter, int member,
const char __user *buf, size_t nbytes, loff_t *pos,
int (*read_strategy)(unsigned long long val, char *s));
+u64 res_counter_read_uint(struct res_counter *counter, int member);
 ssize_t res_counter_write(struct res_counter *counter, int member,
const char __user *buf, size_t nbytes, loff_t *pos,
int (*write_strategy)(char *buf, unsigned long long *val));
Index: rescounter-2.6.25-rc2-mm1/kernel/res_counter.c
===
--- rescounter-2.6.25-rc2-mm1.orig/kernel/res_counter.c
+++ rescounter-2.6.25-rc2-mm1/kernel/res_counter.c
@@ -92,6 +92,11 @@ ssize_t res_counter_read(struct res_coun
pos, buf, s - buf);
 }
 
+u64 res_counter_read_uint(struct res_counter *counter, int member)
+{
+   return *res_counter_member(counter, member);
+}
+
 ssize_t res_counter_write(struct res_counter *counter, int member,
const char __user *userbuf, size_t nbytes, loff_t *pos,
int (*write_strategy)(char *st_buf, unsigned long long *val))

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 1/2] cgroup map files: Add cgroup map data type

2008-02-21 Thread menage

Adds a new type of supported control file representation, a map from
strings to u64 values.

The map type is printed in a similar format to /proc/meminfo or
/proc//status, i.e. "$key: $value\n"

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/cgroup.h |   19 +++
 kernel/cgroup.c|   59 -
 2 files changed, 77 insertions(+), 1 deletion(-)

Index: cgroupmap-2.6.25-rc2-mm1/include/linux/cgroup.h
===
--- cgroupmap-2.6.25-rc2-mm1.orig/include/linux/cgroup.h
+++ cgroupmap-2.6.25-rc2-mm1/include/linux/cgroup.h
@@ -166,6 +166,16 @@ struct css_set {
 
 };
 
+/*
+ * cgroup_map_cb is an abstract callback API for reporting map-valued
+ * control files
+ */
+
+struct cgroup_map_cb {
+   int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
+   void *state;
+};
+
 /* struct cftype:
  *
  * The files in the cgroup filesystem mostly have a very simple read/write
@@ -194,6 +204,15 @@ struct cftype {
 * single integer. Use it in place of read()
 */
u64 (*read_uint) (struct cgroup *cont, struct cftype *cft);
+   /*
+* read_map() is used for defining a map of key/value
+* pairs. It should call cb->fill(cb, key, value) for each
+* entry. The key/value pairs (and their ordering) should not
+* change between reboots.
+*/
+   int (*read_map) (struct cgroup *cont, struct cftype *cft,
+struct cgroup_map_cb *cb);
+
ssize_t (*write) (struct cgroup *cont, struct cftype *cft,
  struct file *file,
  const char __user *buf, size_t nbytes, loff_t *ppos);
Index: cgroupmap-2.6.25-rc2-mm1/kernel/cgroup.c
===
--- cgroupmap-2.6.25-rc2-mm1.orig/kernel/cgroup.c
+++ cgroupmap-2.6.25-rc2-mm1/kernel/cgroup.c
@@ -1487,6 +1487,46 @@ static ssize_t cgroup_file_read(struct f
return -EINVAL;
 }
 
+/*
+ * seqfile ops/methods for returning structured data. Currently just
+ * supports string->u64 maps, but can be extended in future.
+ */
+
+struct cgroup_seqfile_state {
+   struct cftype *cft;
+   struct cgroup *cgroup;
+};
+
+static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+{
+   struct seq_file *sf = cb->state;
+   return seq_printf(sf, "%s %llu\n", key, value);
+}
+
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+   struct cgroup_seqfile_state *state = m->private;
+   struct cftype *cft = state->cft;
+   if (cft->read_map) {
+   struct cgroup_map_cb cb = {
+   .fill = cgroup_map_add,
+   .state = m,
+   };
+   return cft->read_map(state->cgroup, cft, &cb);
+   } else {
+   BUG();
+   }
+}
+
+int cgroup_seqfile_release(struct inode *inode, struct file *file)
+{
+   struct seq_file *seq = file->private_data;
+   kfree(seq->private);
+   return single_release(inode, file);
+}
+
+static struct file_operations cgroup_seqfile_operations;
+
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
int err;
@@ -1499,7 +1539,18 @@ static int cgroup_file_open(struct inode
cft = __d_cft(file->f_dentry);
if (!cft)
return -ENODEV;
-   if (cft->open)
+   if (cft->read_map) {
+   struct cgroup_seqfile_state *state =
+   kzalloc(sizeof(*state), GFP_USER);
+   if (!state)
+   return -ENOMEM;
+   state->cft = cft;
+   state->cgroup = __d_cgrp(file->f_dentry->d_parent);
+   file->f_op = &cgroup_seqfile_operations;
+   err = single_open(file, cgroup_seqfile_show, state);
+   if (err < 0)
+   kfree(state);
+   } else if (cft->open)
err = cft->open(inode, file);
else
err = 0;
@@ -1538,6 +1589,12 @@ static struct file_operations cgroup_fil
.release = cgroup_file_release,
 };
 
+static struct file_operations cgroup_seqfile_operations = {
+   .read = seq_read,
+   .llseek = seq_lseek,
+   .release = cgroup_seqfile_release,
+};
+
 static struct inode_operations cgroup_dir_inode_operations = {
.lookup = simple_lookup,
.mkdir = cgroup_mkdir,

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/2] cgroup map files: Add a key/value map file type to cgroups

2008-02-21 Thread menage

[ Updated from the previous version to remove the colon from the map output ]

These patches add a new cgroup control file output type - a map from
strings to u64 values - and make use of it for the memory controller
"stat" file.

It is intended for use when the subsystem wants to return a collection
of values that are related in some way, for which a separate control
file for each value would make the reporting unwieldy.

The advantages of this are:

- more standardized output from control files that report
similarly-structured data that needs to be parsed programmatically

- less boilerplate required in cgroup subsystems

- simplifies transition to a future efficient cgroups binary API

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>


--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] cgroup map files: Use cgroup map for memcontrol stats file

2008-02-21 Thread menage

Remove the seq_file boilerplate used to construct the memcontrol stats
map, and instead use the new map representation for cgroup control
files

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 mm/memcontrol.c |   30 ++
 1 file changed, 6 insertions(+), 24 deletions(-)

Index: cgroupmap-2.6.25-rc2-mm1/mm/memcontrol.c
===
--- cgroupmap-2.6.25-rc2-mm1.orig/mm/memcontrol.c
+++ cgroupmap-2.6.25-rc2-mm1/mm/memcontrol.c
@@ -974,9 +974,9 @@ static const struct mem_cgroup_stat_desc
[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
 };
 
-static int mem_control_stat_show(struct seq_file *m, void *arg)
+static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+struct cgroup_map_cb *cb)
 {
-   struct cgroup *cont = m->private;
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
struct mem_cgroup_stat *stat = &mem_cont->stat;
int i;
@@ -986,8 +986,7 @@ static int mem_control_stat_show(struct 
 
val = mem_cgroup_read_stat(stat, i);
val *= mem_cgroup_stat_desc[i].unit;
-   seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg,
-   (long long)val);
+   cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
}
/* showing # of active pages */
{
@@ -997,29 +996,12 @@ static int mem_control_stat_show(struct 
MEM_CGROUP_ZSTAT_INACTIVE);
active = mem_cgroup_get_all_zonestat(mem_cont,
MEM_CGROUP_ZSTAT_ACTIVE);
-   seq_printf(m, "active %ld\n", (active) * PAGE_SIZE);
-   seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE);
+   cb->fill(cb, "active", (active) * PAGE_SIZE);
+   cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
}
return 0;
 }
 
-static const struct file_operations mem_control_stat_file_operations = {
-   .read = seq_read,
-   .llseek = seq_lseek,
-   .release = single_release,
-};
-
-static int mem_control_stat_open(struct inode *unused, struct file *file)
-{
-   /* XXX __d_cont */
-   struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
-
-   file->f_op = &mem_control_stat_file_operations;
-   return single_open(file, mem_control_stat_show, cont);
-}
-
-
-
 static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -1044,7 +1026,7 @@ static struct cftype mem_cgroup_files[] 
},
{
.name = "stat",
-   .open = mem_control_stat_open,
+   .read_map = mem_control_stat_show,
},
 };
 

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 06/10] CGroup API files: Add cgroup map data type

2008-02-23 Thread menage

Adds a new type of supported control file representation, a map from
strings to u64 values.

Each map entry is printed as a line in a similar format to
/proc/vmstat, i.e. "$key $value\n"

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/cgroup.h |   19 +
 kernel/cgroup.c|   53 -
 2 files changed, 71 insertions(+), 1 deletion(-)

Index: cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h
===
--- cgroup-2.6.25-rc2-mm1.orig/include/linux/cgroup.h
+++ cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h
@@ -166,6 +166,16 @@ struct css_set {
 
 };
 
+/*
+ * cgroup_map_cb is an abstract callback API for reporting map-valued
+ * control files
+ */
+
+struct cgroup_map_cb {
+   int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
+   void *state;
+};
+
 /* struct cftype:
  *
  * The files in the cgroup filesystem mostly have a very simple read/write
@@ -194,6 +204,15 @@ struct cftype {
 * single integer. Use it in place of read()
 */
u64 (*read_u64) (struct cgroup *cont, struct cftype *cft);
+   /*
+* read_map() is used for defining a map of key/value
+* pairs. It should call cb->fill(cb, key, value) for each
+* entry. The key/value pairs (and their ordering) should not
+* change between reboots.
+*/
+   int (*read_map) (struct cgroup *cont, struct cftype *cft,
+struct cgroup_map_cb *cb);
+
ssize_t (*write) (struct cgroup *cont, struct cftype *cft,
  struct file *file,
  const char __user *buf, size_t nbytes, loff_t *ppos);
Index: cgroup-2.6.25-rc2-mm1/kernel/cgroup.c
===
--- cgroup-2.6.25-rc2-mm1.orig/kernel/cgroup.c
+++ cgroup-2.6.25-rc2-mm1/kernel/cgroup.c
@@ -1484,6 +1484,46 @@ static ssize_t cgroup_file_read(struct f
return -EINVAL;
 }
 
+/*
+ * seqfile ops/methods for returning structured data. Currently just
+ * supports string->u64 maps, but can be extended in future.
+ */
+
+struct cgroup_seqfile_state {
+   struct cftype *cft;
+   struct cgroup *cgroup;
+};
+
+static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+{
+   struct seq_file *sf = cb->state;
+   return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value);
+}
+
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+   struct cgroup_seqfile_state *state = m->private;
+   struct cftype *cft = state->cft;
+   struct cgroup_map_cb cb = {
+   .fill = cgroup_map_add,
+   .state = m,
+   };
+   return cft->read_map(state->cgroup, cft, &cb);
+}
+
+int cgroup_seqfile_release(struct inode *inode, struct file *file)
+{
+   struct seq_file *seq = file->private_data;
+   kfree(seq->private);
+   return single_release(inode, file);
+}
+
+static struct file_operations cgroup_seqfile_operations = {
+   .read = seq_read,
+   .llseek = seq_lseek,
+   .release = cgroup_seqfile_release,
+};
+
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
int err;
@@ -1496,7 +1536,18 @@ static int cgroup_file_open(struct inode
cft = __d_cft(file->f_dentry);
if (!cft)
return -ENODEV;
-   if (cft->open)
+   if (cft->read_map) {
+   struct cgroup_seqfile_state *state =
+   kzalloc(sizeof(*state), GFP_USER);
+   if (!state)
+   return -ENOMEM;
+   state->cft = cft;
+   state->cgroup = __d_cgrp(file->f_dentry->d_parent);
+   file->f_op = &cgroup_seqfile_operations;
+   err = single_open(file, cgroup_seqfile_show, state);
+   if (err < 0)
+   kfree(state);
+   } else if (cft->open)
err = cft->open(inode, file);
else
err = 0;

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 00/10] CGroup API files: Various cleanup to CGroup control files

2008-02-23 Thread menage

This patchset is a roll-up of the non-contraversial items of the
various patches that I've sent out recently, fixed according to the
feedback received.

In summary they are:

- general rename of read_uint/write_uint to read_u64/write_u64

- use these methods for cpusets and memory controller files

- add a read_map cgroup file method, and use it in the memory
  controller

- move the "releasable" control file to the debug subsystem

- make the debug subsystem config option default to "n"

The only user-visible changes are the movement of the "releasable"
file and the fact that some write_u64()-based control files are now
more forgiving of additional whitespace at the end of their input.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 02/10] CGroup API files: Add res_counter_read_u64()

2008-02-23 Thread menage

Adds a function for returning the value of a resource counter member,
in a form suitable for use in a cgroup read_u64 control file method.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/res_counter.h |5 -
 kernel/res_counter.c|5 +
 2 files changed, 9 insertions(+), 1 deletion(-)

Index: cgroup-2.6.25-rc2-mm1/include/linux/res_counter.h
===
--- cgroup-2.6.25-rc2-mm1.orig/include/linux/res_counter.h
+++ cgroup-2.6.25-rc2-mm1/include/linux/res_counter.h
@@ -39,8 +39,9 @@ struct res_counter {
spinlock_t lock;
 };
 
-/*
+/**
  * Helpers to interact with userspace
+ * res_counter_read_u64() - returns the value of the specified member.
  * res_counter_read/_write - put/get the specified fields from the
  * res_counter struct to/from the user
  *
@@ -51,6 +52,8 @@ struct res_counter {
  * @pos: and the offset.
  */
 
+u64 res_counter_read_u64(struct res_counter *counter, int member);
+
 ssize_t res_counter_read(struct res_counter *counter, int member,
const char __user *buf, size_t nbytes, loff_t *pos,
int (*read_strategy)(unsigned long long val, char *s));
Index: cgroup-2.6.25-rc2-mm1/kernel/res_counter.c
===
--- cgroup-2.6.25-rc2-mm1.orig/kernel/res_counter.c
+++ cgroup-2.6.25-rc2-mm1/kernel/res_counter.c
@@ -92,6 +92,11 @@ ssize_t res_counter_read(struct res_coun
pos, buf, s - buf);
 }
 
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+   return *res_counter_member(counter, member);
+}
+
 ssize_t res_counter_write(struct res_counter *counter, int member,
const char __user *userbuf, size_t nbytes, loff_t *pos,
int (*write_strategy)(char *st_buf, unsigned long long *val))

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 04/10] CGroup API files: Strip all trailing whitespace in cgroup_write_u64

2008-02-23 Thread menage

This removes the need for people to remember to pass the -n flag to
echo when writing values to cgroup control files.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 kernel/cgroup.c |5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

Index: cgroup-2.6.25-rc2-mm1/kernel/cgroup.c
===
--- cgroup-2.6.25-rc2-mm1.orig/kernel/cgroup.c
+++ cgroup-2.6.25-rc2-mm1/kernel/cgroup.c
@@ -1321,10 +1321,7 @@ static ssize_t cgroup_write_u64(struct c
return -EFAULT;
 
buffer[nbytes] = 0; /* nul-terminate */
-
-   /* strip newline if necessary */
-   if (nbytes && (buffer[nbytes-1] == '\n'))
-   buffer[nbytes-1] = 0;
+   strstrip(buffer);
val = simple_strtoull(buffer, &end, 0);
if (*end)
return -EINVAL;

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 08/10] CGroup API files: Drop mem_cgroup_force_empty()

2008-02-23 Thread menage

This function isn't needed - a NULL pointer in the cftype read
function will result in the same EINVAL response to userspace.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 mm/memcontrol.c |   14 --
 1 file changed, 14 deletions(-)

Index: cgroup-2.6.25-rc2-mm1/mm/memcontrol.c
===
--- cgroup-2.6.25-rc2-mm1.orig/mm/memcontrol.c
+++ cgroup-2.6.25-rc2-mm1/mm/memcontrol.c
@@ -950,19 +950,6 @@ static ssize_t mem_force_empty_write(str
return ret;
 }
 
-/*
- * Note: This should be removed if cgroup supports write-only file.
- */
-
-static ssize_t mem_force_empty_read(struct cgroup *cont,
-   struct cftype *cft,
-   struct file *file, char __user *userbuf,
-   size_t nbytes, loff_t *ppos)
-{
-   return -EINVAL;
-}
-
-
 static const struct mem_cgroup_stat_desc {
const char *msg;
u64 unit;
@@ -1019,7 +1006,6 @@ static struct cftype mem_cgroup_files[] 
{
.name = "force_empty",
.write = mem_force_empty_write,
-   .read = mem_force_empty_read,
},
{
.name = "stat",

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 07/10] CGroup API files: Use cgroup map for memcontrol stats file

2008-02-23 Thread menage

Remove the seq_file boilerplate used to construct the memcontrol stats
map, and instead use the new map representation for cgroup control
files

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 mm/memcontrol.c |   30 ++
 1 file changed, 6 insertions(+), 24 deletions(-)

Index: cgroup-2.6.25-rc2-mm1/mm/memcontrol.c
===
--- cgroup-2.6.25-rc2-mm1.orig/mm/memcontrol.c
+++ cgroup-2.6.25-rc2-mm1/mm/memcontrol.c
@@ -971,9 +971,9 @@ static const struct mem_cgroup_stat_desc
[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
 };
 
-static int mem_control_stat_show(struct seq_file *m, void *arg)
+static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+struct cgroup_map_cb *cb)
 {
-   struct cgroup *cont = m->private;
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
struct mem_cgroup_stat *stat = &mem_cont->stat;
int i;
@@ -983,8 +983,7 @@ static int mem_control_stat_show(struct 
 
val = mem_cgroup_read_stat(stat, i);
val *= mem_cgroup_stat_desc[i].unit;
-   seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg,
-   (long long)val);
+   cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
}
/* showing # of active pages */
{
@@ -994,29 +993,12 @@ static int mem_control_stat_show(struct 
MEM_CGROUP_ZSTAT_INACTIVE);
active = mem_cgroup_get_all_zonestat(mem_cont,
MEM_CGROUP_ZSTAT_ACTIVE);
-   seq_printf(m, "active %ld\n", (active) * PAGE_SIZE);
-   seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE);
+   cb->fill(cb, "active", (active) * PAGE_SIZE);
+   cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
}
return 0;
 }
 
-static const struct file_operations mem_control_stat_file_operations = {
-   .read = seq_read,
-   .llseek = seq_lseek,
-   .release = single_release,
-};
-
-static int mem_control_stat_open(struct inode *unused, struct file *file)
-{
-   /* XXX __d_cont */
-   struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
-
-   file->f_op = &mem_control_stat_file_operations;
-   return single_open(file, mem_control_stat_show, cont);
-}
-
-
-
 static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -1041,7 +1023,7 @@ static struct cftype mem_cgroup_files[] 
},
{
.name = "stat",
-   .open = mem_control_stat_open,
+   .read_map = mem_control_stat_show,
},
 };
 

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 05/10] CGroup API files: Update cpusets to use cgroup structured file API

2008-02-23 Thread menage

Many of the cpusets control files are simple integer values, which
don't require the overhead of memory allocations for reads and writes.

Move the handlers for these control files into cpuset_read_u64() and
cpuset_write_u64().

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 kernel/cpuset.c |  155 +---
 1 file changed, 81 insertions(+), 74 deletions(-)

Index: cgroup-2.6.25-rc2-mm1/kernel/cpuset.c
===
--- cgroup-2.6.25-rc2-mm1.orig/kernel/cpuset.c
+++ cgroup-2.6.25-rc2-mm1/kernel/cpuset.c
@@ -999,19 +999,6 @@ int current_cpuset_is_being_rebound(void
 }
 
 /*
- * Call with cgroup_mutex held.
- */
-
-static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
-{
-   if (simple_strtoul(buf, NULL, 10) != 0)
-   cpuset_memory_pressure_enabled = 1;
-   else
-   cpuset_memory_pressure_enabled = 0;
-   return 0;
-}
-
-/*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
  * CS_SCHED_LOAD_BALANCE,
@@ -1023,15 +1010,13 @@ static int update_memory_pressure_enable
  * Call with cgroup_mutex held.
  */
 
-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+  int turning_on)
 {
-   int turning_on;
struct cpuset trialcs;
int err;
int cpus_nonempty, balance_flag_changed;
 
-   turning_on = (simple_strtoul(buf, NULL, 10) != 0);
-
trialcs = *cs;
if (turning_on)
set_bit(bit, &trialcs.flags);
@@ -1247,43 +1232,65 @@ static ssize_t cpuset_common_file_write(
case FILE_MEMLIST:
retval = update_nodemask(cs, buffer);
break;
+   default:
+   retval = -EINVAL;
+   goto out2;
+   }
+
+   if (retval == 0)
+   retval = nbytes;
+out2:
+   cgroup_unlock();
+out1:
+   kfree(buffer);
+   return retval;
+}
+
+static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+   int retval = 0;
+   struct cpuset *cs = cgroup_cs(cgrp);
+   cpuset_filetype_t type = cft->private;
+
+   cgroup_lock();
+
+   if (cgroup_is_removed(cgrp)) {
+   cgroup_unlock();
+   return -ENODEV;
+   }
+
+   switch (type) {
case FILE_CPU_EXCLUSIVE:
-   retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
+   retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
break;
case FILE_MEM_EXCLUSIVE:
-   retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
+   retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
break;
case FILE_SCHED_LOAD_BALANCE:
-   retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
+   retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
break;
case FILE_MEMORY_MIGRATE:
-   retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
+   retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
break;
case FILE_MEMORY_PRESSURE_ENABLED:
-   retval = update_memory_pressure_enabled(cs, buffer);
+   cpuset_memory_pressure_enabled = !!val;
break;
case FILE_MEMORY_PRESSURE:
retval = -EACCES;
break;
case FILE_SPREAD_PAGE:
-   retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
+   retval = update_flag(CS_SPREAD_PAGE, cs, val);
cs->mems_generation = cpuset_mems_generation++;
break;
case FILE_SPREAD_SLAB:
-   retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
+   retval = update_flag(CS_SPREAD_SLAB, cs, val);
cs->mems_generation = cpuset_mems_generation++;
break;
default:
retval = -EINVAL;
-   goto out2;
+   break;
}
-
-   if (retval == 0)
-   retval = nbytes;
-out2:
cgroup_unlock();
-out1:
-   kfree(buffer);
return retval;
 }
 
@@ -1345,30 +1352,6 @@ static ssize_t cpuset_common_file_read(s
case FILE_MEMLIST:
s += cpuset_sprintf_memlist(s, cs);
break;
-   case FILE_CPU_EXCLUSIVE:
-   *s++ = is_cpu_exclusive(cs) ? '1' : '0';
-   break;
-   case FILE_MEM_EXCLUSIVE:
-   *s++ = is_mem_exclusive(cs) ? '1' : '0';
-   break;
-   case FILE_SCHED_LOAD_BALANCE:
-   *s++ = is_sched_load_balance(cs) ? '1' : '0';
-   break;
-   case FILE_MEMORY_MIGRATE:
-

[PATCH 09/10] CGroup API files: Move "releasable" to cgroup_debug subsystem

2008-02-23 Thread menage

The "releasable" control file provided by the cgroup framework exports
the state of a per-cgroup flag that's related to the notify-on-release
feature. This isn't really generally useful, unless you're trying to
debug this particular feature of cgroups.

This patch moves the "releasable" file to the cgroup_debug subsystem.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/cgroup.h |   11 +++
 kernel/cgroup.c|   23 ---
 kernel/cgroup_debug.c  |   12 +++-
 3 files changed, 22 insertions(+), 24 deletions(-)

Index: cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h
===
--- cgroup-2.6.25-rc2-mm1.orig/include/linux/cgroup.h
+++ cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h
@@ -88,6 +88,17 @@ static inline void css_put(struct cgroup
__css_put(css);
 }
 
+/* bits in struct cgroup flags field */
+enum {
+   /* Control Group is dead */
+   CGRP_REMOVED,
+   /* Control Group has previously had a child cgroup or a task,
+* but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
+   CGRP_RELEASABLE,
+   /* Control Group requires release notifications to userspace */
+   CGRP_NOTIFY_ON_RELEASE,
+};
+
 struct cgroup {
unsigned long flags;/* "unsigned long" so bitops work */
 
Index: cgroup-2.6.25-rc2-mm1/kernel/cgroup.c
===
--- cgroup-2.6.25-rc2-mm1.orig/kernel/cgroup.c
+++ cgroup-2.6.25-rc2-mm1/kernel/cgroup.c
@@ -119,17 +119,6 @@ static int root_count;
  */
 static int need_forkexit_callback;
 
-/* bits in struct cgroup flags field */
-enum {
-   /* Control Group is dead */
-   CGRP_REMOVED,
-   /* Control Group has previously had a child cgroup or a task,
-* but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */
-   CGRP_RELEASABLE,
-   /* Control Group requires release notifications to userspace */
-   CGRP_NOTIFY_ON_RELEASE,
-};
-
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -1299,7 +1288,6 @@ enum cgroup_filetype {
FILE_DIR,
FILE_TASKLIST,
FILE_NOTIFY_ON_RELEASE,
-   FILE_RELEASABLE,
FILE_RELEASE_AGENT,
 };
 
@@ -2169,11 +2157,6 @@ static u64 cgroup_read_notify_on_release
return notify_on_release(cgrp);
 }
 
-static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft)
-{
-   return test_bit(CGRP_RELEASABLE, &cgrp->flags);
-}
-
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -2193,12 +2176,6 @@ static struct cftype files[] = {
.write = cgroup_common_file_write,
.private = FILE_NOTIFY_ON_RELEASE,
},
-
-   {
-   .name = "releasable",
-   .read_u64 = cgroup_read_releasable,
-   .private = FILE_RELEASABLE,
-   }
 };
 
 static struct cftype cft_release_agent = {
Index: cgroup-2.6.25-rc2-mm1/kernel/cgroup_debug.c
===
--- cgroup-2.6.25-rc2-mm1.orig/kernel/cgroup_debug.c
+++ cgroup-2.6.25-rc2-mm1/kernel/cgroup_debug.c
@@ -1,5 +1,5 @@
 /*
- * kernel/ccontainer_debug.c - Example cgroup subsystem that
+ * kernel/cgroup_debug.c - Example cgroup subsystem that
  * exposes debug info
  *
  * Copyright (C) Google Inc, 2007
@@ -62,6 +62,11 @@ static u64 current_css_set_refcount_read
return count;
 }
 
+static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft)
+{
+   return test_bit(CGRP_RELEASABLE, &cgrp->flags);
+}
+
 static struct cftype files[] =  {
{
.name = "cgroup_refcount",
@@ -81,6 +86,11 @@ static struct cftype files[] =  {
.name = "current_css_set_refcount",
.read_u64 = current_css_set_refcount_read,
},
+
+   {
+   .name = "releasable",
+   .read_u64 = releasable_read,
+   }
 };
 
 static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 10/10] CGroup API files: Make CGROUP_DEBUG default to off

2008-02-23 Thread menage

The cgroup debug subsystem isn't generally useful for users. It should default 
to "n".

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 init/Kconfig |1 +
 1 file changed, 1 insertion(+)

Index: cgroup-2.6.25-rc2-mm1/init/Kconfig
===
--- cgroup-2.6.25-rc2-mm1.orig/init/Kconfig
+++ cgroup-2.6.25-rc2-mm1/init/Kconfig
@@ -284,6 +284,7 @@ config CGROUPS
 config CGROUP_DEBUG
bool "Example debug cgroup subsystem"
depends on CGROUPS
+   default n
help
  This option enables a simple cgroup subsystem that
  exports useful debugging information about the cgroups

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 01/10] CGroup API files: Rename read/write_uint methods to read_write_u64

2008-02-23 Thread menage

Several people have justifiably complained that the "_uint" suffix is
inappropriate for functions that handle u64 values, so this patch just
renames all these functions and their users to have the suffic _u64.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/cgroup.h |8 
 kernel/cgroup.c|   32 
 kernel/cgroup_debug.c  |8 
 kernel/sched.c |   18 +-
 4 files changed, 33 insertions(+), 33 deletions(-)

Index: cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h
===
--- cgroup-2.6.25-rc2-mm1.orig/include/linux/cgroup.h
+++ cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h
@@ -190,20 +190,20 @@ struct cftype {
 struct file *file,
 char __user *buf, size_t nbytes, loff_t *ppos);
/*
-* read_uint() is a shortcut for the common case of returning a
+* read_u64() is a shortcut for the common case of returning a
 * single integer. Use it in place of read()
 */
-   u64 (*read_uint) (struct cgroup *cont, struct cftype *cft);
+   u64 (*read_u64) (struct cgroup *cont, struct cftype *cft);
ssize_t (*write) (struct cgroup *cont, struct cftype *cft,
  struct file *file,
  const char __user *buf, size_t nbytes, loff_t *ppos);
 
/*
-* write_uint() is a shortcut for the common case of accepting
+* write_u64() is a shortcut for the common case of accepting
 * a single integer (as parsed by simple_strtoull) from
 * userspace. Use in place of write(); return 0 or error.
 */
-   int (*write_uint) (struct cgroup *cont, struct cftype *cft, u64 val);
+   int (*write_u64) (struct cgroup *cont, struct cftype *cft, u64 val);
 
int (*release) (struct inode *inode, struct file *file);
 };
Index: cgroup-2.6.25-rc2-mm1/kernel/cgroup.c
===
--- cgroup-2.6.25-rc2-mm1.orig/kernel/cgroup.c
+++ cgroup-2.6.25-rc2-mm1/kernel/cgroup.c
@@ -1303,10 +1303,10 @@ enum cgroup_filetype {
FILE_RELEASE_AGENT,
 };
 
-static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
-struct file *file,
-const char __user *userbuf,
-size_t nbytes, loff_t *unused_ppos)
+static ssize_t cgroup_write_u64(struct cgroup *cgrp, struct cftype *cft,
+   struct file *file,
+   const char __user *userbuf,
+   size_t nbytes, loff_t *unused_ppos)
 {
char buffer[64];
int retval = 0;
@@ -1330,7 +1330,7 @@ static ssize_t cgroup_write_uint(struct 
return -EINVAL;
 
/* Pass to subsystem */
-   retval = cft->write_uint(cgrp, cft, val);
+   retval = cft->write_u64(cgrp, cft, val);
if (!retval)
retval = nbytes;
return retval;
@@ -1411,18 +1411,18 @@ static ssize_t cgroup_file_write(struct 
return -ENODEV;
if (cft->write)
return cft->write(cgrp, cft, file, buf, nbytes, ppos);
-   if (cft->write_uint)
-   return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos);
+   if (cft->write_u64)
+   return cgroup_write_u64(cgrp, cft, file, buf, nbytes, ppos);
return -EINVAL;
 }
 
-static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft,
-  struct file *file,
-  char __user *buf, size_t nbytes,
-  loff_t *ppos)
+static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
+  struct file *file,
+  char __user *buf, size_t nbytes,
+  loff_t *ppos)
 {
char tmp[64];
-   u64 val = cft->read_uint(cgrp, cft);
+   u64 val = cft->read_u64(cgrp, cft);
int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
 
return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
@@ -1482,8 +1482,8 @@ static ssize_t cgroup_file_read(struct f
 
if (cft->read)
return cft->read(cgrp, cft, file, buf, nbytes, ppos);
-   if (cft->read_uint)
-   return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos);
+   if (cft->read_u64)
+   return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos);
return -EINVAL;
 }
 
@@ -2141,14 +2141,14 @@ static struct cftype files[] = {
 
{
.name = "notify_on_release",
-   .read_uint = cgroup_read_notify_on_release,
+   .read_u64 = cgroup_read_notify_on_release,
.write = cgr

[PATCH 03/10] CGroup API files: Use read_u64 in memory controller

2008-02-23 Thread menage

Update the memory controller to use read_u64 for its
limit/usage/failcnt control files, calling the new
res_counter_read_u64() function.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 mm/memcontrol.c |   15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

Index: cgroup-2.6.25-rc2-mm1/mm/memcontrol.c
===
--- cgroup-2.6.25-rc2-mm1.orig/mm/memcontrol.c
+++ cgroup-2.6.25-rc2-mm1/mm/memcontrol.c
@@ -922,13 +922,10 @@ int mem_cgroup_write_strategy(char *buf,
return 0;
 }
 
-static ssize_t mem_cgroup_read(struct cgroup *cont,
-   struct cftype *cft, struct file *file,
-   char __user *userbuf, size_t nbytes, loff_t *ppos)
+static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
-   return res_counter_read(&mem_cgroup_from_cont(cont)->res,
-   cft->private, userbuf, nbytes, ppos,
-   NULL);
+   return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
+   cft->private);
 }
 
 static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
@@ -1024,18 +1021,18 @@ static struct cftype mem_cgroup_files[] 
{
.name = "usage_in_bytes",
.private = RES_USAGE,
-   .read = mem_cgroup_read,
+   .read_u64 = mem_cgroup_read,
},
{
.name = "limit_in_bytes",
.private = RES_LIMIT,
.write = mem_cgroup_write,
-   .read = mem_cgroup_read,
+   .read_u64 = mem_cgroup_read,
},
{
.name = "failcnt",
.private = RES_FAILCNT,
-   .read = mem_cgroup_read,
+   .read_u64 = mem_cgroup_read,
},
{
.name = "force_empty",

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 02/10] Task Containers(V11): Add tasks file interface

2007-07-20 Thread menage

This patch adds the per-directory "tasks" file for containerfs mounts; this
allows the user to determine which tasks are members of a container by reading
a container's "tasks", and to move a task into a container by writing its pid
to its "tasks".

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
---

 include/linux/container.h |   10 +
 kernel/container.c|  359 +-
 2 files changed, 368 insertions(+), 1 deletion(-)

Index: container-2.6.22-rc6-mm1/include/linux/container.h
===
--- container-2.6.22-rc6-mm1.orig/include/linux/container.h
+++ container-2.6.22-rc6-mm1/include/linux/container.h
@@ -144,6 +144,16 @@ int container_is_removed(const struct co
 
 int container_path(const struct container *cont, char *buf, int buflen);
 
+int __container_task_count(const struct container *cont);
+static inline int container_task_count(const struct container *cont)
+{
+   int task_count;
+   rcu_read_lock();
+   task_count = __container_task_count(cont);
+   rcu_read_unlock();
+   return task_count;
+}
+
 /* Return true if the container is a descendant of the current container */
 int container_is_descendant(const struct container *cont);
 
Index: container-2.6.22-rc6-mm1/kernel/container.c
===
--- container-2.6.22-rc6-mm1.orig/kernel/container.c
+++ container-2.6.22-rc6-mm1/kernel/container.c
@@ -40,7 +40,7 @@
 #include 
 #include 
 #include 
-
+#include 
 #include 
 
 /* Generate an array of container subsystem pointers */
@@ -704,6 +704,127 @@ int container_path(const struct containe
return 0;
 }
 
+/*
+ * Return the first subsystem attached to a container's hierarchy, and
+ * its subsystem id.
+ */
+
+static void get_first_subsys(const struct container *cont,
+   struct container_subsys_state **css, int *subsys_id)
+{
+   const struct containerfs_root *root = cont->root;
+   const struct container_subsys *test_ss;
+   BUG_ON(list_empty(&root->subsys_list));
+   test_ss = list_entry(root->subsys_list.next,
+struct container_subsys, sibling);
+   if (css) {
+   *css = cont->subsys[test_ss->subsys_id];
+   BUG_ON(!*css);
+   }
+   if (subsys_id)
+   *subsys_id = test_ss->subsys_id;
+}
+
+/*
+ * Attach task 'tsk' to container 'cont'
+ *
+ * Call holding container_mutex.  May take task_lock of
+ * the task 'pid' during call.
+ */
+static int attach_task(struct container *cont, struct task_struct *tsk)
+{
+   int retval = 0;
+   struct container_subsys *ss;
+   struct container *oldcont;
+   struct css_group *cg = &tsk->containers;
+   struct containerfs_root *root = cont->root;
+   int i;
+   int subsys_id;
+
+   get_first_subsys(cont, NULL, &subsys_id);
+
+   /* Nothing to do if the task is already in that container */
+   oldcont = task_container(tsk, subsys_id);
+   if (cont == oldcont)
+   return 0;
+
+   for_each_subsys(root, ss) {
+   if (ss->can_attach) {
+   retval = ss->can_attach(ss, cont, tsk);
+   if (retval) {
+   return retval;
+   }
+   }
+   }
+
+   task_lock(tsk);
+   if (tsk->flags & PF_EXITING) {
+   task_unlock(tsk);
+   return -ESRCH;
+   }
+   /* Update the css_group pointers for the subsystems in this
+* hierarchy */
+   for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
+   if (root->subsys_bits & (1ull << i)) {
+   /* Subsystem is in this hierarchy. So we want
+* the subsystem state from the new
+* container. Transfer the refcount from the
+* old to the new */
+   atomic_inc(&cont->count);
+   atomic_dec(&cg->subsys[i]->container->count);
+   rcu_assign_pointer(cg->subsys[i], cont->subsys[i]);
+   }
+   }
+   task_unlock(tsk);
+
+   for_each_subsys(root, ss) {
+   if (ss->attach) {
+   ss->attach(ss, cont, oldcont, tsk);
+   }
+   }
+
+   synchronize_rcu();
+   return 0;
+}
+
+/*
+ * Attach task with pid 'pid' to container 'cont'. Call with
+ * container_mutex, may take task_lock of task
+ */
+static int attach_task_by_pid(struct container *cont, char *pidbuf)
+{
+   pid_t pid;
+   struct task_struct *tsk;
+   int ret;
+
+   if (sscanf(pidbuf, "%d", &pid) != 1)
+

[PATCH 05/10] Task Containers(V11): Add procfs interface

2007-07-20 Thread menage

This patch adds:

/proc/containers - general system info

/proc/*/container - per-task container membership info

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
---

 fs/proc/base.c|7 ++
 include/linux/container.h |2 
 kernel/container.c|  132 ++
 3 files changed, 141 insertions(+)

Index: container-2.6.22-rc6-mm1/fs/proc/base.c
===
--- container-2.6.22-rc6-mm1.orig/fs/proc/base.c
+++ container-2.6.22-rc6-mm1/fs/proc/base.c
@@ -67,6 +67,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2050,6 +2051,9 @@ static const struct pid_entry tgid_base_
 #ifdef CONFIG_CPUSETS
REG("cpuset", S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
INF("oom_score",  S_IRUGO, oom_score),
REG("oom_adj",S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
@@ -2341,6 +2345,9 @@ static const struct pid_entry tid_base_s
 #ifdef CONFIG_CPUSETS
REG("cpuset",S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
INF("oom_score", S_IRUGO, oom_score),
REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
Index: container-2.6.22-rc6-mm1/kernel/container.c
===
--- container-2.6.22-rc6-mm1.orig/kernel/container.c
+++ container-2.6.22-rc6-mm1/kernel/container.c
@@ -33,6 +33,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -247,6 +248,7 @@ static int container_mkdir(struct inode 
 static int container_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int container_populate_dir(struct container *cont);
 static struct inode_operations container_dir_inode_operations;
+static struct file_operations proc_containerstats_operations;
 
 static struct inode *container_new_inode(mode_t mode, struct super_block *sb)
 {
@@ -1567,6 +1569,7 @@ int __init container_init(void)
 {
int err;
int i;
+   struct proc_dir_entry *entry;
 
for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
struct container_subsys *ss = subsys[i];
@@ -1578,10 +1581,139 @@ int __init container_init(void)
if (err < 0)
goto out;
 
+   entry = create_proc_entry("containers", 0, NULL);
+   if (entry)
+   entry->proc_fops = &proc_containerstats_operations;
+
 out:
return err;
 }
 
+/*
+ * proc_container_show()
+ *  - Print task's container paths into seq_file, one line for each hierarchy
+ *  - Used for /proc//container.
+ *  - No need to task_lock(tsk) on this tsk->container reference, as it
+ *doesn't really matter if tsk->container changes after we read it,
+ *and we take container_mutex, keeping attach_task() from changing it
+ *anyway.  No need to check that tsk->container != NULL, thanks to
+ *the_top_container_hack in container_exit(), which sets an exiting tasks
+ *container to top_container.
+ */
+
+/* TODO: Use a proper seq_file iterator */
+static int proc_container_show(struct seq_file *m, void *v)
+{
+   struct pid *pid;
+   struct task_struct *tsk;
+   char *buf;
+   int retval;
+   struct containerfs_root *root;
+
+   retval = -ENOMEM;
+   buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+   if (!buf)
+   goto out;
+
+   retval = -ESRCH;
+   pid = m->private;
+   tsk = get_pid_task(pid, PIDTYPE_PID);
+   if (!tsk)
+   goto out_free;
+
+   retval = 0;
+
+   mutex_lock(&container_mutex);
+
+   for_each_root(root) {
+   struct container_subsys *ss;
+   struct container *cont;
+   int subsys_id;
+   int count = 0;
+
+   /* Skip this hierarchy if it has no active subsystems */
+   if (!root->actual_subsys_bits)
+   continue;
+   for_each_subsys(root, ss)
+   seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+   seq_putc(m, ':');
+   get_first_subsys(&root->top_container, NULL, &subsys_id);
+   cont = task_container(tsk, subsys_id);
+   retval = container_path(cont, buf, PAGE_SIZE);
+   if (retval < 0)
+   goto out_unlock;
+   seq_puts(m, buf);
+   seq_putc(m, '\n');
+   }
+
+out_unlock:
+   mutex_unlock(&container_mutex);
+   put_task_struct(tsk);
+out_free:
+   kfree(buf);
+out:
+   return retval;
+}
+
+static int container_open(struct inode *inode, struct file *file)

[PATCH 06/10] Task Containers(V11): Shared container subsystem group arrays

2007-07-20 Thread menage

This patch replaces the struct css_group embedded in task_struct with a
pointer; all tasks that have the same set of memberships across all
hierarchies will share a css_group object, and will be linked via their
css_groups field to the "tasks" list_head in the css_group.

Assuming that many tasks share the same container assignments, this reduces
overall space usage and keeps the size of the task_struct down (three pointers
added to task_struct compared to a non-containers kernel, no matter how many
subsystems are registered).

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
---

 Documentation/containers.txt |   14 
 include/linux/container.h|   89 +-
 include/linux/sched.h|   33 --
 kernel/container.c   |  606 +--
 kernel/fork.c|1 
 5 files changed, 620 insertions(+), 123 deletions(-)

Index: container-2.6.22-rc6-mm1/Documentation/containers.txt
===
--- container-2.6.22-rc6-mm1.orig/Documentation/containers.txt
+++ container-2.6.22-rc6-mm1/Documentation/containers.txt
@@ -176,7 +176,9 @@ Containers extends the kernel as follows
subsystem state is something that's expected to happen frequently
and in performance-critical code, whereas operations that require a
task's actual container assignments (in particular, moving between
-   containers) are less common.
+   containers) are less common. A linked list runs through the cg_list
+   field of each task_struct using the css_group, anchored at
+   css_group->tasks.
 
  - A container hierarchy filesystem can be mounted  for browsing and
manipulation from user space.
@@ -252,6 +254,16 @@ linear search to locate an appropriate e
 very efficient. A future version will use a hash table for better
 performance.
 
+To allow access from a container to the css_groups (and hence tasks)
+that comprise it, a set of cg_container_link objects form a lattice;
+each cg_container_link is linked into a list of cg_container_links for
+a single container on its cont_link_list field, and a list of
+cg_container_links for a single css_group on its cg_link_list.
+
+Thus the set of tasks in a container can be listed by iterating over
+each css_group that references the container, and sub-iterating over
+each css_group's task set.
+
 The use of a Linux virtual file system (vfs) to represent the
 container hierarchy provides for a familiar permission and name space
 for containers, with a minimum of additional kernel code.
Index: container-2.6.22-rc6-mm1/include/linux/container.h
===
--- container-2.6.22-rc6-mm1.orig/include/linux/container.h
+++ container-2.6.22-rc6-mm1/include/linux/container.h
@@ -27,10 +27,19 @@ extern void container_lock(void);
 extern void container_unlock(void);
 extern void container_fork(struct task_struct *p);
 extern void container_fork_callbacks(struct task_struct *p);
+extern void container_post_fork(struct task_struct *p);
 extern void container_exit(struct task_struct *p, int run_callbacks);
 
 extern struct file_operations proc_container_operations;
 
+/* Define the enumeration of all container subsystems */
+#define SUBSYS(_x) _x ## _subsys_id,
+enum container_subsys_id {
+#include 
+   CONTAINER_SUBSYS_COUNT
+};
+#undef SUBSYS
+
 /* Per-subsystem/per-container state maintained by the system. */
 struct container_subsys_state {
/* The container that this subsystem is attached to. Useful
@@ -97,6 +106,52 @@ struct container {
 
struct containerfs_root *root;
struct container *top_container;
+
+   /*
+* List of cg_container_links pointing at css_groups with
+* tasks in this container. Protected by css_group_lock
+*/
+   struct list_head css_groups;
+};
+
+/* A css_group is a structure holding pointers to a set of
+ * container_subsys_state objects. This saves space in the task struct
+ * object and speeds up fork()/exit(), since a single inc/dec and a
+ * list_add()/del() can bump the reference count on the entire
+ * container set for a task.
+ */
+
+struct css_group {
+
+   /* Reference count */
+   struct kref ref;
+
+   /*
+* List running through all container groups. Protected by
+* css_group_lock
+*/
+   struct list_head list;
+
+   /*
+* List running through all tasks using this container
+* group. Protected by css_group_lock
+*/
+   struct list_head tasks;
+
+   /*
+* List of cg_container_link objects on link chains from
+* containers referenced from this css_group. Protected by
+* css_group_lock
+*/
+   struct list_head cg_links;
+
+   /*
+* Set of subsystem states, one for each subsystem. This array
+* is immutable after creation apart from the init_css_group
+* during su

[PATCH 10/10] Task Containers(V11): Simple task container debug info subsystem

2007-07-20 Thread menage

This example subsystem exports debugging information as an aid to diagnosing
refcount leaks, etc, in the container framework.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
---

 include/linux/container_subsys.h |4 +
 init/Kconfig |   10 
 kernel/Makefile  |1 
 kernel/container_debug.c |   97 +++
 4 files changed, 112 insertions(+)

Index: container-2.6.22-rc6-mm1/include/linux/container_subsys.h
===
--- container-2.6.22-rc6-mm1.orig/include/linux/container_subsys.h
+++ container-2.6.22-rc6-mm1/include/linux/container_subsys.h
@@ -19,4 +19,8 @@ SUBSYS(cpuacct)
 
 /* */
 
+#ifdef CONFIG_CONTAINER_DEBUG
+SUBSYS(debug)
+#endif
+
 /* */
Index: container-2.6.22-rc6-mm1/init/Kconfig
===
--- container-2.6.22-rc6-mm1.orig/init/Kconfig
+++ container-2.6.22-rc6-mm1/init/Kconfig
@@ -303,6 +303,16 @@ config CONTAINERS
 
  Say N if unsure.
 
+config CONTAINER_DEBUG
+   bool "Example debug container subsystem"
+   depends on CONTAINERS
+   help
+ This option enables a simple container subsystem that
+ exports useful debugging information about the containers
+ framework
+
+ Say N if unsure
+
 config CPUSETS
bool "Cpuset support"
depends on SMP && CONTAINERS
Index: container-2.6.22-rc6-mm1/kernel/Makefile
===
--- container-2.6.22-rc6-mm1.orig/kernel/Makefile
+++ container-2.6.22-rc6-mm1/kernel/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CONTAINERS) += container.o
+obj-$(CONFIG_CONTAINER_DEBUG) += container_debug.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o
 obj-$(CONFIG_IKCONFIG) += configs.o
Index: container-2.6.22-rc6-mm1/kernel/container_debug.c
===
--- /dev/null
+++ container-2.6.22-rc6-mm1/kernel/container_debug.c
@@ -0,0 +1,97 @@
+/*
+ * kernel/ccontainer_debug.c - Example container subsystem that
+ * exposes debug info
+ *
+ * Copyright (C) Google Inc, 2007
+ *
+ * Developed by Paul Menage ([EMAIL PROTECTED])
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+static struct container_subsys_state *debug_create(struct container_subsys *ss,
+  struct container *cont)
+{
+   struct container_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+   if (!css)
+   return ERR_PTR(-ENOMEM);
+
+   return css;
+}
+
+static void debug_destroy(struct container_subsys *ss, struct container *cont)
+{
+   kfree(cont->subsys[debug_subsys_id]);
+}
+
+static u64 container_refcount_read(struct container *cont, struct cftype *cft)
+{
+   return atomic_read(&cont->count);
+}
+
+static u64 taskcount_read(struct container *cont, struct cftype *cft)
+{
+   u64 count;
+
+   container_lock();
+   count = container_task_count(cont);
+   container_unlock();
+   return count;
+}
+
+static u64 current_css_group_read(struct container *cont, struct cftype *cft)
+{
+   return (u64)(long)current->containers;
+}
+
+static u64 current_css_group_refcount_read(struct container *cont,
+  struct cftype *cft)
+{
+   u64 count;
+
+   rcu_read_lock();
+   count = atomic_read(¤t->containers->ref.refcount);
+   rcu_read_unlock();
+   return count;
+}
+
+static struct cftype files[] =  {
+   {
+   .name = "container_refcount",
+   .read_uint = container_refcount_read,
+   },
+   {
+   .name = "taskcount",
+   .read_uint = taskcount_read,
+   },
+
+   {
+   .name = "current_css_group",
+   .read_uint = current_css_group_read,
+   },
+
+   {
+   .name = "current_css_group_refcount",
+   .read_uint = current_css_group_refcount_read,
+   },
+};
+
+static int debug_populate(struct container_subsys *ss, struct container *cont)
+{
+   return container_add_files(cont, ss, files, ARRAY_SIZE(files));
+}
+
+struct container_subsys debug_subsys = {
+   .name = "debug",
+   .create = debug_create,
+   .destroy = debug_destroy,
+   .populate = debug_populate,
+   .subsys_id = debug_subsys_id,
+};

--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 04/10] Task Containers(V11): Add container_clone() interface.

2007-07-20 Thread menage

This patch adds support for container_clone(), a way to create new
containers intended to be used for systems such as namespace
unsharing. A new subsystem callback, post_clone(), is added to allow
subsystems to automatically configure cloned containers.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
---

 Documentation/containers.txt |7 ++
 include/linux/container.h|3 
 kernel/container.c   |  135 +++
 3 files changed, 145 insertions(+)

Index: container-2.6.22-rc6-mm1/include/linux/container.h
===
--- container-2.6.22-rc6-mm1.orig/include/linux/container.h
+++ container-2.6.22-rc6-mm1/include/linux/container.h
@@ -174,6 +174,7 @@ struct container_subsys {
void (*exit)(struct container_subsys *ss, struct task_struct *task);
int (*populate)(struct container_subsys *ss,
struct container *cont);
+   void (*post_clone)(struct container_subsys *ss, struct container *cont);
void (*bind)(struct container_subsys *ss, struct container *root);
int subsys_id;
int active;
@@ -213,6 +214,8 @@ static inline struct container* task_con
 
 int container_path(const struct container *cont, char *buf, int buflen);
 
+int container_clone(struct task_struct *tsk, struct container_subsys *ss);
+
 #else /* !CONFIG_CONTAINERS */
 
 static inline int container_init_early(void) { return 0; }
Index: container-2.6.22-rc6-mm1/kernel/container.c
===
--- container-2.6.22-rc6-mm1.orig/kernel/container.c
+++ container-2.6.22-rc6-mm1/kernel/container.c
@@ -1675,3 +1675,138 @@ void container_exit(struct task_struct *
tsk->containers = init_task.containers;
task_unlock(tsk);
 }
+
+/**
+ * container_clone - duplicate the current container in the hierarchy
+ * that the given subsystem is attached to, and move this task into
+ * the new child
+ */
+int container_clone(struct task_struct *tsk, struct container_subsys *subsys)
+{
+   struct dentry *dentry;
+   int ret = 0;
+   char nodename[MAX_CONTAINER_TYPE_NAMELEN];
+   struct container *parent, *child;
+   struct inode *inode;
+   struct css_group *cg;
+   struct containerfs_root *root;
+   struct container_subsys *ss;
+
+   /* We shouldn't be called by an unregistered subsystem */
+   BUG_ON(!subsys->active);
+
+   /* First figure out what hierarchy and container we're dealing
+* with, and pin them so we can drop container_mutex */
+   mutex_lock(&container_mutex);
+ again:
+   root = subsys->root;
+   if (root == &rootnode) {
+   printk(KERN_INFO
+  "Not cloning container for unused subsystem %s\n",
+  subsys->name);
+   mutex_unlock(&container_mutex);
+   return 0;
+   }
+   cg = &tsk->containers;
+   parent = task_container(tsk, subsys->subsys_id);
+
+   snprintf(nodename, MAX_CONTAINER_TYPE_NAMELEN, "node_%d", tsk->pid);
+
+   /* Pin the hierarchy */
+   atomic_inc(&parent->root->sb->s_active);
+
+   mutex_unlock(&container_mutex);
+
+   /* Now do the VFS work to create a container */
+   inode = parent->dentry->d_inode;
+
+   /* Hold the parent directory mutex across this operation to
+* stop anyone else deleting the new container */
+   mutex_lock(&inode->i_mutex);
+   dentry = container_get_dentry(parent->dentry, nodename);
+   if (IS_ERR(dentry)) {
+   printk(KERN_INFO
+  "Couldn't allocate dentry for %s: %ld\n", nodename,
+  PTR_ERR(dentry));
+   ret = PTR_ERR(dentry);
+   goto out_release;
+   }
+
+   /* Create the container directory, which also creates the container */
+   ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
+   child = __d_cont(dentry);
+   dput(dentry);
+   if (ret) {
+   printk(KERN_INFO
+  "Failed to create container %s: %d\n", nodename,
+  ret);
+   goto out_release;
+   }
+
+   if (!child) {
+   printk(KERN_INFO
+  "Couldn't find new container %s\n", nodename);
+   ret = -ENOMEM;
+   goto out_release;
+   }
+
+   /* The container now exists. Retake container_mutex and check
+* that we're still in the same state that we thought we
+* were. */
+   mutex_lock(&container_mutex);
+   if ((root != subsys->root) ||
+   (parent != task_container(tsk, subsys->subsys_id))) {
+   /* Aargh, we raced ... */
+   mutex_unlock(&inode->i_mutex);
+
+

[PATCH 03/10] Task Containers(V11): Add fork()/exit() hooks

2007-07-20 Thread menage

This patch adds the necessary hooks to the fork() and exit() paths to ensure
that new children inherit their parent's container assignments, and that
exiting processes release reference counts on their containers.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
---

 include/linux/container.h |6 ++
 kernel/container.c|  121 ++
 kernel/exit.c |2 
 kernel/fork.c |   14 -
 4 files changed, 141 insertions(+), 2 deletions(-)

Index: container-2.6.22-rc6-mm1/include/linux/container.h
===
--- container-2.6.22-rc6-mm1.orig/include/linux/container.h
+++ container-2.6.22-rc6-mm1/include/linux/container.h
@@ -25,6 +25,9 @@ extern int container_init(void);
 extern void container_init_smp(void);
 extern void container_lock(void);
 extern void container_unlock(void);
+extern void container_fork(struct task_struct *p);
+extern void container_fork_callbacks(struct task_struct *p);
+extern void container_exit(struct task_struct *p, int run_callbacks);
 
 /* Per-subsystem/per-container state maintained by the system. */
 struct container_subsys_state {
@@ -215,6 +218,9 @@ int container_path(const struct containe
 static inline int container_init_early(void) { return 0; }
 static inline int container_init(void) { return 0; }
 static inline void container_init_smp(void) {}
+static inline void container_fork(struct task_struct *p) {}
+static inline void container_fork_callbacks(struct task_struct *p) {}
+static inline void container_exit(struct task_struct *p, int callbacks) {}
 
 static inline void container_lock(void) {}
 static inline void container_unlock(void) {}
Index: container-2.6.22-rc6-mm1/kernel/container.c
===
--- container-2.6.22-rc6-mm1.orig/kernel/container.c
+++ container-2.6.22-rc6-mm1/kernel/container.c
@@ -132,6 +132,33 @@ list_for_each_entry(_ss, &_root->subsys_
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
+/* Each task_struct has an embedded css_group, so the get/put
+ * operation simply takes a reference count on all the containers
+ * referenced by subsystems in this css_group. This can end up
+ * multiple-counting some containers, but that's OK - the ref-count is
+ * just a busy/not-busy indicator; ensuring that we only count each
+ * container once would require taking a global lock to ensure that no
+ * subsystems moved between hierarchies while we were doing so.
+ *
+ * Possible TODO: decide at boot time based on the number of
+ * registered subsystems and the number of CPUs or NUMA nodes whether
+ * it's better for performance to ref-count every subsystem, or to
+ * take a global lock and only add one ref count to each hierarchy.
+ */
+static void get_css_group(struct css_group *cg)
+{
+   int i;
+   for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++)
+   atomic_inc(&cg->subsys[i]->container->count);
+}
+
+static void put_css_group(struct css_group *cg)
+{
+   int i;
+   for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++)
+   atomic_dec(&cg->subsys[i]->container->count);
+}
+
 /*
  * There is one global container mutex. We also require taking
  * task_lock() when dereferencing a task's container subsys pointers.
@@ -1554,3 +1581,97 @@ int __init container_init(void)
 out:
return err;
 }
+
+/**
+ * container_fork - attach newly forked task to its parents container.
+ * @tsk: pointer to task_struct of forking parent process.
+ *
+ * Description: A task inherits its parent's container at fork().
+ *
+ * A pointer to the shared css_group was automatically copied in
+ * fork.c by dup_task_struct().  However, we ignore that copy, since
+ * it was not made under the protection of RCU or container_mutex, so
+ * might no longer be a valid container pointer.  attach_task() might
+ * have already changed current->container, allowing the previously
+ * referenced container to be removed and freed.
+ *
+ * At the point that container_fork() is called, 'current' is the parent
+ * task, and the passed argument 'child' points to the child task.
+ */
+void container_fork(struct task_struct *child)
+{
+   rcu_read_lock();
+   child->containers = rcu_dereference(current->containers);
+   get_css_group(&child->containers);
+   rcu_read_unlock();
+}
+
+/**
+ * container_fork_callbacks - called on a new task very soon before
+ * adding it to the tasklist. No need to take any locks since no-one
+ * can be operating on this task
+ */
+void container_fork_callbacks(struct task_struct *child)
+{
+   if (need_forkexit_callback) {
+   int i;
+   for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
+   struct container_subsys *ss = subsys[i];
+   if (ss->

[PATCH 09/10] Task Containers(V11): Example CPU accounting subsystem

2007-07-20 Thread menage

This example demonstrates how to use the generic container subsystem for a
simple resource tracker that counts, for the processes in a container, the
total CPU time used and the %CPU used in the last complete 10 second interval.

Portions contributed by Balbir Singh <[EMAIL PROTECTED]>

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
---

 include/linux/container_subsys.h |6 +
 include/linux/cpu_acct.h |   14 ++
 init/Kconfig |7 +
 kernel/Makefile  |1 
 kernel/cpu_acct.c|  186 +++
 kernel/sched.c   |   14 ++
 6 files changed, 225 insertions(+), 3 deletions(-)

Index: container-2.6.22-rc6-mm1/include/linux/container_subsys.h
===
--- container-2.6.22-rc6-mm1.orig/include/linux/container_subsys.h
+++ container-2.6.22-rc6-mm1/include/linux/container_subsys.h
@@ -13,4 +13,10 @@ SUBSYS(cpuset)
 
 /* */
 
+#ifdef CONFIG_CONTAINER_CPUACCT
+SUBSYS(cpuacct)
+#endif
+
+/* */
+
 /* */
Index: container-2.6.22-rc6-mm1/include/linux/cpu_acct.h
===
--- /dev/null
+++ container-2.6.22-rc6-mm1/include/linux/cpu_acct.h
@@ -0,0 +1,14 @@
+
+#ifndef _LINUX_CPU_ACCT_H
+#define _LINUX_CPU_ACCT_H
+
+#include 
+#include 
+
+#ifdef CONFIG_CONTAINER_CPUACCT
+extern void cpuacct_charge(struct task_struct *, cputime_t cputime);
+#else
+static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {}
+#endif
+
+#endif
Index: container-2.6.22-rc6-mm1/init/Kconfig
===
--- container-2.6.22-rc6-mm1.orig/init/Kconfig
+++ container-2.6.22-rc6-mm1/init/Kconfig
@@ -339,6 +339,13 @@ config PROC_PID_CPUSET
depends on CPUSETS
default y
 
+config CONTAINER_CPUACCT
+   bool "Simple CPU accounting container subsystem"
+   depends on CONTAINERS
+   help
+ Provides a simple Resource Controller for monitoring the
+ total CPU consumed by the tasks in a container
+
 config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
Index: container-2.6.22-rc6-mm1/kernel/Makefile
===
--- container-2.6.22-rc6-mm1.orig/kernel/Makefile
+++ container-2.6.22-rc6-mm1/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CONTAINERS) += container.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
Index: container-2.6.22-rc6-mm1/kernel/cpu_acct.c
===
--- /dev/null
+++ container-2.6.22-rc6-mm1/kernel/cpu_acct.c
@@ -0,0 +1,186 @@
+/*
+ * kernel/cpu_acct.c - CPU accounting container subsystem
+ *
+ * Copyright (C) Google Inc, 2006
+ *
+ * Developed by Paul Menage ([EMAIL PROTECTED]) and Balbir Singh
+ * ([EMAIL PROTECTED])
+ *
+ */
+
+/*
+ * Example container subsystem for reporting total CPU usage of tasks in a
+ * container, along with percentage load over a time interval
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+struct cpuacct {
+   struct container_subsys_state css;
+   spinlock_t lock;
+   /* total time used by this class */
+   cputime64_t time;
+
+   /* time when next load calculation occurs */
+   u64 next_interval_check;
+
+   /* time used in current period */
+   cputime64_t current_interval_time;
+
+   /* time used in last period */
+   cputime64_t last_interval_time;
+};
+
+struct container_subsys cpuacct_subsys;
+
+static inline struct cpuacct *container_ca(struct container *cont)
+{
+   return container_of(container_subsys_state(cont, cpuacct_subsys_id),
+   struct cpuacct, css);
+}
+
+static inline struct cpuacct *task_ca(struct task_struct *task)
+{
+   return container_of(task_subsys_state(task, cpuacct_subsys_id),
+   struct cpuacct, css);
+}
+
+#define INTERVAL (HZ * 10)
+
+static inline u64 next_interval_boundary(u64 now)
+{
+   /* calculate the next interval boundary beyond the
+* current time */
+   do_div(now, INTERVAL);
+   return (now + 1) * INTERVAL;
+}
+
+static struct container_subsys_state *cpuacct_create(
+   struct container_subsys *ss, struct container *cont)
+{
+   struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+
+   if (!ca)
+   return ERR_PTR(-ENOMEM);
+   spin_lock_init(&ca->lock);
+   ca->next_interval_check = next_interval_boundary(get_jiffies_64());
+   return &ca->css;
+}
+
+static void cpuacct_destroy(struct container_subsys *ss,
+   s

[PATCH 00/10] Task Containers(V11): Introduction

2007-07-20 Thread menage

ainers-whitespace.patch
containersv10-share-css_group-arrays-between-tasks-with-same-container-memberships.patch
containersv10-share-css_group-arrays-between-tasks-with-same-container-memberships-fix.patch
containersv10-share-css_group-arrays-between-tasks-with-same-container-memberships-cpuset-zero-malloc-fix-for-new-containers.patch
containersv10-simple-debug-info-subsystem.patch
containersv10-simple-debug-info-subsystem-fix.patch
containersv10-simple-debug-info-subsystem-fix-2.patch
containersv10-support-for-automatic-userspace-release-agents.patch
containersv10-support-for-automatic-userspace-release-agents-whitespace.patch
add-containerstats-v3.patch
add-containerstats-v3-fix.patch
update-getdelays-to-become-containerstats-aware.patch
containers-implement-subsys-post_clone.patch
containers-implement-namespace-tracking-subsystem-v3.patch


Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 07/10] Task Containers(V11): Automatic userspace notification of idle containers

2007-07-20 Thread menage

This patch adds the following files to the container filesystem:

notify_on_release - configures/reports whether the container subsystem should
attempt to run a release script when this container becomes unused

release_agent - configures/reports the release agent to be used for
this hierarchy (top level in each hierarchy only)

releasable - reports whether this container would have been auto-released if
notify_on_release was true and a release agent was configured (mainly useful
for debugging)

To avoid locking issues, invoking the userspace release agent is done via a
workqueue task; containers that need to have their release agents invoked by
the workqueue task are linked on to a list.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
---

 include/linux/container.h |   11 -
 kernel/container.c|  425 +-
 2 files changed, 393 insertions(+), 43 deletions(-)

Index: container-2.6.22-rc6-mm1/include/linux/container.h
===
--- container-2.6.22-rc6-mm1.orig/include/linux/container.h
+++ container-2.6.22-rc6-mm1/include/linux/container.h
@@ -77,10 +77,11 @@ static inline void css_get(struct contai
  * css_get()
  */
 
+extern void __css_put(struct container_subsys_state *css);
 static inline void css_put(struct container_subsys_state *css)
 {
if (!test_bit(CSS_ROOT, &css->flags))
-   atomic_dec(&css->refcnt);
+   __css_put(css);
 }
 
 struct container {
@@ -112,6 +113,13 @@ struct container {
 * tasks in this container. Protected by css_group_lock
 */
struct list_head css_groups;
+
+   /*
+* Linked list running through all containers that can
+* potentially be reaped by the release agent. Protected by
+* release_list_lock
+*/
+   struct list_head release_list;
 };
 
 /* A css_group is a structure holding pointers to a set of
@@ -285,7 +293,6 @@ struct task_struct *container_iter_next(
struct container_iter *it);
 void container_iter_end(struct container *cont, struct container_iter *it);
 
-
 #else /* !CONFIG_CONTAINERS */
 
 static inline int container_init_early(void) { return 0; }
Index: container-2.6.22-rc6-mm1/kernel/container.c
===
--- container-2.6.22-rc6-mm1.orig/kernel/container.c
+++ container-2.6.22-rc6-mm1/kernel/container.c
@@ -44,6 +44,8 @@
 #include 
 #include 
 
+static DEFINE_MUTEX(container_mutex);
+
 /* Generate an array of container subsystem pointers */
 #define SUBSYS(_x) &_x ## _subsys,
 
@@ -82,6 +84,13 @@ struct containerfs_root {
 
/* Hierarchy-specific flags */
unsigned long flags;
+
+   /* The path to use for release notifications. No locking
+* between setting and use - so if userspace updates this
+* while subcontainers exist, you could miss a
+* notification. We ensure that it's always a valid
+* NUL-terminated string */
+   char release_agent_path[PATH_MAX];
 };
 
 
@@ -109,7 +118,13 @@ static int need_forkexit_callback;
 
 /* bits in struct container flags field */
 enum {
+   /* Container is dead */
CONT_REMOVED,
+   /* Container has previously had a child container or a task,
+* but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */
+   CONT_RELEASABLE,
+   /* Container requires release notifications to userspace */
+   CONT_NOTIFY_ON_RELEASE,
 };
 
 /* convenient tests for these bits */
@@ -123,6 +138,19 @@ enum {
ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
 };
 
+inline int container_is_releasable(const struct container *cont)
+{
+   const int bits =
+   (1 << CONT_RELEASABLE) |
+   (1 << CONT_NOTIFY_ON_RELEASE);
+   return (cont->flags & bits) == bits;
+}
+
+inline int notify_on_release(const struct container *cont)
+{
+   return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
+}
+
 /*
  * for_each_subsys() allows you to iterate on each subsystem attached to
  * an active hierarchy
@@ -134,6 +162,14 @@ list_for_each_entry(_ss, &_root->subsys_
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
+/* the list of containers eligible for automatic release. Protected by
+ * release_list_lock */
+static LIST_HEAD(release_list);
+static DEFINE_SPINLOCK(release_list_lock);
+static void container_release_agent(struct work_struct *work);
+static DECLARE_WORK(release_agent_work, container_release_agent);
+static void check_for_release(struct container *cont);
+
 /* Link structure for associating css_group objects with containers */
 struct cg_container_link {
/*
@@ -188,11 +224,8 @@ static int use_task_css_group_links;
 /*
  * unlink a css_group from the list and free it
  */
-static void release_css_group(

[PATCH 0/7] containers (V7): Generic Process Containers

2007-02-12 Thread menage

--

This is an update to my multi-hierarchy multi-subsystem generic
process containers patch. Changes since V6 (22nd December) include:

- updated to 2.6.20

- added more details about multiple hierarchy support in the
  documentation

- reduced the per-task memory overhead to one pointer (previously it
  was one pointer for each hierarchy). Now each task has
  a pointer to a container_group, which holds the pointers to the
  containers (one per active hierarchy) that the task is attached to
  and the associated per-subsystem state (one per active subsystem).
  This container group is shared (with reference counts) between all
  tasks that have the same set of container mappings.

- added API support for binding/unbinding subsystems to/from active
  hierarchies, by remounting with -oremount,. Currently
  this fails with EBUSY if the hierarchy has a child containers; full
  implementation support is left to a later patch.

- added a bind() subsystem callback to indicate when a subsystem is
  moved between hierarchies

- added container_clone(subsys, task), which creates a child container
  for the hierarchy that the specified subsystem is bound to, and
  moves the given task into that container. An example use of this
  would be in sys_unshare, which could, if the namespace container
  subsystem is active, create a child container when the new namespace
  is created.

- temporarily removed the "release agent" support. It's only currently
  used by CPUsets, and intrudes somewhat on the per-container
  reference counting. If necessary it can be re-added, either as a
  generic subsystem feature or a CPUset-specific feature, via a kernel
  thread that periodically polls containers that have been designated
  as notify_on_release to see if they are releasable

Generic Process Containers
--

There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
containers, and others.  These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.

Already existing in the kernel is the cpuset subsystem; this has a
process grouping mechanism that is mature, tested, and well documented
(particularly with regards to synchronization rules).

This patchset extracts the process grouping code from cpusets into a
generic container system, and makes the cpusets code a client of
the container system.

It also provides several example clients of the container system,
including ResGroups, BeanCounters and namespace proxy.

The change is implemented in three stages, plus four example
subsystems that aren't necessarily intended to be merged as part of
this patch set, but demonstrate the applicability of the framework.

1) extract the process grouping code from cpusets into a standalone system

2) remove the process grouping code from cpusets and hook into the
   container system

3) convert the container system to present a generic multi-hierarchy
   API, and make cpusets a client of that API

4) example of a simple CPU accounting container subsystem

5) example of implementing ResGroups and its numtasks controller over
   generic containers

6) example of implementing BeanCounters and its numfiles counter over
   generic containers

7) example of integrating the namespace isolation code (sys_unshare()
   or various clone flags) with generic containers, allowing virtual
   servers to take advantage of other resource control efforts.

The intention is that the various resource management and
virtualization efforts can also become container clients, with the
result that:

- the userspace APIs are (somewhat) normalised

- it's easier to test out e.g. the ResGroups CPU controller in
 conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.

- the additional kernel footprint of any of the competing resource
 management systems is substantially reduced, since it doesn't need
 to provide process grouping/containment, hence improving their
 chances of getting into the kernel

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/7] containers (V7): Simple CPU accounting container subsystem

2007-02-12 Thread menage

This demonstrates how to use the generic container subsystem for a
simple resource tracker that counts the total CPU time used by all
processes in a container, during the time that they're members of the
container.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/cpu_acct.h |   14 +++
 init/Kconfig |7 +
 kernel/Makefile  |1 
 kernel/cpu_acct.c|  213 +++
 kernel/sched.c   |   14 ++-
 5 files changed, 246 insertions(+), 3 deletions(-)

Index: container-2.6.20/include/linux/cpu_acct.h
===
--- /dev/null
+++ container-2.6.20/include/linux/cpu_acct.h
@@ -0,0 +1,14 @@
+
+#ifndef _LINUX_CPU_ACCT_H
+#define _LINUX_CPU_ACCT_H
+
+#include 
+#include 
+
+#ifdef CONFIG_CONTAINER_CPUACCT
+extern void cpuacct_charge(struct task_struct *, cputime_t cputime);
+#else
+static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {}
+#endif
+
+#endif
Index: container-2.6.20/init/Kconfig
===
--- container-2.6.20.orig/init/Kconfig
+++ container-2.6.20/init/Kconfig
@@ -290,6 +290,13 @@ config PROC_PID_CPUSET
depends on CPUSETS
default y
 
+config CONTAINER_CPUACCT
+   bool "Simple CPU accounting container subsystem"
+   select CONTAINERS
+   help
+ Provides a simple Resource Controller for monitoring the
+ total CPU consumed by the tasks in a container
+
 config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
Index: container-2.6.20/kernel/cpu_acct.c
===
--- /dev/null
+++ container-2.6.20/kernel/cpu_acct.c
@@ -0,0 +1,213 @@
+/*
+ * kernel/cpu_acct.c - CPU accounting container subsystem
+ *
+ * Copyright (C) Google Inc, 2006
+ *
+ * Developed by Paul Menage ([EMAIL PROTECTED]) and Balbir Singh
+ * ([EMAIL PROTECTED])
+ *
+ */
+
+/*
+ * Container subsystem for reporting total CPU usage of tasks in a
+ * container, along with percentage load over a time interval
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct cpuacct {
+   struct container_subsys_state css;
+   spinlock_t lock;
+   /* total time used by this class */
+   cputime64_t time;
+
+   /* time when next load calculation occurs */
+   u64 next_interval_check;
+
+   /* time used in current period */
+   cputime64_t current_interval_time;
+
+   /* time used in last period */
+   cputime64_t last_interval_time;
+};
+
+static struct container_subsys cpuacct_subsys;
+
+static inline struct cpuacct *container_ca(struct container *cont)
+{
+   return container_of(container_subsys_state(cont, &cpuacct_subsys),
+   struct cpuacct, css);
+}
+
+static inline struct cpuacct *task_ca(struct task_struct *task)
+{
+   return container_ca(task_container(task, &cpuacct_subsys));
+}
+
+#define INTERVAL (HZ * 10)
+
+static inline u64 next_interval_boundary(u64 now) {
+   /* calculate the next interval boundary beyond the
+* current time */
+   do_div(now, INTERVAL);
+   return (now + 1) * INTERVAL;
+}
+
+static int cpuacct_create(struct container_subsys *ss, struct container *cont)
+{
+   struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+   if (!ca)
+   return -ENOMEM;
+   spin_lock_init(&ca->lock);
+   ca->next_interval_check = next_interval_boundary(get_jiffies_64());
+   cont->subsys[cpuacct_subsys.subsys_id] = &ca->css;
+   return 0;
+}
+
+static void cpuacct_destroy(struct container_subsys *ss,
+   struct container *cont)
+{
+   kfree(container_ca(cont));
+}
+
+/* Lazily update the load calculation if necessary. Called with ca locked */
+static void cpuusage_update(struct cpuacct *ca)
+{
+   u64 now = get_jiffies_64();
+   /* If we're not due for an update, return */
+   if (ca->next_interval_check > now)
+   return;
+
+   if (ca->next_interval_check <= (now - INTERVAL)) {
+   /* If it's been more than an interval since the last
+* check, then catch up - the last interval must have
+* been zero load */
+   ca->last_interval_time = 0;
+   ca->next_interval_check = next_interval_boundary(now);
+   } else {
+   /* If a steal takes the last interval time negative,
+* then we just ignore it */
+   if ((s64)ca->current_interval_time > 0) {
+   ca->last_interval_time = ca->current_interval_time;
+   } else {
+   ca->last_interval_time = 0;
+   }
+   ca->next_interval_check += INTERVAL

[PATCH 5/7] containers (V7): Resource Groups over generic containers

2007-02-12 Thread menage

This patch provides the RG core and numtasks controller as container
subsystems, intended as an example of how to implement a more complex
resource control system over generic process containers. The changes
to the core involve primarily removing the group management, task
membership and configfs support and adding interface layers to talk to
the generic container layer instead.

Each resource controller becomes an independent container subsystem;
the RG core is essentially a library that the resource controllers can
use to provide the RG API to userspace. Rather than a single shares
and stats file in each group, there's a _shares and 
a _stats file, each linked to the appropriate resource
controller.

 include/linux/moduleparam.h  |   12 -
 include/linux/numtasks.h |   28 ++
 include/linux/res_group.h|   87 
 include/linux/res_group_rc.h |   97 
 init/Kconfig |   22 ++
 kernel/Makefile  |1 
 kernel/fork.c|7 
 kernel/res_group/Makefile|2 
 kernel/res_group/local.h |   38 +++
 kernel/res_group/numtasks.c  |  467 +++
 kernel/res_group/res_group.c |  160 ++
 kernel/res_group/rgcs.c  |  302 +++
 kernel/res_group/shares.c|  228 
 13 files changed, 1447 insertions(+), 4 deletions(-)

Index: container-2.6.20/include/linux/moduleparam.h
===
--- container-2.6.20.orig/include/linux/moduleparam.h
+++ container-2.6.20/include/linux/moduleparam.h
@@ -78,11 +78,17 @@ struct kparam_array
 /* Helper functions: type is byte, short, ushort, int, uint, long,
ulong, charp, bool or invbool, or XXX if you define param_get_XXX,
param_set_XXX and param_check_XXX. */
-#define module_param_named(name, value, type, perm)   \
-   param_check_##type(name, &(value));\
-   module_param_call(name, param_set_##type, param_get_##type, &value, 
perm); \
+#define module_param_named_call(name, value, type, set, perm)  \
+   param_check_##type(name, &(value)); \
+   module_param_call(name, set, param_get_##type, &(value), perm); \
__MODULE_PARM_TYPE(name, #type)
 
+#define module_param_named(name, value, type, perm)   \
+   module_param_named_call(name, value, type, param_set_##type, perm)
+
+#define module_param_set_call(name, type, setfn, perm) \
+   module_param_named_call(name, name, type, setfn, perm)
+
 #define module_param(name, type, perm) \
module_param_named(name, name, type, perm)
 
Index: container-2.6.20/include/linux/numtasks.h
===
--- /dev/null
+++ container-2.6.20/include/linux/numtasks.h
@@ -0,0 +1,28 @@
+/* numtasks.h - No. of tasks resource controller for Resource Groups
+ *
+ * Copyright (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005
+ *
+ * Provides No. of tasks resource controller for Resource Groups
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#ifndef _LINUX_NUMTASKS_H
+#define _LINUX_NUMTASKS_H
+
+#ifdef CONFIG_RES_GROUPS_NUMTASKS
+#include 
+
+extern int numtasks_allow_fork(struct task_struct *);
+
+#else /* CONFIG_RES_GROUPS_NUMTASKS */
+
+#define numtasks_allow_fork(task) (0)
+
+#endif /* CONFIG_RES_GROUPS_NUMTASKS */
+#endif /* _LINUX_NUMTASKS_H */
Index: container-2.6.20/include/linux/res_group.h
===
--- /dev/null
+++ container-2.6.20/include/linux/res_group.h
@@ -0,0 +1,87 @@
+/*
+ *  res_group.h - Header file to be used by Resource Groups
+ *
+ * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004
+ * (C) Shailabh Nagar,  IBM Corp. 2003, 2004
+ * (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005
+ *
+ * Provides data structures, macros and kernel APIs
+ *
+ * More details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef _LINUX_RES_GROUP_H
+#define _LINUX_RES_GROUP_H
+
+#ifdef CONFIG_RES_GROUPS
+#include 
+#include 
+#include 
+#include 
+
+#define SHARE_UNCHANGED(-1)/* implicitly specified by userspace,
+* never stored in a resource group'
+* shares struct; never displayed */
+#define SHARE_UNSUPPORTED  (-2)

[PATCH 7/7] containers (V7): Container interface to nsproxy subsystem

2007-02-12 Thread menage

When a task enters a new namespace via a clone() or unshare(), a new
container is created and the task moves into it. Developed by Serge
Hallyn <[EMAIL PROTECTED]>, adapted by Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/nsproxy.h |6 ++
 init/Kconfig|9 +++
 kernel/Makefile |1 
 kernel/fork.c   |4 +
 kernel/ns_container.c   |  110 
 kernel/nsproxy.c|6 ++
 6 files changed, 136 insertions(+)

Index: container-2.6.20/include/linux/nsproxy.h
===
--- container-2.6.20.orig/include/linux/nsproxy.h
+++ container-2.6.20/include/linux/nsproxy.h
@@ -53,4 +53,10 @@ static inline void exit_task_namespaces(
put_nsproxy(ns);
}
 }
+#ifdef CONFIG_CONTAINER_NS
+int ns_container_clone(struct task_struct *tsk);
+#else
+static inline int ns_container_clone(struct task_struct *tsk) { return 0; }
+#endif
+
 #endif
Index: container-2.6.20/init/Kconfig
===
--- container-2.6.20.orig/init/Kconfig
+++ container-2.6.20/init/Kconfig
@@ -297,6 +297,15 @@ config CONTAINER_CPUACCT
  Provides a simple Resource Controller for monitoring the
  total CPU consumed by the tasks in a container
 
+config CONTAINER_NS
+bool "Namespace container subsystem"
+select CONTAINERS
+help
+  Provides a simple namespace container subsystem to
+  provide hierarchical naming of sets of namespaces,
+  for instance virtual servers and checkpoint/restart
+  jobs.
+
 config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
Index: container-2.6.20/kernel/Makefile
===
--- container-2.6.20.orig/kernel/Makefile
+++ container-2.6.20/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CONTAINERS) += container.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o
+obj-$(CONFIG_CONTAINER_NS) += ns_container.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
Index: container-2.6.20/kernel/fork.c
===
--- container-2.6.20.orig/kernel/fork.c
+++ container-2.6.20/kernel/fork.c
@@ -1661,6 +1661,9 @@ asmlinkage long sys_unshare(unsigned lon
err = -ENOMEM;
goto bad_unshare_cleanup_ipc;
}
+   err = ns_container_clone(current);
+   if (err)
+   goto bad_unshare_cleanup_dupns;
}
 
if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
@@ -1715,6 +1718,7 @@ asmlinkage long sys_unshare(unsigned lon
task_unlock(current);
}
 
+ bad_unshare_cleanup_dupns:
if (new_nsproxy)
put_nsproxy(new_nsproxy);
 
Index: container-2.6.20/kernel/ns_container.c
===
--- /dev/null
+++ container-2.6.20/kernel/ns_container.c
@@ -0,0 +1,110 @@
+/*
+ * ns_container.c - namespace container subsystem
+ *
+ * Copyright IBM, 2006
+ */
+
+#include 
+#include 
+#include 
+
+struct nscont {
+   struct container_subsys_state css;
+   spinlock_t lock;
+};
+
+static struct container_subsys ns_subsys;
+
+static inline struct nscont *container_nscont(struct container *cont)
+{
+   return container_of(container_subsys_state(cont, &ns_subsys),
+   struct nscont, css);
+}
+
+int ns_container_clone(struct task_struct *tsk)
+{
+   return container_clone(tsk, &ns_subsys);
+}
+
+/*
+ * Rules:
+ *   1. you can only enter a container which is a child of your current
+ * container
+ *   2. you can only place another process into a container if
+ * a. you have CAP_SYS_ADMIN
+ * b. your container is an ancestor of tsk's destination container
+ *   (hence either you are in the same container as tsk, or in an
+ *ancestor container thereof)
+ */
+int ns_can_attach(struct container_subsys *ss,
+ struct container *cont, struct task_struct *tsk)
+{
+   struct container *c;
+
+   if (current != tsk) {
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   if (!container_is_descendant(cont))
+   return -EPERM;
+   }
+
+   if (atomic_read(&cont->count) != 0)
+   return -EPERM;
+
+   c = task_container(tsk, &ns_subsys);
+   if (c && c != cont->parent)
+   return -EPERM;
+
+   return 0;
+}
+
+/*
+ * Rules: you can only create a container if
+ * 1. you are capable(CAP_SYS_ADMIN)
+ * 2. the target container i

[PATCH 6/7] containers (V7): BeanCounters over generic process containers

2007-02-12 Thread menage

This patch implements the BeanCounter resource control abstraction
over generic process containers. It contains the beancounter core
code, plus the numfiles resource counter. It doesn't currently contain
any of the memory tracking code or the code for switching beancounter
context in interrupts.

Currently all the beancounters resource counters are lumped into a
single hierarchy; ideally it would be possible for each resource
counter to be a separate container subsystem, allowing them to be
connected to different hierarchies.

---
 fs/file_table.c  |   11 +
 include/bc/beancounter.h |  192 
 include/bc/misc.h|   27 +++
 include/linux/fs.h   |3 
 init/Kconfig |4 
 init/main.c  |3 
 kernel/Makefile  |1 
 kernel/bc/Kconfig|   17 ++
 kernel/bc/Makefile   |7 
 kernel/bc/beancounter.c  |  371 +++
 kernel/bc/misc.c |   56 +++
 11 files changed, 691 insertions(+), 1 deletion(-)

Index: container-2.6.20/init/Kconfig
===
--- container-2.6.20.orig/init/Kconfig
+++ container-2.6.20/init/Kconfig
@@ -619,6 +619,10 @@ config STOP_MACHINE
  Need stop_machine() primitive.
 endmenu
 
+menu "Beancounters"
+source "kernel/bc/Kconfig"
+endmenu
+
 menu "Block layer"
 source "block/Kconfig"
 endmenu
Index: container-2.6.20/kernel/Makefile
===
--- container-2.6.20.orig/kernel/Makefile
+++ container-2.6.20/kernel/Makefile
@@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o
 
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
+obj-$(CONFIG_BEANCOUNTERS) += bc/
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
Index: container-2.6.20/kernel/bc/Kconfig
===
--- /dev/null
+++ container-2.6.20/kernel/bc/Kconfig
@@ -0,0 +1,17 @@
+config BEANCOUNTERS
+   bool "Enable resource accounting/control"
+   default n
+   select CONTAINERS
+   help
+ When Y this option provides accounting and allows configuring
+ limits for user's consumption of exhaustible system resources.
+ The most important resource controlled by this patch is unswappable
+ memory (either mlock'ed or used by internal kernel structures and
+ buffers). The main goal of this patch is to protect processes
+ from running short of important resources because of accidental
+ misbehavior of processes or malicious activity aiming to ``kill''
+ the system. It's worth mentioning that resource limits configured
+ by setrlimit(2) do not give an acceptable level of protection
+ because they cover only a small fraction of resources and work on a
+ per-process basis.  Per-process accounting doesn't prevent malicious
+ users from spawning a lot of resource-consuming processes.
Index: container-2.6.20/kernel/bc/Makefile
===
--- /dev/null
+++ container-2.6.20/kernel/bc/Makefile
@@ -0,0 +1,7 @@
+#
+# kernel/bc/Makefile
+#
+# Copyright (C) 2006 OpenVZ SWsoft Inc.
+#
+
+obj-y = beancounter.o misc.o
Index: container-2.6.20/include/bc/beancounter.h
===
--- /dev/null
+++ container-2.6.20/include/bc/beancounter.h
@@ -0,0 +1,192 @@
+/*
+ * include/bc/beancounter.h
+ *
+ * Copyright (C) 2006 OpenVZ SWsoft Inc
+ *
+ */
+
+#ifndef __BEANCOUNTER_H__
+#define __BEANCOUNTER_H__
+
+#include 
+
+enum {
+   BC_KMEMSIZE,
+   BC_PRIVVMPAGES,
+   BC_PHYSPAGES,
+   BC_NUMTASKS,
+   BC_NUMFILES,
+
+   BC_RESOURCES
+};
+
+struct bc_resource_parm {
+   unsigned long   barrier;
+   unsigned long   limit;
+   unsigned long   held;
+   unsigned long   minheld;
+   unsigned long   maxheld;
+   unsigned long   failcnt;
+
+};
+
+#ifdef __KERNEL__
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define BC_MAXVALUE((unsigned long)LONG_MAX)
+
+enum bc_severity {
+   BC_BARRIER,
+   BC_LIMIT,
+   BC_FORCE,
+};
+
+struct beancounter;
+
+#ifdef CONFIG_BEANCOUNTERS
+
+enum bc_attr_index {
+   BC_RES_HELD,
+   BC_RES_MAXHELD,
+   BC_RES_MINHELD,
+   BC_RES_BARRIER,
+   BC_RES_LIMIT,
+   BC_RES_FAILCNT,
+
+   BC_ATTRS
+};
+
+struct bc_resource {
+   char*bcr_name;
+   int  res_id;
+
+   int (*bcr_init)(struct beancounter *bc, int res);
+   int (*bcr_change)(struct beancounter *bc,
+   unsigned long new_bar, unsigned long new_lim);
+   void(*bcr_barrier_hit)(struct beancounter *bc);
+   int (*bcr_limit_hit)(struct beancounter *bc, unsigned long val,
+   unsigned long flags);
+

[PATCH 1/7] containers (V7): Generic container system abstracted from cpusets code

2007-02-12 Thread menage

This patch creates a generic process container system based on (and
parallel top) the cpusets code.  At a coarse level it was created by
copying kernel/cpuset.c, doing s/cpuset/container/g, and stripping out any
code that was cpuset-specific rather than applicable to any process
container subsystem.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 Documentation/containers.txt |  229 +++
 fs/proc/base.c   |7 
 include/linux/container.h|   96 +++
 include/linux/sched.h|5 
 init/Kconfig |9 
 init/main.c  |3 
 kernel/Makefile  |1 
 kernel/container.c   | 1343 +++
 kernel/exit.c|2 
 kernel/fork.c|3 
 10 files changed, 1697 insertions(+), 1 deletion(-)

Index: container-2.6.20/fs/proc/base.c
===
--- container-2.6.20.orig/fs/proc/base.c
+++ container-2.6.20/fs/proc/base.c
@@ -68,6 +68,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1870,6 +1871,9 @@ static struct pid_entry tgid_base_stuff[
 #ifdef CONFIG_CPUSETS
REG("cpuset", S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
INF("oom_score",  S_IRUGO, oom_score),
REG("oom_adj",S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
@@ -2151,6 +2155,9 @@ static struct pid_entry tid_base_stuff[]
 #ifdef CONFIG_CPUSETS
REG("cpuset",S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
INF("oom_score", S_IRUGO, oom_score),
REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
Index: container-2.6.20/include/linux/container.h
===
--- /dev/null
+++ container-2.6.20/include/linux/container.h
@@ -0,0 +1,96 @@
+#ifndef _LINUX_CONTAINER_H
+#define _LINUX_CONTAINER_H
+/*
+ *  container interface
+ *
+ *  Copyright (C) 2003 BULL SA
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ */
+
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_CONTAINERS
+
+extern int number_of_containers;   /* How many containers are defined in 
system? */
+
+extern int container_init_early(void);
+extern int container_init(void);
+extern void container_init_smp(void);
+extern void container_fork(struct task_struct *p);
+extern void container_exit(struct task_struct *p);
+
+extern struct file_operations proc_container_operations;
+
+extern void container_lock(void);
+extern void container_unlock(void);
+
+extern void container_manage_lock(void);
+extern void container_manage_unlock(void);
+
+struct container {
+   unsigned long flags;/* "unsigned long" so bitops work */
+
+   /*
+* Count is atomic so can incr (fork) or decr (exit) without a lock.
+*/
+   atomic_t count; /* count tasks using this container */
+
+   /*
+* We link our 'sibling' struct into our parent's 'children'.
+* Our children link their 'sibling' into our 'children'.
+*/
+   struct list_head sibling;   /* my parent's children */
+   struct list_head children;  /* my children */
+
+   struct container *parent;   /* my parent */
+   struct dentry *dentry;  /* container fs entry */
+};
+
+/* struct cftype:
+ *
+ * The files in the container filesystem mostly have a very simple read/write
+ * handling, some common function will take care of it. Nevertheless some cases
+ * (read tasks) are special and therefore I define this structure for every
+ * kind of file.
+ *
+ *
+ * When reading/writing to a file:
+ * - the container to use in file->f_dentry->d_parent->d_fsdata
+ * - the 'cftype' of the file is file->f_dentry->d_fsdata
+ */
+
+struct inode;
+struct cftype {
+   char *name;
+   int private;
+   int (*open) (struct inode *inode, struct file *file);
+   ssize_t (*read) (struct container *cont, struct cftype *cft,
+struct file *file,
+char __user *buf, size_t nbytes, loff_t *ppos);
+   ssize_t (*write) (struct container *cont, struct cftype *cft,
+ struct file *file,
+ const char __user *buf, size_t nbytes, loff_t *ppos);
+   int (*release) (struct inode *inode, struct file *file);
+};
+
+int container_add_file(struct container *cont, const struct cftype *cft);
+
+int container_is_removed(const struct container *cont);
+
+#else /* !CONFIG_CONTAINERS */
+
+static inline int container_init_early(void) { return 0; }
+static inline int container_init(void) { return 0; }
+static

[PATCH 0/6] Multi-hierarchy Process Containers

2006-11-17 Thread menage


This is an update to my generic containers patch. The major change is
support for multiple hierarchies of containers (up to a limit
specified at build time).

- The mount options passed when mounting a container filesystem
  indicate the set of controllers/subsystems that are wanted in the
  hierarchy - e.g. "mount -t container -o cpuset,numtasks container /foo"

- Default is to try to mount all subsystems

- if a hierarchy with the requested set of subsystems already exists
  then its superblock is reused

- otherwise (as long as all the requested subsystems are currently not
  in use in any hierarchy) a new hierarchy is created.

- hierarchies with more than one container (i.e. with any children of
  the root container) persist even when unmounted;

- /proc/containers shows current hierarchy/subsystem details

- /proc//container shows one line for each active hierarchy

Other changes include:

- ported to 2.6.19-rc5 

- per-subsystem/per-container state is no longer just a void * - it
  has some state maintained by the container framework (to handle
  moving subsystems in and out of hierarchies when they are created/released)

Note that this hasn't yet undergone intensive testing following the
multi-hierarchy introduction, but I wanted to get the basic idea out
for comments.

TODOs include:

- figuring out a nice way to handle release notifications now that
  there are multiple hierarchies

-

There have recently been various proposals floating around for
resource management/accounting subsystems in the kernel, including
Res Groups, User BeanCounters and others.  These all need the basic
abstraction of being able to group together multiple processes in an
aggregate, in order to track/limit the resources permitted to those
processes, and all implement this grouping in different ways.

Already existing in the kernel is the cpuset subsystem; this has a
process grouping mechanism that is mature, tested, and well documented
(particularly with regards to synchronization rules).

This patchset extracts the process grouping code from cpusets into a
generic container system, and makes the cpusets code a client of
the container system.

It also provides a very simple additional container subsystem to do
per-container CPU usage accounting; this is primarily to demonstrate
use of the container subsystem API, but is useful in its own right.

The change is implemented in five stages plus an additional example patch:

1) extract the process grouping code from cpusets into a standalone system

2) remove the process grouping code from cpusets and hook into the
   container system

3) convert the container system to present a generic multi-hierarchy
   API, and make cpusets a client of that API

4) add a simple CPU accounting container subsystem as an example

5) add support for fork/exit callbacks iff some subsystem is interested in them

6) example of implementing ResGroups and its numtasks controller over
   generic containers - not intended to be applied with this patch set

The intention is that the various resource management efforts can also
become container clients, with the result that:

- the userspace APIs are (somewhat) normalised

- it's easier to test out e.g. the ResGroups CPU controller in
 conjunction with the UBC memory controller

- the additional kernel footprint of any of the competing resource
 management systems is substantially reduced, since it doesn't need
 to provide process grouping/containment, hence improving their
 chances of getting into the kernel


Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/6] Add generic multi-subsystem API to containers

2006-11-17 Thread menage

This patch removes all cpuset-specific knowlege from the container
system, replacing it with a generic API that can be used by multiple
subsystems. Cpusets is adapted to be a container subsystem.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 Documentation/containers.txt |  224 ++--
 include/linux/container.h|   70 -
 include/linux/cpuset.h   |   16 -
 include/linux/mempolicy.h|   12 
 include/linux/sched.h|2 
 kernel/container.c   |  589 ++-
 kernel/cpuset.c  |  168 
 mm/mempolicy.c   |2 
 8 files changed, 852 insertions(+), 231 deletions(-)

Index: container-2.6.19-rc5/include/linux/container.h
===
--- container-2.6.19-rc5.orig/include/linux/container.h
+++ container-2.6.19-rc5/include/linux/container.h
@@ -14,8 +14,6 @@
 
 #ifdef CONFIG_CONTAINERS
 
-extern int number_of_containers;   /* How many containers are defined in 
system? */
-
 extern int container_init_early(void);
 extern int container_init(void);
 extern void container_init_smp(void);
@@ -30,6 +28,13 @@ extern void container_unlock(void);
 extern void container_manage_lock(void);
 extern void container_manage_unlock(void);
 
+struct containerfs_root;
+
+/* Per-subsystem/per-container state maintained by the system. */
+struct container_subsys_state {
+   struct container *container;
+};
+
 struct container {
unsigned long flags;/* "unsigned long" so bitops work */
 
@@ -46,11 +51,15 @@ struct container {
struct list_head children;  /* my children */
 
struct container *parent;   /* my parent */
-   struct dentry *dentry;  /* container fs entry */
+   struct dentry *dentry;  /* container fs entry */
 
-#ifdef CONFIG_CPUSETS
-   struct cpuset *cpuset;
-#endif
+   /* Private pointers for each registered subsystem */
+   struct container_subsys_state *subsys[CONFIG_MAX_CONTAINER_SUBSYS];
+
+   int hierarchy;
+
+   struct containerfs_root *root;
+   struct container *top_container;
 };
 
 /* struct cftype:
@@ -85,6 +94,55 @@ int container_add_file(struct container 
 int container_is_removed(const struct container *cont);
 void container_set_release_agent_path(const char *path);
 
+/* Container subsystem type. See Documentation/containers.txt for details */
+
+struct container_subsys {
+   int (*create)(struct container_subsys *ss,
+ struct container *cont);
+   void (*destroy)(struct container_subsys *ss, struct container *cont);
+   int (*can_attach)(struct container_subsys *ss,
+ struct container *cont, struct task_struct *tsk);
+   void (*attach)(struct container_subsys *ss, struct container *cont,
+   struct container *old_cont, struct task_struct *tsk);
+   void (*post_attach)(struct container_subsys *ss,
+   struct container *cont,
+   struct container *old_cont,
+   struct task_struct *tsk);
+   int (*populate)(struct container_subsys *ss,
+   struct container *cont);
+
+   int subsys_id;
+#define MAX_CONTAINER_TYPE_NAMELEN 32
+   const char *name;
+
+   /* Protected by RCU */
+   int hierarchy;
+
+   struct list_head sibling;
+};
+
+int container_register_subsys(struct container_subsys *subsys);
+
+static inline struct container_subsys_state *container_subsys_state(
+   struct container *cont,
+   struct container_subsys *ss)
+{
+   return cont->subsys[ss->subsys_id];
+}
+
+static inline struct container* task_container(struct task_struct *task,
+  struct container_subsys *ss)
+{
+   return rcu_dereference(task->container[ss->hierarchy]);
+}
+
+static inline struct container_subsys_state *task_subsys_state(
+   struct task_struct *task,
+   struct container_subsys *ss)
+{
+   return container_subsys_state(task_container(task, ss), ss);
+}
+
 #else /* !CONFIG_CONTAINERS */
 
 static inline int container_init_early(void) { return 0; }
Index: container-2.6.19-rc5/include/linux/cpuset.h
===
--- container-2.6.19-rc5.orig/include/linux/cpuset.h
+++ container-2.6.19-rc5/include/linux/cpuset.h
@@ -60,16 +60,7 @@ static inline int cpuset_do_slab_mem_spr
 
 extern void cpuset_track_online_nodes(void);
 
-extern int cpuset_can_attach_task(struct container *cont,
- struct task_struct *tsk);
-extern void cpuset_attach_task(struct container *cont,
-   struct task_struct *tsk);
-extern void cpuset_post_attach_task(struct container *cont,
-   struct container *oldcont,
-   struc

[PATCH 4/6] Simple CPU accounting container subsystem

2006-11-17 Thread menage

This demonstrates how to use the generic container subsystem for a
simple resource tracker that counts the total CPU time used by all
processes in a container, during the time that they're members of the
container.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/cpu_acct.h |   14 +
 init/Kconfig |7 ++
 kernel/Makefile  |1 
 kernel/cpu_acct.c|  117 +++
 kernel/sched.c   |6 ++
 5 files changed, 145 insertions(+)

Index: container-2.6.19-rc5/include/linux/cpu_acct.h
===
--- /dev/null
+++ container-2.6.19-rc5/include/linux/cpu_acct.h
@@ -0,0 +1,14 @@
+
+#ifndef _LINUX_CPU_ACCT_H
+#define _LINUX_CPU_ACCT_H
+
+#include 
+#include 
+
+#ifdef CONFIG_CONTAINER_CPUACCT
+extern void cpuacct_charge(struct task_struct *, cputime_t cputime);
+#else
+static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {}
+#endif
+
+#endif
Index: container-2.6.19-rc5/init/Kconfig
===
--- container-2.6.19-rc5.orig/init/Kconfig
+++ container-2.6.19-rc5/init/Kconfig
@@ -263,6 +263,13 @@ config CPUSETS
 
  Say N if unsure.
 
+config CONTAINER_CPUACCT
+   bool "Simple CPU accounting container subsystem"
+   select CONTAINERS
+   help
+ Provides a simple Resource Controller for monitoring the
+ total CPU consumed by the tasks in a container
+
 config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
Index: container-2.6.19-rc5/kernel/cpu_acct.c
===
--- /dev/null
+++ container-2.6.19-rc5/kernel/cpu_acct.c
@@ -0,0 +1,117 @@
+/*
+ * kernel/cpu_acct.c - CPU accounting container subsystem
+ *
+ * Copyright (C) Google Inc, 2006
+ *
+ */
+
+/*
+ * Container subsystem for reporting total CPU usage of tasks in a
+ * container.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct cpuacct {
+   struct container_subsys_state css;
+   spinlock_t lock;
+   cputime64_t time; // total time used by this class
+};
+
+static struct container_subsys cpuacct_subsys;
+
+static inline struct cpuacct *container_ca(struct container *cont)
+{
+   return container_of(container_subsys_state(cont, &cpuacct_subsys),
+   struct cpuacct, css);
+}
+
+static inline struct cpuacct *task_ca(struct task_struct *task)
+{
+   return container_ca(task_container(task, &cpuacct_subsys));
+}
+
+static int cpuacct_create(struct container_subsys *ss, struct container *cont)
+{
+   struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+   if (!ca) return -ENOMEM;
+   spin_lock_init(&ca->lock);
+   cont->subsys[cpuacct_subsys.subsys_id] = &ca->css;
+   return 0;
+}
+
+static void cpuacct_destroy(struct container_subsys *ss,
+   struct container *cont)
+{
+   kfree(container_ca(cont));
+}
+
+static ssize_t cpuusage_read(struct container *cont,
+struct cftype *cft,
+struct file *file,
+char __user *buf,
+size_t nbytes, loff_t *ppos)
+{
+   struct cpuacct *ca = container_ca(cont);
+   cputime64_t time;
+   char usagebuf[64];
+   char *s = usagebuf;
+
+   spin_lock_irq(&ca->lock);
+   time = ca->time;
+   spin_unlock_irq(&ca->lock);
+
+   time *= 1000;
+   do_div(time, HZ);
+   s += sprintf(s, "%llu", (unsigned long long) time);
+
+   return simple_read_from_buffer(buf, nbytes, ppos, usagebuf, s - 
usagebuf);
+}
+
+static struct cftype cft_usage = {
+   .name = "cpu_usage",
+   .read = cpuusage_read,
+};
+
+static int cpuacct_populate(struct container_subsys *ss,
+   struct container *cont)
+{
+   return container_add_file(cont, &cft_usage);
+}
+
+
+void cpuacct_charge(struct task_struct *task, cputime_t cputime) {
+
+   struct cpuacct *ca;
+   unsigned long flags;
+
+   if (cpuacct_subsys.subsys_id < 0) return;
+   rcu_read_lock();
+   ca = task_ca(task);
+   if (ca) {
+   spin_lock_irqsave(&ca->lock, flags);
+   ca->time = cputime64_add(ca->time, cputime);
+   spin_unlock_irqrestore(&ca->lock, flags);
+   }
+   rcu_read_unlock();
+}
+
+static struct container_subsys cpuacct_subsys = {
+   .name = "cpuacct",
+   .create = cpuacct_create,
+   .destroy = cpuacct_destroy,
+   .populate = cpuacct_populate,
+   .subsys_id = -1,
+};
+
+
+int __init init_cpuacct(void)
+{
+   int id = container_register_subsys(&cpuacct_subsys);
+   return id < 0 ? id : 0;
+}
+
+module_init(init_cpuac

[PATCH 5/6] Extension to container system to allow fork/exit callbacks

2006-11-17 Thread menage

This patch adds fork/exit callbacks to container subsystems, and
ensures that every registered container has received one fork callback
for each task running int the system, and one exit callback for each
task that exited since it was registered.

Since the fork/exit path is performance sensitive, an RCU-protected
flag indicates to the fork/exit hooks whether they need to take the
callback mutex and scan the list of registered subsystems to look for
fork/exit handlers.

 Documentation/containers.txt |   11 +
 include/linux/container.h|2 
 kernel/container.c   |   89 +--
 3 files changed, 98 insertions(+), 4 deletions(-)

Index: container-2.6.19-rc5/include/linux/container.h
===
--- container-2.6.19-rc5.orig/include/linux/container.h
+++ container-2.6.19-rc5/include/linux/container.h
@@ -108,6 +108,8 @@ struct container_subsys {
struct container *cont,
struct container *old_cont,
struct task_struct *tsk);
+   void (*fork)(struct container_subsys *ss, struct task_struct *task);
+   void (*exit)(struct container_subsys *ss, struct task_struct *task);
int (*populate)(struct container_subsys *ss,
struct container *cont);
 
Index: container-2.6.19-rc5/kernel/container.c
===
--- container-2.6.19-rc5.orig/kernel/container.c
+++ container-2.6.19-rc5/kernel/container.c
@@ -84,6 +84,21 @@ struct containerfs_root {
 
 static struct containerfs_root rootnode[CONFIG_MAX_CONTAINER_HIERARCHIES];
 
+/* This flag indicates whether tasks in the fork and exit paths should
+ * take callback_mutex and check for fork/exit handlers to call. This
+ * avoids us having to take locks in the fork/exit path if none of the
+ * subsystems need to be called.
+ *
+ * It is protected via RCU, with the invariant that a process in an
+ * rcu_read_lock() section will never see this as 0 if there are
+ * actually registered subsystems with a fork or exit
+ * handler. (Sometimes it may be 1 without there being any registered
+ * subsystems with such a handler, but such periods are safe and of
+ * short duration).
+ */
+
+static int need_forkexit_callback = 0;
+
 /* bits in struct container flags field */
 typedef enum {
CONT_REMOVED,
@@ -1505,11 +1520,40 @@ int container_register_subsys(struct con
goto out;
}
dummytop->subsys[subsys_count]->container = dummytop;
-   subsys[subsys_count++] = new_subsys;
+   mutex_lock(&callback_mutex);
+   /* If this is the first subsystem that requested a fork or
+* exit callback, tell our fork/exit hooks that they need to
+* grab callback_mutex on every invocation. If they are
+* running concurrently with this code, they will either not
+* see the change now and go straight on, or they will see it
+* and grab callback_mutex, which will deschedule them. Either
+* way once synchronize_rcu() returns we know that all current
+* and future forks will make the callbacks. */
+   if (!need_forkexit_callback &&
+   (new_subsys->fork || new_subsys->exit)) {
+   need_forkexit_callback = 1;
+   synchronize_rcu();
+   }
+
+   /* If this subsystem requested that it be notified with fork
+* events, we should send it one now for every process in the
+* system */
+   if (new_subsys->fork) {
+   struct task_struct *g, *p;
+
+   read_lock(&tasklist_lock);
+   do_each_thread(g, p) {
+   new_subsys->fork(new_subsys, p);
+   } while_each_thread(g, p);
+   read_unlock(&tasklist_lock);
+   }
 
+   subsys[subsys_count++] = new_subsys;
+   mutex_unlock(&callback_mutex);
  out:
-   mutex_unlock(&manage_mutex);
-   return retval;
+   mutex_unlock(&manage_mutex);
+   return retval;
+
 }
 
 /**
@@ -1532,7 +1576,16 @@ int container_register_subsys(struct con
 
 void container_fork(struct task_struct *child)
 {
-   int i;
+   int i, need_callback;
+
+   rcu_read_lock();
+   /* need_forkexit_callback will be true if we might need to do
+* a callback */
+   need_callback = rcu_dereference(need_forkexit_callback);
+   if (need_callback) {
+   rcu_read_unlock();
+   mutex_lock(&callback_mutex);
+   }
task_lock(current);
for (i = 0; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) {
struct container *cont = current->container[i];
@@ -1540,7 +1593,20 @@ void container_fork(struct task_struct *
child->container[i] = cont;
atomic_inc(&cont->count);
}
+   if (need_callback) {
+   for (i = 0; i < subsys_count; i++) {
+

[PATCH 6/6] Resource Groups over generic containers

2006-11-17 Thread menage

This patch provides the RG core and numtasks controller as container
subsystems, intended as an example of how to implement a more complex
resource control system over generic process containers. The changes
to the core involve primarily removing the group management, task
membership and configfs support and adding interface layers to talk to
the generic container layer instead.

Each resource controller becomes an independent container subsystem;
the RG core is essentially a library that the resource controllers can
use to provide the RG API to userspace. Rather than a single shares
and stats file in each group, there's a _shares and 
a _stats file, each linked to the appropriate resource
controller.

 include/linux/moduleparam.h  |   12 -
 include/linux/numtasks.h |   28 ++
 include/linux/res_group.h|   87 
 include/linux/res_group_rc.h |   98 +
 init/Kconfig |   22 ++
 kernel/Makefile  |1 
 kernel/fork.c|7 
 kernel/res_group/Makefile|2 
 kernel/res_group/local.h |   38 +++
 kernel/res_group/numtasks.c  |  467 +++
 kernel/res_group/res_group.c |  162 ++
 kernel/res_group/rgcs.c  |  302 +++
 kernel/res_group/shares.c|  228 
 13 files changed, 1450 insertions(+), 4 deletions(-)

Index: container-2.6.19-rc5/include/linux/moduleparam.h
===
--- container-2.6.19-rc5.orig/include/linux/moduleparam.h
+++ container-2.6.19-rc5/include/linux/moduleparam.h
@@ -75,11 +75,17 @@ struct kparam_array
 /* Helper functions: type is byte, short, ushort, int, uint, long,
ulong, charp, bool or invbool, or XXX if you define param_get_XXX,
param_set_XXX and param_check_XXX. */
-#define module_param_named(name, value, type, perm)   \
-   param_check_##type(name, &(value));\
-   module_param_call(name, param_set_##type, param_get_##type, &value, 
perm); \
+#define module_param_named_call(name, value, type, set, perm)  \
+   param_check_##type(name, &(value)); \
+   module_param_call(name, set, param_get_##type, &(value), perm); \
__MODULE_PARM_TYPE(name, #type)
 
+#define module_param_named(name, value, type, perm)   \
+   module_param_named_call(name, value, type, param_set_##type, perm)
+
+#define module_param_set_call(name, type, setfn, perm) \
+   module_param_named_call(name, name, type, setfn, perm)
+
 #define module_param(name, type, perm) \
module_param_named(name, name, type, perm)
 
Index: container-2.6.19-rc5/include/linux/numtasks.h
===
--- /dev/null
+++ container-2.6.19-rc5/include/linux/numtasks.h
@@ -0,0 +1,28 @@
+/* numtasks.h - No. of tasks resource controller for Resource Groups
+ *
+ * Copyright (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005
+ *
+ * Provides No. of tasks resource controller for Resource Groups
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#ifndef _LINUX_NUMTASKS_H
+#define _LINUX_NUMTASKS_H
+
+#ifdef CONFIG_RES_GROUPS_NUMTASKS
+#include 
+
+extern int numtasks_allow_fork(struct task_struct *);
+
+#else /* CONFIG_RES_GROUPS_NUMTASKS */
+
+#define numtasks_allow_fork(task) (0)
+
+#endif /* CONFIG_RES_GROUPS_NUMTASKS */
+#endif /* _LINUX_NUMTASKS_H */
Index: container-2.6.19-rc5/include/linux/res_group.h
===
--- /dev/null
+++ container-2.6.19-rc5/include/linux/res_group.h
@@ -0,0 +1,87 @@
+/*
+ *  res_group.h - Header file to be used by Resource Groups
+ *
+ * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004
+ * (C) Shailabh Nagar,  IBM Corp. 2003, 2004
+ * (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005
+ *
+ * Provides data structures, macros and kernel APIs
+ *
+ * More details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef _LINUX_RES_GROUP_H
+#define _LINUX_RES_GROUP_H
+
+#ifdef CONFIG_RES_GROUPS
+#include 
+#include 
+#include 
+#include 
+
+#define SHARE_UNCHANGED(-1)/* implicitly specified by userspace,
+* never stored in a resource group'
+* shares struct; never displayed */
+#define

[PATCH 1/6] Generic container system abstracted from cpusets code

2006-11-17 Thread menage

This patch creates a generic process container system based on (and
parallel top) the cpusets code.  At a coarse level it was created by
copying kernel/cpuset.c, doing s/cpuset/container/g, and stripping out any
code that was cpuset-specific rather than applicable to any process
container subsystem.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 Documentation/containers.txt |  229 +++
 fs/proc/base.c   |7 
 include/linux/container.h|   96 +++
 include/linux/sched.h|5 
 init/Kconfig |9 
 init/main.c  |3 
 kernel/Makefile  |1 
 kernel/container.c   | 1343 +++
 kernel/exit.c|2 
 kernel/fork.c|3 
 10 files changed, 1697 insertions(+), 1 deletion(-)

Index: container-2.6.19-rc5/fs/proc/base.c
===
--- container-2.6.19-rc5.orig/fs/proc/base.c
+++ container-2.6.19-rc5/fs/proc/base.c
@@ -68,6 +68,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1782,6 +1783,9 @@ static struct pid_entry tgid_base_stuff[
 #ifdef CONFIG_SCHEDSTATS
INF("schedstat",  S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
 #ifdef CONFIG_CPUSETS
REG("cpuset", S_IRUGO, cpuset),
 #endif
@@ -2056,6 +2060,9 @@ static struct pid_entry tid_base_stuff[]
 #ifdef CONFIG_SCHEDSTATS
INF("schedstat", S_IRUGO, pid_schedstat),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
 #ifdef CONFIG_CPUSETS
REG("cpuset",S_IRUGO, cpuset),
 #endif
Index: container-2.6.19-rc5/include/linux/container.h
===
--- /dev/null
+++ container-2.6.19-rc5/include/linux/container.h
@@ -0,0 +1,96 @@
+#ifndef _LINUX_CONTAINER_H
+#define _LINUX_CONTAINER_H
+/*
+ *  container interface
+ *
+ *  Copyright (C) 2003 BULL SA
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ */
+
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_CONTAINERS
+
+extern int number_of_containers;   /* How many containers are defined in 
system? */
+
+extern int container_init_early(void);
+extern int container_init(void);
+extern void container_init_smp(void);
+extern void container_fork(struct task_struct *p);
+extern void container_exit(struct task_struct *p);
+
+extern struct file_operations proc_container_operations;
+
+extern void container_lock(void);
+extern void container_unlock(void);
+
+extern void container_manage_lock(void);
+extern void container_manage_unlock(void);
+
+struct container {
+   unsigned long flags;/* "unsigned long" so bitops work */
+
+   /*
+* Count is atomic so can incr (fork) or decr (exit) without a lock.
+*/
+   atomic_t count; /* count tasks using this container */
+
+   /*
+* We link our 'sibling' struct into our parent's 'children'.
+* Our children link their 'sibling' into our 'children'.
+*/
+   struct list_head sibling;   /* my parent's children */
+   struct list_head children;  /* my children */
+
+   struct container *parent;   /* my parent */
+   struct dentry *dentry;  /* container fs entry */
+};
+
+/* struct cftype:
+ *
+ * The files in the container filesystem mostly have a very simple read/write
+ * handling, some common function will take care of it. Nevertheless some cases
+ * (read tasks) are special and therefore I define this structure for every
+ * kind of file.
+ *
+ *
+ * When reading/writing to a file:
+ * - the container to use in file->f_dentry->d_parent->d_fsdata
+ * - the 'cftype' of the file is file->f_dentry->d_fsdata
+ */
+
+struct inode;
+struct cftype {
+   char *name;
+   int private;
+   int (*open) (struct inode *inode, struct file *file);
+   ssize_t (*read) (struct container *cont, struct cftype *cft,
+struct file *file,
+char __user *buf, size_t nbytes, loff_t *ppos);
+   ssize_t (*write) (struct container *cont, struct cftype *cft,
+ struct file *file,
+ const char __user *buf, size_t nbytes, loff_t *ppos);
+   int (*release) (struct inode *inode, struct file *file);
+};
+
+int container_add_file(struct container *cont, const struct cftype *cft);
+
+int container_is_removed(const struct container *cont);
+
+#else /* !CONFIG_CONTAINERS */
+
+static inline int container_init_early(void) { return 0; }
+static inline int container_init(void) { return 0; }
+static inline void container_init_smp(void) {}
+static inline void container_fork(s

[PATCH 05/10] Containers(V10): Add container_clone() interface

2007-05-29 Thread menage

This patch adds support for container_clone(), a speculative interface
to creating new containers intended to be used for systems such as
namespace unsharing.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container.h |2 
 kernel/container.c|  123 ++
 2 files changed, 125 insertions(+)

Index: container-2.6.22-rc2-mm1/kernel/container.c
===
--- container-2.6.22-rc2-mm1.orig/kernel/container.c
+++ container-2.6.22-rc2-mm1/kernel/container.c
@@ -1616,3 +1616,126 @@ void container_exit(struct task_struct *
tsk->containers = init_task.containers;
task_unlock(tsk);
 }
+
+static atomic_t namecnt;
+static void get_unused_name(char *buf)
+{
+   sprintf(buf, "node%d", atomic_inc_return(&namecnt));
+}
+
+/**
+ * container_clone - duplicate the current container in the hierarchy
+ * that the given subsystem is attached to, and move this task into
+ * the new child
+ */
+int container_clone(struct task_struct *tsk, struct container_subsys *subsys)
+{
+   struct dentry *dentry;
+   int ret = 0;
+   char nodename[32];
+   struct container *parent, *child;
+   struct inode *inode;
+   struct css_group *cg;
+   struct containerfs_root *root;
+
+   /* We shouldn't be called by an unregistered subsystem */
+   BUG_ON(!subsys->active);
+
+   /* First figure out what hierarchy and container we're dealing
+* with, and pin them so we can drop container_mutex */
+   mutex_lock(&container_mutex);
+ again:
+   root = subsys->root;
+   if (root == &rootnode) {
+   printk(KERN_INFO
+  "Not cloning container for unused subsystem %s\n",
+  subsys->name);
+   mutex_unlock(&container_mutex);
+   return 0;
+   }
+   cg = &tsk->containers;
+   parent = task_container(tsk, subsys->subsys_id);
+   /* Pin the hierarchy */
+   atomic_inc(&parent->root->sb->s_active);
+
+   mutex_unlock(&container_mutex);
+
+   /* Now do the VFS work to create a container */
+   get_unused_name(nodename);
+   inode = parent->dentry->d_inode;
+
+   /* Hold the parent directory mutex across this operation to
+* stop anyone else deleting the new container */
+   mutex_lock(&inode->i_mutex);
+   dentry = container_get_dentry(parent->dentry, nodename);
+   if (IS_ERR(dentry)) {
+   printk(KERN_INFO
+  "Couldn't allocate dentry for %s: %ld\n", nodename,
+  PTR_ERR(dentry));
+   ret = PTR_ERR(dentry);
+   goto out_release;
+   }
+
+   /* Create the container directory, which also creates the container */
+   ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
+   child = __d_cont(dentry);
+   dput(dentry);
+   if (ret) {
+   printk(KERN_INFO
+  "Failed to create container %s: %d\n", nodename,
+  ret);
+   goto out_release;
+   }
+
+   if (!child) {
+   printk(KERN_INFO
+  "Couldn't find new container %s\n", nodename);
+   ret = -ENOMEM;
+   goto out_release;
+   }
+
+   /* The container now exists. Retake container_mutex and check
+* that we're still in the same state that we thought we
+* were. */
+   mutex_lock(&container_mutex);
+   if ((root != subsys->root) ||
+   (parent != task_container(tsk, subsys->subsys_id))) {
+   /* Aargh, we raced ... */
+   mutex_unlock(&inode->i_mutex);
+
+   deactivate_super(parent->root->sb);
+   /* The container is still accessible in the VFS, but
+* we're not going to try to rmdir() it at this
+* point. */
+   printk(KERN_INFO
+  "Race in container_clone() - leaking container %s\n",
+  nodename);
+   goto again;
+   }
+
+   /* All seems fine. Finish by moving the task into the new container */
+   ret = attach_task(child, tsk);
+   mutex_unlock(&container_mutex);
+
+ out_release:
+   mutex_unlock(&inode->i_mutex);
+   deactivate_super(parent->root->sb);
+   return ret;
+}
+
+/* See if "cont" is a descendant of the current task's container in
+ * the appropriate hierarchy */
+
+int container_is_descendant(const struct container *cont)
+{
+   int ret;
+   struct container *target;
+   int subsys_id;
+   get_first_subsys(cont, NULL, &subsys_id);
+   target = task_container(current, subsys_i

[PATCH 08/10] Containers(V10): Share css_group arrays between tasks with same container memberships

2007-05-29 Thread menage

This patch replaces the struct css_group embedded in task_struct with
a pointer; all tasks that have the same set of memberships across all
hierarchies will share a css_group object, and will be linked via
their css_groups field to the "tasks" list_head in the css_group.

Assuming that many tasks share the same container assignments, this
reduces overall space usage and keeps the size of the task_struct down
(three pointers added to task_struct compared to a non-containers
kernel, no matter how many subsystems are registered).

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 Documentation/containers.txt |   14 +
 include/linux/container.h|   93 ++-
 include/linux/sched.h|   33 --
 kernel/container.c   |  524 ---
 kernel/cpuset.c  |   15 -
 5 files changed, 553 insertions(+), 126 deletions(-)

Index: container-2.6.22-rc2-mm1/include/linux/container.h
===
--- container-2.6.22-rc2-mm1.orig/include/linux/container.h
+++ container-2.6.22-rc2-mm1/include/linux/container.h
@@ -29,6 +29,14 @@ extern void container_unlock(void);
 
 struct containerfs_root;
 
+/* Define the enumeration of all container subsystems */
+#define SUBSYS(_x) _x ## _subsys_id,
+enum container_subsys_id {
+#include 
+   CONTAINER_SUBSYS_COUNT
+};
+#undef SUBSYS
+
 /* Per-subsystem/per-container state maintained by the system. */
 struct container_subsys_state {
/* The container that this subsystem is attached to. Useful
@@ -85,6 +93,54 @@ struct container {
 
struct containerfs_root *root;
struct container *top_container;
+
+   /*
+* List of cg_container_links pointing at css_groups with
+* tasks in this container. Protected by css_group_lock
+*/
+   struct list_head css_groups;
+};
+
+/* A css_group is a structure holding pointers to a set of
+ * container_subsys_state objects. This saves space in the task struct
+ * object and speeds up fork()/exit(), since a single inc/dec and a
+ * list_add()/del() can bump the reference count on the entire
+ * container set for a task.
+ */
+
+struct css_group {
+
+   /* Reference count */
+   struct kref ref;
+
+   /*
+* List running through all container groups. Protected by
+* css_group_lock
+*/
+   struct list_head list;
+
+   /*
+* List running through all tasks using this container
+* group. Protected by css_group_lock
+*/
+   struct list_head tasks;
+
+   /*
+* List of cg_container_link objects on link chains from
+* containers referenced from this css_group. Protected by
+* css_group_lock
+*/
+   struct list_head cg_links;
+
+   /* Set of subsystem states, one for each subsystem. NULL for
+* subsystems that aren't part of this hierarchy. These
+* pointers reduce the number of dereferences required to get
+* from a task to its state for a given container, but result
+* in increased space usage if tasks are in wildly different
+* groupings across different hierarchies. This array is
+* immutable after creation */
+   struct container_subsys_state *subsys[CONTAINER_SUBSYS_COUNT];
+
 };
 
 /* struct cftype:
@@ -111,6 +167,10 @@ struct cftype {
ssize_t (*read) (struct container *cont, struct cftype *cft,
 struct file *file,
 char __user *buf, size_t nbytes, loff_t *ppos);
+   /*
+* read_uint() is a shortcut for the common case of returning a
+* single integer. Use it in place of read()
+*/
u64 (*read_uint) (struct container *cont, struct cftype *cft);
ssize_t (*write) (struct container *cont, struct cftype *cft,
  struct file *file,
@@ -131,15 +191,7 @@ int container_is_removed(const struct co
 
 int container_path(const struct container *cont, char *buf, int buflen);
 
-int __container_task_count(const struct container *cont);
-static inline int container_task_count(const struct container *cont)
-{
-   int task_count;
-   rcu_read_lock();
-   task_count = __container_task_count(cont);
-   rcu_read_unlock();
-   return task_count;
-}
+int container_task_count(const struct container *cont);
 
 /* Return true if the container is a descendant of the current container */
 int container_is_descendant(const struct container *cont);
@@ -186,7 +238,7 @@ static inline struct container_subsys_st
 static inline struct container_subsys_state *task_subsys_state(
struct task_struct *task, int subsys_id)
 {
-   return rcu_dereference(task->containers.subsys[subsys_id]);
+   return rcu_dereference(task->containers->subsys[subsys_id]);
 }
 
 static inline struct container* task_container(struct task_struct *task,
@@ -199,6 +251,27 @@ int container_pat

[PATCH 04/10] Containers(V10): Add fork/exit hooks

2007-05-29 Thread menage

This patch adds the necessary hooks to the fork() and exit() paths to
ensure that new children inherit their parent's container assignments,
and that exiting processes release reference counts on their
containers.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container.h |6 ++
 kernel/container.c|  128 ++
 kernel/exit.c |2 
 kernel/fork.c |   14 -
 4 files changed, 148 insertions(+), 2 deletions(-)

Index: container-2.6.22-rc2-mm1/kernel/exit.c
===
--- container-2.6.22-rc2-mm1.orig/kernel/exit.c
+++ container-2.6.22-rc2-mm1/kernel/exit.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -935,6 +936,7 @@ fastcall void do_exit(long code)
__exit_fs(tsk);
exit_thread();
cpuset_exit(tsk);
+   container_exit(tsk, 1);
exit_keys(tsk);
 
if (group_dead && tsk->signal->leader)
Index: container-2.6.22-rc2-mm1/kernel/fork.c
===
--- container-2.6.22-rc2-mm1.orig/kernel/fork.c
+++ container-2.6.22-rc2-mm1/kernel/fork.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -963,6 +964,7 @@ static struct task_struct *copy_process(
 {
int retval;
struct task_struct *p = NULL;
+   int container_callbacks_done = 0;
 
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1063,12 +1065,13 @@ static struct task_struct *copy_process(
p->io_wait = NULL;
p->audit_context = NULL;
cpuset_fork(p);
+   container_fork(p);
 #ifdef CONFIG_NUMA
p->mempolicy = mpol_copy(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
-   goto bad_fork_cleanup_cpuset;
+   goto bad_fork_cleanup_container;
}
mpol_fix_fork_child_flag(p);
 #endif
@@ -1178,6 +1181,12 @@ static struct task_struct *copy_process(
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);
 
+   /* Now that the task is set up, run container callbacks if
+* necessary. We need to run them before the task is visible
+* on the tasklist. */
+   container_fork_callbacks(p);
+   container_callbacks_done = 1;
+
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
 
@@ -1300,9 +1309,10 @@ bad_fork_cleanup_security:
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
mpol_free(p->mempolicy);
-bad_fork_cleanup_cpuset:
+bad_fork_cleanup_container:
 #endif
cpuset_exit(p);
+   container_exit(p, container_callbacks_done);
delayacct_tsk_free(p);
if (p->binfmt)
module_put(p->binfmt->module);
Index: container-2.6.22-rc2-mm1/include/linux/container.h
===
--- container-2.6.22-rc2-mm1.orig/include/linux/container.h
+++ container-2.6.22-rc2-mm1/include/linux/container.h
@@ -18,6 +18,9 @@
 extern int container_init_early(void);
 extern int container_init(void);
 extern void container_init_smp(void);
+extern void container_fork(struct task_struct *p);
+extern void container_fork_callbacks(struct task_struct *p);
+extern void container_exit(struct task_struct *p, int run_callbacks);
 
 extern struct file_operations proc_container_operations;
 
@@ -199,6 +202,9 @@ int container_path(const struct containe
 static inline int container_init_early(void) { return 0; }
 static inline int container_init(void) { return 0; }
 static inline void container_init_smp(void) {}
+static inline void container_fork(struct task_struct *p) {}
+static inline void container_fork_callbacks(struct task_struct *p) {}
+static inline void container_exit(struct task_struct *p, int callbacks) {}
 
 static inline void container_lock(void) {}
 static inline void container_unlock(void) {}
Index: container-2.6.22-rc2-mm1/kernel/container.c
===
--- container-2.6.22-rc2-mm1.orig/kernel/container.c
+++ container-2.6.22-rc2-mm1/kernel/container.c
@@ -132,6 +132,36 @@ list_for_each_entry(_ss, &_root->subsys_
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
+/* Each task_struct has an embedded css_group, so the get/put
+ * operation simply takes a reference count on all the containers
+ * referenced by subsystems in this css_group. This can end up
+ * multiple-counting some containers, but that's OK - the ref-count is
+ * just a busy/not-busy indicator; ensuring that we only count each
+ * container once would require taking

[PATCH 03/10] Containers(V10): Add tasks file interface

2007-05-29 Thread menage

This patch adds the per-directory "tasks" file for containerfs mounts;
this allows the user to determine which tasks are members of a
container by reading a container's "tasks", and to move a task into a
container by writing its pid to its "tasks".

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container.h |   10 +
 kernel/container.c|  335 ++
 2 files changed, 345 insertions(+)

Index: container-2.6.22-rc2-mm1/include/linux/container.h
===
--- container-2.6.22-rc2-mm1.orig/include/linux/container.h
+++ container-2.6.22-rc2-mm1/include/linux/container.h
@@ -128,6 +128,16 @@ int container_is_removed(const struct co
 
 int container_path(const struct container *cont, char *buf, int buflen);
 
+int __container_task_count(const struct container *cont);
+static inline int container_task_count(const struct container *cont)
+{
+   int task_count;
+   rcu_read_lock();
+   task_count = __container_task_count(cont);
+   rcu_read_unlock();
+   return task_count;
+}
+
 /* Return true if the container is a descendant of the current container */
 int container_is_descendant(const struct container *cont);
 
Index: container-2.6.22-rc2-mm1/kernel/container.c
===
--- container-2.6.22-rc2-mm1.orig/kernel/container.c
+++ container-2.6.22-rc2-mm1/kernel/container.c
@@ -679,6 +679,109 @@ static inline void get_first_subsys(cons
*subsys_id = test_ss->subsys_id;
 }
 
+/*
+ * Attach task 'tsk' to container 'cont'
+ *
+ * Call holding container_mutex.  May take task_lock of
+ * the task 'pid' during call.
+ */
+
+static int attach_task(struct container *cont, struct task_struct *tsk)
+{
+   int retval = 0;
+   struct container_subsys *ss;
+   struct container *oldcont;
+   struct css_group *cg = &tsk->containers;
+   struct containerfs_root *root = cont->root;
+   int i;
+
+   int subsys_id;
+   get_first_subsys(cont, NULL, &subsys_id);
+
+   /* Nothing to do if the task is already in that container */
+   oldcont = task_container(tsk, subsys_id);
+   if (cont == oldcont)
+   return 0;
+
+   for_each_subsys(root, ss) {
+   if (ss->can_attach) {
+   retval = ss->can_attach(ss, cont, tsk);
+   if (retval) {
+   return retval;
+   }
+   }
+   }
+
+   task_lock(tsk);
+   if (tsk->flags & PF_EXITING) {
+   task_unlock(tsk);
+   return -ESRCH;
+   }
+   /* Update the css_group pointers for the subsystems in this
+* hierarchy */
+   for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
+   if (root->subsys_bits & (1ull << i)) {
+   /* Subsystem is in this hierarchy. So we want
+* the subsystem state from the new
+* container. Transfer the refcount from the
+* old to the new */
+   atomic_inc(&cont->count);
+   atomic_dec(&cg->subsys[i]->container->count);
+   rcu_assign_pointer(cg->subsys[i], cont->subsys[i]);
+   }
+   }
+   task_unlock(tsk);
+
+   for_each_subsys(root, ss) {
+   if (ss->attach) {
+   ss->attach(ss, cont, oldcont, tsk);
+   }
+   }
+
+   synchronize_rcu();
+   return 0;
+}
+
+/*
+ * Attach task with pid 'pid' to container 'cont'. Call with
+ * container_mutex, may take task_lock of task
+ *
+ */
+
+static int attach_task_by_pid(struct container *cont, char *pidbuf)
+{
+   pid_t pid;
+   struct task_struct *tsk;
+   int ret;
+
+   if (sscanf(pidbuf, "%d", &pid) != 1)
+   return -EIO;
+
+   if (pid) {
+   rcu_read_lock();
+   tsk = find_task_by_pid(pid);
+   if (!tsk || tsk->flags & PF_EXITING) {
+   rcu_read_unlock();
+   return -ESRCH;
+   }
+   get_task_struct(tsk);
+   rcu_read_unlock();
+
+   if ((current->euid) && (current->euid != tsk->uid)
+   && (current->euid != tsk->suid)) {
+   put_task_struct(tsk);
+   return -EACCES;
+   }
+   } else {
+   tsk = current;
+   get_task_struct(tsk);
+   }
+
+   ret = attach_task(cont, tsk);
+   put_task_struct(tsk);
+   return ret;
+}
+
 /* The various types of f

[PATCH 06/10] Containers(V10): Add procfs interface

2007-05-29 Thread menage

This patch adds:

/proc/containers - general system info

/proc/*/container - per-task container membership info

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 fs/proc/base.c |7 ++
 kernel/container.c |  128 +
 2 files changed, 135 insertions(+)

Index: container-2.6.22-rc2-mm1/fs/proc/base.c
===
--- container-2.6.22-rc2-mm1.orig/fs/proc/base.c
+++ container-2.6.22-rc2-mm1/fs/proc/base.c
@@ -68,6 +68,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2028,6 +2029,9 @@ static const struct pid_entry tgid_base_
 #ifdef CONFIG_CPUSETS
REG("cpuset", S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
INF("oom_score",  S_IRUGO, oom_score),
REG("oom_adj",S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
@@ -2319,6 +2323,9 @@ static const struct pid_entry tid_base_s
 #ifdef CONFIG_CPUSETS
REG("cpuset",S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
INF("oom_score", S_IRUGO, oom_score),
REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
Index: container-2.6.22-rc2-mm1/kernel/container.c
===
--- container-2.6.22-rc2-mm1.orig/kernel/container.c
+++ container-2.6.22-rc2-mm1/kernel/container.c
@@ -249,6 +249,7 @@ static int container_mkdir(struct inode 
 static int container_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int container_populate_dir(struct container *cont);
 static struct inode_operations container_dir_inode_operations;
+struct file_operations proc_containerstats_operations;
 
 static struct backing_dev_info container_backing_dev_info = {
.ra_pages = 0,  /* No readahead */
@@ -1504,6 +1505,7 @@ int __init container_init(void)
 {
int err;
int i;
+   struct proc_dir_entry *entry;
 
for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
struct container_subsys *ss = subsys[i];
@@ -1515,10 +1517,136 @@ int __init container_init(void)
if (err < 0)
goto out;
 
+   entry = create_proc_entry("containers", 0, NULL);
+   if (entry)
+   entry->proc_fops = &proc_containerstats_operations;
+
 out:
return err;
 }
 
+/*
+ * proc_container_show()
+ *  - Print task's container paths into seq_file, one line for each hierarchy
+ *  - Used for /proc//container.
+ *  - No need to task_lock(tsk) on this tsk->container reference, as it
+ *doesn't really matter if tsk->container changes after we read it,
+ *and we take container_mutex, keeping attach_task() from changing it
+ *anyway.  No need to check that tsk->container != NULL, thanks to
+ *the_top_container_hack in container_exit(), which sets an exiting tasks
+ *container to top_container.
+ */
+
+/* TODO: Use a proper seq_file iterator */
+static int proc_container_show(struct seq_file *m, void *v)
+{
+   struct pid *pid;
+   struct task_struct *tsk;
+   char *buf;
+   int retval;
+   struct containerfs_root *root;
+
+   retval = -ENOMEM;
+   buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+   if (!buf)
+   goto out;
+
+   retval = -ESRCH;
+   pid = m->private;
+   tsk = get_pid_task(pid, PIDTYPE_PID);
+   if (!tsk)
+   goto out_free;
+
+   retval = 0;
+
+   mutex_lock(&container_mutex);
+
+   for_each_root(root) {
+   struct container_subsys *ss;
+   struct container *cont;
+   int subsys_id;
+   int count = 0;
+   /* Skip this hierarchy if it has no active subsystems */
+   if (!root->subsys_bits) continue;
+   for_each_subsys(root, ss) {
+   seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+   }
+   seq_putc(m, ':');
+   get_first_subsys(&root->top_container, NULL, &subsys_id);
+   cont = task_container(tsk, subsys_id);
+   retval = container_path(cont, buf, PAGE_SIZE);
+   if (retval < 0)
+   goto out_unlock;
+   seq_puts(m, buf);
+   seq_putc(m, '\n');
+   }
+
+out_unlock:
+   mutex_unlock(&container_mutex);
+   put_task_struct(tsk);
+out_free:
+   kfree(buf);
+out:
+   return retval;
+}
+
+static int container_open(struct inode *inode, struct file *file)
+{
+   struct pid *pid = PROC_I(inode)->pid;
+   return single_open(file, proc_container_show, pid);
+}

[PATCH 00/10] Containers(V10): Generic Process Containers

2007-05-29 Thread menage

This is an update to my multi-hierarchy multi-subsystem generic
process containers patch. Changes since V9 (April 27th) include:

- The patchset has been rebased over 2.6.22-rc2-mm1

- A lattice of lists linking tasks to their css_groups and css_groups
to their containers has been added to support more efficient iteration
across the member tasks of a container.

- Support for the cpusets "release agent" functionality has been added
back in; this is based on a workqueue concept similar to the changes
that Cliff Wickman has been pushing for supporting CPU hot-unplug.

- Several uses of tasklist_lock replaced by reliance on RCU

- Misc cleanups

- Tested with a tweaked version of PaulJ's cpuset_test script

Still TODO:

- decide whether "Containers" is an acceptable name for the system
given its usage by some other development groups, or whether something
else (ProcessSets? ResourceGroups? TaskGroups?) would be better. I'm
inclined to leave this political decision to Andrew/Linus once they're
happy with the technical aspects of the patches.

- add a hash-table based lookup for css_group objects.

- use seq_file properly in container tasks files to avoid having to
 allocate a big array for all the container's task pointers.

- lots more testing

- define standards for container file names

--

Generic Process Containers
--

There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
containers, and others.  These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.

Already existing in the kernel is the cpuset subsystem; this has a
process grouping mechanism that is mature, tested, and well documented
(particularly with regards to synchronization rules).

This patchset extracts the process grouping code from cpusets into a
generic container system, and makes the cpusets code a client of the
container system, along with a couple of simple example subsystems.

The patch set is structured as follows:

1) Basic container framework - filesystem and tracking structures

2) Simple CPU Accounting example subsystem

3) Support for the "tasks" control file

4) Hooks for fork() and exit()

5) Support for the container_clone() operation

6) Add /proc reporting interface

7) Make cpusets a container subsystem

8) Share container subsystem pointer arrays between tasks with the
   same assignments

9) Simple container debugging subsystem

10) Support for a userspace "release agent", similar to the cpusets
release agent functionality

The intention is that the various resource management and
virtualization efforts can also become container clients, with the
result that:

- the userspace APIs are (somewhat) normalised

- it's easier to test out e.g. the ResGroups CPU controller in
 conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.

- the additional kernel footprint of any of the competing resource
 management systems is substantially reduced, since it doesn't need
 to provide process grouping/containment, hence improving their
 chances of getting into the kernel

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 09/10] Containers(V10): Simple debug info subsystem

2007-05-29 Thread menage

This example subsystem exports debugging information as an aid to
diagnosing refcount leaks, etc, in the container framework.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container_subsys.h |4 +
 init/Kconfig |   10 
 kernel/Makefile  |1 
 kernel/container_debug.c |   89 +++
 4 files changed, 104 insertions(+)

Index: container-2.6.22-rc2-mm1/include/linux/container_subsys.h
===
--- container-2.6.22-rc2-mm1.orig/include/linux/container_subsys.h
+++ container-2.6.22-rc2-mm1/include/linux/container_subsys.h
@@ -19,4 +19,8 @@ SUBSYS(cpuset)
 
 /* */
 
+#ifdef CONFIG_CONTAINER_DEBUG
+SUBSYS(debug)
+#endif
+
 /* */
Index: container-2.6.22-rc2-mm1/init/Kconfig
===
--- container-2.6.22-rc2-mm1.orig/init/Kconfig
+++ container-2.6.22-rc2-mm1/init/Kconfig
@@ -306,6 +306,16 @@ config LOG_BUF_SHIFT
 config CONTAINERS
bool
 
+config CONTAINER_DEBUG
+   bool "Example debug container subsystem"
+   select CONTAINERS
+   help
+ This option enables a simple container subsystem that
+ exports useful debugging information about the containers
+ framework
+
+ Say N if unsure
+
 config CPUSETS
bool "Cpuset support"
depends on SMP
Index: container-2.6.22-rc2-mm1/kernel/container_debug.c
===
--- /dev/null
+++ container-2.6.22-rc2-mm1/kernel/container_debug.c
@@ -0,0 +1,89 @@
+/*
+ * kernel/ccontainer_debug.c - Example container subsystem that
+ * exposes debug info
+ *
+ * Copyright (C) Google Inc, 2007
+ *
+ * Developed by Paul Menage ([EMAIL PROTECTED])
+ *
+ */
+
+#include 
+#include 
+
+static int debug_create(struct container_subsys *ss, struct container *cont)
+{
+   struct container_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+   if (!css)
+   return -ENOMEM;
+   cont->subsys[debug_subsys_id] = css;
+   return 0;
+}
+
+static void debug_destroy(struct container_subsys *ss, struct container *cont)
+{
+   kfree(cont->subsys[debug_subsys_id]);
+}
+
+static u64 container_refcount_read(struct container *cont, struct cftype *cft)
+{
+   return atomic_read(&cont->count);
+}
+
+static u64 taskcount_read(struct container *cont, struct cftype *cft)
+{
+   u64 count;
+   container_lock();
+   count = container_task_count(cont);
+   container_unlock();
+   return count;
+}
+
+static u64 current_css_group_read(struct container *cont, struct cftype *cft)
+{
+   return (u64) current->containers;
+}
+
+static u64 current_css_group_refcount_read(struct container *cont,
+  struct cftype *cft)
+{
+   u64 count;
+   rcu_read_lock();
+   count = atomic_read(¤t->containers->ref.refcount);
+   rcu_read_unlock();
+   return count;
+}
+
+static struct cftype files[] =  {
+   {
+   .name = "debug.container_refcount",
+   .read_uint = container_refcount_read,
+   },
+   {
+   .name = "debug.taskcount",
+   .read_uint = taskcount_read,
+   },
+
+   {
+   .name = "debug.current_css_group",
+   .read_uint = current_css_group_read,
+   },
+
+   {
+   .name = "debug.current_css_group_refcount",
+   .read_uint = current_css_group_refcount_read,
+   },
+};
+
+static int debug_populate(struct container_subsys *ss, struct container *cont)
+{
+   return container_add_files(cont, files, ARRAY_SIZE(files));
+}
+
+struct container_subsys debug_subsys = {
+   .name = "debug",
+   .create = debug_create,
+   .destroy = debug_destroy,
+   .populate = debug_populate,
+   .subsys_id = debug_subsys_id,
+};
Index: container-2.6.22-rc2-mm1/kernel/Makefile
===
--- container-2.6.22-rc2-mm1.orig/kernel/Makefile
+++ container-2.6.22-rc2-mm1/kernel/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CONTAINERS) += container.o
+obj-$(CONFIG_CONTAINER_DEBUG) += container_debug.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o
 obj-$(CONFIG_IKCONFIG) += configs.o

--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 02/10] Containers(V10): Example CPU accounting subsystem

2007-05-29 Thread menage

This example demonstrates how to use the generic container subsystem
for a simple resource tracker that counts, for the processes in a
container, the total CPU time used and the %CPU used in the last
complete 10 second interval.

Portions contributed by Balbir Singh <[EMAIL PROTECTED]>

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container_subsys.h |6 +
 include/linux/cpu_acct.h |   14 ++
 init/Kconfig |7 +
 kernel/Makefile  |1 
 kernel/cpu_acct.c|  185 +++
 kernel/sched.c   |   14 ++
 6 files changed, 224 insertions(+), 3 deletions(-)

Index: container-2.6.22-rc2-mm1/include/linux/container_subsys.h
===
--- container-2.6.22-rc2-mm1.orig/include/linux/container_subsys.h
+++ container-2.6.22-rc2-mm1/include/linux/container_subsys.h
@@ -7,4 +7,10 @@
 
 /* */
 
+#ifdef CONFIG_CONTAINER_CPUACCT
+SUBSYS(cpuacct)
+#endif
+
+/* */
+
 /* */
Index: container-2.6.22-rc2-mm1/include/linux/cpu_acct.h
===
--- /dev/null
+++ container-2.6.22-rc2-mm1/include/linux/cpu_acct.h
@@ -0,0 +1,14 @@
+
+#ifndef _LINUX_CPU_ACCT_H
+#define _LINUX_CPU_ACCT_H
+
+#include 
+#include 
+
+#ifdef CONFIG_CONTAINER_CPUACCT
+extern void cpuacct_charge(struct task_struct *, cputime_t cputime);
+#else
+static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {}
+#endif
+
+#endif
Index: container-2.6.22-rc2-mm1/init/Kconfig
===
--- container-2.6.22-rc2-mm1.orig/init/Kconfig
+++ container-2.6.22-rc2-mm1/init/Kconfig
@@ -337,6 +337,13 @@ config SYSFS_DEPRECATED
  If you are using a distro that was released in 2006 or later,
  it should be safe to say N here.
 
+config CONTAINER_CPUACCT
+   bool "Simple CPU accounting container subsystem"
+   select CONTAINERS
+   help
+ Provides a simple Resource Controller for monitoring the
+ total CPU consumed by the tasks in a container
+
 config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
Index: container-2.6.22-rc2-mm1/kernel/Makefile
===
--- container-2.6.22-rc2-mm1.orig/kernel/Makefile
+++ container-2.6.22-rc2-mm1/kernel/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CONTAINERS) += container.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
Index: container-2.6.22-rc2-mm1/kernel/cpu_acct.c
===
--- /dev/null
+++ container-2.6.22-rc2-mm1/kernel/cpu_acct.c
@@ -0,0 +1,185 @@
+/*
+ * kernel/cpu_acct.c - CPU accounting container subsystem
+ *
+ * Copyright (C) Google Inc, 2006
+ *
+ * Developed by Paul Menage ([EMAIL PROTECTED]) and Balbir Singh
+ * ([EMAIL PROTECTED])
+ *
+ */
+
+/*
+ * Example container subsystem for reporting total CPU usage of tasks in a
+ * container, along with percentage load over a time interval
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct cpuacct {
+   struct container_subsys_state css;
+   spinlock_t lock;
+   /* total time used by this class */
+   cputime64_t time;
+
+   /* time when next load calculation occurs */
+   u64 next_interval_check;
+
+   /* time used in current period */
+   cputime64_t current_interval_time;
+
+   /* time used in last period */
+   cputime64_t last_interval_time;
+};
+
+struct container_subsys cpuacct_subsys;
+
+static inline struct cpuacct *container_ca(struct container *cont)
+{
+   return container_of(container_subsys_state(cont, cpuacct_subsys_id),
+   struct cpuacct, css);
+}
+
+static inline struct cpuacct *task_ca(struct task_struct *task)
+{
+   return container_of(task_subsys_state(task, cpuacct_subsys_id),
+   struct cpuacct, css);
+}
+
+#define INTERVAL (HZ * 10)
+
+static inline u64 next_interval_boundary(u64 now) {
+   /* calculate the next interval boundary beyond the
+* current time */
+   do_div(now, INTERVAL);
+   return (now + 1) * INTERVAL;
+}
+
+static int cpuacct_create(struct container_subsys *ss, struct container *cont)
+{
+   struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+   if (!ca)
+   return -ENOMEM;
+   spin_lock_init(&ca->lock);
+   ca->next_interval_check = next_interval_boundary(get_jiffies_64());
+   cont->subsys[cpuacct_subsys_id] = &ca->css;
+   return 0;
+}
+
+static void cpuacct_destroy(struct container_subsys

[PATCH 10/10] Containers(V10): Support for automatic userspace release agents

2007-05-29 Thread menage

This patch adds the following files to the container filesystem:

notify_on_release - configures/reports whether the container subsystem
should attempt to run a release script when this container becomes
unused

release_agent - configures/reports the release agent to be used for
this hierarchy (top level in each hierarchy only)

releasable - reports whether this container would have been
auto-released if notify_on_release was true and a release agent was
configured (mainly useful for debugging)

To avoid locking issues, invoking the userspace release agent is done
via a workqueue task; containers that need to have their release
agents invoked by the workqueue task are linked on to a list.

When the "cpuset" filesystem is mounted, it automatically sets the
hierarchy's release agent to be /sbin/cpuset_release_agent for
backward-compatibility with existing cpusets users.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
---
 include/linux/container.h |   15 +
 kernel/container.c|  364 ++
 kernel/cpuset.c   |5 
 3 files changed, 348 insertions(+), 36 deletions(-)

Index: container-2.6.22-rc2-mm1/include/linux/container.h
===
--- container-2.6.22-rc2-mm1.orig/include/linux/container.h
+++ container-2.6.22-rc2-mm1/include/linux/container.h
@@ -64,11 +64,7 @@ static inline void css_get(struct contai
  * css_put() should be called to release a reference taken by
  * css_get()
  */
-
-static inline void css_put(struct container_subsys_state *css)
-{
-   atomic_dec(&css->refcnt);
-}
+void css_put(struct container_subsys_state *css);
 
 struct container {
unsigned long flags;/* "unsigned long" so bitops work */
@@ -99,6 +95,13 @@ struct container {
 * tasks in this container. Protected by css_group_lock
 */
struct list_head css_groups;
+
+   /*
+* Linked list running through all containers that can
+* potentially be reaped by the release agent. Protected by
+* container_mutex
+*/
+   struct list_head release_list;
 };
 
 /* A css_group is a structure holding pointers to a set of
@@ -271,6 +274,8 @@ struct task_struct *container_iter_next(
struct container_iter *it);
 void container_iter_end(struct container *cont, struct container_iter *it);
 
+void container_set_release_agent_path(struct container_subsys *ss,
+ const char *path);
 
 #else /* !CONFIG_CONTAINERS */
 
Index: container-2.6.22-rc2-mm1/kernel/container.c
===
--- container-2.6.22-rc2-mm1.orig/kernel/container.c
+++ container-2.6.22-rc2-mm1/kernel/container.c
@@ -62,6 +62,8 @@
 
 #define CONTAINER_SUPER_MAGIC  0x27e0eb
 
+static DEFINE_MUTEX(container_mutex);
+
 /* Generate an array of container subsystem pointers */
 #define SUBSYS(_x) &_x ## _subsys,
 
@@ -89,6 +91,13 @@ struct containerfs_root {
 
/* A list running through the mounted hierarchies */
struct list_head root_list;
+
+   /* The path to use for release notifications. No locking
+* between setting and use - so if userspace updates this
+* while subcontainers exist, you could miss a
+* notification. We ensure that it's always a valid
+* NUL-terminated string */
+   char release_agent_path[PATH_MAX];
 };
 
 
@@ -115,7 +124,13 @@ static int need_forkexit_callback = 0;
 
 /* bits in struct container flags field */
 typedef enum {
+   /* Container is dead */
CONT_REMOVED,
+   /* Container has previously had a child container or a task,
+* but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */
+   CONT_RELEASABLE,
+   /* Container requires release notifications to userspace */
+   CONT_NOTIFY_ON_RELEASE,
 } container_flagbits_t;
 
 /* convenient tests for these bits */
@@ -124,6 +139,19 @@ inline int container_is_removed(const st
return test_bit(CONT_REMOVED, &cont->flags);
 }
 
+inline int container_is_releasable(const struct container *cont)
+{
+   const int bits =
+   (1 << CONT_RELEASABLE) |
+   (1 << CONT_NOTIFY_ON_RELEASE);
+   return (cont->flags & bits) == bits;
+}
+
+inline int notify_on_release(const struct container *cont)
+{
+   return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
+}
+
 /* for_each_subsys() allows you to iterate on each subsystem attached to
  * an active hierarchy */
 #define for_each_subsys(_root, _ss) \
@@ -133,6 +161,12 @@ list_for_each_entry(_ss, &_root->subsys_
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
+/* the list of containers eligible for automatic release */
+static LIST_HEAD(release_list);
+static void container_release_agent(struct wor

[PATCH 0/7] Containers (V8): Generic Process Containers

2007-04-06 Thread menage

--

This is an update to my multi-hierarchy multi-subsystem generic
process containers patch. Changes since V7 (12th Feb) include:

- Removed the config-time choice of the number of supported
hierarchies - this is now completely dynamic; new hierarchies are
allocated on demand, and freed when no longer in use.

- Subsystems are now registered at compile-time in
linux/container_subsys.h. This allows for faster access to subsystem
state since the id is a compile-time constant, so there's only a
single extra pointer dereference compared to having a pointer directly
in the task_struct. It also avoids wasting space with unused subsystem
pointers.

- Removed the container pointers from container_group - this results
in a structure very similar to Srivatsa Vaddagiri's rcfs
approach. (RCFS uses the nsproxy object rather than the
container_group object; merging container_group and nsproxy would be
pretty straightforward if desired).

- Removed callback_mutex from container subsystem to be purely back in
the cpuset subsystem. Renamed manage_mutex to container_mutex.

- Condensed post_attach_task() into attach_task() now that
callback_mutex is purely within cpuset.c

- Simplified the container_subsys_state reference counting - stricter
rules on liveness make adding reference counts cheaper.

Still TODO:

- decide whether "Containers" is an acceptable name for the system
given its usage by some other development groups, or whether something
else (ProcessSets? ResourceGroups?) would be better 

- decide whether merging container_group and nsproxy is desirable

- add a hash-table based lookup for container_group objects.

- use seq_file properly in container tasks files (and also in
cpuset_attach_task) to avoid having to allocate a big array for all
the container's task pointers.

- add back support for the "release agent" functionality

- lots more testing

- define standards for container file names

Generic Process Containers
--

There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
containers, and others.  These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.

Already existing in the kernel is the cpuset subsystem; this has a
process grouping mechanism that is mature, tested, and well documented
(particularly with regards to synchronization rules).

This patchset extracts the process grouping code from cpusets into a
generic container system, and makes the cpusets code a client of
the container system.

It also provides several example clients of the container system,
including ResGroups, BeanCounters and namespace proxy.

The change is implemented in three implementation patches, plus four example
subsystems that aren't necessarily intended to be merged as part of
this patch set, but demonstrate the applicability of the framework.

1) extract the process grouping code from cpusets into a standalone system

2) remove the process grouping code from cpusets and hook into the
  container system

3) convert the container system to present a generic multi-hierarchy
  API, and make cpusets a client of that API

4) example of a simple CPU accounting container subsystem. Useful as a
  boilerplate for people implementing their own subsystems.

5) example of implementing ResGroups and its numtasks controller over
  generic containers

6) example of implementing BeanCounters and its numfiles counter over
  generic containers

7) example of integrating the namespace isolation code (sys_unshare()
  or various clone flags) with generic containers, allowing virtual
  servers to take advantage of other resource control efforts.

The intention is that the various resource management and
virtualization efforts can also become container clients, with the
result that:

- the userspace APIs are (somewhat) normalised

- it's easier to test out e.g. the ResGroups CPU controller in
 conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.

- the additional kernel footprint of any of the competing resource
 management systems is substantially reduced, since it doesn't need
 to provide process grouping/containment, hence improving their
 chances of getting into the kernel

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 6/7] Containers (V8): BeanCounters over generic process containers

2007-04-06 Thread menage

;
+
+static int bc_populate(struct container_subsys *ss, struct container *cont)
+{
+   int err;
+   int attr, res;
+   for (res = 0; res < BC_RESOURCES; res++) {
+   struct bc_resource *bcr = bc_resources[res];
+
+   for (attr = 0; attr < BC_ATTRS; attr++) {
+   struct cftype *cft = &bcr->cft_attrs[attr];
+   if (!cft->name[0]) continue;
+   err = container_add_file(cont, cft);
+   if (err < 0) return err;
+   }
+   }
+   return 0;
+}
+
+struct container_subsys bc_subsys = {
+   .name = "bc",
+   .create = bc_create,
+   .destroy = bc_destroy,
+   .populate = bc_populate,
+   .subsys_id = bc_subsys_id,
+   .early_init = 1,
+};
+
+EXPORT_SYMBOL(bc_resources);
+EXPORT_SYMBOL(init_bc);
+EXPORT_SYMBOL(bc_change_param);
Index: container-2.6.20-new/include/bc/misc.h
===
--- /dev/null
+++ container-2.6.20-new/include/bc/misc.h
@@ -0,0 +1,27 @@
+/*
+ * include/bc/misc.h
+ *
+ * Copyright (C) 2006 OpenVZ SWsoft Inc
+ *
+ */
+
+#ifndef __BC_MISC_H__
+#define __BC_MISC_H__
+
+struct file;
+
+#ifdef CONFIG_BEANCOUNTERS
+int __must_check bc_file_charge(struct file *);
+void bc_file_uncharge(struct file *);
+#else
+static inline int __must_check bc_file_charge(struct file *f)
+{
+   return 0;
+}
+
+static inline void bc_file_uncharge(struct file *f)
+{
+}
+#endif
+
+#endif
Index: container-2.6.20-new/kernel/bc/misc.c
===
--- /dev/null
+++ container-2.6.20-new/kernel/bc/misc.c
@@ -0,0 +1,57 @@
+
+#include 
+#include 
+#include 
+
+int bc_file_charge(struct file *file)
+{
+   int sev;
+   struct beancounter *bc;
+
+   rcu_read_lock();
+   bc = get_exec_bc();
+   css_get(&bc->css);
+   rcu_read_unlock();
+
+   sev = (capable(CAP_SYS_ADMIN) ? BC_LIMIT : BC_BARRIER);
+
+   if (bc_charge(bc, BC_NUMFILES, 1, sev)) {
+   css_put(&bc->css);
+   return -EMFILE;
+   }
+
+   file->f_bc = bc;
+   return 0;
+}
+
+void bc_file_uncharge(struct file *file)
+{
+   struct beancounter *bc;
+
+   bc = file->f_bc;
+   bc_uncharge(bc, BC_NUMFILES, 1);
+   css_put(&bc->css);
+}
+
+#define BC_NUMFILES_BARRIER256
+#define BC_NUMFILES_LIMIT  512
+
+static int bc_files_init(struct beancounter *bc, int i)
+{
+   bc_init_resource(&bc->bc_parms[BC_NUMFILES],
+   BC_NUMFILES_BARRIER, BC_NUMFILES_LIMIT);
+   return 0;
+}
+
+static struct bc_resource bc_files_resource = {
+   .bcr_name = "numfiles",
+   .bcr_init = bc_files_init,
+};
+
+static int __init bc_misc_init_resource(void)
+{
+   bc_register_resource(BC_NUMFILES, &bc_files_resource);
+   return 0;
+}
+
+__initcall(bc_misc_init_resource);
Index: container-2.6.20-new/fs/file_table.c
===
--- container-2.6.20-new.orig/fs/file_table.c
+++ container-2.6.20-new/fs/file_table.c
@@ -22,6 +22,8 @@
 #include 
 #include 
 
+#include 
+
 #include 
 
 /* sysctl tunables... */
@@ -43,6 +45,7 @@ static inline void file_free_rcu(struct 
 static inline void file_free(struct file *f)
 {
percpu_counter_dec(&nr_files);
+   bc_file_uncharge(f);
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 
@@ -107,8 +110,10 @@ struct file *get_empty_filp(void)
if (f == NULL)
goto fail;
 
-   percpu_counter_inc(&nr_files);
memset(f, 0, sizeof(*f));
+   if (bc_file_charge(f))
+   goto fail_charge;
+   percpu_counter_inc(&nr_files);
if (security_file_alloc(f))
goto fail_sec;
 
@@ -135,6 +140,10 @@ fail_sec:
file_free(f);
 fail:
return NULL;
+
+ fail_charge:
+   kmem_cache_free(filp_cachep, f);
+   return NULL;
 }
 
 EXPORT_SYMBOL(get_empty_filp);
Index: container-2.6.20-new/include/linux/fs.h
===
--- container-2.6.20-new.orig/include/linux/fs.h
+++ container-2.6.20-new/include/linux/fs.h
@@ -739,6 +739,9 @@ struct file {
spinlock_t  f_ep_lock;
 #endif /* #ifdef CONFIG_EPOLL */
struct address_space*f_mapping;
+#ifdef CONFIG_BEANCOUNTERS
+   struct beancounter  *f_bc;
+#endif
 };
 extern spinlock_t files_lock;
 #define file_list_lock() spin_lock(&files_lock);
Index: container-2.6.20-new/include/bc/task.h
===
--- /dev/null
+++ container-2.6.20-new/include/bc/task.h
@@ -0,0 +1,64 @@
+/*
+ * include/bc/task.h
+ *
+ * Copyright (C) 2007 OpenVZ SWsoft Inc
+ * Adapted by Paul Menage <[EMAIL PROTECTED]> for generic containers
+ *
+ */
+
+#ifndef __

[PATCH 4/7] Containers (V8): Simple CPU accounting container subsystem

2007-04-06 Thread menage

This example demonstrates how to use the generic container subsystem
for a simple resource tracker that counts, for the processes in a
container, the total CPU time used and the %CPU used in the last
complete 10 second interval.

Portions contributed by Balbir Singh <[EMAIL PROTECTED]>

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container_subsys.h |6 +
 include/linux/cpu_acct.h |   14 ++
 init/Kconfig |7 +
 kernel/Makefile  |1 
 kernel/cpu_acct.c|  204 +++
 kernel/sched.c   |   14 ++
 6 files changed, 243 insertions(+), 3 deletions(-)

Index: container-2.6.20-new/include/linux/cpu_acct.h
===
--- /dev/null
+++ container-2.6.20-new/include/linux/cpu_acct.h
@@ -0,0 +1,14 @@
+
+#ifndef _LINUX_CPU_ACCT_H
+#define _LINUX_CPU_ACCT_H
+
+#include 
+#include 
+
+#ifdef CONFIG_CONTAINER_CPUACCT
+extern void cpuacct_charge(struct task_struct *, cputime_t cputime);
+#else
+static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {}
+#endif
+
+#endif
Index: container-2.6.20-new/init/Kconfig
===
--- container-2.6.20-new.orig/init/Kconfig
+++ container-2.6.20-new/init/Kconfig
@@ -278,6 +278,13 @@ config PROC_PID_CPUSET
depends on CPUSETS
default y
 
+config CONTAINER_CPUACCT
+   bool "Simple CPU accounting container subsystem"
+   select CONTAINERS
+   help
+ Provides a simple Resource Controller for monitoring the
+ total CPU consumed by the tasks in a container
+
 config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
Index: container-2.6.20-new/kernel/cpu_acct.c
===
--- /dev/null
+++ container-2.6.20-new/kernel/cpu_acct.c
@@ -0,0 +1,204 @@
+/*
+ * kernel/cpu_acct.c - CPU accounting container subsystem
+ *
+ * Copyright (C) Google Inc, 2006
+ *
+ * Developed by Paul Menage ([EMAIL PROTECTED]) and Balbir Singh
+ * ([EMAIL PROTECTED])
+ *
+ */
+
+/*
+ * Container subsystem for reporting total CPU usage of tasks in a
+ * container, along with percentage load over a time interval
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct cpuacct {
+   struct container_subsys_state css;
+   spinlock_t lock;
+   /* total time used by this class */
+   cputime64_t time;
+
+   /* time when next load calculation occurs */
+   u64 next_interval_check;
+
+   /* time used in current period */
+   cputime64_t current_interval_time;
+
+   /* time used in last period */
+   cputime64_t last_interval_time;
+};
+
+struct container_subsys cpuacct_subsys;
+
+static inline struct cpuacct *container_ca(struct container *cont)
+{
+   return container_of(container_subsys_state(cont, cpuacct_subsys_id),
+   struct cpuacct, css);
+}
+
+static inline struct cpuacct *task_ca(struct task_struct *task)
+{
+   return container_ca(task_container(task, cpuacct_subsys_id));
+}
+
+#define INTERVAL (HZ * 10)
+
+static inline u64 next_interval_boundary(u64 now) {
+   /* calculate the next interval boundary beyond the
+* current time */
+   do_div(now, INTERVAL);
+   return (now + 1) * INTERVAL;
+}
+
+static int cpuacct_create(struct container_subsys *ss, struct container *cont)
+{
+   struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+   if (!ca)
+   return -ENOMEM;
+   spin_lock_init(&ca->lock);
+   ca->next_interval_check = next_interval_boundary(get_jiffies_64());
+   cont->subsys[cpuacct_subsys.subsys_id] = &ca->css;
+   return 0;
+}
+
+static void cpuacct_destroy(struct container_subsys *ss,
+   struct container *cont)
+{
+   kfree(container_ca(cont));
+}
+
+/* Lazily update the load calculation if necessary. Called with ca locked */
+static void cpuusage_update(struct cpuacct *ca)
+{
+   u64 now = get_jiffies_64();
+   /* If we're not due for an update, return */
+   if (ca->next_interval_check > now)
+   return;
+
+   if (ca->next_interval_check <= (now - INTERVAL)) {
+   /* If it's been more than an interval since the last
+* check, then catch up - the last interval must have
+* been zero load */
+   ca->last_interval_time = 0;
+   ca->next_interval_check = next_interval_boundary(now);
+   } else {
+   /* If a steal takes the last interval time negative,
+* then we just ignore it */
+   if ((s64)ca->current_interval_time > 0) {
+   ca->last_interval_time = ca->current_interval_time;
+   }

[PATCH 1/7] Containers (V8): Generic container system abstracted from cpusets code

2007-04-06 Thread menage

This patch creates a generic process container system based on (and
parallel top) the cpusets code.  At a coarse level it was created by
copying kernel/cpuset.c, doing s/cpuset/container/g, and stripping out any
code that was cpuset-specific rather than applicable to any process
container subsystem.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 Documentation/containers.txt |  229 +++
 fs/proc/base.c   |7 
 include/linux/container.h|   96 +++
 include/linux/sched.h|5 
 init/Kconfig |9 
 init/main.c  |3 
 kernel/Makefile  |1 
 kernel/container.c   | 1260 +++
 kernel/exit.c|2 
 kernel/fork.c|3 
 10 files changed, 1614 insertions(+), 1 deletion(-)

Index: container-2.6.20-new/fs/proc/base.c
===
--- container-2.6.20-new.orig/fs/proc/base.c
+++ container-2.6.20-new/fs/proc/base.c
@@ -68,6 +68,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1870,6 +1871,9 @@ static struct pid_entry tgid_base_stuff[
 #ifdef CONFIG_CPUSETS
REG("cpuset", S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
INF("oom_score",  S_IRUGO, oom_score),
REG("oom_adj",S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
@@ -2151,6 +2155,9 @@ static struct pid_entry tid_base_stuff[]
 #ifdef CONFIG_CPUSETS
REG("cpuset",S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
INF("oom_score", S_IRUGO, oom_score),
REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
Index: container-2.6.20-new/include/linux/container.h
===
--- /dev/null
+++ container-2.6.20-new/include/linux/container.h
@@ -0,0 +1,96 @@
+#ifndef _LINUX_CONTAINER_H
+#define _LINUX_CONTAINER_H
+/*
+ *  container interface
+ *
+ *  Copyright (C) 2003 BULL SA
+ *  Copyright (C) 2004-2006 Silicon Graphics, Inc.
+ *
+ */
+
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_CONTAINERS
+
+extern int number_of_containers;   /* How many containers are defined in 
system? */
+
+extern int container_init_early(void);
+extern int container_init(void);
+extern void container_init_smp(void);
+extern void container_fork(struct task_struct *p);
+extern void container_exit(struct task_struct *p);
+
+extern struct file_operations proc_container_operations;
+
+extern void container_lock(void);
+extern void container_unlock(void);
+
+extern void container_manage_lock(void);
+extern void container_manage_unlock(void);
+
+struct container {
+   unsigned long flags;/* "unsigned long" so bitops work */
+
+   /*
+* Count is atomic so can incr (fork) or decr (exit) without a lock.
+*/
+   atomic_t count; /* count tasks using this container */
+
+   /*
+* We link our 'sibling' struct into our parent's 'children'.
+* Our children link their 'sibling' into our 'children'.
+*/
+   struct list_head sibling;   /* my parent's children */
+   struct list_head children;  /* my children */
+
+   struct container *parent;   /* my parent */
+   struct dentry *dentry;  /* container fs entry */
+};
+
+/* struct cftype:
+ *
+ * The files in the container filesystem mostly have a very simple read/write
+ * handling, some common function will take care of it. Nevertheless some cases
+ * (read tasks) are special and therefore I define this structure for every
+ * kind of file.
+ *
+ *
+ * When reading/writing to a file:
+ * - the container to use in file->f_dentry->d_parent->d_fsdata
+ * - the 'cftype' of the file is file->f_dentry->d_fsdata
+ */
+
+struct inode;
+struct cftype {
+   char *name;
+   int private;
+   int (*open) (struct inode *inode, struct file *file);
+   ssize_t (*read) (struct container *cont, struct cftype *cft,
+struct file *file,
+char __user *buf, size_t nbytes, loff_t *ppos);
+   ssize_t (*write) (struct container *cont, struct cftype *cft,
+ struct file *file,
+ const char __user *buf, size_t nbytes, loff_t *ppos);
+   int (*release) (struct inode *inode, struct file *file);
+};
+
+int container_add_file(struct container *cont, const struct cftype *cft);
+
+int container_is_removed(const struct container *cont);
+
+#else /* !CONFIG_CONTAINERS */
+
+static inline int container_init_early(void) { return 0; }
+static inline int container_init

[PATCH 7/7] Containers (V8): Container interface to nsproxy subsystem

2007-04-06 Thread menage

This is intended as a simple illustration of how a virtual server
system could be integrated with generic containers, and hence take
advantage of other resource-control efforts. A real implementation
would probably allow parameters such as configuring what kinds of
namespace creations triggered new containers, etc.

When a task enters a new namespace via a clone() or unshare(), a new
container is created and the task moves into it. Developed by Serge
Hallyn <[EMAIL PROTECTED]>, adapted by Paul Menage <[EMAIL PROTECTED]>

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container_subsys.h |6 ++
 include/linux/nsproxy.h  |6 ++
 init/Kconfig |9 +++
 kernel/Makefile  |1 
 kernel/fork.c|4 +
 kernel/ns_container.c|   99 +++
 kernel/nsproxy.c |6 ++
 7 files changed, 131 insertions(+)

Index: container-2.6.20-new/include/linux/nsproxy.h
===
--- container-2.6.20-new.orig/include/linux/nsproxy.h
+++ container-2.6.20-new/include/linux/nsproxy.h
@@ -53,4 +53,10 @@ static inline void exit_task_namespaces(
put_nsproxy(ns);
}
 }
+#ifdef CONFIG_CONTAINER_NS
+int ns_container_clone(struct task_struct *tsk);
+#else
+static inline int ns_container_clone(struct task_struct *tsk) { return 0; }
+#endif
+
 #endif
Index: container-2.6.20-new/kernel/Makefile
===
--- container-2.6.20-new.orig/kernel/Makefile
+++ container-2.6.20-new/kernel/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CONTAINERS) += container.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o
+obj-$(CONFIG_CONTAINER_NS) += ns_container.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
Index: container-2.6.20-new/kernel/fork.c
===
--- container-2.6.20-new.orig/kernel/fork.c
+++ container-2.6.20-new/kernel/fork.c
@@ -1668,6 +1668,9 @@ asmlinkage long sys_unshare(unsigned lon
err = -ENOMEM;
goto bad_unshare_cleanup_ipc;
}
+   err = ns_container_clone(current);
+   if (err)
+   goto bad_unshare_cleanup_dupns;
}
 
if (new_fs || new_ns || new_mm || new_fd || new_ulist ||
@@ -1722,6 +1725,7 @@ asmlinkage long sys_unshare(unsigned lon
task_unlock(current);
}
 
+ bad_unshare_cleanup_dupns:
if (new_nsproxy)
put_nsproxy(new_nsproxy);
 
Index: container-2.6.20-new/kernel/ns_container.c
===
--- /dev/null
+++ container-2.6.20-new/kernel/ns_container.c
@@ -0,0 +1,99 @@
+/*
+ * ns_container.c - namespace container subsystem
+ *
+ * Copyright IBM, 2006
+ */
+
+#include 
+#include 
+#include 
+
+struct nscont {
+   struct container_subsys_state css;
+   spinlock_t lock;
+};
+
+struct container_subsys ns_subsys;
+
+static inline struct nscont *container_nscont(struct container *cont)
+{
+   return container_of(container_subsys_state(cont, ns_subsys_id),
+   struct nscont, css);
+}
+
+int ns_container_clone(struct task_struct *tsk)
+{
+   return container_clone(tsk, &ns_subsys);
+}
+
+/*
+ * Rules:
+ *   1. you can only enter a container which is a child of your current
+ * container
+ *   2. you can only place another process into a container if
+ * a. you have CAP_SYS_ADMIN
+ * b. your container is an ancestor of tsk's destination container
+ *   (hence either you are in the same container as tsk, or in an
+ *ancestor container thereof)
+ */
+int ns_can_attach(struct container_subsys *ss,
+ struct container *cont, struct task_struct *tsk)
+{
+   struct container *c;
+
+   if (current != tsk) {
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   if (!container_is_descendant(cont))
+   return -EPERM;
+   }
+
+   if (container_task_count(cont) != 0)
+   return -EPERM;
+
+   c = task_container(tsk, ns_subsys_id);
+   if (c && c != cont->parent)
+   return -EPERM;
+
+   return 0;
+}
+
+/*
+ * Rules: you can only create a container if
+ * 1. you are capable(CAP_SYS_ADMIN)
+ * 2. the target container is a descendant of your own container
+ */
+static int ns_create(struct container_subsys *ss, struct container *cont)
+{
+   struct nscont *ns;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+   if (cont->parent && !container_is_descendant(cont))
+

[PATCH 5/7] Containers (V8): Resource Groups over generic containers

2007-04-06 Thread menage

This patch provides the RG core and numtasks controller as container
subsystems, intended as an example of how to implement a more complex
resource control system over generic process containers. The changes
to the core involve primarily removing the group management, task
membership and configfs support and adding interface layers to talk to
the generic container layer instead.

Each resource controller becomes an independent container subsystem;
the RG core is essentially a library that the resource controllers can
use to provide the RG API to userspace. Rather than a single shares
and stats file in each group, there's a _shares and 
a _stats file, each linked to the appropriate resource
controller.

 include/linux/container_subsys.h |6 
 include/linux/moduleparam.h  |   12 -
 include/linux/numtasks.h |   28 ++
 include/linux/res_group.h|   86 +++
 include/linux/res_group_rc.h |  125 ++
 init/Kconfig |   22 +
 kernel/Makefile  |1 
 kernel/fork.c|7 
 kernel/res_group/Makefile|2 
 kernel/res_group/local.h |   38 +++
 kernel/res_group/numtasks.c  |  451 +++
 kernel/res_group/res_group.c |  135 +++
 kernel/res_group/rgcs.c  |  302 ++
 kernel/res_group/shares.c|  228 +++
 14 files changed, 1439 insertions(+), 4 deletions(-)

Index: container-2.6.20-new/include/linux/moduleparam.h
===
--- container-2.6.20-new.orig/include/linux/moduleparam.h
+++ container-2.6.20-new/include/linux/moduleparam.h
@@ -78,11 +78,17 @@ struct kparam_array
 /* Helper functions: type is byte, short, ushort, int, uint, long,
ulong, charp, bool or invbool, or XXX if you define param_get_XXX,
param_set_XXX and param_check_XXX. */
-#define module_param_named(name, value, type, perm)   \
-   param_check_##type(name, &(value));\
-   module_param_call(name, param_set_##type, param_get_##type, &value, 
perm); \
+#define module_param_named_call(name, value, type, set, perm)  \
+   param_check_##type(name, &(value)); \
+   module_param_call(name, set, param_get_##type, &(value), perm); \
__MODULE_PARM_TYPE(name, #type)
 
+#define module_param_named(name, value, type, perm)   \
+   module_param_named_call(name, value, type, param_set_##type, perm)
+
+#define module_param_set_call(name, type, setfn, perm) \
+   module_param_named_call(name, name, type, setfn, perm)
+
 #define module_param(name, type, perm) \
module_param_named(name, name, type, perm)
 
Index: container-2.6.20-new/include/linux/numtasks.h
===
--- /dev/null
+++ container-2.6.20-new/include/linux/numtasks.h
@@ -0,0 +1,28 @@
+/* numtasks.h - No. of tasks resource controller for Resource Groups
+ *
+ * Copyright (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005
+ *
+ * Provides No. of tasks resource controller for Resource Groups
+ *
+ * Latest version, more details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+#ifndef _LINUX_NUMTASKS_H
+#define _LINUX_NUMTASKS_H
+
+#ifdef CONFIG_RES_GROUPS_NUMTASKS
+#include 
+
+extern int numtasks_allow_fork(struct task_struct *);
+
+#else /* CONFIG_RES_GROUPS_NUMTASKS */
+
+#define numtasks_allow_fork(task) (0)
+
+#endif /* CONFIG_RES_GROUPS_NUMTASKS */
+#endif /* _LINUX_NUMTASKS_H */
Index: container-2.6.20-new/include/linux/res_group.h
===
--- /dev/null
+++ container-2.6.20-new/include/linux/res_group.h
@@ -0,0 +1,86 @@
+/*
+ *  res_group.h - Header file to be used by Resource Groups
+ *
+ * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004
+ * (C) Shailabh Nagar,  IBM Corp. 2003, 2004
+ * (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005
+ *
+ * Provides data structures, macros and kernel APIs
+ *
+ * More details at http://ckrm.sf.net
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#ifndef _LINUX_RES_GROUP_H
+#define _LINUX_RES_GROUP_H
+
+#ifdef CONFIG_RES_GROUPS
+#include 
+#include 
+#include 
+#include 
+
+#define SHARE_UNCHANGED(-1)/* implicitly specified by userspace,
+* never stored in a resource group'
+

[PATCH 5/9] Containers (V9): Add container_clone() interface

2007-04-27 Thread menage

This patch adds support for container_clone(), a speculative interface
to creating new containers intended to be used for systems such as
namespace unsharing.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container.h |2 
 kernel/container.c|  121 ++
 2 files changed, 123 insertions(+)

Index: container-2.6.21-rc7-mm1/kernel/container.c
===
--- container-2.6.21-rc7-mm1.orig/kernel/container.c
+++ container-2.6.21-rc7-mm1/kernel/container.c
@@ -1619,3 +1619,124 @@ void container_exit(struct task_struct *
tsk->containers = init_task.containers;
task_unlock(tsk);
 }
+
+static atomic_t namecnt;
+static void get_unused_name(char *buf) {
+   sprintf(buf, "node%d", atomic_inc_return(&namecnt));
+}
+
+/**
+ * container_clone - duplicate the current container in the hierarchy
+ * that the given subsystem is attached to, and move this task into
+ * the new child
+ */
+int container_clone(struct task_struct *tsk, struct container_subsys *subsys)
+{
+   struct dentry *dentry;
+   int ret = 0;
+   char nodename[32];
+   struct container *parent, *child;
+   struct inode *inode;
+   struct css_group *cg;
+   struct containerfs_root *root;
+
+   /* We shouldn't be called by an unregistered subsystem */
+   BUG_ON(!subsys->active);
+
+   /* First figure out what hierarchy and container we're dealing
+* with, and pin them so we can drop container_mutex */
+   mutex_lock(&container_mutex);
+ again:
+   root = subsys->root;
+   if (root == &rootnode) {
+   printk(KERN_INFO
+  "Not cloning container for unused subsystem %s\n",
+  subsys->name);
+   mutex_unlock(&container_mutex);
+   return 0;
+   }
+   cg = &tsk->containers;
+   parent = task_container(tsk, subsys->subsys_id);
+   /* Pin the hierarchy */
+   atomic_inc(&parent->root->sb->s_active);
+
+   mutex_unlock(&container_mutex);
+
+   /* Now do the VFS work to create a container */
+   get_unused_name(nodename);
+   inode = parent->dentry->d_inode;
+
+   /* Hold the parent directory mutex across this operation to
+* stop anyone else deleting the new container */
+   mutex_lock(&inode->i_mutex);
+   dentry = container_get_dentry(parent->dentry, nodename);
+   if (IS_ERR(dentry)) {
+   printk(KERN_INFO
+  "Couldn't allocate dentry for %s: %ld\n", nodename,
+  PTR_ERR(dentry));
+   ret = PTR_ERR(dentry);
+   goto out_release;
+   }
+
+   /* Create the container directory, which also creates the container */
+   ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
+   child = __d_cont(dentry);
+   dput(dentry);
+   if (ret) {
+   printk(KERN_INFO
+  "Failed to create container %s: %d\n", nodename,
+  ret);
+   goto out_release;
+   }
+
+   if (!child) {
+   printk(KERN_INFO
+  "Couldn't find new container %s\n", nodename);
+   ret = -ENOMEM;
+   goto out_release;
+   }
+
+   /* The container now exists. Retake container_mutex and check
+* that we're still in the same state that we thought we
+* were. */
+   mutex_lock(&container_mutex);
+   if ((root != subsys->root) ||
+   (parent != task_container(tsk, subsys->subsys_id))) {
+   /* Aargh, we raced ... */
+   mutex_unlock(&inode->i_mutex);
+
+   deactivate_super(parent->root->sb);
+   /* The container is still accessible in the VFS, but
+* we're not going to try to rmdir() it at this
+* point. */
+   printk(KERN_INFO
+  "Race in container_clone() - leaking container %s\n",
+  nodename);
+   goto again;
+   }
+
+   /* All seems fine. Finish by moving the task into the new container */
+   ret = attach_task(child, tsk);
+   mutex_unlock(&container_mutex);
+
+ out_release:
+   mutex_unlock(&inode->i_mutex);
+   deactivate_super(parent->root->sb);
+   return ret;
+}
+
+/* See if "cont" is a descendant of the current task's container in
+ * the appropriate hierarchy */
+
+int container_is_descendant(const struct container *cont) {
+   int ret;
+   struct container *target;
+   int subsys_id;
+   get_first_subsys(cont, NULL, &subsys_id);
+   target = task_container(current, subsys_i

[PATCH 0/9] Containers (V9): Generic Process Containers

2007-04-27 Thread menage

--

This is an update to my multi-hierarchy multi-subsystem generic
process containers patch. Changes since V8 (April 6th) include:

- The patchset has been rebased over 2.6.21-rc7-mm1

- The patchset has been restructured based on feedback; more
  functionality is now split out into separate patches where
  practical.

- The container_group structure has been renamed css_group since this
  is more descriptive of its true function

- Added a simplified file registration interface, and a simple
  interface for the common operation of returning a single number to
  userspace from a container control file

- Added a simple "debug" subsystem that is both an example of how to
  use the container system and a useful debugging tool for checking
  reference counts, etc.

Still TODO:

- decide whether "Containers" is an acceptable name for the system
given its usage by some other development groups, or whether something
else (ProcessSets? ResourceGroups? TaskGroups?) would be better

- decide whether merging css_group and nsproxy is desirable

- add a hash-table based lookup for css_group objects.

- use seq_file properly in container tasks files to avoid having to
  allocate a big array for all the container's task pointers.

- add back support for the "release agent" functionality

- lots more testing

- define standards for container file names

Generic Process Containers
--

There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
containers, and others.  These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.

Already existing in the kernel is the cpuset subsystem; this has a
process grouping mechanism that is mature, tested, and well documented
(particularly with regards to synchronization rules).

This patchset extracts the process grouping code from cpusets into a
generic container system, and makes the cpusets code a client of the
container system, along with a couple of simple example subsystems.

The patch set is structured as follows:

1) Basic container framework - filesystem and tracking structures

2) Simple CPU Accounting example subsystem

3) Support for the "tasks" control file

4) Hooks for fork() and exit()

5) Support for the container_clone() operation

6) Add /proc reporting interface

7) Make cpusets a container subsystem

8) Share container subsystem pointer arrays between tasks with the
   same assignments

9) Simple container debugging subsystem

The intention is that the various resource management and
virtualization efforts can also become container clients, with the
result that:

- the userspace APIs are (somewhat) normalised

- it's easier to test out e.g. the ResGroups CPU controller in
 conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.

- the additional kernel footprint of any of the competing resource
 management systems is substantially reduced, since it doesn't need
 to provide process grouping/containment, hence improving their
 chances of getting into the kernel

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 3/9] Containers (V9): Add tasks file interface

2007-04-27 Thread menage

This patch adds the per-directory "tasks" file for containerfs mounts;
this allows the user to determine which tasks are members of a
container by reading a container's "tasks", and to move a task into a
container by writing its pid to its "tasks".

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container.h |2 
 kernel/container.c|  344 ++
 2 files changed, 346 insertions(+)

Index: container-2.6.21-rc7-mm1/include/linux/container.h
===
--- container-2.6.21-rc7-mm1.orig/include/linux/container.h
+++ container-2.6.21-rc7-mm1/include/linux/container.h
@@ -128,6 +128,8 @@ int container_is_removed(const struct co
 
 int container_path(const struct container *cont, char *buf, int buflen);
 
+int container_task_count(const struct container *cont);
+
 /* Return true if the container is a descendant of the current container */
 int container_is_descendant(const struct container *cont);
 
Index: container-2.6.21-rc7-mm1/kernel/container.c
===
--- container-2.6.21-rc7-mm1.orig/kernel/container.c
+++ container-2.6.21-rc7-mm1/kernel/container.c
@@ -676,6 +676,111 @@ static inline void get_first_subsys(cons
*subsys_id = test_ss->subsys_id;
 }
 
+/*
+ * Attach task 'tsk' to container 'cont'
+ *
+ * Call holding container_mutex.  May take task_lock of
+ * the task 'pid' during call.
+ */
+
+static int attach_task(struct container *cont, struct task_struct *tsk)
+{
+   int retval = 0;
+   struct container_subsys *ss;
+   struct container *oldcont;
+   struct css_group *cg = &tsk->containers;
+   struct containerfs_root *root = cont->root;
+   int i;
+
+   int subsys_id;
+   get_first_subsys(cont, NULL, &subsys_id);
+
+   /* Nothing to do if the task is already in that container */
+   oldcont = task_container(tsk, subsys_id);
+   if (cont == oldcont)
+   return 0;
+
+   for_each_subsys(root, ss) {
+   if (ss->can_attach) {
+   retval = ss->can_attach(ss, cont, tsk);
+   if (retval) {
+   return retval;
+   }
+   }
+   }
+
+   task_lock(tsk);
+   if (tsk->flags & PF_EXITING) {
+   task_unlock(tsk);
+   return -ESRCH;
+   }
+   /* Update the css_group pointers for the subsystems in this
+* hierarchy */
+   for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
+   if (root->subsys_bits & (1ull << i)) {
+   /* Subsystem is in this hierarchy. So we want
+* the subsystem state from the new
+* container. Transfer the refcount from the
+* old to the new */
+   atomic_inc(&cont->count);
+   atomic_dec(&cg->subsys[i]->container->count);
+   rcu_assign_pointer(cg->subsys[i], cont->subsys[i]);
+   }
+   }
+   task_unlock(tsk);
+
+   for_each_subsys(root, ss) {
+   if (ss->attach) {
+   ss->attach(ss, cont, oldcont, tsk);
+   }
+   }
+
+   synchronize_rcu();
+   return 0;
+}
+
+/*
+ * Attach task with pid 'pid' to container 'cont'. Call with
+ * container_mutex, may take task_lock of task
+ *
+ */
+
+static int attach_task_by_pid(struct container *cont, char *pidbuf)
+{
+   pid_t pid;
+   struct task_struct *tsk;
+   int ret;
+
+   if (sscanf(pidbuf, "%d", &pid) != 1)
+   return -EIO;
+
+   if (pid) {
+   read_lock(&tasklist_lock);
+
+   tsk = find_task_by_pid(pid);
+   if (!tsk || tsk->flags & PF_EXITING) {
+   read_unlock(&tasklist_lock);
+   return -ESRCH;
+   }
+
+   get_task_struct(tsk);
+   read_unlock(&tasklist_lock);
+
+   if ((current->euid) && (current->euid != tsk->uid)
+   && (current->euid != tsk->suid)) {
+   put_task_struct(tsk);
+   return -EACCES;
+   }
+   } else {
+   tsk = current;
+   get_task_struct(tsk);
+   }
+
+   ret = attach_task(cont, tsk);
+   put_task_struct(tsk);
+   return ret;
+}
+
 /* The various types of files and directories in a container file system */
 
 typedef enum {
@@ -684,6 +789,54 @@ typedef enum {
FILE_TASKLIST,
 } container_filetype_t;
 
+static ssize_t container_common_file_write(struct container *cont,
+

[PATCH 9/9] Containers (V9): Simple debug info subsystem

2007-04-27 Thread menage

This example subsystem exports debugging information as an aid to
diagnosing refcount leaks, etc, in the container framework.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container_subsys.h |4 +
 init/Kconfig |   10 
 kernel/Makefile  |1 
 kernel/container_debug.c |   89 +++
 4 files changed, 104 insertions(+)

Index: container-2.6.21-rc7-mm1/include/linux/container_subsys.h
===
--- container-2.6.21-rc7-mm1.orig/include/linux/container_subsys.h
+++ container-2.6.21-rc7-mm1/include/linux/container_subsys.h
@@ -19,4 +19,8 @@ SUBSYS(cpuset)
 
 /* */
 
+#ifdef CONFIG_CONTAINER_DEBUG
+SUBSYS(debug)
+#endif
+
 /* */
Index: container-2.6.21-rc7-mm1/init/Kconfig
===
--- container-2.6.21-rc7-mm1.orig/init/Kconfig
+++ container-2.6.21-rc7-mm1/init/Kconfig
@@ -291,6 +291,16 @@ config IKCONFIG_PROC
 config CONTAINERS
bool
 
+config CONTAINER_DEBUG
+   bool "Example debug container subsystem"
+   select CONTAINERS
+   help
+ This option enables a simple container subsystem that
+ exports useful debugging information about the containers
+ framework
+
+ Say N if unsure
+
 config CPUSETS
bool "Cpuset support"
depends on SMP
Index: container-2.6.21-rc7-mm1/kernel/container_debug.c
===
--- /dev/null
+++ container-2.6.21-rc7-mm1/kernel/container_debug.c
@@ -0,0 +1,89 @@
+/*
+ * kernel/ccontainer_debug.c - Example container subsystem that
+ * exposes debug info
+ *
+ * Copyright (C) Google Inc, 2007
+ *
+ * Developed by Paul Menage ([EMAIL PROTECTED])
+ *
+ */
+
+#include 
+#include 
+
+static int debug_create(struct container_subsys *ss, struct container *cont)
+{
+   struct container_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+   if (!css)
+   return -ENOMEM;
+   cont->subsys[debug_subsys_id] = css;
+   return 0;
+}
+
+static void debug_destroy(struct container_subsys *ss, struct container *cont)
+{
+   kfree(cont->subsys[debug_subsys_id]);
+}
+
+static u64 container_refcount_read(struct container *cont, struct cftype *cft)
+{
+   return atomic_read(&cont->count);
+}
+
+static u64 taskcount_read(struct container *cont, struct cftype *cft)
+{
+   u64 count;
+   container_lock();
+   count = container_task_count(cont);
+   container_unlock();
+   return count;
+}
+
+static u64 current_css_group_read(struct container *cont, struct cftype *cft)
+{
+   return (u64) current->containers;
+}
+
+static u64 current_css_group_refcount_read(struct container *cont,
+  struct cftype *cft)
+{
+   u64 count;
+   rcu_read_lock();
+   count = atomic_read(¤t->containers->ref.refcount);
+   rcu_read_unlock();
+   return count;
+}
+
+static struct cftype files[] =  {
+   {
+   .name = "debug.container_refcount",
+   .read_uint = container_refcount_read,
+   },
+   {
+   .name = "debug.taskcount",
+   .read_uint = taskcount_read,
+   },
+
+   {
+   .name = "debug.current_css_group",
+   .read_uint = current_css_group_read,
+   },
+
+   {
+   .name = "debug.current_css_group_refcount",
+   .read_uint = current_css_group_refcount_read,
+   },
+};
+
+static int debug_populate(struct container_subsys *ss, struct container *cont)
+{
+   return container_add_files(cont, files, ARRAY_SIZE(files));
+}
+
+struct container_subsys debug_subsys = {
+   .name = "debug",
+   .create = debug_create,
+   .destroy = debug_destroy,
+   .populate = debug_populate,
+   .subsys_id = debug_subsys_id,
+};
Index: container-2.6.21-rc7-mm1/kernel/Makefile
===
--- container-2.6.21-rc7-mm1.orig/kernel/Makefile
+++ container-2.6.21-rc7-mm1/kernel/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CONTAINERS) += container.o
+obj-$(CONFIG_CONTAINER_DEBUG) += container_debug.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
 obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o
 obj-$(CONFIG_IKCONFIG) += configs.o

--
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 4/9] Containers (V9): Add fork/exit hooks

2007-04-27 Thread menage

This patch adds the necessary hooks to the fork() and exit() paths to
ensure that new children inherit their parent's container assignments,
and that exiting processes release reference counts on their
containers.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container.h |6 ++
 kernel/container.c|  126 ++
 kernel/exit.c |2 
 kernel/fork.c |   14 -
 4 files changed, 146 insertions(+), 2 deletions(-)

Index: container-2.6.21-rc7-mm1/kernel/exit.c
===
--- container-2.6.21-rc7-mm1.orig/kernel/exit.c
+++ container-2.6.21-rc7-mm1/kernel/exit.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -939,6 +940,7 @@ fastcall NORET_TYPE void do_exit(long co
__exit_fs(tsk);
exit_thread();
cpuset_exit(tsk);
+   container_exit(tsk, 1);
exit_keys(tsk);
 
if (group_dead && tsk->signal->leader)
Index: container-2.6.21-rc7-mm1/kernel/fork.c
===
--- container-2.6.21-rc7-mm1.orig/kernel/fork.c
+++ container-2.6.21-rc7-mm1/kernel/fork.c
@@ -30,6 +30,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -962,6 +963,7 @@ static struct task_struct *copy_process(
 {
int retval;
struct task_struct *p = NULL;
+   int container_callbacks_done = 0;
 
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
@@ -1061,12 +1063,13 @@ static struct task_struct *copy_process(
p->io_wait = NULL;
p->audit_context = NULL;
cpuset_fork(p);
+   container_fork(p);
 #ifdef CONFIG_NUMA
p->mempolicy = mpol_copy(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
-   goto bad_fork_cleanup_cpuset;
+   goto bad_fork_cleanup_container;
}
mpol_fix_fork_child_flag(p);
 #endif
@@ -1176,6 +1179,12 @@ static struct task_struct *copy_process(
/* Perform scheduler related setup. Assign this task to a CPU. */
sched_fork(p, clone_flags);
 
+   /* Now that the task is set up, run container callbacks if
+* necessary. We need to run them before the task is visible
+* on the tasklist. */
+   container_fork_callbacks(p);
+   container_callbacks_done = 1;
+
/* Need tasklist lock for parent etc handling! */
write_lock_irq(&tasklist_lock);
 
@@ -1298,9 +1307,10 @@ bad_fork_cleanup_security:
 bad_fork_cleanup_policy:
 #ifdef CONFIG_NUMA
mpol_free(p->mempolicy);
-bad_fork_cleanup_cpuset:
+bad_fork_cleanup_container:
 #endif
cpuset_exit(p);
+   container_exit(p, container_callbacks_done);
delayacct_tsk_free(p);
if (p->binfmt)
module_put(p->binfmt->module);
Index: container-2.6.21-rc7-mm1/include/linux/container.h
===
--- container-2.6.21-rc7-mm1.orig/include/linux/container.h
+++ container-2.6.21-rc7-mm1/include/linux/container.h
@@ -18,6 +18,9 @@
 extern int container_init_early(void);
 extern int container_init(void);
 extern void container_init_smp(void);
+extern void container_fork(struct task_struct *p);
+extern void container_fork_callbacks(struct task_struct *p);
+extern void container_exit(struct task_struct *p, int run_callbacks);
 
 extern struct file_operations proc_container_operations;
 
@@ -191,6 +194,9 @@ int container_path(const struct containe
 static inline int container_init_early(void) { return 0; }
 static inline int container_init(void) { return 0; }
 static inline void container_init_smp(void) {}
+static inline void container_fork(struct task_struct *p) {}
+static inline void container_fork_callbacks(struct task_struct *p) {}
+static inline void container_exit(struct task_struct *p, int callbacks) {}
 
 static inline void container_lock(void) {}
 static inline void container_unlock(void) {}
Index: container-2.6.21-rc7-mm1/kernel/container.c
===
--- container-2.6.21-rc7-mm1.orig/kernel/container.c
+++ container-2.6.21-rc7-mm1/kernel/container.c
@@ -132,6 +132,34 @@ list_for_each_entry(_ss, &_root->subsys_
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
+/* Each task_struct has an embedded css_group, so the get/put
+ * operation simply takes a reference count on all the containers
+ * referenced by subsystems in this css_group. This can end up
+ * multiple-counting some containers, but that's OK - the ref-count is
+ * just a busy/not-busy indicator; ensuring that we only count each
+ * container once would requi

[PATCH 6/9] Containers (V9): Add procfs interface

2007-04-27 Thread menage

This patch adds:

/proc/containers - general system info

/proc/*/container - per-task container membership info

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 fs/proc/base.c |7 ++
 kernel/container.c |  128 +
 2 files changed, 135 insertions(+)

Index: container-2.6.21-rc7-mm1/fs/proc/base.c
===
--- container-2.6.21-rc7-mm1.orig/fs/proc/base.c
+++ container-2.6.21-rc7-mm1/fs/proc/base.c
@@ -68,6 +68,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1980,6 +1981,9 @@ static const struct pid_entry tgid_base_
 #ifdef CONFIG_CPUSETS
REG("cpuset", S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
INF("oom_score",  S_IRUGO, oom_score),
REG("oom_adj",S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
@@ -2270,6 +2274,9 @@ static const struct pid_entry tid_base_s
 #ifdef CONFIG_CPUSETS
REG("cpuset",S_IRUGO, cpuset),
 #endif
+#ifdef CONFIG_CONTAINERS
+   REG("container",  S_IRUGO, container),
+#endif
INF("oom_score", S_IRUGO, oom_score),
REG("oom_adj",   S_IRUGO|S_IWUSR, oom_adjust),
 #ifdef CONFIG_AUDITSYSCALL
Index: container-2.6.21-rc7-mm1/kernel/container.c
===
--- container-2.6.21-rc7-mm1.orig/kernel/container.c
+++ container-2.6.21-rc7-mm1/kernel/container.c
@@ -247,6 +247,7 @@ static int container_mkdir(struct inode 
 static int container_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int container_populate_dir(struct container *cont);
 static struct inode_operations container_dir_inode_operations;
+struct file_operations proc_containerstats_operations;
 
 static struct backing_dev_info container_backing_dev_info = {
.ra_pages = 0,  /* No readahead */
@@ -1507,6 +1508,7 @@ int __init container_init(void)
 {
int err;
int i;
+   struct proc_dir_entry *entry;
 
for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) {
struct container_subsys *ss = subsys[i];
@@ -1518,10 +1520,136 @@ int __init container_init(void)
if (err < 0)
goto out;
 
+   entry = create_proc_entry("containers", 0, NULL);
+   if (entry)
+   entry->proc_fops = &proc_containerstats_operations;
+
 out:
return err;
 }
 
+/*
+ * proc_container_show()
+ *  - Print task's container paths into seq_file, one line for each hierarchy
+ *  - Used for /proc//container.
+ *  - No need to task_lock(tsk) on this tsk->container reference, as it
+ *doesn't really matter if tsk->container changes after we read it,
+ *and we take container_mutex, keeping attach_task() from changing it
+ *anyway.  No need to check that tsk->container != NULL, thanks to
+ *the_top_container_hack in container_exit(), which sets an exiting tasks
+ *container to top_container.
+ */
+
+/* TODO: Use a proper seq_file iterator */
+static int proc_container_show(struct seq_file *m, void *v)
+{
+   struct pid *pid;
+   struct task_struct *tsk;
+   char *buf;
+   int retval;
+   struct containerfs_root *root;
+
+   retval = -ENOMEM;
+   buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+   if (!buf)
+   goto out;
+
+   retval = -ESRCH;
+   pid = m->private;
+   tsk = get_pid_task(pid, PIDTYPE_PID);
+   if (!tsk)
+   goto out_free;
+
+   retval = 0;
+
+   mutex_lock(&container_mutex);
+
+   for_each_root(root) {
+   struct container_subsys *ss;
+   struct container *cont;
+   int subsys_id;
+   int count = 0;
+   /* Skip this hierarchy if it has no active subsystems */
+   if (!root->subsys_bits) continue;
+   for_each_subsys(root, ss) {
+   seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
+   }
+   seq_putc(m, ':');
+   get_first_subsys(&root->top_container, NULL, &subsys_id);
+   cont = task_container(tsk, subsys_id);
+   retval = container_path(cont, buf, PAGE_SIZE);
+   if (retval < 0)
+   goto out_unlock;
+   seq_puts(m, buf);
+   seq_putc(m, '\n');
+   }
+
+out_unlock:
+   mutex_unlock(&container_mutex);
+   put_task_struct(tsk);
+out_free:
+   kfree(buf);
+out:
+   return retval;
+}
+
+static int container_open(struct inode *inode, struct file *file)
+{
+   struct pid *pid = PROC_I(inode)->pid;
+   return single_open(file, proc_container_show, pid);
+}

[PATCH 2/9] Containers (V9): Example CPU accounting subsystem

2007-04-27 Thread menage

This example demonstrates how to use the generic container subsystem
for a simple resource tracker that counts, for the processes in a
container, the total CPU time used and the %CPU used in the last
complete 10 second interval.

Portions contributed by Balbir Singh <[EMAIL PROTECTED]>

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container_subsys.h |6 +
 include/linux/cpu_acct.h |   14 ++
 init/Kconfig |7 +
 kernel/Makefile  |1 
 kernel/cpu_acct.c|  185 +++
 kernel/sched.c   |   14 ++
 6 files changed, 224 insertions(+), 3 deletions(-)

Index: container-2.6.21-rc7-mm1/include/linux/container_subsys.h
===
--- container-2.6.21-rc7-mm1.orig/include/linux/container_subsys.h
+++ container-2.6.21-rc7-mm1/include/linux/container_subsys.h
@@ -7,4 +7,10 @@
 
 /* */
 
+#ifdef CONFIG_CONTAINER_CPUACCT
+SUBSYS(cpuacct)
+#endif
+
+/* */
+
 /* */
Index: container-2.6.21-rc7-mm1/include/linux/cpu_acct.h
===
--- /dev/null
+++ container-2.6.21-rc7-mm1/include/linux/cpu_acct.h
@@ -0,0 +1,14 @@
+
+#ifndef _LINUX_CPU_ACCT_H
+#define _LINUX_CPU_ACCT_H
+
+#include 
+#include 
+
+#ifdef CONFIG_CONTAINER_CPUACCT
+extern void cpuacct_charge(struct task_struct *, cputime_t cputime);
+#else
+static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {}
+#endif
+
+#endif
Index: container-2.6.21-rc7-mm1/init/Kconfig
===
--- container-2.6.21-rc7-mm1.orig/init/Kconfig
+++ container-2.6.21-rc7-mm1/init/Kconfig
@@ -322,6 +322,13 @@ config SYSFS_DEPRECATED
  If you are using a distro that was released in 2006 or later,
  it should be safe to say N here.
 
+config CONTAINER_CPUACCT
+   bool "Simple CPU accounting container subsystem"
+   select CONTAINERS
+   help
+ Provides a simple Resource Controller for monitoring the
+ total CPU consumed by the tasks in a container
+
 config RELAY
bool "Kernel->user space relay support (formerly relayfs)"
help
Index: container-2.6.21-rc7-mm1/kernel/Makefile
===
--- container-2.6.21-rc7-mm1.orig/kernel/Makefile
+++ container-2.6.21-rc7-mm1/kernel/Makefile
@@ -38,6 +38,7 @@ obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_CONTAINERS) += container.o
 obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o
 obj-$(CONFIG_IKCONFIG) += configs.o
 obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
 obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
Index: container-2.6.21-rc7-mm1/kernel/cpu_acct.c
===
--- /dev/null
+++ container-2.6.21-rc7-mm1/kernel/cpu_acct.c
@@ -0,0 +1,185 @@
+/*
+ * kernel/cpu_acct.c - CPU accounting container subsystem
+ *
+ * Copyright (C) Google Inc, 2006
+ *
+ * Developed by Paul Menage ([EMAIL PROTECTED]) and Balbir Singh
+ * ([EMAIL PROTECTED])
+ *
+ */
+
+/*
+ * Container subsystem for reporting total CPU usage of tasks in a
+ * container, along with percentage load over a time interval
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+struct cpuacct {
+   struct container_subsys_state css;
+   spinlock_t lock;
+   /* total time used by this class */
+   cputime64_t time;
+
+   /* time when next load calculation occurs */
+   u64 next_interval_check;
+
+   /* time used in current period */
+   cputime64_t current_interval_time;
+
+   /* time used in last period */
+   cputime64_t last_interval_time;
+};
+
+struct container_subsys cpuacct_subsys;
+
+static inline struct cpuacct *container_ca(struct container *cont)
+{
+   return container_of(container_subsys_state(cont, cpuacct_subsys_id),
+   struct cpuacct, css);
+}
+
+static inline struct cpuacct *task_ca(struct task_struct *task)
+{
+   return container_of(task_subsys_state(task, cpuacct_subsys_id),
+   struct cpuacct, css);
+}
+
+#define INTERVAL (HZ * 10)
+
+static inline u64 next_interval_boundary(u64 now) {
+   /* calculate the next interval boundary beyond the
+* current time */
+   do_div(now, INTERVAL);
+   return (now + 1) * INTERVAL;
+}
+
+static int cpuacct_create(struct container_subsys *ss, struct container *cont)
+{
+   struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+   if (!ca)
+   return -ENOMEM;
+   spin_lock_init(&ca->lock);
+   ca->next_interval_check = next_interval_boundary(get_jiffies_64());
+   cont->subsys[cpuacct_subsys_id] = &ca->css;
+   return 0;
+}
+
+static void cpuacct_destroy(struct container_subsys

[PATCH 8/9] Containers (V9): Share css_group arrays between tasks with same container memberships

2007-04-27 Thread menage

This patch replaces the struct css_group embedded in task_struct with
a pointer; all tasks with the same set of memberships across all
hierarchies will share a css_group object.

The css_group used by init isn't refcounted, since it can't ever be
freed; this speeds up fork/exit for any systems that have containers
compiled in but haven't actually created any containers other than the
default one.

With more than one registered subsystem, this reduces the number of
atomic inc/dec operations required when tasks fork/exit;

Assuming that many tasks share the same container assignments, this
reduces overall space usage and keeps the size of the task_struct down
(only one pointer added to task_struct compared to a non-containers
kernel).

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/container.h |   35 ++
 include/linux/sched.h |   30 -
 kernel/container.c|  248 +-
 3 files changed, 238 insertions(+), 75 deletions(-)

Index: container-2.6.21-rc7-mm1/include/linux/container.h
===
--- container-2.6.21-rc7-mm1.orig/include/linux/container.h
+++ container-2.6.21-rc7-mm1/include/linux/container.h
@@ -29,6 +29,14 @@ extern void container_unlock(void);
 
 struct containerfs_root;
 
+/* Define the enumeration of all container subsystems */
+#define SUBSYS(_x) _x ## _subsys_id,
+enum container_subsys_id {
+#include 
+   CONTAINER_SUBSYS_COUNT
+};
+#undef SUBSYS
+
 /* Per-subsystem/per-container state maintained by the system. */
 struct container_subsys_state {
/* The container that this subsystem is attached to. Useful
@@ -87,6 +95,31 @@ struct container {
struct container *top_container;
 };
 
+/* A css_group is a structure holding pointers to a set of
+ * container_subsys_state objects. This saves space in the task struct
+ * object and speeds up fork()/exit(), since a single inc/dec can bump
+ * the reference count on the entire container set for a task.
+ */
+
+struct css_group {
+
+   /* Reference count */
+   struct kref ref;
+
+   /* List running through all container groups */
+   struct list_head list;
+
+   /* Set of subsystem states, one for each subsystem. NULL for
+* subsystems that aren't part of this hierarchy. These
+* pointers reduce the number of dereferences required to get
+* from a task to its state for a given container, but result
+* in increased space usage if tasks are in wildly different
+* groupings across different hierarchies. This array is
+* immutable after creation */
+   struct container_subsys_state *subsys[CONTAINER_SUBSYS_COUNT];
+
+};
+
 /* struct cftype:
  *
  * The files in the container filesystem mostly have a very simple read/write
@@ -178,7 +211,7 @@ static inline struct container_subsys_st
 static inline struct container_subsys_state *task_subsys_state(
struct task_struct *task, int subsys_id)
 {
-   return rcu_dereference(task->containers.subsys[subsys_id]);
+   return rcu_dereference(task->containers->subsys[subsys_id]);
 }
 
 static inline struct container* task_container(struct task_struct *task,
Index: container-2.6.21-rc7-mm1/include/linux/sched.h
===
--- container-2.6.21-rc7-mm1.orig/include/linux/sched.h
+++ container-2.6.21-rc7-mm1/include/linux/sched.h
@@ -818,34 +818,6 @@ struct uts_namespace;
 
 struct prio_array;
 
-#ifdef CONFIG_CONTAINERS
-
-#define SUBSYS(_x) _x ## _subsys_id,
-enum container_subsys_id {
-#include 
-   CONTAINER_SUBSYS_COUNT
-};
-#undef SUBSYS
-
-/* A css_group is a structure holding pointers to a set of
- * container_subsys_state objects.
- */
-
-struct css_group {
-
-   /* Set of subsystem states, one for each subsystem. NULL for
-* subsystems that aren't part of this hierarchy. These
-* pointers reduce the number of dereferences required to get
-* from a task to its state for a given container, but result
-* in increased space usage if tasks are in wildly different
-* groupings across different hierarchies. This array is
-* immutable after creation */
-   struct container_subsys_state *subsys[CONTAINER_SUBSYS_COUNT];
-
-};
-
-#endif /* CONFIG_CONTAINERS */
-
 struct task_struct {
volatile long state;/* -1 unrunnable, 0 runnable, >0 stopped */
struct thread_info *thread_info;
@@ -1098,7 +1070,7 @@ struct task_struct {
int cpuset_mem_spread_rotor;
 #endif
 #ifdef CONFIG_CONTAINERS
-   struct css_group containers;
+   struct css_group *containers;
 #endif
struct robust_list_head __user *robust_list;
 #ifdef CONFIG_COMPAT
Index: container-2.6.21-rc7-mm1/kernel/container.c
===
--- container-2.6.21-rc7-mm1.orig/kernel/cont

Re: [RFC] Default child of a cgroup

2008-01-31 Thread Paul Menage

On Jan 30, 2008 6:40 PM, Srivatsa Vaddagiri <[EMAIL PROTECTED]> wrote:
>
> Here are some questions that arise in this picture:
>
> 1. What is the relationship of the task-group in A/tasks with the
>task-group in A/a1/tasks? In otherwords do they form siblings
>of the same parent A?

I'd argue the same as Balbir - tasks in A/tasks are are children of A
and are siblings of a1, a2, etc.

>
> 2. Somewhat related to the above question, how much resource should the
>task-group A/a1/tasks get in relation to A/tasks? Is it 1/2 of parent
>A's share or 1/(1 + N) of parent A's share (where N = number of tasks
>in A/tasks)?

Each process in A should have a scheduler weight that's derived from
its static_prio field. Similarly each subgroup of A will have a
scheduler weight that's determined by its cpu.shares value. So the cpu
share of any child (be it a task or a subgroup) would be equal to its
own weight divided by the sum of weights of all children.

So yes, if a task in A forks lots of children, those children could
end up getting a disproportionate amount of the CPU compared to tasks
in A/a1 - but that's the same as the situation without cgroups. If you
want to control cpu usage between different sets of processes in A,
they should be in sibling cgroups, not directly in A.

Is there a restriction in CFS that stops a given group from
simultaneously holding tasks and sub-groups? If so, couldn't we change
CFS to make it possible rather than enforcing awkward restructions on
cgroups?

If we really can't change CFS in that way, then an alternative would
be similar to Peter's suggestion - make cpu_cgroup_can_attach() fail
if the cgroup has children, and make cpu_cgroup_create() fail if the
cgroup has any tasks - that way you limit the restriction to just the
hierarchy that has CFS attached to it, rather than generically for all
cgroups

BTW, I noticed this code in cpu_cgroup_create():

/* we support only 1-level deep hierarchical scheduler atm */
if (cgrp->parent->parent)
return ERR_PTR(-EINVAL);

Is anyone working on allowing more levels?

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] Default child of a cgroup

2008-02-01 Thread Paul Menage

On Jan 31, 2008 11:58 PM, Peter Zijlstra <[EMAIL PROTECTED]> wrote:
> > Is there a restriction in CFS that stops a given group from
> > simultaneously holding tasks and sub-groups? If so, couldn't we change
> > CFS to make it possible rather than enforcing awkward restrictions on
> > cgroups?
>
> I think it is possible, just way more work than the proposed hack.

Seems to me like the right thing to do though.

>
> > If we really can't change CFS in that way, then an alternative would
> > be similar to Peter's suggestion - make cpu_cgroup_can_attach() fail
> > if the cgroup has children, and make cpu_cgroup_create() fail if the
> > cgroup has any tasks - that way you limit the restriction to just the
> > hierarchy that has CFS attached to it, rather than generically for all
> > cgroups
>
> Agreed.
>

Actually, I realised later that this is impossible - since the root
cgroup will have tasks initially, there'd be no way to create the
first child cgroup in the CFS hierarchy.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] Add MS_BIND_FLAGS mount flag

2008-02-12 Thread Paul Menage


From: Paul Menage <[EMAIL PROTECTED]>

Add a new mount() flag, MS_BIND_FLAGS.

MS_BIND_FLAGS indicates that a bind mount should take its per-mount flags
from the arguments passed to mount() rather than from the source
mountpoint.

This flag allows you to create a bind mount with the desired per-mount
flags in a single operation, rather than having to do a bind mount
followed by a remount, which is fiddly and can block for non-trivial
periods of time (on sb->s_umount?).

For recursive bind mounts, only the root of the tree being bound
inherits the per-mount flags from the mount() arguments; sub-mounts
inherit their per-mount flags from the source tree as usual.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>


---
fs/namespace.c |   36 +---
include/linux/fs.h |2 ++
2 files changed, 27 insertions(+), 11 deletions(-)

Index: 2.6.24-mm1-bindflags/fs/namespace.c
===
--- 2.6.24-mm1-bindflags.orig/fs/namespace.c
+++ 2.6.24-mm1-bindflags/fs/namespace.c
@@ -512,13 +512,13 @@ static struct vfsmount *skip_mnt_tree(st
}

static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
-   int flag)
+ int flag, int mnt_flags)
{
struct super_block *sb = old->mnt_sb;
struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);

if (mnt) {
-   mnt->mnt_flags = old->mnt_flags;
+   mnt->mnt_flags = mnt_flags;
atomic_inc(&sb->s_active);
mnt->mnt_sb = sb;
mnt->mnt_root = dget(root);
@@ -1095,8 +1095,9 @@ static int lives_below_in_same_fs(struct
}
}

-struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
-   int flag)
+static struct vfsmount *
+__copy_tree(struct vfsmount *mnt, struct dentry *dentry,
+   int flag, int mnt_flags)
{
struct vfsmount *res, *p, *q, *r, *s;
struct nameidata nd;
@@ -1104,7 +1105,7 @@ struct vfsmount *copy_tree(struct vfsmou
if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt))
return NULL;

-   res = q = clone_mnt(mnt, dentry, flag);
+   res = q = clone_mnt(mnt, dentry, flag, mnt_flags);
if (!q)
goto Enomem;
q->mnt_mountpoint = mnt->mnt_mountpoint;
@@ -1126,7 +1127,7 @@ struct vfsmount *copy_tree(struct vfsmou
p = s;
nd.path.mnt = q;
nd.path.dentry = p->mnt_mountpoint;
-   q = clone_mnt(p, p->mnt_root, flag);
+   q = clone_mnt(p, p->mnt_root, flag, p->mnt_flags);
if (!q)
goto Enomem;
spin_lock(&vfsmount_lock);
@@ -1146,6 +1147,11 @@ Enomem:
}
return NULL;
}
+struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
+  int flag)
+{
+   return __copy_tree(mnt, dentry, flag, mnt->mnt_flags);
+}

struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
{
@@ -1320,7 +1326,8 @@ static int do_change_type(struct nameida
/*
 * do loopback mount.
 */
-static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
+static int do_loopback(struct nameidata *nd, char *old_name, int flags,
+  int mnt_flags)
{
struct nameidata old_nd;
struct vfsmount *mnt = NULL;
@@ -1342,10 +1349,15 @@ static int do_loopback(struct nameidata 
		goto out;


err = -ENOMEM;
-   if (recurse)
-   mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0);
+   /* Use the source mount flags unless the user passed MS_BIND_FLAGS */
+   if (!(flags & MS_BIND_FLAGS))
+   mnt_flags = old_nd.path.mnt->mnt_flags;
+   if (flags & MS_REC)
+   mnt = __copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0,
+ mnt_flags);
else
-   mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0);
+   mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0,
+   mnt_flags);

if (!mnt)
goto out;
@@ -1874,7 +1886,9 @@ long do_mount(char *dev_name, char *dir_
retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
data_page);
else if (flags & MS_BIND)
-   retval = do_loopback(&nd, dev_name, flags & MS_REC);
+   retval = do_loopback(&nd, dev_name,
+flags & (MS_REC | MS_BIND_FLAGS),
+mnt_flags);
else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))

Re: [PATCH] Add MS_BIND_FLAGS mount flag

2008-02-14 Thread Paul Menage

On Thu, Feb 14, 2008 at 12:30 AM, Miklos Szeredi <[EMAIL PROTECTED]> wrote:
>  > For recursive bind mounts, only the root of the tree being bound
>  > inherits the per-mount flags from the mount() arguments; sub-mounts
>  > inherit their per-mount flags from the source tree as usual.
>
>  This is rather strange behavior.  I think it would be much better, if
>  setting mount flags would work for recursive operations as well.  Also
>  what we really need is not resetting all the mount flags to some
>  predetermined values, but to be able to set or clear each flag
>  individually.

This is certainly true, but as you observe below it's a fair bit more
fiddly to specify in the API. I wasn't sure how much people recursive
bind mounts, so I figured I'd throw out this simpler version first.

>
>  For example, with the per-mount-read-only thing the most useful
>  application would be to just set the read-only flag and leave the
>  others alone.
>
>  And this is where we usually conclude, that a new userspace mount API
>  is long overdue.  So for starters, how about a new syscall for bind
>  mounts:
>
>  int mount_bind(const char *src, const char *dst, unsigned flags,
>  unsigned mnt_flags);

The "flags" argument could be the same as for regular mount, and
contain the mnt_flags - so the extra argument could maybe usefully be
a "mnt_flags_mask", to indicate which flags we actually care about
overriding.

What would happen when an existing super-block flag changes to become
a per-mount flag (e.g. per-mount read-only)? I think that would just
fit in with the "mask" idea, as long as we complained if any bits in
mnt_flags_mask weren't actually per-mount settable.

Being able to mask/set mount flags might be useful on a remount too,
since there's no clean way to get the existing mount flags for a mount
other than by scanning /proc/mounts. So an alternative to a separate
system call would be a new mnt_flag_mask argument to mount() (whose
presence would be indicated by a flag bit being set in the main flags)
which would be used to control which bits were set cleared for
remount/bind calls. Seems a bit wasteful of bits though. If we turned
"flags" into an (optionally) 64-bit argument then we'd have plenty of
bits to be able to specify both a "set" bit and a "mask" bit for each,
without needing a new syscall.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Add MS_BIND_FLAGS mount flag

2008-02-14 Thread Paul Menage

On Wed, Feb 13, 2008 at 10:02 PM, Christoph Hellwig <[EMAIL PROTECTED]> wrote:
>
>  I think this concept is reasonable, but I don't think MS_BIND_FLAGS
>  is a descriptive name for this flag.  MS_EXPLICIT_FLAGS might be better
>  but still isn't optimal.
>

MS_BIND_FLAGS_OVERRIDE ?

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Add MS_BIND_FLAGS mount flag

2008-02-14 Thread Paul Menage

[ cc: linux-fsdevel ]

On Thu, Feb 14, 2008 at 7:22 AM, Paul Menage <[EMAIL PROTECTED]> wrote:
> On Wed, Feb 13, 2008 at 10:02 PM, Christoph Hellwig <[EMAIL PROTECTED]> wrote:
>  >
>  >  I think this concept is reasonable, but I don't think MS_BIND_FLAGS
>  >  is a descriptive name for this flag.  MS_EXPLICIT_FLAGS might be better
>  >  but still isn't optimal.
>  >
>
>  MS_BIND_FLAGS_OVERRIDE ?
>
>  Paul
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Add MS_BIND_FLAGS mount flag

2008-02-14 Thread Paul Menage

On Thu, Feb 14, 2008 at 8:03 AM, Miklos Szeredi <[EMAIL PROTECTED]> wrote:
>  > The "flags" argument could be the same as for regular mount, and
>  > contain the mnt_flags - so the extra argument could maybe usefully be
>  > a "mnt_flags_mask", to indicate which flags we actually care about
>  > overriding.
>
>  The way I imagined it, is that mnt_flags is a mask, and the operation
>  (determined by flags) is either:
>
>   - set bits in mask
>   - clear bits in mask (or not in mask)
>   - set flags to mask
>
>  It doesn't allow setting some bits, clearing some others, and leaving
>  alone the rest.  But I think such flexibility isn't really needed.

I think I'd suggest something like:

new_mnt->mnt_flags = (old_mnt->mnt_flags & ~arg_mask) | (arg_flags & mask)

>  Maybe instead of messing with masks, it's better to introduce a
>  get_flags() or a more general mount_stat() operation, and let
>  userspace deal with setting and clearing flags, just as we do for
>  stat/chmod?
>
>  So we'd have
>
>   mount_stat(path, stat);
>   mount_bind(from, to, flags);
>   mount_set_flags(path, flags);
>   mount_move(from, to);
>
>  and perhaps
>
>   mount_remount(path, opt_string, flags);

Sounds reasonable to me. But it wouldn't directly solve the "do a
recursive bind mount setting the MS_READONLY flag on all children"
problem, so we'd need some of the earlier suggestions too.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] Add MS_BIND_FLAGS mount flag

2008-02-14 Thread Paul Menage

On Thu, Feb 14, 2008 at 9:31 AM, Miklos Szeredi <[EMAIL PROTECTED]> wrote:
>
>  I deliberately not used the MS_* flags, which is currently a messy mix
>  of things with totally different meanings.
>
>  Does this solve all the issues?

We should add a size parameter either in the mount_params or as a
final argument, for future extensibility.

And we might as well include MNT_READONLY in the API on the assumption
that per-mount readonly will be available soon.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] Add linux-fsdevel to VFS entry in MAINTAINERS

2008-02-14 Thread Paul Menage


Add linux-fsdevel to the VFS entry in MAINTAINERS

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
MAINTAINERS |1 +
1 file changed, 1 insertion(+)

Index: 2.6.24-mm1-bindflags/MAINTAINERS
===
--- 2.6.24-mm1-bindflags.orig/MAINTAINERS
+++ 2.6.24-mm1-bindflags/MAINTAINERS
@@ -1616,6 +1616,7 @@ S:Maintained
FILESYSTEMS (VFS and infrastructure)
P:  Alexander Viro
M:  [EMAIL PROTECTED]
+L: [EMAIL PROTECTED]
S:  Maintained

FIREWIRE SUBSYSTEM (drivers/firewire, )
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC][PATCH 1/7] CGroup API: Add cgroup.api control file

2008-02-15 Thread Paul Menage

Add a cgroup.api control file in every cgroup directory. This reports
for each control file the type of data represented by that control
file, and a user-friendly description of the contents.

A secondary effect of this patch is to add the "cgroup." prefix in
front of all cgroup-provided control files. This will reduce the
chance of future control files clashing with user-provided names.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/cgroup.h |   21 +++
 kernel/cgroup.c|  133 ++---
 2 files changed, 148 insertions(+), 6 deletions(-)

Index: cgroupmap-2.6.24-mm1/include/linux/cgroup.h
===
--- cgroupmap-2.6.24-mm1.orig/include/linux/cgroup.h
+++ cgroupmap-2.6.24-mm1/include/linux/cgroup.h
@@ -179,12 +179,33 @@ struct css_set {
  * - the 'cftype' of the file is file->f_dentry->d_fsdata
  */
 
+/*
+ * The various types of control file that are reported in the
+ * cgroup.api file. "String" is a catch-all default, but should only
+ * be used for special cases. If you use the appropriate accessors
+ * (such as "read_uint") in your control file, then you can leave this
+ * as 0 (CGROUP_FILE_UNKNOWN) and let cgroup figure out the right type.
+ */
+enum cgroup_file_type {
+   CGROUP_FILE_UNKNOWN = 0,
+   CGROUP_FILE_VOID,
+   CGROUP_FILE_U64,
+   CGROUP_FILE_STRING,
+};
+
 #define MAX_CFTYPE_NAME 64
 struct cftype {
/* By convention, the name should begin with the name of the
 * subsystem, followed by a period */
char name[MAX_CFTYPE_NAME];
int private;
+
+   /* The type of a file - reported in the cgroup.api file */
+   enum cgroup_file_type type;
+
+   /* Human-readable description of the file */
+   const char *desc;
+
int (*open) (struct inode *inode, struct file *file);
ssize_t (*read) (struct cgroup *cont, struct cftype *cft,
 struct file *file,
Index: cgroupmap-2.6.24-mm1/kernel/cgroup.c
===
--- cgroupmap-2.6.24-mm1.orig/kernel/cgroup.c
+++ cgroupmap-2.6.24-mm1/kernel/cgroup.c
@@ -1301,6 +1301,7 @@ enum cgroup_filetype {
FILE_NOTIFY_ON_RELEASE,
FILE_RELEASABLE,
FILE_RELEASE_AGENT,
+   FILE_API,
 };
 
 static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft,
@@ -1611,17 +1612,21 @@ static int cgroup_create_dir(struct cgro
 }
 
 int cgroup_add_file(struct cgroup *cgrp,
-  struct cgroup_subsys *subsys,
-  const struct cftype *cft)
+   struct cgroup_subsys *subsys,
+   const struct cftype *cft)
 {
struct dentry *dir = cgrp->dentry;
struct dentry *dentry;
int error;
 
char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
-   if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
-   strcpy(name, subsys->name);
-   strcat(name, ".");
+   if (!test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
+   if (subsys) {
+   strcpy(name, subsys->name);
+   strcat(name, ".");
+   } else {
+   strcpy(name, "cgroup.");
+   }
}
strcat(name, cft->name);
BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
@@ -2126,6 +2131,110 @@ static u64 cgroup_read_releasable(struct
return test_bit(CGRP_RELEASABLE, &cgrp->flags);
 }
 
+static const struct file_operations cgroup_api_file_operations = {
+   .read = seq_read,
+   .llseek = seq_lseek,
+   .release = seq_release,
+};
+
+/*
+ * cgroup.api is a file in each cgroup directory that gives the types
+ * and descriptions of the various control files in that directory.
+ */
+
+static struct dentry *cgroup_api_advance(struct dentry *d, int advance)
+{
+   struct dentry *parent = d->d_parent;
+   struct list_head *l = &d->d_u.d_child;
+   while (true) {
+   if (advance)
+   l = l->next;
+   advance = true;
+   /* Did we reach the end of the directory? */
+   if (l == &parent->d_subdirs)
+   return NULL;
+   d = container_of(l, struct dentry, d_u.d_child);
+   /* Skip cgroup subdirectories */
+   if (d->d_inode && S_ISREG(d->d_inode->i_mode))
+   return d;
+   }
+}
+
+static void *cgroup_api_start(struct seq_file *sf, loff_t *pos)
+{
+   struct dentry *parent = sf->private;
+   struct dentry *d;
+   loff_t l = 0;
+   spin_lock(&dcache_lock);
+   if (list_empty(&parent->d_subdirs))
+

[RFC][PATCH 5/7] CGroup API: Use read_uint in memory controller

2008-02-15 Thread Paul Menage

Update the memory controller to use read_uint for its
limit/usage/failcnt control files, calling the new
res_counter_read_uint() function. This allows the files to show up as
u64 rather than string in the cgroup.api file.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 mm/memcontrol.c |   15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

Index: cgroupmap-2.6.24-mm1/mm/memcontrol.c
===
--- cgroupmap-2.6.24-mm1.orig/mm/memcontrol.c
+++ cgroupmap-2.6.24-mm1/mm/memcontrol.c
@@ -922,13 +922,10 @@ int mem_cgroup_write_strategy(char *buf,
return 0;
 }
 
-static ssize_t mem_cgroup_read(struct cgroup *cont,
-   struct cftype *cft, struct file *file,
-   char __user *userbuf, size_t nbytes, loff_t *ppos)
+static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
-   return res_counter_read(&mem_cgroup_from_cont(cont)->res,
-   cft->private, userbuf, nbytes, ppos,
-   NULL);
+   return res_counter_read_uint(&mem_cgroup_from_cont(cont)->res,
+cft->private);
 }
 
 static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
@@ -1006,18 +1003,18 @@ static struct cftype mem_cgroup_files[] 
{
.name = "usage_in_bytes",
.private = RES_USAGE,
-   .read = mem_cgroup_read,
+   .read_uint = mem_cgroup_read,
},
{
.name = "limit_in_bytes",
.private = RES_LIMIT,
.write = mem_cgroup_write,
-   .read = mem_cgroup_read,
+   .read_uint = mem_cgroup_read,
},
{
.name = "failcnt",
.private = RES_FAILCNT,
-   .read = mem_cgroup_read,
+   .read_uint = mem_cgroup_read,
},
{
.name = "force_empty",

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC][PATCH 6/7] CGroup API: Use descriptions for memory controller API files

2008-02-15 Thread Paul Menage

This patch adds descriptions to the memory controller API files to
indicate that the usage/limit are in bytes; the names of the control
files can then be simplified to usage/limit.

Also removes the unnecessary mem_force_empty_read() function

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 mm/memcontrol.c |   21 +
 1 file changed, 5 insertions(+), 16 deletions(-)

Index: cgroupmap-2.6.24-mm1/mm/memcontrol.c
===
--- cgroupmap-2.6.24-mm1.orig/mm/memcontrol.c
+++ cgroupmap-2.6.24-mm1/mm/memcontrol.c
@@ -950,19 +950,6 @@ static ssize_t mem_force_empty_write(str
return ret;
 }
 
-/*
- * Note: This should be removed if cgroup supports write-only file.
- */
-
-static ssize_t mem_force_empty_read(struct cgroup *cont,
-   struct cftype *cft,
-   struct file *file, char __user *userbuf,
-   size_t nbytes, loff_t *ppos)
-{
-   return -EINVAL;
-}
-
-
 static const struct mem_cgroup_stat_desc {
const char *msg;
u64 unit;
@@ -1001,15 +988,17 @@ static int mem_control_stat_show(struct 
 
 static struct cftype mem_cgroup_files[] = {
{
-   .name = "usage_in_bytes",
+   .name = "usage",
.private = RES_USAGE,
.read_uint = mem_cgroup_read,
+   .desc = "Memory usage in bytes",
},
{
-   .name = "limit_in_bytes",
+   .name = "limit",
.private = RES_LIMIT,
.write = mem_cgroup_write,
.read_uint = mem_cgroup_read,
+   .desc = "Memory limit in bytes",
},
{
.name = "failcnt",
@@ -1019,7 +1008,7 @@ static struct cftype mem_cgroup_files[] 
{
.name = "force_empty",
.write = mem_force_empty_write,
-   .read = mem_force_empty_read,
+   .desc = "Write to this file to forget all memory charges"
},
{
.name = "stat",

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC][PATCH 0/7] CGroup API: More structured API for CGroups control files

2008-02-15 Thread Paul Menage


This set of patches makes the Control Groups API more structured and
self-describing.

1) Allows control files to be associated with data types such as
"u64", "string", "map", etc. These types show up in a new cgroup.api
file in each cgroup directory, along with a user-readable
string. Files that use cgroup-provided data accessors have these file
types inferred automatically.

2) Moves various files in cpusets and the memory controller from using
custom-written file handlers to cgroup-defined handlers

3) Adds the "cgroup." prefix for existing cgroup-provided control
files (tasks, release_agent, releasable, notify_on_release). Given
than we've already had 2.6.24 go out without this prefix, I guess this
could be a little contentious - but it seems like a good move to
prevent name clashes in the future. (Note that this doesn't affect
mounting the legacy cpuset filesystem, since the compatibility layer
disables all prefixes when mounted with filesystem type "cpuset"). If
people object too strongly, we could just make this the case for *new*
cgroup API files, but I think this is a case where consistency would
be better than compatibility - I'd be surprised if anyone has written
major legacy apps yet that rely on 2.6.24 cgroup control file names.


There are various motivations for this:

1) We said at Kernel Summit '07 that the cgroup API wouldn't be
allowed to spiral into an arbitrary mess of ad-hoc APIs. Having simple
ways to represent common data types makes this easier. (E.g. one
standard way to report a map of string,u64 pairs to userspace.)

2) People were divided on the issue of binary APIs versus ASCII APIs
for control groups. Compatibility with the existing cpusets system,
and ease of experimentation, were two important reasons for going with
the current. ASCII API. But by having structured control files, we can
open the path towards having more efficient binary APIs for simpler
and more efficient programmatic access too, without any additional
modifications required from the subsystems themselves.

My plans for this potential binary API are a little hazy at this
point, but they might go something like opening a cgroup.bin file in a
cgroup directory, and writing the names of the control files that you
were interested in; then a read on that file handle would return the
contents of the given control files in a single read in a simple
binary format. (Better suggestions are welcome). Regardless, getting a
good typing/structure on the control files is an important first step
if we want to go in that direction.

3) The memory controller currently has files with the "_in_bytes"
suffix, on the grounds that otherwise it's not obvious to a new user
what they represent. By moving the description to a auto-generated API
file, we can remove this (IMO) inelegant suffix.


--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC][PATCH 3/7] CGroup API: Use cgroup map for memcontrol stats file

2008-02-15 Thread Paul Menage

Remove the seq_file boilerplate used to construct the memcontrol stats
map, and instead use the new map representation for cgroup control
files

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 mm/memcontrol.c |   30 ++
 1 file changed, 6 insertions(+), 24 deletions(-)

Index: cgroupmap-2.6.24-mm1/mm/memcontrol.c
===
--- cgroupmap-2.6.24-mm1.orig/mm/memcontrol.c
+++ cgroupmap-2.6.24-mm1/mm/memcontrol.c
@@ -974,9 +974,9 @@ static const struct mem_cgroup_stat_desc
[MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, },
 };
 
-static int mem_control_stat_show(struct seq_file *m, void *arg)
+static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
+struct cgroup_map_cb *cb)
 {
-   struct cgroup *cont = m->private;
struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
struct mem_cgroup_stat *stat = &mem_cont->stat;
int i;
@@ -986,8 +986,7 @@ static int mem_control_stat_show(struct 
 
val = mem_cgroup_read_stat(stat, i);
val *= mem_cgroup_stat_desc[i].unit;
-   seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg,
-   (long long)val);
+   cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
}
/* showing # of active pages */
{
@@ -997,29 +996,12 @@ static int mem_control_stat_show(struct 
MEM_CGROUP_ZSTAT_INACTIVE);
active = mem_cgroup_get_all_zonestat(mem_cont,
MEM_CGROUP_ZSTAT_ACTIVE);
-   seq_printf(m, "active %ld\n", (active) * PAGE_SIZE);
-   seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE);
+   cb->fill(cb, "active", (active) * PAGE_SIZE);
+   cb->fill(cb, "inactive", (inactive) * PAGE_SIZE);
}
return 0;
 }
 
-static const struct file_operations mem_control_stat_file_operations = {
-   .read = seq_read,
-   .llseek = seq_lseek,
-   .release = single_release,
-};
-
-static int mem_control_stat_open(struct inode *unused, struct file *file)
-{
-   /* XXX __d_cont */
-   struct cgroup *cont = file->f_dentry->d_parent->d_fsdata;
-
-   file->f_op = &mem_control_stat_file_operations;
-   return single_open(file, mem_control_stat_show, cont);
-}
-
-
-
 static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
@@ -1044,7 +1026,7 @@ static struct cftype mem_cgroup_files[] 
},
{
.name = "stat",
-   .open = mem_control_stat_open,
+   .read_map = mem_control_stat_show,
},
 };
 

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC][PATCH 2/7] CGroup API: Add cgroup map data type

2008-02-15 Thread Paul Menage

Adds a new type of supported control file representation, a map from
strings to u64 values.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/cgroup.h |   19 +++
 kernel/cgroup.c|   61 -
 2 files changed, 79 insertions(+), 1 deletion(-)

Index: cgroupmap-2.6.24-mm1/include/linux/cgroup.h
===
--- cgroupmap-2.6.24-mm1.orig/include/linux/cgroup.h
+++ cgroupmap-2.6.24-mm1/include/linux/cgroup.h
@@ -191,6 +191,17 @@ enum cgroup_file_type {
CGROUP_FILE_VOID,
CGROUP_FILE_U64,
CGROUP_FILE_STRING,
+   CGROUP_FILE_MAP,
+};
+
+/*
+ * cgroup_map_cb is an abstract callback API for reporting map-valued
+ * control files
+ */
+
+struct cgroup_map_cb {
+   int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value);
+   void *state;
 };
 
 #define MAX_CFTYPE_NAME 64
@@ -215,6 +226,14 @@ struct cftype {
 * single integer. Use it in place of read()
 */
u64 (*read_uint) (struct cgroup *cont, struct cftype *cft);
+   /*
+* read_map() is used for defining a map of key/value
+* pairs. It should call cb->fill(cb, key, value) for each
+* entry.
+*/
+   int (*read_map) (struct cgroup *cont, struct cftype *cft,
+struct cgroup_map_cb *cb);
+
ssize_t (*write) (struct cgroup *cont, struct cftype *cft,
  struct file *file,
  const char __user *buf, size_t nbytes, loff_t *ppos);
Index: cgroupmap-2.6.24-mm1/kernel/cgroup.c
===
--- cgroupmap-2.6.24-mm1.orig/kernel/cgroup.c
+++ cgroupmap-2.6.24-mm1/kernel/cgroup.c
@@ -1488,6 +1488,46 @@ static ssize_t cgroup_file_read(struct f
return -EINVAL;
 }
 
+/*
+ * seqfile ops/methods for returning structured data. Currently just
+ * supports string->u64 maps, but can be extended in future.
+ */
+
+struct cgroup_seqfile_state {
+   struct cftype *cft;
+   struct cgroup *cgroup;
+};
+
+static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value)
+{
+   struct seq_file *sf = cb->state;
+   return seq_printf(sf, "%s: %llu\n", key, value);
+}
+
+static int cgroup_seqfile_show(struct seq_file *m, void *arg)
+{
+   struct cgroup_seqfile_state *state = m->private;
+   struct cftype *cft = state->cft;
+   struct cgroup_map_cb cb = {
+   .fill = cgroup_map_add,
+   .state = m,
+   };
+   if (cft->read_map) {
+   return cft->read_map(state->cgroup, cft, &cb);
+   } else {
+   BUG();
+   }
+}
+
+int cgroup_seqfile_release(struct inode *inode, struct file *file)
+{
+   struct seq_file *seq = file->private_data;
+   kfree(seq->private);
+   return single_release(inode, file);
+}
+
+static struct file_operations cgroup_seqfile_operations;
+
 static int cgroup_file_open(struct inode *inode, struct file *file)
 {
int err;
@@ -1500,7 +1540,18 @@ static int cgroup_file_open(struct inode
cft = __d_cft(file->f_dentry);
if (!cft)
return -ENODEV;
-   if (cft->open)
+   if (cft->read_map) {
+   struct cgroup_seqfile_state *state =
+   kzalloc(sizeof(*state), GFP_USER);
+   if (!state)
+   return -ENOMEM;
+   state->cft = cft;
+   state->cgroup = __d_cgrp(file->f_dentry->d_parent);
+   file->f_op = &cgroup_seqfile_operations;
+   err = single_open(file, cgroup_seqfile_show, state);
+   if (err < 0)
+   kfree(state);
+   } else if (cft->open)
err = cft->open(inode, file);
else
err = 0;
@@ -1539,6 +1590,12 @@ static struct file_operations cgroup_fil
.release = cgroup_file_release,
 };
 
+static struct file_operations cgroup_seqfile_operations = {
+   .read = seq_read,
+   .llseek = seq_lseek,
+   .release = cgroup_seqfile_release,
+};
+
 static struct inode_operations cgroup_dir_inode_operations = {
.lookup = simple_lookup,
.mkdir = cgroup_mkdir,
@@ -2206,6 +2263,8 @@ static int cgroup_api_show(struct seq_fi
if (type == CGROUP_FILE_UNKNOWN) {
if (cft->read_uint)
type = CGROUP_FILE_U64;
+   else if (cft->read_map)
+   type = CGROUP_FILE_MAP;
else if (cft->read)
type = CGROUP_FILE_STRING;
else if (!cft->open)

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC][PATCH 7/7] CGroup API: Update cpusets to use cgroup structured file API

2008-02-15 Thread Paul Menage

Many of the cpusets control files are simple integer values, which
don't require the overhead of memory allocations for reads and writes.

Move the handlers for these control files into cpuset_read_uint() and
cpuset_write_uint(). This also has the advantage that the control
files show up as "u64" rather than "string" in the cgroup.api file.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 kernel/cpuset.c |  158 +---
 1 file changed, 83 insertions(+), 75 deletions(-)

Index: cgroupmap-2.6.24-mm1/kernel/cpuset.c
===
--- cgroupmap-2.6.24-mm1.orig/kernel/cpuset.c
+++ cgroupmap-2.6.24-mm1/kernel/cpuset.c
@@ -999,19 +999,6 @@ int current_cpuset_is_being_rebound(void
 }
 
 /*
- * Call with cgroup_mutex held.
- */
-
-static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
-{
-   if (simple_strtoul(buf, NULL, 10) != 0)
-   cpuset_memory_pressure_enabled = 1;
-   else
-   cpuset_memory_pressure_enabled = 0;
-   return 0;
-}
-
-/*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
  * CS_SCHED_LOAD_BALANCE,
@@ -1023,15 +1010,13 @@ static int update_memory_pressure_enable
  * Call with cgroup_mutex held.
  */
 
-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+  int turning_on)
 {
-   int turning_on;
struct cpuset trialcs;
int err;
int cpus_nonempty, balance_flag_changed;
 
-   turning_on = (simple_strtoul(buf, NULL, 10) != 0);
-
trialcs = *cs;
if (turning_on)
set_bit(bit, &trialcs.flags);
@@ -1247,44 +1232,66 @@ static ssize_t cpuset_common_file_write(
case FILE_MEMLIST:
retval = update_nodemask(cs, buffer);
break;
+   default:
+   retval = -EINVAL;
+   goto out2;
+   }
+
+   if (retval == 0)
+   retval = nbytes;
+out2:
+   cgroup_unlock();
+out1:
+   kfree(buffer);
+   return retval;
+}
+
+static int cpuset_write_uint(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+   int retval = 0;
+   struct cpuset *cs = cgroup_cs(cgrp);
+   cpuset_filetype_t type = cft->private;
+
+   cgroup_lock();
+
+   if (cgroup_is_removed(cgrp)) {
+   cgroup_unlock();
+   return -ENODEV;
+   }
+
+   switch (type) {
case FILE_CPU_EXCLUSIVE:
-   retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
+   retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
break;
case FILE_MEM_EXCLUSIVE:
-   retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
+   retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
break;
case FILE_SCHED_LOAD_BALANCE:
-   retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
+   retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
break;
case FILE_MEMORY_MIGRATE:
-   retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
+   retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
break;
case FILE_MEMORY_PRESSURE_ENABLED:
-   retval = update_memory_pressure_enabled(cs, buffer);
+   cpuset_memory_pressure_enabled = val;
break;
case FILE_MEMORY_PRESSURE:
retval = -EACCES;
break;
case FILE_SPREAD_PAGE:
-   retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
+   retval = update_flag(CS_SPREAD_PAGE, cs, val);
cs->mems_generation = cpuset_mems_generation++;
break;
case FILE_SPREAD_SLAB:
-   retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
+   retval = update_flag(CS_SPREAD_SLAB, cs, val);
cs->mems_generation = cpuset_mems_generation++;
break;
default:
retval = -EINVAL;
-   goto out2;
+   break;
}
-
-   if (retval == 0)
-   retval = nbytes;
-out2:
cgroup_unlock();
-out1:
-   kfree(buffer);
-   return retval;
+   return -EINVAL;
 }
 
 /*
@@ -1345,30 +1352,6 @@ static ssize_t cpuset_common_file_read(s
case FILE_MEMLIST:
s += cpuset_sprintf_memlist(s, cs);
break;
-   case FILE_CPU_EXCLUSIVE:
-   *s++ = is_cpu_exclusive(cs) ? '1' : '0';
-   break;
-   case FILE_MEM_EXCLUSIVE:
-   *s++ = is_mem_exclusive(cs) ? '1' : '0';
-   break;
-   case FILE_SCHED_LOAD_BALANCE:

[RFC][PATCH 4/7] CGroup API: Add res_counter_read_uint()

2008-02-15 Thread Paul Menage

Adds a function for returning the value of a resource counter member,
in a form suitable for use in a cgroup read_uint control file method.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 include/linux/res_counter.h |1 +
 kernel/res_counter.c|5 +
 2 files changed, 6 insertions(+)

Index: cgroupmap-2.6.24-mm1/include/linux/res_counter.h
===
--- cgroupmap-2.6.24-mm1.orig/include/linux/res_counter.h
+++ cgroupmap-2.6.24-mm1/include/linux/res_counter.h
@@ -54,6 +54,7 @@ struct res_counter {
 ssize_t res_counter_read(struct res_counter *counter, int member,
const char __user *buf, size_t nbytes, loff_t *pos,
int (*read_strategy)(unsigned long long val, char *s));
+u64 res_counter_read_uint(struct res_counter *counter, int member);
 ssize_t res_counter_write(struct res_counter *counter, int member,
const char __user *buf, size_t nbytes, loff_t *pos,
int (*write_strategy)(char *buf, unsigned long long *val));
Index: cgroupmap-2.6.24-mm1/kernel/res_counter.c
===
--- cgroupmap-2.6.24-mm1.orig/kernel/res_counter.c
+++ cgroupmap-2.6.24-mm1/kernel/res_counter.c
@@ -92,6 +92,11 @@ ssize_t res_counter_read(struct res_coun
pos, buf, s - buf);
 }
 
+u64 res_counter_read_uint(struct res_counter *counter, int member)
+{
+   return *res_counter_member(counter, member);
+}
+
 ssize_t res_counter_write(struct res_counter *counter, int member,
const char __user *userbuf, size_t nbytes, loff_t *pos,
int (*write_strategy)(char *st_buf, unsigned long long *val))

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH 1/7] CGroup API: Add cgroup.api control file

2008-02-16 Thread Paul Menage

On Feb 16, 2008 2:07 AM, Balbir Singh <[EMAIL PROTECTED]> wrote:
> Paul Menage wrote:
>
> Hi, Paul,
>
> Do we need to use a cgroup.api file? Why not keep up to date documentation and
> get users to use that. I fear that, cgroup.api will not be kept up-to-date,
> leading to confusion.

The cgroup.api file isn't meant to give complete documentation for a
control file, simply a brief indication of its usage.

The aim is that most bits of the information reported in cgroup.api
are auto-generated, so there shouldn't be problems with it getting
out-of-date.

Is it just the space used by the documentation string that you're
objecting to? The other function of the file is to declare a type for
each variable.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH 0/7] CGroup API: More structured API for CGroups control files

2008-02-16 Thread Paul Menage

On Feb 16, 2008 1:31 AM, Li Zefan <[EMAIL PROTECTED]> wrote:
>
> I don't quite catch what you mean. Cgoup does support write-only/read-only
> files. For a write-only file, just set .write and .write_uint to be NULL,
> similar for a read-only file.
>
> Do I miss something?
>

I suppose we could infer from the lack of any write handlers that we
should give the file in the filesystem a mode of 444 rather 644.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH 7/7] CGroup API: Update cpusets to use cgroup structured file API

2008-02-17 Thread Paul Menage

On Feb 16, 2008 7:29 PM, Paul Jackson <[EMAIL PROTECTED]> wrote:
>
> From: Paul Jackson <[EMAIL PROTECTED]>
>
> Strip all trailing whitespace (such as carriage returns)
> when parsing integer writes to cgroup files, not just
> one trailing newline if present.

Sounds like a good idea to me. Thanks for this.

>
> Signed-off-by: Paul Jackson <[EMAIL PROTECTED]>
> Cc: Paul Menage <[EMAIL PROTECTED]>

Acked-by: Paul Menage <[EMAIL PROTECTED]>

>
> ---
>  kernel/cgroup.c |5 +
>  1 file changed, 1 insertion(+), 4 deletions(-)
>
> --- 2.6.24-mm1.orig/kernel/cgroup.c 2008-02-16 04:20:33.0 -0800
> +++ 2.6.24-mm1/kernel/cgroup.c  2008-02-16 19:00:41.207478218 -0800
> @@ -1321,10 +1321,7 @@ static ssize_t cgroup_write_uint(struct
> return -EFAULT;
>
> buffer[nbytes] = 0; /* nul-terminate */
> -
> -   /* strip newline if necessary */
> -   if (nbytes && (buffer[nbytes-1] == '\n'))
> -   buffer[nbytes-1] = 0;
> +   strstrip(buffer);   /* strip -just- trailing whitespace */
> val = simple_strtoull(buffer, &end, 0);
> if (*end)
> return -EINVAL;
>
>
> --
>   I won't rest till it's the best ...
>   Programmer, Linux Scalability
>   Paul Jackson <[EMAIL PROTECTED]> 1.940.382.4214
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH 7/7] CGroup API: Update cpusets to use cgroup structured file API

2008-02-17 Thread Paul Menage

On Feb 17, 2008 9:28 AM, Paul Jackson <[EMAIL PROTECTED]> wrote:
>
> I'm figuring it would be easiest if you just threw this
> little change into your hopper for the bigger changes
> you're making

OK, will do.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: Improve init/Kconfig help descriptions [PATCH 6/9]

2008-02-19 Thread Paul Menage

On Feb 19, 2008 7:12 AM, Nick Andrew <[EMAIL PROTECTED]> wrote:
>  config CGROUPS
> bool "Control Group support"
> help
> - This option will let you use process cgroup subsystems
> - such as Cpusets
> + Control Groups enables processes to be tracked and grouped
> + into "cgroups". This enables you, for example, to associate
> + cgroups with certain CPU sets using "cpusets".
>
> - Say N if unsure.
> + When enabled, a new filesystem type "cgroup" is available
> + and can be mounted to control cpusets.

How about:

... cpusets and other resource/behaviour controllers.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH 1/7] CGroup API: Add cgroup.api control file

2008-02-19 Thread Paul Menage

On Feb 19, 2008 1:57 PM, Paul Jackson <[EMAIL PROTECTED]> wrote:
>
> Finally, it goes against the one thingie per file (at most, one scalar
> vector) that has worked well for us when tried.

Right, I like the idea of keeping things simple. But if you're going
to accept that a vector is useful, then it seems reasonable that some
other *simple* structured datatypes can be useful. An N-element
key/value map (a la /proc/meminfo) is, I think, nicer than having to
read values from N separate files.

>
> As to the motivations Paul M gives:
>  1) Avoid "an arbitrary mess of ad-hoc APIs":
> We can still do that, whether or not we "self-document" these
> API's in this manner.

We can, but this file makes it more clear what control files have a
well-defined API and which are just returning some ad-hoc string.

I guess it's not essential, I just figured that if we had that
information, it made sense to make it available to userspace. I guess
I'm happy with dropping the actual exposed cgroup.api file for now as
long as we can work towards reducing the number of control files that
just return strings, and make use of the structured output such as
read_uint() miore.

>  2) binary APIs versus ASCII APIs:
> Well, I have an ASCII API bias, not surprising.  But I'd
> suggest not doing things "in anticipation" of some future
> fuzzy binary API support.  Wait until that day actually arrives.

I have a reasonably clear idea of how we can do the binary API.

That's mostly for a separate RFC. But for example, reading a map via
the binary API would be able to just return a list values since the
keys could be parsed once from the ascii map (provided that the
subsystem guaranteed that the map keys and their order wouldn't change
between reboots).

>  3) The memory controller currently has files with the "_in_bytes":
> The traditional way to handle this is Documentation and man
> pages; good enough for my granddad, good enough for me ;).

I've tried submitting patches to remove the in_bytes suffix and just
rely on the documentation, and people didn't seem to like it ...

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH 1/7] CGroup API: Add cgroup.api control file

2008-02-19 Thread Paul Menage

On Feb 18, 2008 1:45 AM, Li Zefan <[EMAIL PROTECTED]> wrote:
> >
>
> But we don't have /proc/proc.api or /sys/sysfs.api ...

True. And /proc is a bit of a mess. Having a similar API file for
sysfs sounds like a good idea to me.

>
> And is it better to describe the debug subsystem too?
>

Yes, probably, but that would be a separate patch to the debug
subsystem itself, not the main cgroups code.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 3/7] cgroup: clean up cgroup.h

2008-02-19 Thread Paul Menage

On Feb 17, 2008 9:49 PM, Li Zefan <[EMAIL PROTECTED]> wrote:
> - replace old name 'cont' with 'cgrp' (Paul Menage did this cleanup for
>   cgroup.c in commit bd89aabc6761de1c35b154fe6f914a445d301510)
> - remove a duplicate declaration of cgroup_path()
>
> Signed-off-by: Li Zefan <[EMAIL PROTECTED]>

Acked-by: Paul Menage <[EMAIL PROTECTED]>

> ---
>  include/linux/cgroup.h |   48 
> +++-
>  1 files changed, 23 insertions(+), 25 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index 2ebf7af..028ba3b 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -186,15 +186,15 @@ struct cftype {
> char name[MAX_CFTYPE_NAME];
> int private;
> int (*open) (struct inode *inode, struct file *file);
> -   ssize_t (*read) (struct cgroup *cont, struct cftype *cft,
> +   ssize_t (*read) (struct cgroup *cgrp, struct cftype *cft,
>  struct file *file,
>  char __user *buf, size_t nbytes, loff_t *ppos);
> /*
>  * read_uint() is a shortcut for the common case of returning a
>  * single integer. Use it in place of read()
>  */
> -   u64 (*read_uint) (struct cgroup *cont, struct cftype *cft);
> -   ssize_t (*write) (struct cgroup *cont, struct cftype *cft,
> +   u64 (*read_uint) (struct cgroup *cgrp, struct cftype *cft);
> +   ssize_t (*write) (struct cgroup *cgrp, struct cftype *cft,
>   struct file *file,
>   const char __user *buf, size_t nbytes, loff_t 
> *ppos);
>
> @@ -203,7 +203,7 @@ struct cftype {
>  * a single integer (as parsed by simple_strtoull) from
>  * userspace. Use in place of write(); return 0 or error.
>  */
> -   int (*write_uint) (struct cgroup *cont, struct cftype *cft, u64 val);
> +   int (*write_uint) (struct cgroup *cgrp, struct cftype *cft, u64 val);
>
> int (*release) (struct inode *inode, struct file *file);
>  };
> @@ -218,41 +218,41 @@ struct cgroup_scanner {
>
>  /* Add a new file to the given cgroup directory. Should only be
>   * called by subsystems from within a populate() method */
> -int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys,
> +int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
>const struct cftype *cft);
>
>  /* Add a set of new files to the given cgroup directory. Should
>   * only be called by subsystems from within a populate() method */
> -int cgroup_add_files(struct cgroup *cont,
> +int cgroup_add_files(struct cgroup *cgrp,
> struct cgroup_subsys *subsys,
> const struct cftype cft[],
> int count);
>
> -int cgroup_is_removed(const struct cgroup *cont);
> +int cgroup_is_removed(const struct cgroup *cgrp);
>
> -int cgroup_path(const struct cgroup *cont, char *buf, int buflen);
> +int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
>
> -int cgroup_task_count(const struct cgroup *cont);
> +int cgroup_task_count(const struct cgroup *cgrp);
>
>  /* Return true if the cgroup is a descendant of the current cgroup */
> -int cgroup_is_descendant(const struct cgroup *cont);
> +int cgroup_is_descendant(const struct cgroup *cgrp);
>
>  /* Control Group subsystem type. See Documentation/cgroups.txt for details */
>
>  struct cgroup_subsys {
> struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
> - struct cgroup *cont);
> -   void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cont);
> -   void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cont);
> + struct cgroup *cgrp);
> +   void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
> +   void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
> int (*can_attach)(struct cgroup_subsys *ss,
> - struct cgroup *cont, struct task_struct *tsk);
> -   void (*attach)(struct cgroup_subsys *ss, struct cgroup *cont,
> -   struct cgroup *old_cont, struct task_struct *tsk);
> + struct cgroup *cgrp, struct task_struct *tsk);
> +   void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp,
> +   struct cgroup *old_cgrp, struct task_struct *tsk);
> void (*fork)(struct cgroup_subsys *ss, struct task_struct *task);
> void (*exit)(struct cgroup_subsys *ss, struct task_struct *task);
> int (*populate)(struc

Re: [PATCH 5/7] cgroup: fix subsys bitops

2008-02-19 Thread Paul Menage

On Feb 17, 2008 9:49 PM, Li Zefan <[EMAIL PROTECTED]> wrote:
> Cgroup uses unsigned long for subsys bitops, not unsigned long long.
>
> Signed-off-by: Li Zefan <[EMAIL PROTECTED]>

Acked-by: Paul Menage <[EMAIL PROTECTED]>

> ---
>  kernel/cgroup.c |4 ++--
>  1 files changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index aa76bbd..e8c8e58 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -320,7 +320,7 @@ static struct css_set *find_existing_css_set(
> /* Built the set of subsystem state objects that we want to
>  * see in the new css_set */
> for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> -   if (root->subsys_bits & (1ull << i)) {
> +   if (root->subsys_bits & (1UL << i)) {
> /* Subsystem is in this hierarchy. So we want
>  * the subsystem state from the new
>  * cgroup */
> @@ -696,7 +696,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
> added_bits = final_bits & ~root->actual_subsys_bits;
> /* Check that any added subsystems are currently free */
> for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
> -   unsigned long long bit = 1ull << i;
> +   unsigned long bit = 1UL << i;
> struct cgroup_subsys *ss = subsys[i];
> if (!(bit & added_bits))
> continue;
> --
> 1.5.4.rc3
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 4/7] cgroup: fix memory leak in cgroup_get_sb()

2008-02-19 Thread Paul Menage

On Feb 17, 2008 9:49 PM, Li Zefan <[EMAIL PROTECTED]> wrote:
> opts.release_agent is not kfree()ed in all necessary places.
>
> Signed-off-by: Li Zefan <[EMAIL PROTECTED]>

Acked-by: Paul Menage <[EMAIL PROTECTED]>

Good catch, although hopefully something that would be extremely rare
in practice.

Thanks,

Paul

> ---
>  kernel/cgroup.c |5 -
>  1 files changed, 4 insertions(+), 1 deletions(-)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 0c35022..aa76bbd 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -961,8 +961,11 @@ static int cgroup_get_sb(struct file_system_type 
> *fs_type,
> }
>
> root = kzalloc(sizeof(*root), GFP_KERNEL);
> -   if (!root)
> +   if (!root) {
> +   if (opts.release_agent)
> +   kfree(opts.release_agent);
> return -ENOMEM;
> +   }
>
> init_cgroup_root(root);
> root->subsys_bits = opts.subsys_bits;
> --
> 1.5.4.rc3
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 1/7] cgroup: fix and update documentation

2008-02-19 Thread Paul Menage

On Feb 18, 2008 12:39 AM, Li Zefan <[EMAIL PROTECTED]> wrote:
> Misc fixes and updates, make the doc consistent with current
> cgroup implementation.
>
> Signed-off-by: Li Zefan <[EMAIL PROTECTED]>

Acked-by: Paul Menage <[EMAIL PROTECTED]>

Thanks for these cleanups.

Paul

> ---
>  Documentation/cgroups.txt |   66 ++--
>  1 files changed, 33 insertions(+), 33 deletions(-)
>
> diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt
> index 42d7c4c..31d12e2 100644
>
> --- a/Documentation/cgroups.txt
> +++ b/Documentation/cgroups.txt
> @@ -28,7 +28,7 @@ CONTENTS:
>  4. Questions
>
>  1. Control Groups
> -==
> +=
>
>  1.1 What are cgroups ?
>  --
> @@ -143,10 +143,10 @@ proliferation of such cgroups.
>
>  Also lets say that the administrator would like to give enhanced network
>  access temporarily to a student's browser (since it is night and the user
> -wants to do online gaming :)  OR give one of the students simulation
> +wants to do online gaming :))  OR give one of the students simulation
>  apps enhanced CPU power,
>
> -With ability to write pids directly to resource classes, its just a
> +With ability to write pids directly to resource classes, it's just a
>  matter of :
>
> # echo pid > /mnt/network//tasks
> @@ -227,10 +227,13 @@ Each cgroup is represented by a directory in the cgroup 
> file system
>  containing the following files describing that cgroup:
>
>   - tasks: list of tasks (by pid) attached to that cgroup
> - - notify_on_release flag: run /sbin/cgroup_release_agent on exit?
> + - releasable flag: cgroup currently removeable?
> + - notify_on_release flag: run the release agent on exit?
> + - release_agent: the path to use for release notifications (this file
> +   exists in the top cgroup only)
>
>  Other subsystems such as cpusets may add additional files in each
> -cgroup dir
> +cgroup dir.
>
>  New cgroups are created using the mkdir system call or shell
>  command.  The properties of a cgroup, such as its flags, are
> @@ -257,7 +260,7 @@ performance.
>  To allow access from a cgroup to the css_sets (and hence tasks)
>  that comprise it, a set of cg_cgroup_link objects form a lattice;
>  each cg_cgroup_link is linked into a list of cg_cgroup_links for
> -a single cgroup on its cont_link_list field, and a list of
> +a single cgroup on its cgrp_link_list field, and a list of
>  cg_cgroup_links for a single css_set on its cg_link_list.
>
>  Thus the set of tasks in a cgroup can be listed by iterating over
> @@ -271,9 +274,6 @@ for cgroups, with a minimum of additional kernel code.
>  1.4 What does notify_on_release do ?
>  
>
> -*** notify_on_release is disabled in the current patch set. It will be
> -*** reactivated in a future patch in a less-intrusive manner
> -
>  If the notify_on_release flag is enabled (1) in a cgroup, then
>  whenever the last task in the cgroup leaves (exits or attaches to
>  some other cgroup) and the last child cgroup of that cgroup
> @@ -360,8 +360,8 @@ Now you want to do something with this cgroup.
>
>  In this directory you can find several files:
>  # ls
> -notify_on_release release_agent tasks
> -(plus whatever files are added by the attached subsystems)
> +notify_on_release releasable tasks
> +(plus whatever files added by the attached subsystems)
>
>  Now attach your shell to this cgroup:
>  # /bin/echo $$ > tasks
> @@ -404,19 +404,13 @@ with a subsystem id which will be assigned by the 
> cgroup system.
>  Other fields in the cgroup_subsys object include:
>
>  - subsys_id: a unique array index for the subsystem, indicating which
> -  entry in cgroup->subsys[] this subsystem should be
> -  managing. Initialized by cgroup_register_subsys(); prior to this
> -  it should be initialized to -1
> +  entry in cgroup->subsys[] this subsystem should be managing.
>
> -- hierarchy: an index indicating which hierarchy, if any, this
> -  subsystem is currently attached to. If this is -1, then the
> -  subsystem is not attached to any hierarchy, and all tasks should be
> -  considered to be members of the subsystem's top_cgroup. It should
> -  be initialized to -1.
> +- name: should be initialized to a unique subsystem name. Should be
> +  no longer than MAX_CGROUP_TYPE_NAMELEN.
>
> -- name: should be initialized to a unique subsystem name prior to
> -  calling cgroup_register_subsystem. Should be no longer than
> -  MAX_CGROUP_TYPE_NAMELEN
> +- early_init: indicate if the subsystem needs early initialization
> +  at system boot.
>
>  Each cgroup object created by the sy

Re: Improve init/Kconfig help descriptions [PATCH 6/9]

2008-02-19 Thread Paul Menage

On Feb 19, 2008 6:54 PM, Nick Andrew <[EMAIL PROTECTED]> wrote:
>
> config CGROUPS
> bool "Control Group support"
> help
>   Control Groups enables processes to be tracked and grouped
>   into "cgroups". This enables you, for example, to associate
>   cgroups with certain CPU sets using "cpusets".
>
>   When enabled, a new filesystem type "cgroup" is available
>   and can be mounted to control cpusets and other
>   resource/behaviour controllers.
>
>   See  for more information.
>
>   If unsure, say N.
>
>
> I don't think that description is as clear as it could be. From
> the non-kernel-developer point of view, that is.

Originally this wasn't a user-selectable config value, it was
auto-selected by any subsystem that needed it. I think that was nicer
from the user-experience, and it would eliminate the need for this
documentation but there were concerns that this triggered unspecified
brokenness in the Kbuild system.

>
> Re "other resource/behaviour controllers", what in particular?
> I take it that our current controllers are cpusets, scheduler,
> CPU accounting and Resource counters?

Resource counters aren't a resource controller, they're a helper
library. The others are good examples, as is the memory controller
that's just been added to 2.6.25.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 6/7] cgroup: remove duplicate code in find_css_set()

2008-02-19 Thread Paul Menage

On Feb 17, 2008 9:49 PM, Li Zefan <[EMAIL PROTECTED]> wrote:
> The list head res->tasks gets initialized twice in find_css_set().
>
> Signed-off-by: Li Zefan <[EMAIL PROTECTED]>

Acked-by: Paul Menage <[EMAIL PROTECTED]>

> ---
>  kernel/cgroup.c |1 -
>  1 files changed, 0 insertions(+), 1 deletions(-)
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index e8c8e58..71cf961 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -473,7 +473,6 @@ static struct css_set *find_css_set(
> /* Link this cgroup group into the list */
> list_add(&res->list, &init_css_set.list);
> css_set_count++;
> -   INIT_LIST_HEAD(&res->tasks);
> write_unlock(&css_set_lock);
>
> return res;
> --
> 1.5.4.rc3
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 2/7] cgroup: fix comments

2008-02-19 Thread Paul Menage

On Feb 17, 2008 9:49 PM, Li Zefan <[EMAIL PROTECTED]> wrote:
> fix:
> - comments about need_forkexit_callback
> - comments about release agent
> - typo and comment style, etc.
>
> Signed-off-by: Li Zefan <[EMAIL PROTECTED]>
> ---
>  include/linux/cgroup.h |2 +-
>  kernel/cgroup.c|   44 +---
>  2 files changed, 22 insertions(+), 24 deletions(-)
>
> diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
> index ff9055f..2ebf7af 100644
> --- a/include/linux/cgroup.h
> +++ b/include/linux/cgroup.h
> @@ -175,7 +175,7 @@ struct css_set {
>   *
>   *
>   * When reading/writing to a file:
> - * - the cgroup to use in file->f_dentry->d_parent->d_fsdata
> + * - the cgroup to use is file->f_dentry->d_parent->d_fsdata
>   * - the 'cftype' of the file is file->f_dentry->d_fsdata
>   */
>
> diff --git a/kernel/cgroup.c b/kernel/cgroup.c
> index 4766bb6..0c35022 100644
> --- a/kernel/cgroup.c
> +++ b/kernel/cgroup.c
> @@ -113,9 +113,9 @@ static int root_count;
>  #define dummytop (&rootnode.top_cgroup)
>
>  /* This flag indicates whether tasks in the fork and exit paths should
> - * take callback_mutex and check for fork/exit handlers to call. This
> - * avoids us having to do extra work in the fork/exit path if none of the
> - * subsystems need to be called.
> + * check for fork/exit handlers to call. This avoids us having to do
> + * extra work in the fork/exit path if none of the subsystems need to
> + * be called.
>   */
>  static int need_forkexit_callback;
>
> @@ -507,8 +507,8 @@ static struct css_set *find_css_set(
>   * critical pieces of code here.  The exception occurs on cgroup_exit(),
>   * when a task in a notify_on_release cgroup exits.  Then cgroup_mutex
>   * is taken, and if the cgroup count is zero, a usermode call made
> - * to /sbin/cgroup_release_agent with the name of the cgroup (path
> - * relative to the root of cgroup file system) as the argument.
> + * to the release agent with the name of the cgroup (path relative to
> + * the root of cgroup file system) as the argument.
>   *
>   * A cgroup can only be deleted if both its 'count' of using tasks
>   * is zero, and its list of 'children' cgroups is empty.  Since all
> @@ -521,7 +521,7 @@ static struct css_set *find_css_set(
>   *
>   * The need for this exception arises from the action of
>   * cgroup_attach_task(), which overwrites one tasks cgroup pointer with
> - * another.  It does so using cgroup_mutexe, however there are
> + * another.  It does so using cgroup_mutex, however there are
>   * several performance critical places that need to reference
>   * task->cgroup without the expense of grabbing a system global
>   * mutex.  Therefore except as noted below, when dereferencing or, as
> @@ -1192,7 +1192,7 @@ static void get_first_subsys(const struct cgroup *cgrp,
>   * Attach task 'tsk' to cgroup 'cgrp'
>   *
>   * Call holding cgroup_mutex.  May take task_lock of
> - * the task 'pid' during call.
> + * the task 'tsk' during call.
>   */
>  int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
>  {
> @@ -1584,12 +1584,11 @@ static int cgroup_create_file(struct dentry *dentry, 
> int mode,
>  }
>
>  /*

I think that docbook-style function comments need /** at the start of
the comment block.

Thanks,

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 2/2] Cpusets API: Update cpusets to use cgroup structured file API

2008-02-19 Thread Paul Menage

Many of the cpusets control files are simple integer values, which
don't require the overhead of memory allocations for reads and writes.

Move the handlers for these control files into cpuset_read_uint() and
cpuset_write_uint(). This also has the advantage that the control
files show up as "u64" rather than "string" in the cgroup.api file.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 kernel/cpuset.c |  156 +---
 1 file changed, 82 insertions(+), 74 deletions(-)

Index: cpusets-2.6.25-rc2-mm1/kernel/cpuset.c
===
--- cpusets-2.6.25-rc2-mm1.orig/kernel/cpuset.c
+++ cpusets-2.6.25-rc2-mm1/kernel/cpuset.c
@@ -999,19 +999,6 @@ int current_cpuset_is_being_rebound(void
 }
 
 /*
- * Call with cgroup_mutex held.
- */
-
-static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
-{
-   if (simple_strtoul(buf, NULL, 10) != 0)
-   cpuset_memory_pressure_enabled = 1;
-   else
-   cpuset_memory_pressure_enabled = 0;
-   return 0;
-}
-
-/*
  * update_flag - read a 0 or a 1 in a file and update associated flag
  * bit:the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
  * CS_SCHED_LOAD_BALANCE,
@@ -1023,15 +1010,13 @@ static int update_memory_pressure_enable
  * Call with cgroup_mutex held.
  */
 
-static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+  int turning_on)
 {
-   int turning_on;
struct cpuset trialcs;
int err;
int cpus_nonempty, balance_flag_changed;
 
-   turning_on = (simple_strtoul(buf, NULL, 10) != 0);
-
trialcs = *cs;
if (turning_on)
set_bit(bit, &trialcs.flags);
@@ -1247,43 +1232,65 @@ static ssize_t cpuset_common_file_write(
case FILE_MEMLIST:
retval = update_nodemask(cs, buffer);
break;
+   default:
+   retval = -EINVAL;
+   goto out2;
+   }
+
+   if (retval == 0)
+   retval = nbytes;
+out2:
+   cgroup_unlock();
+out1:
+   kfree(buffer);
+   return retval;
+}
+
+static int cpuset_write_uint(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+   int retval = 0;
+   struct cpuset *cs = cgroup_cs(cgrp);
+   cpuset_filetype_t type = cft->private;
+
+   cgroup_lock();
+
+   if (cgroup_is_removed(cgrp)) {
+   cgroup_unlock();
+   return -ENODEV;
+   }
+
+   switch (type) {
case FILE_CPU_EXCLUSIVE:
-   retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
+   retval = update_flag(CS_CPU_EXCLUSIVE, cs, val);
break;
case FILE_MEM_EXCLUSIVE:
-   retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
+   retval = update_flag(CS_MEM_EXCLUSIVE, cs, val);
break;
case FILE_SCHED_LOAD_BALANCE:
-   retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
+   retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
break;
case FILE_MEMORY_MIGRATE:
-   retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
+   retval = update_flag(CS_MEMORY_MIGRATE, cs, val);
break;
case FILE_MEMORY_PRESSURE_ENABLED:
-   retval = update_memory_pressure_enabled(cs, buffer);
+   cpuset_memory_pressure_enabled = !!val;
break;
case FILE_MEMORY_PRESSURE:
retval = -EACCES;
break;
case FILE_SPREAD_PAGE:
-   retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
+   retval = update_flag(CS_SPREAD_PAGE, cs, val);
cs->mems_generation = cpuset_mems_generation++;
break;
case FILE_SPREAD_SLAB:
-   retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
+   retval = update_flag(CS_SPREAD_SLAB, cs, val);
cs->mems_generation = cpuset_mems_generation++;
break;
default:
retval = -EINVAL;
-   goto out2;
+   break;
}
-
-   if (retval == 0)
-   retval = nbytes;
-out2:
cgroup_unlock();
-out1:
-   kfree(buffer);
return retval;
 }
 
@@ -1345,30 +1352,6 @@ static ssize_t cpuset_common_file_read(s
case FILE_MEMLIST:
s += cpuset_sprintf_memlist(s, cs);
break;
-   case FILE_CPU_EXCLUSIVE:
-   *s++ = is_cpu_exclusive(cs) ? '1' : '0';
-   break;
-   case FILE_MEM_EXCLUSIVE:
-   *s++ = is_mem_exclusive(cs) ? '1' : '0';
-   break;
-   case FILE_SCHED_LOAD_BALANCE:
-

[PATCH 1/2] Cpusets API: From: Paul Jackson <[EMAIL PROTECTED]>

2008-02-19 Thread Paul Menage

Strip all trailing whitespace in cgroup_write_uint

This removes the need for people to remember to pass the -n flag to
echo when writing values to cgroup control files.

Signed-off-by: Paul Menage <[EMAIL PROTECTED]>

---
 kernel/cgroup.c |5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

Index: cpusets-2.6.25-rc2-mm1/kernel/cgroup.c
===
--- cpusets-2.6.25-rc2-mm1.orig/kernel/cgroup.c
+++ cpusets-2.6.25-rc2-mm1/kernel/cgroup.c
@@ -1321,10 +1321,7 @@ static ssize_t cgroup_write_uint(struct 
return -EFAULT;
 
buffer[nbytes] = 0; /* nul-terminate */
-
-   /* strip newline if necessary */
-   if (nbytes && (buffer[nbytes-1] == '\n'))
-   buffer[nbytes-1] = 0;
+   strstrip(buffer);
val = simple_strtoull(buffer, &end, 0);
if (*end)
return -EINVAL;

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH 0/2] Cpusets API: Update Cpusets control files

2008-02-19 Thread Paul Menage

This pair of patches simplifies the cpusets read/write path for the
control files that consist of simple integers.

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC][PATCH 1/7] CGroup API: Add cgroup.api control file

2008-02-19 Thread Paul Menage

On Feb 19, 2008 9:17 PM, Paul Jackson <[EMAIL PROTECTED]> wrote:
>
> Perhaps my primary concern with these *.api files was that I did not
> understand who or what the critical use or user was; who found this
> essential, not just nice to have.
>

Right now, no-one would find it essential. If/when a binary API is
added, I guess I'll ressurrect this part of the patchset.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH 0/2] cgroup map files: Add a key/value map file type to cgroups

2008-02-19 Thread Paul Menage

On Feb 19, 2008 9:48 PM, YAMAMOTO Takashi <[EMAIL PROTECTED]> wrote:
>
> it changes the format from "%s %lld" to "%s: %llu", right?
> why?
>

The colon for consistency with maps in /proc. I think it also makes it
slightly more readable.

For %lld versus %llu - I think that cgroup resource APIs are much more
likely to need to report unsigned rather than signed values. In the
case of the memory.stat file, that's certainly the case.

But I guess there's an argument to be made that nothing's likely to
need the final 64th bit of an unsigned value, whereas the ability to
report negative numbers could potentially be useful for some cgroups.

Paul
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 3 4 5 >

1 - 100 of 407 matches

Mail list logo