[PATCH 2/2] cgroup map files: Use cgroup map for memcontrol stats file
Remove the seq_file boilerplate used to construct the memcontrol stats map, and instead use the new map representation for cgroup control files Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- mm/memcontrol.c | 30 ++ 1 file changed, 6 insertions(+), 24 deletions(-) Index: cgroupmap-2.6.25-rc2-mm1/mm/memcontrol.c === --- cgroupmap-2.6.25-rc2-mm1.orig/mm/memcontrol.c +++ cgroupmap-2.6.25-rc2-mm1/mm/memcontrol.c @@ -974,9 +974,9 @@ static const struct mem_cgroup_stat_desc [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, }; -static int mem_control_stat_show(struct seq_file *m, void *arg) +static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, +struct cgroup_map_cb *cb) { - struct cgroup *cont = m->private; struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); struct mem_cgroup_stat *stat = &mem_cont->stat; int i; @@ -986,8 +986,7 @@ static int mem_control_stat_show(struct val = mem_cgroup_read_stat(stat, i); val *= mem_cgroup_stat_desc[i].unit; - seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, - (long long)val); + cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); } /* showing # of active pages */ { @@ -997,29 +996,12 @@ static int mem_control_stat_show(struct MEM_CGROUP_ZSTAT_INACTIVE); active = mem_cgroup_get_all_zonestat(mem_cont, MEM_CGROUP_ZSTAT_ACTIVE); - seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); - seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); + cb->fill(cb, "active", (active) * PAGE_SIZE); + cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); } return 0; } -static const struct file_operations mem_control_stat_file_operations = { - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int mem_control_stat_open(struct inode *unused, struct file *file) -{ - /* XXX __d_cont */ - struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; - - file->f_op = &mem_control_stat_file_operations; - return single_open(file, mem_control_stat_show, cont); -} - - - static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -1044,7 +1026,7 @@ static struct cftype mem_cgroup_files[] }, { .name = "stat", - .open = mem_control_stat_open, + .read_map = mem_control_stat_show, }, }; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] cgroup map files: Add cgroup map data type
Adds a new type of supported control file representation, a map from strings to u64 values. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/cgroup.h | 19 +++ kernel/cgroup.c| 59 - 2 files changed, 77 insertions(+), 1 deletion(-) Index: cgroupmap-2.6.25-rc2-mm1/include/linux/cgroup.h === --- cgroupmap-2.6.25-rc2-mm1.orig/include/linux/cgroup.h +++ cgroupmap-2.6.25-rc2-mm1/include/linux/cgroup.h @@ -166,6 +166,16 @@ struct css_set { }; +/* + * cgroup_map_cb is an abstract callback API for reporting map-valued + * control files + */ + +struct cgroup_map_cb { + int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); + void *state; +}; + /* struct cftype: * * The files in the cgroup filesystem mostly have a very simple read/write @@ -194,6 +204,15 @@ struct cftype { * single integer. Use it in place of read() */ u64 (*read_uint) (struct cgroup *cont, struct cftype *cft); + /* +* read_map() is used for defining a map of key/value +* pairs. It should call cb->fill(cb, key, value) for each +* entry. The key/value pairs (and their ordering) should not +* change between reboots. +*/ + int (*read_map) (struct cgroup *cont, struct cftype *cft, +struct cgroup_map_cb *cb); + ssize_t (*write) (struct cgroup *cont, struct cftype *cft, struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos); Index: cgroupmap-2.6.25-rc2-mm1/kernel/cgroup.c === --- cgroupmap-2.6.25-rc2-mm1.orig/kernel/cgroup.c +++ cgroupmap-2.6.25-rc2-mm1/kernel/cgroup.c @@ -1487,6 +1487,46 @@ static ssize_t cgroup_file_read(struct f return -EINVAL; } +/* + * seqfile ops/methods for returning structured data. Currently just + * supports string->u64 maps, but can be extended in future. + */ + +struct cgroup_seqfile_state { + struct cftype *cft; + struct cgroup *cgroup; +}; + +static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) +{ + struct seq_file *sf = cb->state; + return seq_printf(sf, "%s: %llu\n", key, value); +} + +static int cgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct cgroup_seqfile_state *state = m->private; + struct cftype *cft = state->cft; + struct cgroup_map_cb cb = { + .fill = cgroup_map_add, + .state = m, + }; + if (cft->read_map) { + return cft->read_map(state->cgroup, cft, &cb); + } else { + BUG(); + } +} + +int cgroup_seqfile_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + kfree(seq->private); + return single_release(inode, file); +} + +static struct file_operations cgroup_seqfile_operations; + static int cgroup_file_open(struct inode *inode, struct file *file) { int err; @@ -1499,7 +1539,18 @@ static int cgroup_file_open(struct inode cft = __d_cft(file->f_dentry); if (!cft) return -ENODEV; - if (cft->open) + if (cft->read_map) { + struct cgroup_seqfile_state *state = + kzalloc(sizeof(*state), GFP_USER); + if (!state) + return -ENOMEM; + state->cft = cft; + state->cgroup = __d_cgrp(file->f_dentry->d_parent); + file->f_op = &cgroup_seqfile_operations; + err = single_open(file, cgroup_seqfile_show, state); + if (err < 0) + kfree(state); + } else if (cft->open) err = cft->open(inode, file); else err = 0; @@ -1538,6 +1589,12 @@ static struct file_operations cgroup_fil .release = cgroup_file_release, }; +static struct file_operations cgroup_seqfile_operations = { + .read = seq_read, + .llseek = seq_lseek, + .release = cgroup_seqfile_release, +}; + static struct inode_operations cgroup_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cgroup_mkdir, -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/2] cgroup map files: Add a key/value map file type to cgroups
These patches add a new cgroup control file output type - a map from strings to u64 values - and make use of it for the memory controller "stat" file. It is intended for use when the subsystem wants to return a collection of values that are related in some way, for which a separate control file for each value would make the reporting unwieldy. The advantages of this are: - more standardized output from control files that report similarly-structured data - less boilerplate required in cgroup subsystems - simplifies transition to a future efficient cgroups binary API Signed-off-by: Paul Menage <[EMAIL PROTECTED]> -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] ResCounter: Use read_uint in memory controller
Update the memory controller to use read_uint for its limit/usage/failcnt control files, calling the new res_counter_read_uint() function. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- mm/memcontrol.c | 15 ++- 1 file changed, 6 insertions(+), 9 deletions(-) Index: rescounter-2.6.25-rc2-mm1/mm/memcontrol.c === --- rescounter-2.6.25-rc2-mm1.orig/mm/memcontrol.c +++ rescounter-2.6.25-rc2-mm1/mm/memcontrol.c @@ -922,13 +922,10 @@ int mem_cgroup_write_strategy(char *buf, return 0; } -static ssize_t mem_cgroup_read(struct cgroup *cont, - struct cftype *cft, struct file *file, - char __user *userbuf, size_t nbytes, loff_t *ppos) +static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { - return res_counter_read(&mem_cgroup_from_cont(cont)->res, - cft->private, userbuf, nbytes, ppos, - NULL); + return res_counter_read_uint(&mem_cgroup_from_cont(cont)->res, +cft->private); } static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, @@ -1024,18 +1021,18 @@ static struct cftype mem_cgroup_files[] { .name = "usage_in_bytes", .private = RES_USAGE, - .read = mem_cgroup_read, + .read_uint = mem_cgroup_read, }, { .name = "limit_in_bytes", .private = RES_LIMIT, .write = mem_cgroup_write, - .read = mem_cgroup_read, + .read_uint = mem_cgroup_read, }, { .name = "failcnt", .private = RES_FAILCNT, - .read = mem_cgroup_read, + .read_uint = mem_cgroup_read, }, { .name = "force_empty", -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/2] ResCounter: Add res_counter_read_uint and use it in memory cgroup
These patches simplify the code required to report values from a res_counter object in a cgroups control file. The first patch adds res_counter_read_uint, which simply reports the current value for a res_counter member. The second replaces the existing mem_cgroup_read() with a simpler version that calls res_counter_read_uint(). Signed-off-by: Paul Menage <[EMAIL PROTECTED]> -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] ResCounter: Add res_counter_read_uint()
Adds a function for returning the value of a resource counter member, in a form suitable for use in a cgroup read_uint control file method. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/res_counter.h |1 + kernel/res_counter.c|5 + 2 files changed, 6 insertions(+) Index: rescounter-2.6.25-rc2-mm1/include/linux/res_counter.h === --- rescounter-2.6.25-rc2-mm1.orig/include/linux/res_counter.h +++ rescounter-2.6.25-rc2-mm1/include/linux/res_counter.h @@ -54,6 +54,7 @@ struct res_counter { ssize_t res_counter_read(struct res_counter *counter, int member, const char __user *buf, size_t nbytes, loff_t *pos, int (*read_strategy)(unsigned long long val, char *s)); +u64 res_counter_read_uint(struct res_counter *counter, int member); ssize_t res_counter_write(struct res_counter *counter, int member, const char __user *buf, size_t nbytes, loff_t *pos, int (*write_strategy)(char *buf, unsigned long long *val)); Index: rescounter-2.6.25-rc2-mm1/kernel/res_counter.c === --- rescounter-2.6.25-rc2-mm1.orig/kernel/res_counter.c +++ rescounter-2.6.25-rc2-mm1/kernel/res_counter.c @@ -92,6 +92,11 @@ ssize_t res_counter_read(struct res_coun pos, buf, s - buf); } +u64 res_counter_read_uint(struct res_counter *counter, int member) +{ + return *res_counter_member(counter, member); +} + ssize_t res_counter_write(struct res_counter *counter, int member, const char __user *userbuf, size_t nbytes, loff_t *pos, int (*write_strategy)(char *st_buf, unsigned long long *val)) -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 1/2] cgroup map files: Add cgroup map data type
Adds a new type of supported control file representation, a map from strings to u64 values. The map type is printed in a similar format to /proc/meminfo or /proc//status, i.e. "$key: $value\n" Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/cgroup.h | 19 +++ kernel/cgroup.c| 59 - 2 files changed, 77 insertions(+), 1 deletion(-) Index: cgroupmap-2.6.25-rc2-mm1/include/linux/cgroup.h === --- cgroupmap-2.6.25-rc2-mm1.orig/include/linux/cgroup.h +++ cgroupmap-2.6.25-rc2-mm1/include/linux/cgroup.h @@ -166,6 +166,16 @@ struct css_set { }; +/* + * cgroup_map_cb is an abstract callback API for reporting map-valued + * control files + */ + +struct cgroup_map_cb { + int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); + void *state; +}; + /* struct cftype: * * The files in the cgroup filesystem mostly have a very simple read/write @@ -194,6 +204,15 @@ struct cftype { * single integer. Use it in place of read() */ u64 (*read_uint) (struct cgroup *cont, struct cftype *cft); + /* +* read_map() is used for defining a map of key/value +* pairs. It should call cb->fill(cb, key, value) for each +* entry. The key/value pairs (and their ordering) should not +* change between reboots. +*/ + int (*read_map) (struct cgroup *cont, struct cftype *cft, +struct cgroup_map_cb *cb); + ssize_t (*write) (struct cgroup *cont, struct cftype *cft, struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos); Index: cgroupmap-2.6.25-rc2-mm1/kernel/cgroup.c === --- cgroupmap-2.6.25-rc2-mm1.orig/kernel/cgroup.c +++ cgroupmap-2.6.25-rc2-mm1/kernel/cgroup.c @@ -1487,6 +1487,46 @@ static ssize_t cgroup_file_read(struct f return -EINVAL; } +/* + * seqfile ops/methods for returning structured data. Currently just + * supports string->u64 maps, but can be extended in future. + */ + +struct cgroup_seqfile_state { + struct cftype *cft; + struct cgroup *cgroup; +}; + +static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) +{ + struct seq_file *sf = cb->state; + return seq_printf(sf, "%s %llu\n", key, value); +} + +static int cgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct cgroup_seqfile_state *state = m->private; + struct cftype *cft = state->cft; + if (cft->read_map) { + struct cgroup_map_cb cb = { + .fill = cgroup_map_add, + .state = m, + }; + return cft->read_map(state->cgroup, cft, &cb); + } else { + BUG(); + } +} + +int cgroup_seqfile_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + kfree(seq->private); + return single_release(inode, file); +} + +static struct file_operations cgroup_seqfile_operations; + static int cgroup_file_open(struct inode *inode, struct file *file) { int err; @@ -1499,7 +1539,18 @@ static int cgroup_file_open(struct inode cft = __d_cft(file->f_dentry); if (!cft) return -ENODEV; - if (cft->open) + if (cft->read_map) { + struct cgroup_seqfile_state *state = + kzalloc(sizeof(*state), GFP_USER); + if (!state) + return -ENOMEM; + state->cft = cft; + state->cgroup = __d_cgrp(file->f_dentry->d_parent); + file->f_op = &cgroup_seqfile_operations; + err = single_open(file, cgroup_seqfile_show, state); + if (err < 0) + kfree(state); + } else if (cft->open) err = cft->open(inode, file); else err = 0; @@ -1538,6 +1589,12 @@ static struct file_operations cgroup_fil .release = cgroup_file_release, }; +static struct file_operations cgroup_seqfile_operations = { + .read = seq_read, + .llseek = seq_lseek, + .release = cgroup_seqfile_release, +}; + static struct inode_operations cgroup_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cgroup_mkdir, -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/2] cgroup map files: Add a key/value map file type to cgroups
[ Updated from the previous version to remove the colon from the map output ] These patches add a new cgroup control file output type - a map from strings to u64 values - and make use of it for the memory controller "stat" file. It is intended for use when the subsystem wants to return a collection of values that are related in some way, for which a separate control file for each value would make the reporting unwieldy. The advantages of this are: - more standardized output from control files that report similarly-structured data that needs to be parsed programmatically - less boilerplate required in cgroup subsystems - simplifies transition to a future efficient cgroups binary API Signed-off-by: Paul Menage <[EMAIL PROTECTED]> -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] cgroup map files: Use cgroup map for memcontrol stats file
Remove the seq_file boilerplate used to construct the memcontrol stats map, and instead use the new map representation for cgroup control files Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- mm/memcontrol.c | 30 ++ 1 file changed, 6 insertions(+), 24 deletions(-) Index: cgroupmap-2.6.25-rc2-mm1/mm/memcontrol.c === --- cgroupmap-2.6.25-rc2-mm1.orig/mm/memcontrol.c +++ cgroupmap-2.6.25-rc2-mm1/mm/memcontrol.c @@ -974,9 +974,9 @@ static const struct mem_cgroup_stat_desc [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, }; -static int mem_control_stat_show(struct seq_file *m, void *arg) +static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, +struct cgroup_map_cb *cb) { - struct cgroup *cont = m->private; struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); struct mem_cgroup_stat *stat = &mem_cont->stat; int i; @@ -986,8 +986,7 @@ static int mem_control_stat_show(struct val = mem_cgroup_read_stat(stat, i); val *= mem_cgroup_stat_desc[i].unit; - seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, - (long long)val); + cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); } /* showing # of active pages */ { @@ -997,29 +996,12 @@ static int mem_control_stat_show(struct MEM_CGROUP_ZSTAT_INACTIVE); active = mem_cgroup_get_all_zonestat(mem_cont, MEM_CGROUP_ZSTAT_ACTIVE); - seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); - seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); + cb->fill(cb, "active", (active) * PAGE_SIZE); + cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); } return 0; } -static const struct file_operations mem_control_stat_file_operations = { - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int mem_control_stat_open(struct inode *unused, struct file *file) -{ - /* XXX __d_cont */ - struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; - - file->f_op = &mem_control_stat_file_operations; - return single_open(file, mem_control_stat_show, cont); -} - - - static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -1044,7 +1026,7 @@ static struct cftype mem_cgroup_files[] }, { .name = "stat", - .open = mem_control_stat_open, + .read_map = mem_control_stat_show, }, }; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 06/10] CGroup API files: Add cgroup map data type
Adds a new type of supported control file representation, a map from strings to u64 values. Each map entry is printed as a line in a similar format to /proc/vmstat, i.e. "$key $value\n" Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/cgroup.h | 19 + kernel/cgroup.c| 53 - 2 files changed, 71 insertions(+), 1 deletion(-) Index: cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h === --- cgroup-2.6.25-rc2-mm1.orig/include/linux/cgroup.h +++ cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h @@ -166,6 +166,16 @@ struct css_set { }; +/* + * cgroup_map_cb is an abstract callback API for reporting map-valued + * control files + */ + +struct cgroup_map_cb { + int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); + void *state; +}; + /* struct cftype: * * The files in the cgroup filesystem mostly have a very simple read/write @@ -194,6 +204,15 @@ struct cftype { * single integer. Use it in place of read() */ u64 (*read_u64) (struct cgroup *cont, struct cftype *cft); + /* +* read_map() is used for defining a map of key/value +* pairs. It should call cb->fill(cb, key, value) for each +* entry. The key/value pairs (and their ordering) should not +* change between reboots. +*/ + int (*read_map) (struct cgroup *cont, struct cftype *cft, +struct cgroup_map_cb *cb); + ssize_t (*write) (struct cgroup *cont, struct cftype *cft, struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos); Index: cgroup-2.6.25-rc2-mm1/kernel/cgroup.c === --- cgroup-2.6.25-rc2-mm1.orig/kernel/cgroup.c +++ cgroup-2.6.25-rc2-mm1/kernel/cgroup.c @@ -1484,6 +1484,46 @@ static ssize_t cgroup_file_read(struct f return -EINVAL; } +/* + * seqfile ops/methods for returning structured data. Currently just + * supports string->u64 maps, but can be extended in future. + */ + +struct cgroup_seqfile_state { + struct cftype *cft; + struct cgroup *cgroup; +}; + +static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) +{ + struct seq_file *sf = cb->state; + return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); +} + +static int cgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct cgroup_seqfile_state *state = m->private; + struct cftype *cft = state->cft; + struct cgroup_map_cb cb = { + .fill = cgroup_map_add, + .state = m, + }; + return cft->read_map(state->cgroup, cft, &cb); +} + +int cgroup_seqfile_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + kfree(seq->private); + return single_release(inode, file); +} + +static struct file_operations cgroup_seqfile_operations = { + .read = seq_read, + .llseek = seq_lseek, + .release = cgroup_seqfile_release, +}; + static int cgroup_file_open(struct inode *inode, struct file *file) { int err; @@ -1496,7 +1536,18 @@ static int cgroup_file_open(struct inode cft = __d_cft(file->f_dentry); if (!cft) return -ENODEV; - if (cft->open) + if (cft->read_map) { + struct cgroup_seqfile_state *state = + kzalloc(sizeof(*state), GFP_USER); + if (!state) + return -ENOMEM; + state->cft = cft; + state->cgroup = __d_cgrp(file->f_dentry->d_parent); + file->f_op = &cgroup_seqfile_operations; + err = single_open(file, cgroup_seqfile_show, state); + if (err < 0) + kfree(state); + } else if (cft->open) err = cft->open(inode, file); else err = 0; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 00/10] CGroup API files: Various cleanup to CGroup control files
This patchset is a roll-up of the non-contraversial items of the various patches that I've sent out recently, fixed according to the feedback received. In summary they are: - general rename of read_uint/write_uint to read_u64/write_u64 - use these methods for cpusets and memory controller files - add a read_map cgroup file method, and use it in the memory controller - move the "releasable" control file to the debug subsystem - make the debug subsystem config option default to "n" The only user-visible changes are the movement of the "releasable" file and the fact that some write_u64()-based control files are now more forgiving of additional whitespace at the end of their input. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 02/10] CGroup API files: Add res_counter_read_u64()
Adds a function for returning the value of a resource counter member, in a form suitable for use in a cgroup read_u64 control file method. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/res_counter.h |5 - kernel/res_counter.c|5 + 2 files changed, 9 insertions(+), 1 deletion(-) Index: cgroup-2.6.25-rc2-mm1/include/linux/res_counter.h === --- cgroup-2.6.25-rc2-mm1.orig/include/linux/res_counter.h +++ cgroup-2.6.25-rc2-mm1/include/linux/res_counter.h @@ -39,8 +39,9 @@ struct res_counter { spinlock_t lock; }; -/* +/** * Helpers to interact with userspace + * res_counter_read_u64() - returns the value of the specified member. * res_counter_read/_write - put/get the specified fields from the * res_counter struct to/from the user * @@ -51,6 +52,8 @@ struct res_counter { * @pos: and the offset. */ +u64 res_counter_read_u64(struct res_counter *counter, int member); + ssize_t res_counter_read(struct res_counter *counter, int member, const char __user *buf, size_t nbytes, loff_t *pos, int (*read_strategy)(unsigned long long val, char *s)); Index: cgroup-2.6.25-rc2-mm1/kernel/res_counter.c === --- cgroup-2.6.25-rc2-mm1.orig/kernel/res_counter.c +++ cgroup-2.6.25-rc2-mm1/kernel/res_counter.c @@ -92,6 +92,11 @@ ssize_t res_counter_read(struct res_coun pos, buf, s - buf); } +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + return *res_counter_member(counter, member); +} + ssize_t res_counter_write(struct res_counter *counter, int member, const char __user *userbuf, size_t nbytes, loff_t *pos, int (*write_strategy)(char *st_buf, unsigned long long *val)) -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 04/10] CGroup API files: Strip all trailing whitespace in cgroup_write_u64
This removes the need for people to remember to pass the -n flag to echo when writing values to cgroup control files. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- kernel/cgroup.c |5 + 1 file changed, 1 insertion(+), 4 deletions(-) Index: cgroup-2.6.25-rc2-mm1/kernel/cgroup.c === --- cgroup-2.6.25-rc2-mm1.orig/kernel/cgroup.c +++ cgroup-2.6.25-rc2-mm1/kernel/cgroup.c @@ -1321,10 +1321,7 @@ static ssize_t cgroup_write_u64(struct c return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ - - /* strip newline if necessary */ - if (nbytes && (buffer[nbytes-1] == '\n')) - buffer[nbytes-1] = 0; + strstrip(buffer); val = simple_strtoull(buffer, &end, 0); if (*end) return -EINVAL; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 08/10] CGroup API files: Drop mem_cgroup_force_empty()
This function isn't needed - a NULL pointer in the cftype read function will result in the same EINVAL response to userspace. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- mm/memcontrol.c | 14 -- 1 file changed, 14 deletions(-) Index: cgroup-2.6.25-rc2-mm1/mm/memcontrol.c === --- cgroup-2.6.25-rc2-mm1.orig/mm/memcontrol.c +++ cgroup-2.6.25-rc2-mm1/mm/memcontrol.c @@ -950,19 +950,6 @@ static ssize_t mem_force_empty_write(str return ret; } -/* - * Note: This should be removed if cgroup supports write-only file. - */ - -static ssize_t mem_force_empty_read(struct cgroup *cont, - struct cftype *cft, - struct file *file, char __user *userbuf, - size_t nbytes, loff_t *ppos) -{ - return -EINVAL; -} - - static const struct mem_cgroup_stat_desc { const char *msg; u64 unit; @@ -1019,7 +1006,6 @@ static struct cftype mem_cgroup_files[] { .name = "force_empty", .write = mem_force_empty_write, - .read = mem_force_empty_read, }, { .name = "stat", -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 07/10] CGroup API files: Use cgroup map for memcontrol stats file
Remove the seq_file boilerplate used to construct the memcontrol stats map, and instead use the new map representation for cgroup control files Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- mm/memcontrol.c | 30 ++ 1 file changed, 6 insertions(+), 24 deletions(-) Index: cgroup-2.6.25-rc2-mm1/mm/memcontrol.c === --- cgroup-2.6.25-rc2-mm1.orig/mm/memcontrol.c +++ cgroup-2.6.25-rc2-mm1/mm/memcontrol.c @@ -971,9 +971,9 @@ static const struct mem_cgroup_stat_desc [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, }; -static int mem_control_stat_show(struct seq_file *m, void *arg) +static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, +struct cgroup_map_cb *cb) { - struct cgroup *cont = m->private; struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); struct mem_cgroup_stat *stat = &mem_cont->stat; int i; @@ -983,8 +983,7 @@ static int mem_control_stat_show(struct val = mem_cgroup_read_stat(stat, i); val *= mem_cgroup_stat_desc[i].unit; - seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, - (long long)val); + cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); } /* showing # of active pages */ { @@ -994,29 +993,12 @@ static int mem_control_stat_show(struct MEM_CGROUP_ZSTAT_INACTIVE); active = mem_cgroup_get_all_zonestat(mem_cont, MEM_CGROUP_ZSTAT_ACTIVE); - seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); - seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); + cb->fill(cb, "active", (active) * PAGE_SIZE); + cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); } return 0; } -static const struct file_operations mem_control_stat_file_operations = { - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int mem_control_stat_open(struct inode *unused, struct file *file) -{ - /* XXX __d_cont */ - struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; - - file->f_op = &mem_control_stat_file_operations; - return single_open(file, mem_control_stat_show, cont); -} - - - static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -1041,7 +1023,7 @@ static struct cftype mem_cgroup_files[] }, { .name = "stat", - .open = mem_control_stat_open, + .read_map = mem_control_stat_show, }, }; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 05/10] CGroup API files: Update cpusets to use cgroup structured file API
Many of the cpusets control files are simple integer values, which don't require the overhead of memory allocations for reads and writes. Move the handlers for these control files into cpuset_read_u64() and cpuset_write_u64(). Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- kernel/cpuset.c | 155 +--- 1 file changed, 81 insertions(+), 74 deletions(-) Index: cgroup-2.6.25-rc2-mm1/kernel/cpuset.c === --- cgroup-2.6.25-rc2-mm1.orig/kernel/cpuset.c +++ cgroup-2.6.25-rc2-mm1/kernel/cpuset.c @@ -999,19 +999,6 @@ int current_cpuset_is_being_rebound(void } /* - * Call with cgroup_mutex held. - */ - -static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) -{ - if (simple_strtoul(buf, NULL, 10) != 0) - cpuset_memory_pressure_enabled = 1; - else - cpuset_memory_pressure_enabled = 0; - return 0; -} - -/* * update_flag - read a 0 or a 1 in a file and update associated flag * bit:the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, * CS_SCHED_LOAD_BALANCE, @@ -1023,15 +1010,13 @@ static int update_memory_pressure_enable * Call with cgroup_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) +static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, + int turning_on) { - int turning_on; struct cpuset trialcs; int err; int cpus_nonempty, balance_flag_changed; - turning_on = (simple_strtoul(buf, NULL, 10) != 0); - trialcs = *cs; if (turning_on) set_bit(bit, &trialcs.flags); @@ -1247,43 +1232,65 @@ static ssize_t cpuset_common_file_write( case FILE_MEMLIST: retval = update_nodemask(cs, buffer); break; + default: + retval = -EINVAL; + goto out2; + } + + if (retval == 0) + retval = nbytes; +out2: + cgroup_unlock(); +out1: + kfree(buffer); + return retval; +} + +static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + cpuset_filetype_t type = cft->private; + + cgroup_lock(); + + if (cgroup_is_removed(cgrp)) { + cgroup_unlock(); + return -ENODEV; + } + + switch (type) { case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); + retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); + retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); break; case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); + retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); + retval = update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: - retval = update_memory_pressure_enabled(cs, buffer); + cpuset_memory_pressure_enabled = !!val; break; case FILE_MEMORY_PRESSURE: retval = -EACCES; break; case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, buffer); + retval = update_flag(CS_SPREAD_PAGE, cs, val); cs->mems_generation = cpuset_mems_generation++; break; case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, buffer); + retval = update_flag(CS_SPREAD_SLAB, cs, val); cs->mems_generation = cpuset_mems_generation++; break; default: retval = -EINVAL; - goto out2; + break; } - - if (retval == 0) - retval = nbytes; -out2: cgroup_unlock(); -out1: - kfree(buffer); return retval; } @@ -1345,30 +1352,6 @@ static ssize_t cpuset_common_file_read(s case FILE_MEMLIST: s += cpuset_sprintf_memlist(s, cs); break; - case FILE_CPU_EXCLUSIVE: - *s++ = is_cpu_exclusive(cs) ? '1' : '0'; - break; - case FILE_MEM_EXCLUSIVE: - *s++ = is_mem_exclusive(cs) ? '1' : '0'; - break; - case FILE_SCHED_LOAD_BALANCE: - *s++ = is_sched_load_balance(cs) ? '1' : '0'; - break; - case FILE_MEMORY_MIGRATE: -
[PATCH 09/10] CGroup API files: Move "releasable" to cgroup_debug subsystem
The "releasable" control file provided by the cgroup framework exports the state of a per-cgroup flag that's related to the notify-on-release feature. This isn't really generally useful, unless you're trying to debug this particular feature of cgroups. This patch moves the "releasable" file to the cgroup_debug subsystem. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/cgroup.h | 11 +++ kernel/cgroup.c| 23 --- kernel/cgroup_debug.c | 12 +++- 3 files changed, 22 insertions(+), 24 deletions(-) Index: cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h === --- cgroup-2.6.25-rc2-mm1.orig/include/linux/cgroup.h +++ cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h @@ -88,6 +88,17 @@ static inline void css_put(struct cgroup __css_put(css); } +/* bits in struct cgroup flags field */ +enum { + /* Control Group is dead */ + CGRP_REMOVED, + /* Control Group has previously had a child cgroup or a task, +* but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */ + CGRP_RELEASABLE, + /* Control Group requires release notifications to userspace */ + CGRP_NOTIFY_ON_RELEASE, +}; + struct cgroup { unsigned long flags;/* "unsigned long" so bitops work */ Index: cgroup-2.6.25-rc2-mm1/kernel/cgroup.c === --- cgroup-2.6.25-rc2-mm1.orig/kernel/cgroup.c +++ cgroup-2.6.25-rc2-mm1/kernel/cgroup.c @@ -119,17 +119,6 @@ static int root_count; */ static int need_forkexit_callback; -/* bits in struct cgroup flags field */ -enum { - /* Control Group is dead */ - CGRP_REMOVED, - /* Control Group has previously had a child cgroup or a task, -* but no longer (only if CGRP_NOTIFY_ON_RELEASE is set) */ - CGRP_RELEASABLE, - /* Control Group requires release notifications to userspace */ - CGRP_NOTIFY_ON_RELEASE, -}; - /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) { @@ -1299,7 +1288,6 @@ enum cgroup_filetype { FILE_DIR, FILE_TASKLIST, FILE_NOTIFY_ON_RELEASE, - FILE_RELEASABLE, FILE_RELEASE_AGENT, }; @@ -2169,11 +2157,6 @@ static u64 cgroup_read_notify_on_release return notify_on_release(cgrp); } -static u64 cgroup_read_releasable(struct cgroup *cgrp, struct cftype *cft) -{ - return test_bit(CGRP_RELEASABLE, &cgrp->flags); -} - /* * for the common functions, 'private' gives the type of file */ @@ -2193,12 +2176,6 @@ static struct cftype files[] = { .write = cgroup_common_file_write, .private = FILE_NOTIFY_ON_RELEASE, }, - - { - .name = "releasable", - .read_u64 = cgroup_read_releasable, - .private = FILE_RELEASABLE, - } }; static struct cftype cft_release_agent = { Index: cgroup-2.6.25-rc2-mm1/kernel/cgroup_debug.c === --- cgroup-2.6.25-rc2-mm1.orig/kernel/cgroup_debug.c +++ cgroup-2.6.25-rc2-mm1/kernel/cgroup_debug.c @@ -1,5 +1,5 @@ /* - * kernel/ccontainer_debug.c - Example cgroup subsystem that + * kernel/cgroup_debug.c - Example cgroup subsystem that * exposes debug info * * Copyright (C) Google Inc, 2007 @@ -62,6 +62,11 @@ static u64 current_css_set_refcount_read return count; } +static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + static struct cftype files[] = { { .name = "cgroup_refcount", @@ -81,6 +86,11 @@ static struct cftype files[] = { .name = "current_css_set_refcount", .read_u64 = current_css_set_refcount_read, }, + + { + .name = "releasable", + .read_u64 = releasable_read, + } }; static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 10/10] CGroup API files: Make CGROUP_DEBUG default to off
The cgroup debug subsystem isn't generally useful for users. It should default to "n". Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- init/Kconfig |1 + 1 file changed, 1 insertion(+) Index: cgroup-2.6.25-rc2-mm1/init/Kconfig === --- cgroup-2.6.25-rc2-mm1.orig/init/Kconfig +++ cgroup-2.6.25-rc2-mm1/init/Kconfig @@ -284,6 +284,7 @@ config CGROUPS config CGROUP_DEBUG bool "Example debug cgroup subsystem" depends on CGROUPS + default n help This option enables a simple cgroup subsystem that exports useful debugging information about the cgroups -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 01/10] CGroup API files: Rename read/write_uint methods to read_write_u64
Several people have justifiably complained that the "_uint" suffix is inappropriate for functions that handle u64 values, so this patch just renames all these functions and their users to have the suffic _u64. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/cgroup.h |8 kernel/cgroup.c| 32 kernel/cgroup_debug.c |8 kernel/sched.c | 18 +- 4 files changed, 33 insertions(+), 33 deletions(-) Index: cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h === --- cgroup-2.6.25-rc2-mm1.orig/include/linux/cgroup.h +++ cgroup-2.6.25-rc2-mm1/include/linux/cgroup.h @@ -190,20 +190,20 @@ struct cftype { struct file *file, char __user *buf, size_t nbytes, loff_t *ppos); /* -* read_uint() is a shortcut for the common case of returning a +* read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() */ - u64 (*read_uint) (struct cgroup *cont, struct cftype *cft); + u64 (*read_u64) (struct cgroup *cont, struct cftype *cft); ssize_t (*write) (struct cgroup *cont, struct cftype *cft, struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos); /* -* write_uint() is a shortcut for the common case of accepting +* write_u64() is a shortcut for the common case of accepting * a single integer (as parsed by simple_strtoull) from * userspace. Use in place of write(); return 0 or error. */ - int (*write_uint) (struct cgroup *cont, struct cftype *cft, u64 val); + int (*write_u64) (struct cgroup *cont, struct cftype *cft, u64 val); int (*release) (struct inode *inode, struct file *file); }; Index: cgroup-2.6.25-rc2-mm1/kernel/cgroup.c === --- cgroup-2.6.25-rc2-mm1.orig/kernel/cgroup.c +++ cgroup-2.6.25-rc2-mm1/kernel/cgroup.c @@ -1303,10 +1303,10 @@ enum cgroup_filetype { FILE_RELEASE_AGENT, }; -static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, -struct file *file, -const char __user *userbuf, -size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_u64(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) { char buffer[64]; int retval = 0; @@ -1330,7 +1330,7 @@ static ssize_t cgroup_write_uint(struct return -EINVAL; /* Pass to subsystem */ - retval = cft->write_uint(cgrp, cft, val); + retval = cft->write_u64(cgrp, cft, val); if (!retval) retval = nbytes; return retval; @@ -1411,18 +1411,18 @@ static ssize_t cgroup_file_write(struct return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); - if (cft->write_uint) - return cgroup_write_uint(cgrp, cft, file, buf, nbytes, ppos); + if (cft->write_u64) + return cgroup_write_u64(cgrp, cft, file, buf, nbytes, ppos); return -EINVAL; } -static ssize_t cgroup_read_uint(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { char tmp[64]; - u64 val = cft->read_uint(cgrp, cft); + u64 val = cft->read_u64(cgrp, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); @@ -1482,8 +1482,8 @@ static ssize_t cgroup_file_read(struct f if (cft->read) return cft->read(cgrp, cft, file, buf, nbytes, ppos); - if (cft->read_uint) - return cgroup_read_uint(cgrp, cft, file, buf, nbytes, ppos); + if (cft->read_u64) + return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); return -EINVAL; } @@ -2141,14 +2141,14 @@ static struct cftype files[] = { { .name = "notify_on_release", - .read_uint = cgroup_read_notify_on_release, + .read_u64 = cgroup_read_notify_on_release, .write = cgr
[PATCH 03/10] CGroup API files: Use read_u64 in memory controller
Update the memory controller to use read_u64 for its limit/usage/failcnt control files, calling the new res_counter_read_u64() function. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- mm/memcontrol.c | 15 ++- 1 file changed, 6 insertions(+), 9 deletions(-) Index: cgroup-2.6.25-rc2-mm1/mm/memcontrol.c === --- cgroup-2.6.25-rc2-mm1.orig/mm/memcontrol.c +++ cgroup-2.6.25-rc2-mm1/mm/memcontrol.c @@ -922,13 +922,10 @@ int mem_cgroup_write_strategy(char *buf, return 0; } -static ssize_t mem_cgroup_read(struct cgroup *cont, - struct cftype *cft, struct file *file, - char __user *userbuf, size_t nbytes, loff_t *ppos) +static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { - return res_counter_read(&mem_cgroup_from_cont(cont)->res, - cft->private, userbuf, nbytes, ppos, - NULL); + return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, + cft->private); } static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, @@ -1024,18 +1021,18 @@ static struct cftype mem_cgroup_files[] { .name = "usage_in_bytes", .private = RES_USAGE, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read, }, { .name = "limit_in_bytes", .private = RES_LIMIT, .write = mem_cgroup_write, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read, }, { .name = "failcnt", .private = RES_FAILCNT, - .read = mem_cgroup_read, + .read_u64 = mem_cgroup_read, }, { .name = "force_empty", -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 02/10] Task Containers(V11): Add tasks file interface
This patch adds the per-directory "tasks" file for containerfs mounts; this allows the user to determine which tasks are members of a container by reading a container's "tasks", and to move a task into a container by writing its pid to its "tasks". Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container.h | 10 + kernel/container.c| 359 +- 2 files changed, 368 insertions(+), 1 deletion(-) Index: container-2.6.22-rc6-mm1/include/linux/container.h === --- container-2.6.22-rc6-mm1.orig/include/linux/container.h +++ container-2.6.22-rc6-mm1/include/linux/container.h @@ -144,6 +144,16 @@ int container_is_removed(const struct co int container_path(const struct container *cont, char *buf, int buflen); +int __container_task_count(const struct container *cont); +static inline int container_task_count(const struct container *cont) +{ + int task_count; + rcu_read_lock(); + task_count = __container_task_count(cont); + rcu_read_unlock(); + return task_count; +} + /* Return true if the container is a descendant of the current container */ int container_is_descendant(const struct container *cont); Index: container-2.6.22-rc6-mm1/kernel/container.c === --- container-2.6.22-rc6-mm1.orig/kernel/container.c +++ container-2.6.22-rc6-mm1/kernel/container.c @@ -40,7 +40,7 @@ #include #include #include - +#include #include /* Generate an array of container subsystem pointers */ @@ -704,6 +704,127 @@ int container_path(const struct containe return 0; } +/* + * Return the first subsystem attached to a container's hierarchy, and + * its subsystem id. + */ + +static void get_first_subsys(const struct container *cont, + struct container_subsys_state **css, int *subsys_id) +{ + const struct containerfs_root *root = cont->root; + const struct container_subsys *test_ss; + BUG_ON(list_empty(&root->subsys_list)); + test_ss = list_entry(root->subsys_list.next, +struct container_subsys, sibling); + if (css) { + *css = cont->subsys[test_ss->subsys_id]; + BUG_ON(!*css); + } + if (subsys_id) + *subsys_id = test_ss->subsys_id; +} + +/* + * Attach task 'tsk' to container 'cont' + * + * Call holding container_mutex. May take task_lock of + * the task 'pid' during call. + */ +static int attach_task(struct container *cont, struct task_struct *tsk) +{ + int retval = 0; + struct container_subsys *ss; + struct container *oldcont; + struct css_group *cg = &tsk->containers; + struct containerfs_root *root = cont->root; + int i; + int subsys_id; + + get_first_subsys(cont, NULL, &subsys_id); + + /* Nothing to do if the task is already in that container */ + oldcont = task_container(tsk, subsys_id); + if (cont == oldcont) + return 0; + + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cont, tsk); + if (retval) { + return retval; + } + } + } + + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + task_unlock(tsk); + return -ESRCH; + } + /* Update the css_group pointers for the subsystems in this +* hierarchy */ + for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) { + if (root->subsys_bits & (1ull << i)) { + /* Subsystem is in this hierarchy. So we want +* the subsystem state from the new +* container. Transfer the refcount from the +* old to the new */ + atomic_inc(&cont->count); + atomic_dec(&cg->subsys[i]->container->count); + rcu_assign_pointer(cg->subsys[i], cont->subsys[i]); + } + } + task_unlock(tsk); + + for_each_subsys(root, ss) { + if (ss->attach) { + ss->attach(ss, cont, oldcont, tsk); + } + } + + synchronize_rcu(); + return 0; +} + +/* + * Attach task with pid 'pid' to container 'cont'. Call with + * container_mutex, may take task_lock of task + */ +static int attach_task_by_pid(struct container *cont, char *pidbuf) +{ + pid_t pid; + struct task_struct *tsk; + int ret; + + if (sscanf(pidbuf, "%d", &pid) != 1) +
[PATCH 05/10] Task Containers(V11): Add procfs interface
This patch adds: /proc/containers - general system info /proc/*/container - per-task container membership info Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- fs/proc/base.c|7 ++ include/linux/container.h |2 kernel/container.c| 132 ++ 3 files changed, 141 insertions(+) Index: container-2.6.22-rc6-mm1/fs/proc/base.c === --- container-2.6.22-rc6-mm1.orig/fs/proc/base.c +++ container-2.6.22-rc6-mm1/fs/proc/base.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -2050,6 +2051,9 @@ static const struct pid_entry tgid_base_ #ifdef CONFIG_CPUSETS REG("cpuset", S_IRUGO, cpuset), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj",S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL @@ -2341,6 +2345,9 @@ static const struct pid_entry tid_base_s #ifdef CONFIG_CPUSETS REG("cpuset",S_IRUGO, cpuset), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL Index: container-2.6.22-rc6-mm1/kernel/container.c === --- container-2.6.22-rc6-mm1.orig/kernel/container.c +++ container-2.6.22-rc6-mm1/kernel/container.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -247,6 +248,7 @@ static int container_mkdir(struct inode static int container_rmdir(struct inode *unused_dir, struct dentry *dentry); static int container_populate_dir(struct container *cont); static struct inode_operations container_dir_inode_operations; +static struct file_operations proc_containerstats_operations; static struct inode *container_new_inode(mode_t mode, struct super_block *sb) { @@ -1567,6 +1569,7 @@ int __init container_init(void) { int err; int i; + struct proc_dir_entry *entry; for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) { struct container_subsys *ss = subsys[i]; @@ -1578,10 +1581,139 @@ int __init container_init(void) if (err < 0) goto out; + entry = create_proc_entry("containers", 0, NULL); + if (entry) + entry->proc_fops = &proc_containerstats_operations; + out: return err; } +/* + * proc_container_show() + * - Print task's container paths into seq_file, one line for each hierarchy + * - Used for /proc//container. + * - No need to task_lock(tsk) on this tsk->container reference, as it + *doesn't really matter if tsk->container changes after we read it, + *and we take container_mutex, keeping attach_task() from changing it + *anyway. No need to check that tsk->container != NULL, thanks to + *the_top_container_hack in container_exit(), which sets an exiting tasks + *container to top_container. + */ + +/* TODO: Use a proper seq_file iterator */ +static int proc_container_show(struct seq_file *m, void *v) +{ + struct pid *pid; + struct task_struct *tsk; + char *buf; + int retval; + struct containerfs_root *root; + + retval = -ENOMEM; + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto out; + + retval = -ESRCH; + pid = m->private; + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) + goto out_free; + + retval = 0; + + mutex_lock(&container_mutex); + + for_each_root(root) { + struct container_subsys *ss; + struct container *cont; + int subsys_id; + int count = 0; + + /* Skip this hierarchy if it has no active subsystems */ + if (!root->actual_subsys_bits) + continue; + for_each_subsys(root, ss) + seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + seq_putc(m, ':'); + get_first_subsys(&root->top_container, NULL, &subsys_id); + cont = task_container(tsk, subsys_id); + retval = container_path(cont, buf, PAGE_SIZE); + if (retval < 0) + goto out_unlock; + seq_puts(m, buf); + seq_putc(m, '\n'); + } + +out_unlock: + mutex_unlock(&container_mutex); + put_task_struct(tsk); +out_free: + kfree(buf); +out: + return retval; +} + +static int container_open(struct inode *inode, struct file *file)
[PATCH 06/10] Task Containers(V11): Shared container subsystem group arrays
This patch replaces the struct css_group embedded in task_struct with a pointer; all tasks that have the same set of memberships across all hierarchies will share a css_group object, and will be linked via their css_groups field to the "tasks" list_head in the css_group. Assuming that many tasks share the same container assignments, this reduces overall space usage and keeps the size of the task_struct down (three pointers added to task_struct compared to a non-containers kernel, no matter how many subsystems are registered). Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- Documentation/containers.txt | 14 include/linux/container.h| 89 +- include/linux/sched.h| 33 -- kernel/container.c | 606 +-- kernel/fork.c|1 5 files changed, 620 insertions(+), 123 deletions(-) Index: container-2.6.22-rc6-mm1/Documentation/containers.txt === --- container-2.6.22-rc6-mm1.orig/Documentation/containers.txt +++ container-2.6.22-rc6-mm1/Documentation/containers.txt @@ -176,7 +176,9 @@ Containers extends the kernel as follows subsystem state is something that's expected to happen frequently and in performance-critical code, whereas operations that require a task's actual container assignments (in particular, moving between - containers) are less common. + containers) are less common. A linked list runs through the cg_list + field of each task_struct using the css_group, anchored at + css_group->tasks. - A container hierarchy filesystem can be mounted for browsing and manipulation from user space. @@ -252,6 +254,16 @@ linear search to locate an appropriate e very efficient. A future version will use a hash table for better performance. +To allow access from a container to the css_groups (and hence tasks) +that comprise it, a set of cg_container_link objects form a lattice; +each cg_container_link is linked into a list of cg_container_links for +a single container on its cont_link_list field, and a list of +cg_container_links for a single css_group on its cg_link_list. + +Thus the set of tasks in a container can be listed by iterating over +each css_group that references the container, and sub-iterating over +each css_group's task set. + The use of a Linux virtual file system (vfs) to represent the container hierarchy provides for a familiar permission and name space for containers, with a minimum of additional kernel code. Index: container-2.6.22-rc6-mm1/include/linux/container.h === --- container-2.6.22-rc6-mm1.orig/include/linux/container.h +++ container-2.6.22-rc6-mm1/include/linux/container.h @@ -27,10 +27,19 @@ extern void container_lock(void); extern void container_unlock(void); extern void container_fork(struct task_struct *p); extern void container_fork_callbacks(struct task_struct *p); +extern void container_post_fork(struct task_struct *p); extern void container_exit(struct task_struct *p, int run_callbacks); extern struct file_operations proc_container_operations; +/* Define the enumeration of all container subsystems */ +#define SUBSYS(_x) _x ## _subsys_id, +enum container_subsys_id { +#include + CONTAINER_SUBSYS_COUNT +}; +#undef SUBSYS + /* Per-subsystem/per-container state maintained by the system. */ struct container_subsys_state { /* The container that this subsystem is attached to. Useful @@ -97,6 +106,52 @@ struct container { struct containerfs_root *root; struct container *top_container; + + /* +* List of cg_container_links pointing at css_groups with +* tasks in this container. Protected by css_group_lock +*/ + struct list_head css_groups; +}; + +/* A css_group is a structure holding pointers to a set of + * container_subsys_state objects. This saves space in the task struct + * object and speeds up fork()/exit(), since a single inc/dec and a + * list_add()/del() can bump the reference count on the entire + * container set for a task. + */ + +struct css_group { + + /* Reference count */ + struct kref ref; + + /* +* List running through all container groups. Protected by +* css_group_lock +*/ + struct list_head list; + + /* +* List running through all tasks using this container +* group. Protected by css_group_lock +*/ + struct list_head tasks; + + /* +* List of cg_container_link objects on link chains from +* containers referenced from this css_group. Protected by +* css_group_lock +*/ + struct list_head cg_links; + + /* +* Set of subsystem states, one for each subsystem. This array +* is immutable after creation apart from the init_css_group +* during su
[PATCH 10/10] Task Containers(V11): Simple task container debug info subsystem
This example subsystem exports debugging information as an aid to diagnosing refcount leaks, etc, in the container framework. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container_subsys.h |4 + init/Kconfig | 10 kernel/Makefile |1 kernel/container_debug.c | 97 +++ 4 files changed, 112 insertions(+) Index: container-2.6.22-rc6-mm1/include/linux/container_subsys.h === --- container-2.6.22-rc6-mm1.orig/include/linux/container_subsys.h +++ container-2.6.22-rc6-mm1/include/linux/container_subsys.h @@ -19,4 +19,8 @@ SUBSYS(cpuacct) /* */ +#ifdef CONFIG_CONTAINER_DEBUG +SUBSYS(debug) +#endif + /* */ Index: container-2.6.22-rc6-mm1/init/Kconfig === --- container-2.6.22-rc6-mm1.orig/init/Kconfig +++ container-2.6.22-rc6-mm1/init/Kconfig @@ -303,6 +303,16 @@ config CONTAINERS Say N if unsure. +config CONTAINER_DEBUG + bool "Example debug container subsystem" + depends on CONTAINERS + help + This option enables a simple container subsystem that + exports useful debugging information about the containers + framework + + Say N if unsure + config CPUSETS bool "Cpuset support" depends on SMP && CONTAINERS Index: container-2.6.22-rc6-mm1/kernel/Makefile === --- container-2.6.22-rc6-mm1.orig/kernel/Makefile +++ container-2.6.22-rc6-mm1/kernel/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CONTAINERS) += container.o +obj-$(CONFIG_CONTAINER_DEBUG) += container_debug.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o obj-$(CONFIG_IKCONFIG) += configs.o Index: container-2.6.22-rc6-mm1/kernel/container_debug.c === --- /dev/null +++ container-2.6.22-rc6-mm1/kernel/container_debug.c @@ -0,0 +1,97 @@ +/* + * kernel/ccontainer_debug.c - Example container subsystem that + * exposes debug info + * + * Copyright (C) Google Inc, 2007 + * + * Developed by Paul Menage ([EMAIL PROTECTED]) + * + */ + +#include +#include +#include +#include + +#include + +static struct container_subsys_state *debug_create(struct container_subsys *ss, + struct container *cont) +{ + struct container_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_destroy(struct container_subsys *ss, struct container *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 container_refcount_read(struct container *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 taskcount_read(struct container *cont, struct cftype *cft) +{ + u64 count; + + container_lock(); + count = container_task_count(cont); + container_unlock(); + return count; +} + +static u64 current_css_group_read(struct container *cont, struct cftype *cft) +{ + return (u64)(long)current->containers; +} + +static u64 current_css_group_refcount_read(struct container *cont, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(¤t->containers->ref.refcount); + rcu_read_unlock(); + return count; +} + +static struct cftype files[] = { + { + .name = "container_refcount", + .read_uint = container_refcount_read, + }, + { + .name = "taskcount", + .read_uint = taskcount_read, + }, + + { + .name = "current_css_group", + .read_uint = current_css_group_read, + }, + + { + .name = "current_css_group_refcount", + .read_uint = current_css_group_refcount_read, + }, +}; + +static int debug_populate(struct container_subsys *ss, struct container *cont) +{ + return container_add_files(cont, ss, files, ARRAY_SIZE(files)); +} + +struct container_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 04/10] Task Containers(V11): Add container_clone() interface.
This patch adds support for container_clone(), a way to create new containers intended to be used for systems such as namespace unsharing. A new subsystem callback, post_clone(), is added to allow subsystems to automatically configure cloned containers. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- Documentation/containers.txt |7 ++ include/linux/container.h|3 kernel/container.c | 135 +++ 3 files changed, 145 insertions(+) Index: container-2.6.22-rc6-mm1/include/linux/container.h === --- container-2.6.22-rc6-mm1.orig/include/linux/container.h +++ container-2.6.22-rc6-mm1/include/linux/container.h @@ -174,6 +174,7 @@ struct container_subsys { void (*exit)(struct container_subsys *ss, struct task_struct *task); int (*populate)(struct container_subsys *ss, struct container *cont); + void (*post_clone)(struct container_subsys *ss, struct container *cont); void (*bind)(struct container_subsys *ss, struct container *root); int subsys_id; int active; @@ -213,6 +214,8 @@ static inline struct container* task_con int container_path(const struct container *cont, char *buf, int buflen); +int container_clone(struct task_struct *tsk, struct container_subsys *ss); + #else /* !CONFIG_CONTAINERS */ static inline int container_init_early(void) { return 0; } Index: container-2.6.22-rc6-mm1/kernel/container.c === --- container-2.6.22-rc6-mm1.orig/kernel/container.c +++ container-2.6.22-rc6-mm1/kernel/container.c @@ -1675,3 +1675,138 @@ void container_exit(struct task_struct * tsk->containers = init_task.containers; task_unlock(tsk); } + +/** + * container_clone - duplicate the current container in the hierarchy + * that the given subsystem is attached to, and move this task into + * the new child + */ +int container_clone(struct task_struct *tsk, struct container_subsys *subsys) +{ + struct dentry *dentry; + int ret = 0; + char nodename[MAX_CONTAINER_TYPE_NAMELEN]; + struct container *parent, *child; + struct inode *inode; + struct css_group *cg; + struct containerfs_root *root; + struct container_subsys *ss; + + /* We shouldn't be called by an unregistered subsystem */ + BUG_ON(!subsys->active); + + /* First figure out what hierarchy and container we're dealing +* with, and pin them so we can drop container_mutex */ + mutex_lock(&container_mutex); + again: + root = subsys->root; + if (root == &rootnode) { + printk(KERN_INFO + "Not cloning container for unused subsystem %s\n", + subsys->name); + mutex_unlock(&container_mutex); + return 0; + } + cg = &tsk->containers; + parent = task_container(tsk, subsys->subsys_id); + + snprintf(nodename, MAX_CONTAINER_TYPE_NAMELEN, "node_%d", tsk->pid); + + /* Pin the hierarchy */ + atomic_inc(&parent->root->sb->s_active); + + mutex_unlock(&container_mutex); + + /* Now do the VFS work to create a container */ + inode = parent->dentry->d_inode; + + /* Hold the parent directory mutex across this operation to +* stop anyone else deleting the new container */ + mutex_lock(&inode->i_mutex); + dentry = container_get_dentry(parent->dentry, nodename); + if (IS_ERR(dentry)) { + printk(KERN_INFO + "Couldn't allocate dentry for %s: %ld\n", nodename, + PTR_ERR(dentry)); + ret = PTR_ERR(dentry); + goto out_release; + } + + /* Create the container directory, which also creates the container */ + ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); + child = __d_cont(dentry); + dput(dentry); + if (ret) { + printk(KERN_INFO + "Failed to create container %s: %d\n", nodename, + ret); + goto out_release; + } + + if (!child) { + printk(KERN_INFO + "Couldn't find new container %s\n", nodename); + ret = -ENOMEM; + goto out_release; + } + + /* The container now exists. Retake container_mutex and check +* that we're still in the same state that we thought we +* were. */ + mutex_lock(&container_mutex); + if ((root != subsys->root) || + (parent != task_container(tsk, subsys->subsys_id))) { + /* Aargh, we raced ... */ + mutex_unlock(&inode->i_mutex); + +
[PATCH 03/10] Task Containers(V11): Add fork()/exit() hooks
This patch adds the necessary hooks to the fork() and exit() paths to ensure that new children inherit their parent's container assignments, and that exiting processes release reference counts on their containers. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container.h |6 ++ kernel/container.c| 121 ++ kernel/exit.c |2 kernel/fork.c | 14 - 4 files changed, 141 insertions(+), 2 deletions(-) Index: container-2.6.22-rc6-mm1/include/linux/container.h === --- container-2.6.22-rc6-mm1.orig/include/linux/container.h +++ container-2.6.22-rc6-mm1/include/linux/container.h @@ -25,6 +25,9 @@ extern int container_init(void); extern void container_init_smp(void); extern void container_lock(void); extern void container_unlock(void); +extern void container_fork(struct task_struct *p); +extern void container_fork_callbacks(struct task_struct *p); +extern void container_exit(struct task_struct *p, int run_callbacks); /* Per-subsystem/per-container state maintained by the system. */ struct container_subsys_state { @@ -215,6 +218,9 @@ int container_path(const struct containe static inline int container_init_early(void) { return 0; } static inline int container_init(void) { return 0; } static inline void container_init_smp(void) {} +static inline void container_fork(struct task_struct *p) {} +static inline void container_fork_callbacks(struct task_struct *p) {} +static inline void container_exit(struct task_struct *p, int callbacks) {} static inline void container_lock(void) {} static inline void container_unlock(void) {} Index: container-2.6.22-rc6-mm1/kernel/container.c === --- container-2.6.22-rc6-mm1.orig/kernel/container.c +++ container-2.6.22-rc6-mm1/kernel/container.c @@ -132,6 +132,33 @@ list_for_each_entry(_ss, &_root->subsys_ #define for_each_root(_root) \ list_for_each_entry(_root, &roots, root_list) +/* Each task_struct has an embedded css_group, so the get/put + * operation simply takes a reference count on all the containers + * referenced by subsystems in this css_group. This can end up + * multiple-counting some containers, but that's OK - the ref-count is + * just a busy/not-busy indicator; ensuring that we only count each + * container once would require taking a global lock to ensure that no + * subsystems moved between hierarchies while we were doing so. + * + * Possible TODO: decide at boot time based on the number of + * registered subsystems and the number of CPUs or NUMA nodes whether + * it's better for performance to ref-count every subsystem, or to + * take a global lock and only add one ref count to each hierarchy. + */ +static void get_css_group(struct css_group *cg) +{ + int i; + for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) + atomic_inc(&cg->subsys[i]->container->count); +} + +static void put_css_group(struct css_group *cg) +{ + int i; + for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) + atomic_dec(&cg->subsys[i]->container->count); +} + /* * There is one global container mutex. We also require taking * task_lock() when dereferencing a task's container subsys pointers. @@ -1554,3 +1581,97 @@ int __init container_init(void) out: return err; } + +/** + * container_fork - attach newly forked task to its parents container. + * @tsk: pointer to task_struct of forking parent process. + * + * Description: A task inherits its parent's container at fork(). + * + * A pointer to the shared css_group was automatically copied in + * fork.c by dup_task_struct(). However, we ignore that copy, since + * it was not made under the protection of RCU or container_mutex, so + * might no longer be a valid container pointer. attach_task() might + * have already changed current->container, allowing the previously + * referenced container to be removed and freed. + * + * At the point that container_fork() is called, 'current' is the parent + * task, and the passed argument 'child' points to the child task. + */ +void container_fork(struct task_struct *child) +{ + rcu_read_lock(); + child->containers = rcu_dereference(current->containers); + get_css_group(&child->containers); + rcu_read_unlock(); +} + +/** + * container_fork_callbacks - called on a new task very soon before + * adding it to the tasklist. No need to take any locks since no-one + * can be operating on this task + */ +void container_fork_callbacks(struct task_struct *child) +{ + if (need_forkexit_callback) { + int i; + for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) { + struct container_subsys *ss = subsys[i]; + if (ss->
[PATCH 09/10] Task Containers(V11): Example CPU accounting subsystem
This example demonstrates how to use the generic container subsystem for a simple resource tracker that counts, for the processes in a container, the total CPU time used and the %CPU used in the last complete 10 second interval. Portions contributed by Balbir Singh <[EMAIL PROTECTED]> Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container_subsys.h |6 + include/linux/cpu_acct.h | 14 ++ init/Kconfig |7 + kernel/Makefile |1 kernel/cpu_acct.c| 186 +++ kernel/sched.c | 14 ++ 6 files changed, 225 insertions(+), 3 deletions(-) Index: container-2.6.22-rc6-mm1/include/linux/container_subsys.h === --- container-2.6.22-rc6-mm1.orig/include/linux/container_subsys.h +++ container-2.6.22-rc6-mm1/include/linux/container_subsys.h @@ -13,4 +13,10 @@ SUBSYS(cpuset) /* */ +#ifdef CONFIG_CONTAINER_CPUACCT +SUBSYS(cpuacct) +#endif + +/* */ + /* */ Index: container-2.6.22-rc6-mm1/include/linux/cpu_acct.h === --- /dev/null +++ container-2.6.22-rc6-mm1/include/linux/cpu_acct.h @@ -0,0 +1,14 @@ + +#ifndef _LINUX_CPU_ACCT_H +#define _LINUX_CPU_ACCT_H + +#include +#include + +#ifdef CONFIG_CONTAINER_CPUACCT +extern void cpuacct_charge(struct task_struct *, cputime_t cputime); +#else +static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {} +#endif + +#endif Index: container-2.6.22-rc6-mm1/init/Kconfig === --- container-2.6.22-rc6-mm1.orig/init/Kconfig +++ container-2.6.22-rc6-mm1/init/Kconfig @@ -339,6 +339,13 @@ config PROC_PID_CPUSET depends on CPUSETS default y +config CONTAINER_CPUACCT + bool "Simple CPU accounting container subsystem" + depends on CONTAINERS + help + Provides a simple Resource Controller for monitoring the + total CPU consumed by the tasks in a container + config RELAY bool "Kernel->user space relay support (formerly relayfs)" help Index: container-2.6.22-rc6-mm1/kernel/Makefile === --- container-2.6.22-rc6-mm1.orig/kernel/Makefile +++ container-2.6.22-rc6-mm1/kernel/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CONTAINERS) += container.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o Index: container-2.6.22-rc6-mm1/kernel/cpu_acct.c === --- /dev/null +++ container-2.6.22-rc6-mm1/kernel/cpu_acct.c @@ -0,0 +1,186 @@ +/* + * kernel/cpu_acct.c - CPU accounting container subsystem + * + * Copyright (C) Google Inc, 2006 + * + * Developed by Paul Menage ([EMAIL PROTECTED]) and Balbir Singh + * ([EMAIL PROTECTED]) + * + */ + +/* + * Example container subsystem for reporting total CPU usage of tasks in a + * container, along with percentage load over a time interval + */ + +#include +#include +#include +#include + +#include + +struct cpuacct { + struct container_subsys_state css; + spinlock_t lock; + /* total time used by this class */ + cputime64_t time; + + /* time when next load calculation occurs */ + u64 next_interval_check; + + /* time used in current period */ + cputime64_t current_interval_time; + + /* time used in last period */ + cputime64_t last_interval_time; +}; + +struct container_subsys cpuacct_subsys; + +static inline struct cpuacct *container_ca(struct container *cont) +{ + return container_of(container_subsys_state(cont, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *task_ca(struct task_struct *task) +{ + return container_of(task_subsys_state(task, cpuacct_subsys_id), + struct cpuacct, css); +} + +#define INTERVAL (HZ * 10) + +static inline u64 next_interval_boundary(u64 now) +{ + /* calculate the next interval boundary beyond the +* current time */ + do_div(now, INTERVAL); + return (now + 1) * INTERVAL; +} + +static struct container_subsys_state *cpuacct_create( + struct container_subsys *ss, struct container *cont) +{ + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + + if (!ca) + return ERR_PTR(-ENOMEM); + spin_lock_init(&ca->lock); + ca->next_interval_check = next_interval_boundary(get_jiffies_64()); + return &ca->css; +} + +static void cpuacct_destroy(struct container_subsys *ss, + s
[PATCH 00/10] Task Containers(V11): Introduction
ainers-whitespace.patch containersv10-share-css_group-arrays-between-tasks-with-same-container-memberships.patch containersv10-share-css_group-arrays-between-tasks-with-same-container-memberships-fix.patch containersv10-share-css_group-arrays-between-tasks-with-same-container-memberships-cpuset-zero-malloc-fix-for-new-containers.patch containersv10-simple-debug-info-subsystem.patch containersv10-simple-debug-info-subsystem-fix.patch containersv10-simple-debug-info-subsystem-fix-2.patch containersv10-support-for-automatic-userspace-release-agents.patch containersv10-support-for-automatic-userspace-release-agents-whitespace.patch add-containerstats-v3.patch add-containerstats-v3-fix.patch update-getdelays-to-become-containerstats-aware.patch containers-implement-subsys-post_clone.patch containers-implement-namespace-tracking-subsystem-v3.patch Signed-off-by: Paul Menage <[EMAIL PROTECTED]> -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 07/10] Task Containers(V11): Automatic userspace notification of idle containers
This patch adds the following files to the container filesystem: notify_on_release - configures/reports whether the container subsystem should attempt to run a release script when this container becomes unused release_agent - configures/reports the release agent to be used for this hierarchy (top level in each hierarchy only) releasable - reports whether this container would have been auto-released if notify_on_release was true and a release agent was configured (mainly useful for debugging) To avoid locking issues, invoking the userspace release agent is done via a workqueue task; containers that need to have their release agents invoked by the workqueue task are linked on to a list. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container.h | 11 - kernel/container.c| 425 +- 2 files changed, 393 insertions(+), 43 deletions(-) Index: container-2.6.22-rc6-mm1/include/linux/container.h === --- container-2.6.22-rc6-mm1.orig/include/linux/container.h +++ container-2.6.22-rc6-mm1/include/linux/container.h @@ -77,10 +77,11 @@ static inline void css_get(struct contai * css_get() */ +extern void __css_put(struct container_subsys_state *css); static inline void css_put(struct container_subsys_state *css) { if (!test_bit(CSS_ROOT, &css->flags)) - atomic_dec(&css->refcnt); + __css_put(css); } struct container { @@ -112,6 +113,13 @@ struct container { * tasks in this container. Protected by css_group_lock */ struct list_head css_groups; + + /* +* Linked list running through all containers that can +* potentially be reaped by the release agent. Protected by +* release_list_lock +*/ + struct list_head release_list; }; /* A css_group is a structure holding pointers to a set of @@ -285,7 +293,6 @@ struct task_struct *container_iter_next( struct container_iter *it); void container_iter_end(struct container *cont, struct container_iter *it); - #else /* !CONFIG_CONTAINERS */ static inline int container_init_early(void) { return 0; } Index: container-2.6.22-rc6-mm1/kernel/container.c === --- container-2.6.22-rc6-mm1.orig/kernel/container.c +++ container-2.6.22-rc6-mm1/kernel/container.c @@ -44,6 +44,8 @@ #include #include +static DEFINE_MUTEX(container_mutex); + /* Generate an array of container subsystem pointers */ #define SUBSYS(_x) &_x ## _subsys, @@ -82,6 +84,13 @@ struct containerfs_root { /* Hierarchy-specific flags */ unsigned long flags; + + /* The path to use for release notifications. No locking +* between setting and use - so if userspace updates this +* while subcontainers exist, you could miss a +* notification. We ensure that it's always a valid +* NUL-terminated string */ + char release_agent_path[PATH_MAX]; }; @@ -109,7 +118,13 @@ static int need_forkexit_callback; /* bits in struct container flags field */ enum { + /* Container is dead */ CONT_REMOVED, + /* Container has previously had a child container or a task, +* but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */ + CONT_RELEASABLE, + /* Container requires release notifications to userspace */ + CONT_NOTIFY_ON_RELEASE, }; /* convenient tests for these bits */ @@ -123,6 +138,19 @@ enum { ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ }; +inline int container_is_releasable(const struct container *cont) +{ + const int bits = + (1 << CONT_RELEASABLE) | + (1 << CONT_NOTIFY_ON_RELEASE); + return (cont->flags & bits) == bits; +} + +inline int notify_on_release(const struct container *cont) +{ + return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags); +} + /* * for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy @@ -134,6 +162,14 @@ list_for_each_entry(_ss, &_root->subsys_ #define for_each_root(_root) \ list_for_each_entry(_root, &roots, root_list) +/* the list of containers eligible for automatic release. Protected by + * release_list_lock */ +static LIST_HEAD(release_list); +static DEFINE_SPINLOCK(release_list_lock); +static void container_release_agent(struct work_struct *work); +static DECLARE_WORK(release_agent_work, container_release_agent); +static void check_for_release(struct container *cont); + /* Link structure for associating css_group objects with containers */ struct cg_container_link { /* @@ -188,11 +224,8 @@ static int use_task_css_group_links; /* * unlink a css_group from the list and free it */ -static void release_css_group(
[PATCH 0/7] containers (V7): Generic Process Containers
-- This is an update to my multi-hierarchy multi-subsystem generic process containers patch. Changes since V6 (22nd December) include: - updated to 2.6.20 - added more details about multiple hierarchy support in the documentation - reduced the per-task memory overhead to one pointer (previously it was one pointer for each hierarchy). Now each task has a pointer to a container_group, which holds the pointers to the containers (one per active hierarchy) that the task is attached to and the associated per-subsystem state (one per active subsystem). This container group is shared (with reference counts) between all tasks that have the same set of container mappings. - added API support for binding/unbinding subsystems to/from active hierarchies, by remounting with -oremount,. Currently this fails with EBUSY if the hierarchy has a child containers; full implementation support is left to a later patch. - added a bind() subsystem callback to indicate when a subsystem is moved between hierarchies - added container_clone(subsys, task), which creates a child container for the hierarchy that the specified subsystem is bound to, and moves the given task into that container. An example use of this would be in sys_unshare, which could, if the namespace container subsystem is active, create a child container when the new namespace is created. - temporarily removed the "release agent" support. It's only currently used by CPUsets, and intrudes somewhat on the per-container reference counting. If necessary it can be re-added, either as a generic subsystem feature or a CPUset-specific feature, via a kernel thread that periodically polls containers that have been designated as notify_on_release to see if they are releasable Generic Process Containers -- There have recently been various proposals floating around for resource management/accounting and other task grouping subsystems in the kernel, including ResGroups, User BeanCounters, NSProxy containers, and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, or control other behaviour of the processes, and all implement this grouping in different ways. Already existing in the kernel is the cpuset subsystem; this has a process grouping mechanism that is mature, tested, and well documented (particularly with regards to synchronization rules). This patchset extracts the process grouping code from cpusets into a generic container system, and makes the cpusets code a client of the container system. It also provides several example clients of the container system, including ResGroups, BeanCounters and namespace proxy. The change is implemented in three stages, plus four example subsystems that aren't necessarily intended to be merged as part of this patch set, but demonstrate the applicability of the framework. 1) extract the process grouping code from cpusets into a standalone system 2) remove the process grouping code from cpusets and hook into the container system 3) convert the container system to present a generic multi-hierarchy API, and make cpusets a client of that API 4) example of a simple CPU accounting container subsystem 5) example of implementing ResGroups and its numtasks controller over generic containers 6) example of implementing BeanCounters and its numfiles counter over generic containers 7) example of integrating the namespace isolation code (sys_unshare() or various clone flags) with generic containers, allowing virtual servers to take advantage of other resource control efforts. The intention is that the various resource management and virtualization efforts can also become container clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test out e.g. the ResGroups CPU controller in conjunction with the BeanCounters memory controller, or use either of them as the resource-control portion of a virtual server system. - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel Signed-off-by: Paul Menage <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 4/7] containers (V7): Simple CPU accounting container subsystem
This demonstrates how to use the generic container subsystem for a simple resource tracker that counts the total CPU time used by all processes in a container, during the time that they're members of the container. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/cpu_acct.h | 14 +++ init/Kconfig |7 + kernel/Makefile |1 kernel/cpu_acct.c| 213 +++ kernel/sched.c | 14 ++- 5 files changed, 246 insertions(+), 3 deletions(-) Index: container-2.6.20/include/linux/cpu_acct.h === --- /dev/null +++ container-2.6.20/include/linux/cpu_acct.h @@ -0,0 +1,14 @@ + +#ifndef _LINUX_CPU_ACCT_H +#define _LINUX_CPU_ACCT_H + +#include +#include + +#ifdef CONFIG_CONTAINER_CPUACCT +extern void cpuacct_charge(struct task_struct *, cputime_t cputime); +#else +static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {} +#endif + +#endif Index: container-2.6.20/init/Kconfig === --- container-2.6.20.orig/init/Kconfig +++ container-2.6.20/init/Kconfig @@ -290,6 +290,13 @@ config PROC_PID_CPUSET depends on CPUSETS default y +config CONTAINER_CPUACCT + bool "Simple CPU accounting container subsystem" + select CONTAINERS + help + Provides a simple Resource Controller for monitoring the + total CPU consumed by the tasks in a container + config RELAY bool "Kernel->user space relay support (formerly relayfs)" help Index: container-2.6.20/kernel/cpu_acct.c === --- /dev/null +++ container-2.6.20/kernel/cpu_acct.c @@ -0,0 +1,213 @@ +/* + * kernel/cpu_acct.c - CPU accounting container subsystem + * + * Copyright (C) Google Inc, 2006 + * + * Developed by Paul Menage ([EMAIL PROTECTED]) and Balbir Singh + * ([EMAIL PROTECTED]) + * + */ + +/* + * Container subsystem for reporting total CPU usage of tasks in a + * container, along with percentage load over a time interval + */ + +#include +#include +#include +#include + +struct cpuacct { + struct container_subsys_state css; + spinlock_t lock; + /* total time used by this class */ + cputime64_t time; + + /* time when next load calculation occurs */ + u64 next_interval_check; + + /* time used in current period */ + cputime64_t current_interval_time; + + /* time used in last period */ + cputime64_t last_interval_time; +}; + +static struct container_subsys cpuacct_subsys; + +static inline struct cpuacct *container_ca(struct container *cont) +{ + return container_of(container_subsys_state(cont, &cpuacct_subsys), + struct cpuacct, css); +} + +static inline struct cpuacct *task_ca(struct task_struct *task) +{ + return container_ca(task_container(task, &cpuacct_subsys)); +} + +#define INTERVAL (HZ * 10) + +static inline u64 next_interval_boundary(u64 now) { + /* calculate the next interval boundary beyond the +* current time */ + do_div(now, INTERVAL); + return (now + 1) * INTERVAL; +} + +static int cpuacct_create(struct container_subsys *ss, struct container *cont) +{ + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + return -ENOMEM; + spin_lock_init(&ca->lock); + ca->next_interval_check = next_interval_boundary(get_jiffies_64()); + cont->subsys[cpuacct_subsys.subsys_id] = &ca->css; + return 0; +} + +static void cpuacct_destroy(struct container_subsys *ss, + struct container *cont) +{ + kfree(container_ca(cont)); +} + +/* Lazily update the load calculation if necessary. Called with ca locked */ +static void cpuusage_update(struct cpuacct *ca) +{ + u64 now = get_jiffies_64(); + /* If we're not due for an update, return */ + if (ca->next_interval_check > now) + return; + + if (ca->next_interval_check <= (now - INTERVAL)) { + /* If it's been more than an interval since the last +* check, then catch up - the last interval must have +* been zero load */ + ca->last_interval_time = 0; + ca->next_interval_check = next_interval_boundary(now); + } else { + /* If a steal takes the last interval time negative, +* then we just ignore it */ + if ((s64)ca->current_interval_time > 0) { + ca->last_interval_time = ca->current_interval_time; + } else { + ca->last_interval_time = 0; + } + ca->next_interval_check += INTERVAL
[PATCH 5/7] containers (V7): Resource Groups over generic containers
This patch provides the RG core and numtasks controller as container subsystems, intended as an example of how to implement a more complex resource control system over generic process containers. The changes to the core involve primarily removing the group management, task membership and configfs support and adding interface layers to talk to the generic container layer instead. Each resource controller becomes an independent container subsystem; the RG core is essentially a library that the resource controllers can use to provide the RG API to userspace. Rather than a single shares and stats file in each group, there's a _shares and a _stats file, each linked to the appropriate resource controller. include/linux/moduleparam.h | 12 - include/linux/numtasks.h | 28 ++ include/linux/res_group.h| 87 include/linux/res_group_rc.h | 97 init/Kconfig | 22 ++ kernel/Makefile |1 kernel/fork.c|7 kernel/res_group/Makefile|2 kernel/res_group/local.h | 38 +++ kernel/res_group/numtasks.c | 467 +++ kernel/res_group/res_group.c | 160 ++ kernel/res_group/rgcs.c | 302 +++ kernel/res_group/shares.c| 228 13 files changed, 1447 insertions(+), 4 deletions(-) Index: container-2.6.20/include/linux/moduleparam.h === --- container-2.6.20.orig/include/linux/moduleparam.h +++ container-2.6.20/include/linux/moduleparam.h @@ -78,11 +78,17 @@ struct kparam_array /* Helper functions: type is byte, short, ushort, int, uint, long, ulong, charp, bool or invbool, or XXX if you define param_get_XXX, param_set_XXX and param_check_XXX. */ -#define module_param_named(name, value, type, perm) \ - param_check_##type(name, &(value));\ - module_param_call(name, param_set_##type, param_get_##type, &value, perm); \ +#define module_param_named_call(name, value, type, set, perm) \ + param_check_##type(name, &(value)); \ + module_param_call(name, set, param_get_##type, &(value), perm); \ __MODULE_PARM_TYPE(name, #type) +#define module_param_named(name, value, type, perm) \ + module_param_named_call(name, value, type, param_set_##type, perm) + +#define module_param_set_call(name, type, setfn, perm) \ + module_param_named_call(name, name, type, setfn, perm) + #define module_param(name, type, perm) \ module_param_named(name, name, type, perm) Index: container-2.6.20/include/linux/numtasks.h === --- /dev/null +++ container-2.6.20/include/linux/numtasks.h @@ -0,0 +1,28 @@ +/* numtasks.h - No. of tasks resource controller for Resource Groups + * + * Copyright (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005 + * + * Provides No. of tasks resource controller for Resource Groups + * + * Latest version, more details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ +#ifndef _LINUX_NUMTASKS_H +#define _LINUX_NUMTASKS_H + +#ifdef CONFIG_RES_GROUPS_NUMTASKS +#include + +extern int numtasks_allow_fork(struct task_struct *); + +#else /* CONFIG_RES_GROUPS_NUMTASKS */ + +#define numtasks_allow_fork(task) (0) + +#endif /* CONFIG_RES_GROUPS_NUMTASKS */ +#endif /* _LINUX_NUMTASKS_H */ Index: container-2.6.20/include/linux/res_group.h === --- /dev/null +++ container-2.6.20/include/linux/res_group.h @@ -0,0 +1,87 @@ +/* + * res_group.h - Header file to be used by Resource Groups + * + * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004 + * (C) Shailabh Nagar, IBM Corp. 2003, 2004 + * (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005 + * + * Provides data structures, macros and kernel APIs + * + * More details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#ifndef _LINUX_RES_GROUP_H +#define _LINUX_RES_GROUP_H + +#ifdef CONFIG_RES_GROUPS +#include +#include +#include +#include + +#define SHARE_UNCHANGED(-1)/* implicitly specified by userspace, +* never stored in a resource group' +* shares struct; never displayed */ +#define SHARE_UNSUPPORTED (-2)
[PATCH 7/7] containers (V7): Container interface to nsproxy subsystem
When a task enters a new namespace via a clone() or unshare(), a new container is created and the task moves into it. Developed by Serge Hallyn <[EMAIL PROTECTED]>, adapted by Paul Menage <[EMAIL PROTECTED]> --- include/linux/nsproxy.h |6 ++ init/Kconfig|9 +++ kernel/Makefile |1 kernel/fork.c |4 + kernel/ns_container.c | 110 kernel/nsproxy.c|6 ++ 6 files changed, 136 insertions(+) Index: container-2.6.20/include/linux/nsproxy.h === --- container-2.6.20.orig/include/linux/nsproxy.h +++ container-2.6.20/include/linux/nsproxy.h @@ -53,4 +53,10 @@ static inline void exit_task_namespaces( put_nsproxy(ns); } } +#ifdef CONFIG_CONTAINER_NS +int ns_container_clone(struct task_struct *tsk); +#else +static inline int ns_container_clone(struct task_struct *tsk) { return 0; } +#endif + #endif Index: container-2.6.20/init/Kconfig === --- container-2.6.20.orig/init/Kconfig +++ container-2.6.20/init/Kconfig @@ -297,6 +297,15 @@ config CONTAINER_CPUACCT Provides a simple Resource Controller for monitoring the total CPU consumed by the tasks in a container +config CONTAINER_NS +bool "Namespace container subsystem" +select CONTAINERS +help + Provides a simple namespace container subsystem to + provide hierarchical naming of sets of namespaces, + for instance virtual servers and checkpoint/restart + jobs. + config RELAY bool "Kernel->user space relay support (formerly relayfs)" help Index: container-2.6.20/kernel/Makefile === --- container-2.6.20.orig/kernel/Makefile +++ container-2.6.20/kernel/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CONTAINERS) += container.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o +obj-$(CONFIG_CONTAINER_NS) += ns_container.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o Index: container-2.6.20/kernel/fork.c === --- container-2.6.20.orig/kernel/fork.c +++ container-2.6.20/kernel/fork.c @@ -1661,6 +1661,9 @@ asmlinkage long sys_unshare(unsigned lon err = -ENOMEM; goto bad_unshare_cleanup_ipc; } + err = ns_container_clone(current); + if (err) + goto bad_unshare_cleanup_dupns; } if (new_fs || new_ns || new_mm || new_fd || new_ulist || @@ -1715,6 +1718,7 @@ asmlinkage long sys_unshare(unsigned lon task_unlock(current); } + bad_unshare_cleanup_dupns: if (new_nsproxy) put_nsproxy(new_nsproxy); Index: container-2.6.20/kernel/ns_container.c === --- /dev/null +++ container-2.6.20/kernel/ns_container.c @@ -0,0 +1,110 @@ +/* + * ns_container.c - namespace container subsystem + * + * Copyright IBM, 2006 + */ + +#include +#include +#include + +struct nscont { + struct container_subsys_state css; + spinlock_t lock; +}; + +static struct container_subsys ns_subsys; + +static inline struct nscont *container_nscont(struct container *cont) +{ + return container_of(container_subsys_state(cont, &ns_subsys), + struct nscont, css); +} + +int ns_container_clone(struct task_struct *tsk) +{ + return container_clone(tsk, &ns_subsys); +} + +/* + * Rules: + * 1. you can only enter a container which is a child of your current + * container + * 2. you can only place another process into a container if + * a. you have CAP_SYS_ADMIN + * b. your container is an ancestor of tsk's destination container + * (hence either you are in the same container as tsk, or in an + *ancestor container thereof) + */ +int ns_can_attach(struct container_subsys *ss, + struct container *cont, struct task_struct *tsk) +{ + struct container *c; + + if (current != tsk) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!container_is_descendant(cont)) + return -EPERM; + } + + if (atomic_read(&cont->count) != 0) + return -EPERM; + + c = task_container(tsk, &ns_subsys); + if (c && c != cont->parent) + return -EPERM; + + return 0; +} + +/* + * Rules: you can only create a container if + * 1. you are capable(CAP_SYS_ADMIN) + * 2. the target container i
[PATCH 6/7] containers (V7): BeanCounters over generic process containers
This patch implements the BeanCounter resource control abstraction over generic process containers. It contains the beancounter core code, plus the numfiles resource counter. It doesn't currently contain any of the memory tracking code or the code for switching beancounter context in interrupts. Currently all the beancounters resource counters are lumped into a single hierarchy; ideally it would be possible for each resource counter to be a separate container subsystem, allowing them to be connected to different hierarchies. --- fs/file_table.c | 11 + include/bc/beancounter.h | 192 include/bc/misc.h| 27 +++ include/linux/fs.h |3 init/Kconfig |4 init/main.c |3 kernel/Makefile |1 kernel/bc/Kconfig| 17 ++ kernel/bc/Makefile |7 kernel/bc/beancounter.c | 371 +++ kernel/bc/misc.c | 56 +++ 11 files changed, 691 insertions(+), 1 deletion(-) Index: container-2.6.20/init/Kconfig === --- container-2.6.20.orig/init/Kconfig +++ container-2.6.20/init/Kconfig @@ -619,6 +619,10 @@ config STOP_MACHINE Need stop_machine() primitive. endmenu +menu "Beancounters" +source "kernel/bc/Kconfig" +endmenu + menu "Block layer" source "block/Kconfig" endmenu Index: container-2.6.20/kernel/Makefile === --- container-2.6.20.orig/kernel/Makefile +++ container-2.6.20/kernel/Makefile @@ -12,6 +12,7 @@ obj-y = sched.o fork.o exec_domain.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ +obj-$(CONFIG_BEANCOUNTERS) += bc/ obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) Index: container-2.6.20/kernel/bc/Kconfig === --- /dev/null +++ container-2.6.20/kernel/bc/Kconfig @@ -0,0 +1,17 @@ +config BEANCOUNTERS + bool "Enable resource accounting/control" + default n + select CONTAINERS + help + When Y this option provides accounting and allows configuring + limits for user's consumption of exhaustible system resources. + The most important resource controlled by this patch is unswappable + memory (either mlock'ed or used by internal kernel structures and + buffers). The main goal of this patch is to protect processes + from running short of important resources because of accidental + misbehavior of processes or malicious activity aiming to ``kill'' + the system. It's worth mentioning that resource limits configured + by setrlimit(2) do not give an acceptable level of protection + because they cover only a small fraction of resources and work on a + per-process basis. Per-process accounting doesn't prevent malicious + users from spawning a lot of resource-consuming processes. Index: container-2.6.20/kernel/bc/Makefile === --- /dev/null +++ container-2.6.20/kernel/bc/Makefile @@ -0,0 +1,7 @@ +# +# kernel/bc/Makefile +# +# Copyright (C) 2006 OpenVZ SWsoft Inc. +# + +obj-y = beancounter.o misc.o Index: container-2.6.20/include/bc/beancounter.h === --- /dev/null +++ container-2.6.20/include/bc/beancounter.h @@ -0,0 +1,192 @@ +/* + * include/bc/beancounter.h + * + * Copyright (C) 2006 OpenVZ SWsoft Inc + * + */ + +#ifndef __BEANCOUNTER_H__ +#define __BEANCOUNTER_H__ + +#include + +enum { + BC_KMEMSIZE, + BC_PRIVVMPAGES, + BC_PHYSPAGES, + BC_NUMTASKS, + BC_NUMFILES, + + BC_RESOURCES +}; + +struct bc_resource_parm { + unsigned long barrier; + unsigned long limit; + unsigned long held; + unsigned long minheld; + unsigned long maxheld; + unsigned long failcnt; + +}; + +#ifdef __KERNEL__ + +#include +#include +#include +#include +#include + +#define BC_MAXVALUE((unsigned long)LONG_MAX) + +enum bc_severity { + BC_BARRIER, + BC_LIMIT, + BC_FORCE, +}; + +struct beancounter; + +#ifdef CONFIG_BEANCOUNTERS + +enum bc_attr_index { + BC_RES_HELD, + BC_RES_MAXHELD, + BC_RES_MINHELD, + BC_RES_BARRIER, + BC_RES_LIMIT, + BC_RES_FAILCNT, + + BC_ATTRS +}; + +struct bc_resource { + char*bcr_name; + int res_id; + + int (*bcr_init)(struct beancounter *bc, int res); + int (*bcr_change)(struct beancounter *bc, + unsigned long new_bar, unsigned long new_lim); + void(*bcr_barrier_hit)(struct beancounter *bc); + int (*bcr_limit_hit)(struct beancounter *bc, unsigned long val, + unsigned long flags); +
[PATCH 1/7] containers (V7): Generic container system abstracted from cpusets code
This patch creates a generic process container system based on (and parallel top) the cpusets code. At a coarse level it was created by copying kernel/cpuset.c, doing s/cpuset/container/g, and stripping out any code that was cpuset-specific rather than applicable to any process container subsystem. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- Documentation/containers.txt | 229 +++ fs/proc/base.c |7 include/linux/container.h| 96 +++ include/linux/sched.h|5 init/Kconfig |9 init/main.c |3 kernel/Makefile |1 kernel/container.c | 1343 +++ kernel/exit.c|2 kernel/fork.c|3 10 files changed, 1697 insertions(+), 1 deletion(-) Index: container-2.6.20/fs/proc/base.c === --- container-2.6.20.orig/fs/proc/base.c +++ container-2.6.20/fs/proc/base.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -1870,6 +1871,9 @@ static struct pid_entry tgid_base_stuff[ #ifdef CONFIG_CPUSETS REG("cpuset", S_IRUGO, cpuset), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj",S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL @@ -2151,6 +2155,9 @@ static struct pid_entry tid_base_stuff[] #ifdef CONFIG_CPUSETS REG("cpuset",S_IRUGO, cpuset), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL Index: container-2.6.20/include/linux/container.h === --- /dev/null +++ container-2.6.20/include/linux/container.h @@ -0,0 +1,96 @@ +#ifndef _LINUX_CONTAINER_H +#define _LINUX_CONTAINER_H +/* + * container interface + * + * Copyright (C) 2003 BULL SA + * Copyright (C) 2004-2006 Silicon Graphics, Inc. + * + */ + +#include +#include +#include + +#ifdef CONFIG_CONTAINERS + +extern int number_of_containers; /* How many containers are defined in system? */ + +extern int container_init_early(void); +extern int container_init(void); +extern void container_init_smp(void); +extern void container_fork(struct task_struct *p); +extern void container_exit(struct task_struct *p); + +extern struct file_operations proc_container_operations; + +extern void container_lock(void); +extern void container_unlock(void); + +extern void container_manage_lock(void); +extern void container_manage_unlock(void); + +struct container { + unsigned long flags;/* "unsigned long" so bitops work */ + + /* +* Count is atomic so can incr (fork) or decr (exit) without a lock. +*/ + atomic_t count; /* count tasks using this container */ + + /* +* We link our 'sibling' struct into our parent's 'children'. +* Our children link their 'sibling' into our 'children'. +*/ + struct list_head sibling; /* my parent's children */ + struct list_head children; /* my children */ + + struct container *parent; /* my parent */ + struct dentry *dentry; /* container fs entry */ +}; + +/* struct cftype: + * + * The files in the container filesystem mostly have a very simple read/write + * handling, some common function will take care of it. Nevertheless some cases + * (read tasks) are special and therefore I define this structure for every + * kind of file. + * + * + * When reading/writing to a file: + * - the container to use in file->f_dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_dentry->d_fsdata + */ + +struct inode; +struct cftype { + char *name; + int private; + int (*open) (struct inode *inode, struct file *file); + ssize_t (*read) (struct container *cont, struct cftype *cft, +struct file *file, +char __user *buf, size_t nbytes, loff_t *ppos); + ssize_t (*write) (struct container *cont, struct cftype *cft, + struct file *file, + const char __user *buf, size_t nbytes, loff_t *ppos); + int (*release) (struct inode *inode, struct file *file); +}; + +int container_add_file(struct container *cont, const struct cftype *cft); + +int container_is_removed(const struct container *cont); + +#else /* !CONFIG_CONTAINERS */ + +static inline int container_init_early(void) { return 0; } +static inline int container_init(void) { return 0; } +static
[PATCH 0/6] Multi-hierarchy Process Containers
This is an update to my generic containers patch. The major change is support for multiple hierarchies of containers (up to a limit specified at build time). - The mount options passed when mounting a container filesystem indicate the set of controllers/subsystems that are wanted in the hierarchy - e.g. "mount -t container -o cpuset,numtasks container /foo" - Default is to try to mount all subsystems - if a hierarchy with the requested set of subsystems already exists then its superblock is reused - otherwise (as long as all the requested subsystems are currently not in use in any hierarchy) a new hierarchy is created. - hierarchies with more than one container (i.e. with any children of the root container) persist even when unmounted; - /proc/containers shows current hierarchy/subsystem details - /proc//container shows one line for each active hierarchy Other changes include: - ported to 2.6.19-rc5 - per-subsystem/per-container state is no longer just a void * - it has some state maintained by the container framework (to handle moving subsystems in and out of hierarchies when they are created/released) Note that this hasn't yet undergone intensive testing following the multi-hierarchy introduction, but I wanted to get the basic idea out for comments. TODOs include: - figuring out a nice way to handle release notifications now that there are multiple hierarchies - There have recently been various proposals floating around for resource management/accounting subsystems in the kernel, including Res Groups, User BeanCounters and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, and all implement this grouping in different ways. Already existing in the kernel is the cpuset subsystem; this has a process grouping mechanism that is mature, tested, and well documented (particularly with regards to synchronization rules). This patchset extracts the process grouping code from cpusets into a generic container system, and makes the cpusets code a client of the container system. It also provides a very simple additional container subsystem to do per-container CPU usage accounting; this is primarily to demonstrate use of the container subsystem API, but is useful in its own right. The change is implemented in five stages plus an additional example patch: 1) extract the process grouping code from cpusets into a standalone system 2) remove the process grouping code from cpusets and hook into the container system 3) convert the container system to present a generic multi-hierarchy API, and make cpusets a client of that API 4) add a simple CPU accounting container subsystem as an example 5) add support for fork/exit callbacks iff some subsystem is interested in them 6) example of implementing ResGroups and its numtasks controller over generic containers - not intended to be applied with this patch set The intention is that the various resource management efforts can also become container clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test out e.g. the ResGroups CPU controller in conjunction with the UBC memory controller - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel Signed-off-by: Paul Menage <[EMAIL PROTECTED]> -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/6] Add generic multi-subsystem API to containers
This patch removes all cpuset-specific knowlege from the container system, replacing it with a generic API that can be used by multiple subsystems. Cpusets is adapted to be a container subsystem. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- Documentation/containers.txt | 224 ++-- include/linux/container.h| 70 - include/linux/cpuset.h | 16 - include/linux/mempolicy.h| 12 include/linux/sched.h|2 kernel/container.c | 589 ++- kernel/cpuset.c | 168 mm/mempolicy.c |2 8 files changed, 852 insertions(+), 231 deletions(-) Index: container-2.6.19-rc5/include/linux/container.h === --- container-2.6.19-rc5.orig/include/linux/container.h +++ container-2.6.19-rc5/include/linux/container.h @@ -14,8 +14,6 @@ #ifdef CONFIG_CONTAINERS -extern int number_of_containers; /* How many containers are defined in system? */ - extern int container_init_early(void); extern int container_init(void); extern void container_init_smp(void); @@ -30,6 +28,13 @@ extern void container_unlock(void); extern void container_manage_lock(void); extern void container_manage_unlock(void); +struct containerfs_root; + +/* Per-subsystem/per-container state maintained by the system. */ +struct container_subsys_state { + struct container *container; +}; + struct container { unsigned long flags;/* "unsigned long" so bitops work */ @@ -46,11 +51,15 @@ struct container { struct list_head children; /* my children */ struct container *parent; /* my parent */ - struct dentry *dentry; /* container fs entry */ + struct dentry *dentry; /* container fs entry */ -#ifdef CONFIG_CPUSETS - struct cpuset *cpuset; -#endif + /* Private pointers for each registered subsystem */ + struct container_subsys_state *subsys[CONFIG_MAX_CONTAINER_SUBSYS]; + + int hierarchy; + + struct containerfs_root *root; + struct container *top_container; }; /* struct cftype: @@ -85,6 +94,55 @@ int container_add_file(struct container int container_is_removed(const struct container *cont); void container_set_release_agent_path(const char *path); +/* Container subsystem type. See Documentation/containers.txt for details */ + +struct container_subsys { + int (*create)(struct container_subsys *ss, + struct container *cont); + void (*destroy)(struct container_subsys *ss, struct container *cont); + int (*can_attach)(struct container_subsys *ss, + struct container *cont, struct task_struct *tsk); + void (*attach)(struct container_subsys *ss, struct container *cont, + struct container *old_cont, struct task_struct *tsk); + void (*post_attach)(struct container_subsys *ss, + struct container *cont, + struct container *old_cont, + struct task_struct *tsk); + int (*populate)(struct container_subsys *ss, + struct container *cont); + + int subsys_id; +#define MAX_CONTAINER_TYPE_NAMELEN 32 + const char *name; + + /* Protected by RCU */ + int hierarchy; + + struct list_head sibling; +}; + +int container_register_subsys(struct container_subsys *subsys); + +static inline struct container_subsys_state *container_subsys_state( + struct container *cont, + struct container_subsys *ss) +{ + return cont->subsys[ss->subsys_id]; +} + +static inline struct container* task_container(struct task_struct *task, + struct container_subsys *ss) +{ + return rcu_dereference(task->container[ss->hierarchy]); +} + +static inline struct container_subsys_state *task_subsys_state( + struct task_struct *task, + struct container_subsys *ss) +{ + return container_subsys_state(task_container(task, ss), ss); +} + #else /* !CONFIG_CONTAINERS */ static inline int container_init_early(void) { return 0; } Index: container-2.6.19-rc5/include/linux/cpuset.h === --- container-2.6.19-rc5.orig/include/linux/cpuset.h +++ container-2.6.19-rc5/include/linux/cpuset.h @@ -60,16 +60,7 @@ static inline int cpuset_do_slab_mem_spr extern void cpuset_track_online_nodes(void); -extern int cpuset_can_attach_task(struct container *cont, - struct task_struct *tsk); -extern void cpuset_attach_task(struct container *cont, - struct task_struct *tsk); -extern void cpuset_post_attach_task(struct container *cont, - struct container *oldcont, - struc
[PATCH 4/6] Simple CPU accounting container subsystem
This demonstrates how to use the generic container subsystem for a simple resource tracker that counts the total CPU time used by all processes in a container, during the time that they're members of the container. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/cpu_acct.h | 14 + init/Kconfig |7 ++ kernel/Makefile |1 kernel/cpu_acct.c| 117 +++ kernel/sched.c |6 ++ 5 files changed, 145 insertions(+) Index: container-2.6.19-rc5/include/linux/cpu_acct.h === --- /dev/null +++ container-2.6.19-rc5/include/linux/cpu_acct.h @@ -0,0 +1,14 @@ + +#ifndef _LINUX_CPU_ACCT_H +#define _LINUX_CPU_ACCT_H + +#include +#include + +#ifdef CONFIG_CONTAINER_CPUACCT +extern void cpuacct_charge(struct task_struct *, cputime_t cputime); +#else +static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {} +#endif + +#endif Index: container-2.6.19-rc5/init/Kconfig === --- container-2.6.19-rc5.orig/init/Kconfig +++ container-2.6.19-rc5/init/Kconfig @@ -263,6 +263,13 @@ config CPUSETS Say N if unsure. +config CONTAINER_CPUACCT + bool "Simple CPU accounting container subsystem" + select CONTAINERS + help + Provides a simple Resource Controller for monitoring the + total CPU consumed by the tasks in a container + config RELAY bool "Kernel->user space relay support (formerly relayfs)" help Index: container-2.6.19-rc5/kernel/cpu_acct.c === --- /dev/null +++ container-2.6.19-rc5/kernel/cpu_acct.c @@ -0,0 +1,117 @@ +/* + * kernel/cpu_acct.c - CPU accounting container subsystem + * + * Copyright (C) Google Inc, 2006 + * + */ + +/* + * Container subsystem for reporting total CPU usage of tasks in a + * container. + */ + +#include +#include +#include +#include + +struct cpuacct { + struct container_subsys_state css; + spinlock_t lock; + cputime64_t time; // total time used by this class +}; + +static struct container_subsys cpuacct_subsys; + +static inline struct cpuacct *container_ca(struct container *cont) +{ + return container_of(container_subsys_state(cont, &cpuacct_subsys), + struct cpuacct, css); +} + +static inline struct cpuacct *task_ca(struct task_struct *task) +{ + return container_ca(task_container(task, &cpuacct_subsys)); +} + +static int cpuacct_create(struct container_subsys *ss, struct container *cont) +{ + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) return -ENOMEM; + spin_lock_init(&ca->lock); + cont->subsys[cpuacct_subsys.subsys_id] = &ca->css; + return 0; +} + +static void cpuacct_destroy(struct container_subsys *ss, + struct container *cont) +{ + kfree(container_ca(cont)); +} + +static ssize_t cpuusage_read(struct container *cont, +struct cftype *cft, +struct file *file, +char __user *buf, +size_t nbytes, loff_t *ppos) +{ + struct cpuacct *ca = container_ca(cont); + cputime64_t time; + char usagebuf[64]; + char *s = usagebuf; + + spin_lock_irq(&ca->lock); + time = ca->time; + spin_unlock_irq(&ca->lock); + + time *= 1000; + do_div(time, HZ); + s += sprintf(s, "%llu", (unsigned long long) time); + + return simple_read_from_buffer(buf, nbytes, ppos, usagebuf, s - usagebuf); +} + +static struct cftype cft_usage = { + .name = "cpu_usage", + .read = cpuusage_read, +}; + +static int cpuacct_populate(struct container_subsys *ss, + struct container *cont) +{ + return container_add_file(cont, &cft_usage); +} + + +void cpuacct_charge(struct task_struct *task, cputime_t cputime) { + + struct cpuacct *ca; + unsigned long flags; + + if (cpuacct_subsys.subsys_id < 0) return; + rcu_read_lock(); + ca = task_ca(task); + if (ca) { + spin_lock_irqsave(&ca->lock, flags); + ca->time = cputime64_add(ca->time, cputime); + spin_unlock_irqrestore(&ca->lock, flags); + } + rcu_read_unlock(); +} + +static struct container_subsys cpuacct_subsys = { + .name = "cpuacct", + .create = cpuacct_create, + .destroy = cpuacct_destroy, + .populate = cpuacct_populate, + .subsys_id = -1, +}; + + +int __init init_cpuacct(void) +{ + int id = container_register_subsys(&cpuacct_subsys); + return id < 0 ? id : 0; +} + +module_init(init_cpuac
[PATCH 5/6] Extension to container system to allow fork/exit callbacks
This patch adds fork/exit callbacks to container subsystems, and ensures that every registered container has received one fork callback for each task running int the system, and one exit callback for each task that exited since it was registered. Since the fork/exit path is performance sensitive, an RCU-protected flag indicates to the fork/exit hooks whether they need to take the callback mutex and scan the list of registered subsystems to look for fork/exit handlers. Documentation/containers.txt | 11 + include/linux/container.h|2 kernel/container.c | 89 +-- 3 files changed, 98 insertions(+), 4 deletions(-) Index: container-2.6.19-rc5/include/linux/container.h === --- container-2.6.19-rc5.orig/include/linux/container.h +++ container-2.6.19-rc5/include/linux/container.h @@ -108,6 +108,8 @@ struct container_subsys { struct container *cont, struct container *old_cont, struct task_struct *tsk); + void (*fork)(struct container_subsys *ss, struct task_struct *task); + void (*exit)(struct container_subsys *ss, struct task_struct *task); int (*populate)(struct container_subsys *ss, struct container *cont); Index: container-2.6.19-rc5/kernel/container.c === --- container-2.6.19-rc5.orig/kernel/container.c +++ container-2.6.19-rc5/kernel/container.c @@ -84,6 +84,21 @@ struct containerfs_root { static struct containerfs_root rootnode[CONFIG_MAX_CONTAINER_HIERARCHIES]; +/* This flag indicates whether tasks in the fork and exit paths should + * take callback_mutex and check for fork/exit handlers to call. This + * avoids us having to take locks in the fork/exit path if none of the + * subsystems need to be called. + * + * It is protected via RCU, with the invariant that a process in an + * rcu_read_lock() section will never see this as 0 if there are + * actually registered subsystems with a fork or exit + * handler. (Sometimes it may be 1 without there being any registered + * subsystems with such a handler, but such periods are safe and of + * short duration). + */ + +static int need_forkexit_callback = 0; + /* bits in struct container flags field */ typedef enum { CONT_REMOVED, @@ -1505,11 +1520,40 @@ int container_register_subsys(struct con goto out; } dummytop->subsys[subsys_count]->container = dummytop; - subsys[subsys_count++] = new_subsys; + mutex_lock(&callback_mutex); + /* If this is the first subsystem that requested a fork or +* exit callback, tell our fork/exit hooks that they need to +* grab callback_mutex on every invocation. If they are +* running concurrently with this code, they will either not +* see the change now and go straight on, or they will see it +* and grab callback_mutex, which will deschedule them. Either +* way once synchronize_rcu() returns we know that all current +* and future forks will make the callbacks. */ + if (!need_forkexit_callback && + (new_subsys->fork || new_subsys->exit)) { + need_forkexit_callback = 1; + synchronize_rcu(); + } + + /* If this subsystem requested that it be notified with fork +* events, we should send it one now for every process in the +* system */ + if (new_subsys->fork) { + struct task_struct *g, *p; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + new_subsys->fork(new_subsys, p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } + subsys[subsys_count++] = new_subsys; + mutex_unlock(&callback_mutex); out: - mutex_unlock(&manage_mutex); - return retval; + mutex_unlock(&manage_mutex); + return retval; + } /** @@ -1532,7 +1576,16 @@ int container_register_subsys(struct con void container_fork(struct task_struct *child) { - int i; + int i, need_callback; + + rcu_read_lock(); + /* need_forkexit_callback will be true if we might need to do +* a callback */ + need_callback = rcu_dereference(need_forkexit_callback); + if (need_callback) { + rcu_read_unlock(); + mutex_lock(&callback_mutex); + } task_lock(current); for (i = 0; i < CONFIG_MAX_CONTAINER_HIERARCHIES; i++) { struct container *cont = current->container[i]; @@ -1540,7 +1593,20 @@ void container_fork(struct task_struct * child->container[i] = cont; atomic_inc(&cont->count); } + if (need_callback) { + for (i = 0; i < subsys_count; i++) { +
[PATCH 6/6] Resource Groups over generic containers
This patch provides the RG core and numtasks controller as container subsystems, intended as an example of how to implement a more complex resource control system over generic process containers. The changes to the core involve primarily removing the group management, task membership and configfs support and adding interface layers to talk to the generic container layer instead. Each resource controller becomes an independent container subsystem; the RG core is essentially a library that the resource controllers can use to provide the RG API to userspace. Rather than a single shares and stats file in each group, there's a _shares and a _stats file, each linked to the appropriate resource controller. include/linux/moduleparam.h | 12 - include/linux/numtasks.h | 28 ++ include/linux/res_group.h| 87 include/linux/res_group_rc.h | 98 + init/Kconfig | 22 ++ kernel/Makefile |1 kernel/fork.c|7 kernel/res_group/Makefile|2 kernel/res_group/local.h | 38 +++ kernel/res_group/numtasks.c | 467 +++ kernel/res_group/res_group.c | 162 ++ kernel/res_group/rgcs.c | 302 +++ kernel/res_group/shares.c| 228 13 files changed, 1450 insertions(+), 4 deletions(-) Index: container-2.6.19-rc5/include/linux/moduleparam.h === --- container-2.6.19-rc5.orig/include/linux/moduleparam.h +++ container-2.6.19-rc5/include/linux/moduleparam.h @@ -75,11 +75,17 @@ struct kparam_array /* Helper functions: type is byte, short, ushort, int, uint, long, ulong, charp, bool or invbool, or XXX if you define param_get_XXX, param_set_XXX and param_check_XXX. */ -#define module_param_named(name, value, type, perm) \ - param_check_##type(name, &(value));\ - module_param_call(name, param_set_##type, param_get_##type, &value, perm); \ +#define module_param_named_call(name, value, type, set, perm) \ + param_check_##type(name, &(value)); \ + module_param_call(name, set, param_get_##type, &(value), perm); \ __MODULE_PARM_TYPE(name, #type) +#define module_param_named(name, value, type, perm) \ + module_param_named_call(name, value, type, param_set_##type, perm) + +#define module_param_set_call(name, type, setfn, perm) \ + module_param_named_call(name, name, type, setfn, perm) + #define module_param(name, type, perm) \ module_param_named(name, name, type, perm) Index: container-2.6.19-rc5/include/linux/numtasks.h === --- /dev/null +++ container-2.6.19-rc5/include/linux/numtasks.h @@ -0,0 +1,28 @@ +/* numtasks.h - No. of tasks resource controller for Resource Groups + * + * Copyright (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005 + * + * Provides No. of tasks resource controller for Resource Groups + * + * Latest version, more details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ +#ifndef _LINUX_NUMTASKS_H +#define _LINUX_NUMTASKS_H + +#ifdef CONFIG_RES_GROUPS_NUMTASKS +#include + +extern int numtasks_allow_fork(struct task_struct *); + +#else /* CONFIG_RES_GROUPS_NUMTASKS */ + +#define numtasks_allow_fork(task) (0) + +#endif /* CONFIG_RES_GROUPS_NUMTASKS */ +#endif /* _LINUX_NUMTASKS_H */ Index: container-2.6.19-rc5/include/linux/res_group.h === --- /dev/null +++ container-2.6.19-rc5/include/linux/res_group.h @@ -0,0 +1,87 @@ +/* + * res_group.h - Header file to be used by Resource Groups + * + * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004 + * (C) Shailabh Nagar, IBM Corp. 2003, 2004 + * (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005 + * + * Provides data structures, macros and kernel APIs + * + * More details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#ifndef _LINUX_RES_GROUP_H +#define _LINUX_RES_GROUP_H + +#ifdef CONFIG_RES_GROUPS +#include +#include +#include +#include + +#define SHARE_UNCHANGED(-1)/* implicitly specified by userspace, +* never stored in a resource group' +* shares struct; never displayed */ +#define
[PATCH 1/6] Generic container system abstracted from cpusets code
This patch creates a generic process container system based on (and parallel top) the cpusets code. At a coarse level it was created by copying kernel/cpuset.c, doing s/cpuset/container/g, and stripping out any code that was cpuset-specific rather than applicable to any process container subsystem. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- Documentation/containers.txt | 229 +++ fs/proc/base.c |7 include/linux/container.h| 96 +++ include/linux/sched.h|5 init/Kconfig |9 init/main.c |3 kernel/Makefile |1 kernel/container.c | 1343 +++ kernel/exit.c|2 kernel/fork.c|3 10 files changed, 1697 insertions(+), 1 deletion(-) Index: container-2.6.19-rc5/fs/proc/base.c === --- container-2.6.19-rc5.orig/fs/proc/base.c +++ container-2.6.19-rc5/fs/proc/base.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -1782,6 +1783,9 @@ static struct pid_entry tgid_base_stuff[ #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, pid_schedstat), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif #ifdef CONFIG_CPUSETS REG("cpuset", S_IRUGO, cpuset), #endif @@ -2056,6 +2060,9 @@ static struct pid_entry tid_base_stuff[] #ifdef CONFIG_SCHEDSTATS INF("schedstat", S_IRUGO, pid_schedstat), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif #ifdef CONFIG_CPUSETS REG("cpuset",S_IRUGO, cpuset), #endif Index: container-2.6.19-rc5/include/linux/container.h === --- /dev/null +++ container-2.6.19-rc5/include/linux/container.h @@ -0,0 +1,96 @@ +#ifndef _LINUX_CONTAINER_H +#define _LINUX_CONTAINER_H +/* + * container interface + * + * Copyright (C) 2003 BULL SA + * Copyright (C) 2004-2006 Silicon Graphics, Inc. + * + */ + +#include +#include +#include + +#ifdef CONFIG_CONTAINERS + +extern int number_of_containers; /* How many containers are defined in system? */ + +extern int container_init_early(void); +extern int container_init(void); +extern void container_init_smp(void); +extern void container_fork(struct task_struct *p); +extern void container_exit(struct task_struct *p); + +extern struct file_operations proc_container_operations; + +extern void container_lock(void); +extern void container_unlock(void); + +extern void container_manage_lock(void); +extern void container_manage_unlock(void); + +struct container { + unsigned long flags;/* "unsigned long" so bitops work */ + + /* +* Count is atomic so can incr (fork) or decr (exit) without a lock. +*/ + atomic_t count; /* count tasks using this container */ + + /* +* We link our 'sibling' struct into our parent's 'children'. +* Our children link their 'sibling' into our 'children'. +*/ + struct list_head sibling; /* my parent's children */ + struct list_head children; /* my children */ + + struct container *parent; /* my parent */ + struct dentry *dentry; /* container fs entry */ +}; + +/* struct cftype: + * + * The files in the container filesystem mostly have a very simple read/write + * handling, some common function will take care of it. Nevertheless some cases + * (read tasks) are special and therefore I define this structure for every + * kind of file. + * + * + * When reading/writing to a file: + * - the container to use in file->f_dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_dentry->d_fsdata + */ + +struct inode; +struct cftype { + char *name; + int private; + int (*open) (struct inode *inode, struct file *file); + ssize_t (*read) (struct container *cont, struct cftype *cft, +struct file *file, +char __user *buf, size_t nbytes, loff_t *ppos); + ssize_t (*write) (struct container *cont, struct cftype *cft, + struct file *file, + const char __user *buf, size_t nbytes, loff_t *ppos); + int (*release) (struct inode *inode, struct file *file); +}; + +int container_add_file(struct container *cont, const struct cftype *cft); + +int container_is_removed(const struct container *cont); + +#else /* !CONFIG_CONTAINERS */ + +static inline int container_init_early(void) { return 0; } +static inline int container_init(void) { return 0; } +static inline void container_init_smp(void) {} +static inline void container_fork(s
[PATCH 05/10] Containers(V10): Add container_clone() interface
This patch adds support for container_clone(), a speculative interface to creating new containers intended to be used for systems such as namespace unsharing. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container.h |2 kernel/container.c| 123 ++ 2 files changed, 125 insertions(+) Index: container-2.6.22-rc2-mm1/kernel/container.c === --- container-2.6.22-rc2-mm1.orig/kernel/container.c +++ container-2.6.22-rc2-mm1/kernel/container.c @@ -1616,3 +1616,126 @@ void container_exit(struct task_struct * tsk->containers = init_task.containers; task_unlock(tsk); } + +static atomic_t namecnt; +static void get_unused_name(char *buf) +{ + sprintf(buf, "node%d", atomic_inc_return(&namecnt)); +} + +/** + * container_clone - duplicate the current container in the hierarchy + * that the given subsystem is attached to, and move this task into + * the new child + */ +int container_clone(struct task_struct *tsk, struct container_subsys *subsys) +{ + struct dentry *dentry; + int ret = 0; + char nodename[32]; + struct container *parent, *child; + struct inode *inode; + struct css_group *cg; + struct containerfs_root *root; + + /* We shouldn't be called by an unregistered subsystem */ + BUG_ON(!subsys->active); + + /* First figure out what hierarchy and container we're dealing +* with, and pin them so we can drop container_mutex */ + mutex_lock(&container_mutex); + again: + root = subsys->root; + if (root == &rootnode) { + printk(KERN_INFO + "Not cloning container for unused subsystem %s\n", + subsys->name); + mutex_unlock(&container_mutex); + return 0; + } + cg = &tsk->containers; + parent = task_container(tsk, subsys->subsys_id); + /* Pin the hierarchy */ + atomic_inc(&parent->root->sb->s_active); + + mutex_unlock(&container_mutex); + + /* Now do the VFS work to create a container */ + get_unused_name(nodename); + inode = parent->dentry->d_inode; + + /* Hold the parent directory mutex across this operation to +* stop anyone else deleting the new container */ + mutex_lock(&inode->i_mutex); + dentry = container_get_dentry(parent->dentry, nodename); + if (IS_ERR(dentry)) { + printk(KERN_INFO + "Couldn't allocate dentry for %s: %ld\n", nodename, + PTR_ERR(dentry)); + ret = PTR_ERR(dentry); + goto out_release; + } + + /* Create the container directory, which also creates the container */ + ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); + child = __d_cont(dentry); + dput(dentry); + if (ret) { + printk(KERN_INFO + "Failed to create container %s: %d\n", nodename, + ret); + goto out_release; + } + + if (!child) { + printk(KERN_INFO + "Couldn't find new container %s\n", nodename); + ret = -ENOMEM; + goto out_release; + } + + /* The container now exists. Retake container_mutex and check +* that we're still in the same state that we thought we +* were. */ + mutex_lock(&container_mutex); + if ((root != subsys->root) || + (parent != task_container(tsk, subsys->subsys_id))) { + /* Aargh, we raced ... */ + mutex_unlock(&inode->i_mutex); + + deactivate_super(parent->root->sb); + /* The container is still accessible in the VFS, but +* we're not going to try to rmdir() it at this +* point. */ + printk(KERN_INFO + "Race in container_clone() - leaking container %s\n", + nodename); + goto again; + } + + /* All seems fine. Finish by moving the task into the new container */ + ret = attach_task(child, tsk); + mutex_unlock(&container_mutex); + + out_release: + mutex_unlock(&inode->i_mutex); + deactivate_super(parent->root->sb); + return ret; +} + +/* See if "cont" is a descendant of the current task's container in + * the appropriate hierarchy */ + +int container_is_descendant(const struct container *cont) +{ + int ret; + struct container *target; + int subsys_id; + get_first_subsys(cont, NULL, &subsys_id); + target = task_container(current, subsys_i
[PATCH 08/10] Containers(V10): Share css_group arrays between tasks with same container memberships
This patch replaces the struct css_group embedded in task_struct with a pointer; all tasks that have the same set of memberships across all hierarchies will share a css_group object, and will be linked via their css_groups field to the "tasks" list_head in the css_group. Assuming that many tasks share the same container assignments, this reduces overall space usage and keeps the size of the task_struct down (three pointers added to task_struct compared to a non-containers kernel, no matter how many subsystems are registered). Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- Documentation/containers.txt | 14 + include/linux/container.h| 93 ++- include/linux/sched.h| 33 -- kernel/container.c | 524 --- kernel/cpuset.c | 15 - 5 files changed, 553 insertions(+), 126 deletions(-) Index: container-2.6.22-rc2-mm1/include/linux/container.h === --- container-2.6.22-rc2-mm1.orig/include/linux/container.h +++ container-2.6.22-rc2-mm1/include/linux/container.h @@ -29,6 +29,14 @@ extern void container_unlock(void); struct containerfs_root; +/* Define the enumeration of all container subsystems */ +#define SUBSYS(_x) _x ## _subsys_id, +enum container_subsys_id { +#include + CONTAINER_SUBSYS_COUNT +}; +#undef SUBSYS + /* Per-subsystem/per-container state maintained by the system. */ struct container_subsys_state { /* The container that this subsystem is attached to. Useful @@ -85,6 +93,54 @@ struct container { struct containerfs_root *root; struct container *top_container; + + /* +* List of cg_container_links pointing at css_groups with +* tasks in this container. Protected by css_group_lock +*/ + struct list_head css_groups; +}; + +/* A css_group is a structure holding pointers to a set of + * container_subsys_state objects. This saves space in the task struct + * object and speeds up fork()/exit(), since a single inc/dec and a + * list_add()/del() can bump the reference count on the entire + * container set for a task. + */ + +struct css_group { + + /* Reference count */ + struct kref ref; + + /* +* List running through all container groups. Protected by +* css_group_lock +*/ + struct list_head list; + + /* +* List running through all tasks using this container +* group. Protected by css_group_lock +*/ + struct list_head tasks; + + /* +* List of cg_container_link objects on link chains from +* containers referenced from this css_group. Protected by +* css_group_lock +*/ + struct list_head cg_links; + + /* Set of subsystem states, one for each subsystem. NULL for +* subsystems that aren't part of this hierarchy. These +* pointers reduce the number of dereferences required to get +* from a task to its state for a given container, but result +* in increased space usage if tasks are in wildly different +* groupings across different hierarchies. This array is +* immutable after creation */ + struct container_subsys_state *subsys[CONTAINER_SUBSYS_COUNT]; + }; /* struct cftype: @@ -111,6 +167,10 @@ struct cftype { ssize_t (*read) (struct container *cont, struct cftype *cft, struct file *file, char __user *buf, size_t nbytes, loff_t *ppos); + /* +* read_uint() is a shortcut for the common case of returning a +* single integer. Use it in place of read() +*/ u64 (*read_uint) (struct container *cont, struct cftype *cft); ssize_t (*write) (struct container *cont, struct cftype *cft, struct file *file, @@ -131,15 +191,7 @@ int container_is_removed(const struct co int container_path(const struct container *cont, char *buf, int buflen); -int __container_task_count(const struct container *cont); -static inline int container_task_count(const struct container *cont) -{ - int task_count; - rcu_read_lock(); - task_count = __container_task_count(cont); - rcu_read_unlock(); - return task_count; -} +int container_task_count(const struct container *cont); /* Return true if the container is a descendant of the current container */ int container_is_descendant(const struct container *cont); @@ -186,7 +238,7 @@ static inline struct container_subsys_st static inline struct container_subsys_state *task_subsys_state( struct task_struct *task, int subsys_id) { - return rcu_dereference(task->containers.subsys[subsys_id]); + return rcu_dereference(task->containers->subsys[subsys_id]); } static inline struct container* task_container(struct task_struct *task, @@ -199,6 +251,27 @@ int container_pat
[PATCH 04/10] Containers(V10): Add fork/exit hooks
This patch adds the necessary hooks to the fork() and exit() paths to ensure that new children inherit their parent's container assignments, and that exiting processes release reference counts on their containers. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container.h |6 ++ kernel/container.c| 128 ++ kernel/exit.c |2 kernel/fork.c | 14 - 4 files changed, 148 insertions(+), 2 deletions(-) Index: container-2.6.22-rc2-mm1/kernel/exit.c === --- container-2.6.22-rc2-mm1.orig/kernel/exit.c +++ container-2.6.22-rc2-mm1/kernel/exit.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -935,6 +936,7 @@ fastcall void do_exit(long code) __exit_fs(tsk); exit_thread(); cpuset_exit(tsk); + container_exit(tsk, 1); exit_keys(tsk); if (group_dead && tsk->signal->leader) Index: container-2.6.22-rc2-mm1/kernel/fork.c === --- container-2.6.22-rc2-mm1.orig/kernel/fork.c +++ container-2.6.22-rc2-mm1/kernel/fork.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -963,6 +964,7 @@ static struct task_struct *copy_process( { int retval; struct task_struct *p = NULL; + int container_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1063,12 +1065,13 @@ static struct task_struct *copy_process( p->io_wait = NULL; p->audit_context = NULL; cpuset_fork(p); + container_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_copy(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup_cpuset; + goto bad_fork_cleanup_container; } mpol_fix_fork_child_flag(p); #endif @@ -1178,6 +1181,12 @@ static struct task_struct *copy_process( /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); + /* Now that the task is set up, run container callbacks if +* necessary. We need to run them before the task is visible +* on the tasklist. */ + container_fork_callbacks(p); + container_callbacks_done = 1; + /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); @@ -1300,9 +1309,10 @@ bad_fork_cleanup_security: bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_free(p->mempolicy); -bad_fork_cleanup_cpuset: +bad_fork_cleanup_container: #endif cpuset_exit(p); + container_exit(p, container_callbacks_done); delayacct_tsk_free(p); if (p->binfmt) module_put(p->binfmt->module); Index: container-2.6.22-rc2-mm1/include/linux/container.h === --- container-2.6.22-rc2-mm1.orig/include/linux/container.h +++ container-2.6.22-rc2-mm1/include/linux/container.h @@ -18,6 +18,9 @@ extern int container_init_early(void); extern int container_init(void); extern void container_init_smp(void); +extern void container_fork(struct task_struct *p); +extern void container_fork_callbacks(struct task_struct *p); +extern void container_exit(struct task_struct *p, int run_callbacks); extern struct file_operations proc_container_operations; @@ -199,6 +202,9 @@ int container_path(const struct containe static inline int container_init_early(void) { return 0; } static inline int container_init(void) { return 0; } static inline void container_init_smp(void) {} +static inline void container_fork(struct task_struct *p) {} +static inline void container_fork_callbacks(struct task_struct *p) {} +static inline void container_exit(struct task_struct *p, int callbacks) {} static inline void container_lock(void) {} static inline void container_unlock(void) {} Index: container-2.6.22-rc2-mm1/kernel/container.c === --- container-2.6.22-rc2-mm1.orig/kernel/container.c +++ container-2.6.22-rc2-mm1/kernel/container.c @@ -132,6 +132,36 @@ list_for_each_entry(_ss, &_root->subsys_ #define for_each_root(_root) \ list_for_each_entry(_root, &roots, root_list) +/* Each task_struct has an embedded css_group, so the get/put + * operation simply takes a reference count on all the containers + * referenced by subsystems in this css_group. This can end up + * multiple-counting some containers, but that's OK - the ref-count is + * just a busy/not-busy indicator; ensuring that we only count each + * container once would require taking
[PATCH 03/10] Containers(V10): Add tasks file interface
This patch adds the per-directory "tasks" file for containerfs mounts; this allows the user to determine which tasks are members of a container by reading a container's "tasks", and to move a task into a container by writing its pid to its "tasks". Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container.h | 10 + kernel/container.c| 335 ++ 2 files changed, 345 insertions(+) Index: container-2.6.22-rc2-mm1/include/linux/container.h === --- container-2.6.22-rc2-mm1.orig/include/linux/container.h +++ container-2.6.22-rc2-mm1/include/linux/container.h @@ -128,6 +128,16 @@ int container_is_removed(const struct co int container_path(const struct container *cont, char *buf, int buflen); +int __container_task_count(const struct container *cont); +static inline int container_task_count(const struct container *cont) +{ + int task_count; + rcu_read_lock(); + task_count = __container_task_count(cont); + rcu_read_unlock(); + return task_count; +} + /* Return true if the container is a descendant of the current container */ int container_is_descendant(const struct container *cont); Index: container-2.6.22-rc2-mm1/kernel/container.c === --- container-2.6.22-rc2-mm1.orig/kernel/container.c +++ container-2.6.22-rc2-mm1/kernel/container.c @@ -679,6 +679,109 @@ static inline void get_first_subsys(cons *subsys_id = test_ss->subsys_id; } +/* + * Attach task 'tsk' to container 'cont' + * + * Call holding container_mutex. May take task_lock of + * the task 'pid' during call. + */ + +static int attach_task(struct container *cont, struct task_struct *tsk) +{ + int retval = 0; + struct container_subsys *ss; + struct container *oldcont; + struct css_group *cg = &tsk->containers; + struct containerfs_root *root = cont->root; + int i; + + int subsys_id; + get_first_subsys(cont, NULL, &subsys_id); + + /* Nothing to do if the task is already in that container */ + oldcont = task_container(tsk, subsys_id); + if (cont == oldcont) + return 0; + + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cont, tsk); + if (retval) { + return retval; + } + } + } + + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + task_unlock(tsk); + return -ESRCH; + } + /* Update the css_group pointers for the subsystems in this +* hierarchy */ + for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) { + if (root->subsys_bits & (1ull << i)) { + /* Subsystem is in this hierarchy. So we want +* the subsystem state from the new +* container. Transfer the refcount from the +* old to the new */ + atomic_inc(&cont->count); + atomic_dec(&cg->subsys[i]->container->count); + rcu_assign_pointer(cg->subsys[i], cont->subsys[i]); + } + } + task_unlock(tsk); + + for_each_subsys(root, ss) { + if (ss->attach) { + ss->attach(ss, cont, oldcont, tsk); + } + } + + synchronize_rcu(); + return 0; +} + +/* + * Attach task with pid 'pid' to container 'cont'. Call with + * container_mutex, may take task_lock of task + * + */ + +static int attach_task_by_pid(struct container *cont, char *pidbuf) +{ + pid_t pid; + struct task_struct *tsk; + int ret; + + if (sscanf(pidbuf, "%d", &pid) != 1) + return -EIO; + + if (pid) { + rcu_read_lock(); + tsk = find_task_by_pid(pid); + if (!tsk || tsk->flags & PF_EXITING) { + rcu_read_unlock(); + return -ESRCH; + } + get_task_struct(tsk); + rcu_read_unlock(); + + if ((current->euid) && (current->euid != tsk->uid) + && (current->euid != tsk->suid)) { + put_task_struct(tsk); + return -EACCES; + } + } else { + tsk = current; + get_task_struct(tsk); + } + + ret = attach_task(cont, tsk); + put_task_struct(tsk); + return ret; +} + /* The various types of f
[PATCH 06/10] Containers(V10): Add procfs interface
This patch adds: /proc/containers - general system info /proc/*/container - per-task container membership info Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- fs/proc/base.c |7 ++ kernel/container.c | 128 + 2 files changed, 135 insertions(+) Index: container-2.6.22-rc2-mm1/fs/proc/base.c === --- container-2.6.22-rc2-mm1.orig/fs/proc/base.c +++ container-2.6.22-rc2-mm1/fs/proc/base.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -2028,6 +2029,9 @@ static const struct pid_entry tgid_base_ #ifdef CONFIG_CPUSETS REG("cpuset", S_IRUGO, cpuset), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj",S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL @@ -2319,6 +2323,9 @@ static const struct pid_entry tid_base_s #ifdef CONFIG_CPUSETS REG("cpuset",S_IRUGO, cpuset), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL Index: container-2.6.22-rc2-mm1/kernel/container.c === --- container-2.6.22-rc2-mm1.orig/kernel/container.c +++ container-2.6.22-rc2-mm1/kernel/container.c @@ -249,6 +249,7 @@ static int container_mkdir(struct inode static int container_rmdir(struct inode *unused_dir, struct dentry *dentry); static int container_populate_dir(struct container *cont); static struct inode_operations container_dir_inode_operations; +struct file_operations proc_containerstats_operations; static struct backing_dev_info container_backing_dev_info = { .ra_pages = 0, /* No readahead */ @@ -1504,6 +1505,7 @@ int __init container_init(void) { int err; int i; + struct proc_dir_entry *entry; for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) { struct container_subsys *ss = subsys[i]; @@ -1515,10 +1517,136 @@ int __init container_init(void) if (err < 0) goto out; + entry = create_proc_entry("containers", 0, NULL); + if (entry) + entry->proc_fops = &proc_containerstats_operations; + out: return err; } +/* + * proc_container_show() + * - Print task's container paths into seq_file, one line for each hierarchy + * - Used for /proc//container. + * - No need to task_lock(tsk) on this tsk->container reference, as it + *doesn't really matter if tsk->container changes after we read it, + *and we take container_mutex, keeping attach_task() from changing it + *anyway. No need to check that tsk->container != NULL, thanks to + *the_top_container_hack in container_exit(), which sets an exiting tasks + *container to top_container. + */ + +/* TODO: Use a proper seq_file iterator */ +static int proc_container_show(struct seq_file *m, void *v) +{ + struct pid *pid; + struct task_struct *tsk; + char *buf; + int retval; + struct containerfs_root *root; + + retval = -ENOMEM; + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto out; + + retval = -ESRCH; + pid = m->private; + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) + goto out_free; + + retval = 0; + + mutex_lock(&container_mutex); + + for_each_root(root) { + struct container_subsys *ss; + struct container *cont; + int subsys_id; + int count = 0; + /* Skip this hierarchy if it has no active subsystems */ + if (!root->subsys_bits) continue; + for_each_subsys(root, ss) { + seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + } + seq_putc(m, ':'); + get_first_subsys(&root->top_container, NULL, &subsys_id); + cont = task_container(tsk, subsys_id); + retval = container_path(cont, buf, PAGE_SIZE); + if (retval < 0) + goto out_unlock; + seq_puts(m, buf); + seq_putc(m, '\n'); + } + +out_unlock: + mutex_unlock(&container_mutex); + put_task_struct(tsk); +out_free: + kfree(buf); +out: + return retval; +} + +static int container_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_container_show, pid); +}
[PATCH 00/10] Containers(V10): Generic Process Containers
This is an update to my multi-hierarchy multi-subsystem generic process containers patch. Changes since V9 (April 27th) include: - The patchset has been rebased over 2.6.22-rc2-mm1 - A lattice of lists linking tasks to their css_groups and css_groups to their containers has been added to support more efficient iteration across the member tasks of a container. - Support for the cpusets "release agent" functionality has been added back in; this is based on a workqueue concept similar to the changes that Cliff Wickman has been pushing for supporting CPU hot-unplug. - Several uses of tasklist_lock replaced by reliance on RCU - Misc cleanups - Tested with a tweaked version of PaulJ's cpuset_test script Still TODO: - decide whether "Containers" is an acceptable name for the system given its usage by some other development groups, or whether something else (ProcessSets? ResourceGroups? TaskGroups?) would be better. I'm inclined to leave this political decision to Andrew/Linus once they're happy with the technical aspects of the patches. - add a hash-table based lookup for css_group objects. - use seq_file properly in container tasks files to avoid having to allocate a big array for all the container's task pointers. - lots more testing - define standards for container file names -- Generic Process Containers -- There have recently been various proposals floating around for resource management/accounting and other task grouping subsystems in the kernel, including ResGroups, User BeanCounters, NSProxy containers, and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, or control other behaviour of the processes, and all implement this grouping in different ways. Already existing in the kernel is the cpuset subsystem; this has a process grouping mechanism that is mature, tested, and well documented (particularly with regards to synchronization rules). This patchset extracts the process grouping code from cpusets into a generic container system, and makes the cpusets code a client of the container system, along with a couple of simple example subsystems. The patch set is structured as follows: 1) Basic container framework - filesystem and tracking structures 2) Simple CPU Accounting example subsystem 3) Support for the "tasks" control file 4) Hooks for fork() and exit() 5) Support for the container_clone() operation 6) Add /proc reporting interface 7) Make cpusets a container subsystem 8) Share container subsystem pointer arrays between tasks with the same assignments 9) Simple container debugging subsystem 10) Support for a userspace "release agent", similar to the cpusets release agent functionality The intention is that the various resource management and virtualization efforts can also become container clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test out e.g. the ResGroups CPU controller in conjunction with the BeanCounters memory controller, or use either of them as the resource-control portion of a virtual server system. - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel Signed-off-by: Paul Menage <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 09/10] Containers(V10): Simple debug info subsystem
This example subsystem exports debugging information as an aid to diagnosing refcount leaks, etc, in the container framework. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container_subsys.h |4 + init/Kconfig | 10 kernel/Makefile |1 kernel/container_debug.c | 89 +++ 4 files changed, 104 insertions(+) Index: container-2.6.22-rc2-mm1/include/linux/container_subsys.h === --- container-2.6.22-rc2-mm1.orig/include/linux/container_subsys.h +++ container-2.6.22-rc2-mm1/include/linux/container_subsys.h @@ -19,4 +19,8 @@ SUBSYS(cpuset) /* */ +#ifdef CONFIG_CONTAINER_DEBUG +SUBSYS(debug) +#endif + /* */ Index: container-2.6.22-rc2-mm1/init/Kconfig === --- container-2.6.22-rc2-mm1.orig/init/Kconfig +++ container-2.6.22-rc2-mm1/init/Kconfig @@ -306,6 +306,16 @@ config LOG_BUF_SHIFT config CONTAINERS bool +config CONTAINER_DEBUG + bool "Example debug container subsystem" + select CONTAINERS + help + This option enables a simple container subsystem that + exports useful debugging information about the containers + framework + + Say N if unsure + config CPUSETS bool "Cpuset support" depends on SMP Index: container-2.6.22-rc2-mm1/kernel/container_debug.c === --- /dev/null +++ container-2.6.22-rc2-mm1/kernel/container_debug.c @@ -0,0 +1,89 @@ +/* + * kernel/ccontainer_debug.c - Example container subsystem that + * exposes debug info + * + * Copyright (C) Google Inc, 2007 + * + * Developed by Paul Menage ([EMAIL PROTECTED]) + * + */ + +#include +#include + +static int debug_create(struct container_subsys *ss, struct container *cont) +{ + struct container_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + if (!css) + return -ENOMEM; + cont->subsys[debug_subsys_id] = css; + return 0; +} + +static void debug_destroy(struct container_subsys *ss, struct container *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 container_refcount_read(struct container *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 taskcount_read(struct container *cont, struct cftype *cft) +{ + u64 count; + container_lock(); + count = container_task_count(cont); + container_unlock(); + return count; +} + +static u64 current_css_group_read(struct container *cont, struct cftype *cft) +{ + return (u64) current->containers; +} + +static u64 current_css_group_refcount_read(struct container *cont, + struct cftype *cft) +{ + u64 count; + rcu_read_lock(); + count = atomic_read(¤t->containers->ref.refcount); + rcu_read_unlock(); + return count; +} + +static struct cftype files[] = { + { + .name = "debug.container_refcount", + .read_uint = container_refcount_read, + }, + { + .name = "debug.taskcount", + .read_uint = taskcount_read, + }, + + { + .name = "debug.current_css_group", + .read_uint = current_css_group_read, + }, + + { + .name = "debug.current_css_group_refcount", + .read_uint = current_css_group_refcount_read, + }, +}; + +static int debug_populate(struct container_subsys *ss, struct container *cont) +{ + return container_add_files(cont, files, ARRAY_SIZE(files)); +} + +struct container_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; Index: container-2.6.22-rc2-mm1/kernel/Makefile === --- container-2.6.22-rc2-mm1.orig/kernel/Makefile +++ container-2.6.22-rc2-mm1/kernel/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CONTAINERS) += container.o +obj-$(CONFIG_CONTAINER_DEBUG) += container_debug.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o obj-$(CONFIG_IKCONFIG) += configs.o -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 02/10] Containers(V10): Example CPU accounting subsystem
This example demonstrates how to use the generic container subsystem for a simple resource tracker that counts, for the processes in a container, the total CPU time used and the %CPU used in the last complete 10 second interval. Portions contributed by Balbir Singh <[EMAIL PROTECTED]> Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container_subsys.h |6 + include/linux/cpu_acct.h | 14 ++ init/Kconfig |7 + kernel/Makefile |1 kernel/cpu_acct.c| 185 +++ kernel/sched.c | 14 ++ 6 files changed, 224 insertions(+), 3 deletions(-) Index: container-2.6.22-rc2-mm1/include/linux/container_subsys.h === --- container-2.6.22-rc2-mm1.orig/include/linux/container_subsys.h +++ container-2.6.22-rc2-mm1/include/linux/container_subsys.h @@ -7,4 +7,10 @@ /* */ +#ifdef CONFIG_CONTAINER_CPUACCT +SUBSYS(cpuacct) +#endif + +/* */ + /* */ Index: container-2.6.22-rc2-mm1/include/linux/cpu_acct.h === --- /dev/null +++ container-2.6.22-rc2-mm1/include/linux/cpu_acct.h @@ -0,0 +1,14 @@ + +#ifndef _LINUX_CPU_ACCT_H +#define _LINUX_CPU_ACCT_H + +#include +#include + +#ifdef CONFIG_CONTAINER_CPUACCT +extern void cpuacct_charge(struct task_struct *, cputime_t cputime); +#else +static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {} +#endif + +#endif Index: container-2.6.22-rc2-mm1/init/Kconfig === --- container-2.6.22-rc2-mm1.orig/init/Kconfig +++ container-2.6.22-rc2-mm1/init/Kconfig @@ -337,6 +337,13 @@ config SYSFS_DEPRECATED If you are using a distro that was released in 2006 or later, it should be safe to say N here. +config CONTAINER_CPUACCT + bool "Simple CPU accounting container subsystem" + select CONTAINERS + help + Provides a simple Resource Controller for monitoring the + total CPU consumed by the tasks in a container + config RELAY bool "Kernel->user space relay support (formerly relayfs)" help Index: container-2.6.22-rc2-mm1/kernel/Makefile === --- container-2.6.22-rc2-mm1.orig/kernel/Makefile +++ container-2.6.22-rc2-mm1/kernel/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CONTAINERS) += container.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o Index: container-2.6.22-rc2-mm1/kernel/cpu_acct.c === --- /dev/null +++ container-2.6.22-rc2-mm1/kernel/cpu_acct.c @@ -0,0 +1,185 @@ +/* + * kernel/cpu_acct.c - CPU accounting container subsystem + * + * Copyright (C) Google Inc, 2006 + * + * Developed by Paul Menage ([EMAIL PROTECTED]) and Balbir Singh + * ([EMAIL PROTECTED]) + * + */ + +/* + * Example container subsystem for reporting total CPU usage of tasks in a + * container, along with percentage load over a time interval + */ + +#include +#include +#include +#include + +struct cpuacct { + struct container_subsys_state css; + spinlock_t lock; + /* total time used by this class */ + cputime64_t time; + + /* time when next load calculation occurs */ + u64 next_interval_check; + + /* time used in current period */ + cputime64_t current_interval_time; + + /* time used in last period */ + cputime64_t last_interval_time; +}; + +struct container_subsys cpuacct_subsys; + +static inline struct cpuacct *container_ca(struct container *cont) +{ + return container_of(container_subsys_state(cont, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *task_ca(struct task_struct *task) +{ + return container_of(task_subsys_state(task, cpuacct_subsys_id), + struct cpuacct, css); +} + +#define INTERVAL (HZ * 10) + +static inline u64 next_interval_boundary(u64 now) { + /* calculate the next interval boundary beyond the +* current time */ + do_div(now, INTERVAL); + return (now + 1) * INTERVAL; +} + +static int cpuacct_create(struct container_subsys *ss, struct container *cont) +{ + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + return -ENOMEM; + spin_lock_init(&ca->lock); + ca->next_interval_check = next_interval_boundary(get_jiffies_64()); + cont->subsys[cpuacct_subsys_id] = &ca->css; + return 0; +} + +static void cpuacct_destroy(struct container_subsys
[PATCH 10/10] Containers(V10): Support for automatic userspace release agents
This patch adds the following files to the container filesystem: notify_on_release - configures/reports whether the container subsystem should attempt to run a release script when this container becomes unused release_agent - configures/reports the release agent to be used for this hierarchy (top level in each hierarchy only) releasable - reports whether this container would have been auto-released if notify_on_release was true and a release agent was configured (mainly useful for debugging) To avoid locking issues, invoking the userspace release agent is done via a workqueue task; containers that need to have their release agents invoked by the workqueue task are linked on to a list. When the "cpuset" filesystem is mounted, it automatically sets the hierarchy's release agent to be /sbin/cpuset_release_agent for backward-compatibility with existing cpusets users. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container.h | 15 + kernel/container.c| 364 ++ kernel/cpuset.c |5 3 files changed, 348 insertions(+), 36 deletions(-) Index: container-2.6.22-rc2-mm1/include/linux/container.h === --- container-2.6.22-rc2-mm1.orig/include/linux/container.h +++ container-2.6.22-rc2-mm1/include/linux/container.h @@ -64,11 +64,7 @@ static inline void css_get(struct contai * css_put() should be called to release a reference taken by * css_get() */ - -static inline void css_put(struct container_subsys_state *css) -{ - atomic_dec(&css->refcnt); -} +void css_put(struct container_subsys_state *css); struct container { unsigned long flags;/* "unsigned long" so bitops work */ @@ -99,6 +95,13 @@ struct container { * tasks in this container. Protected by css_group_lock */ struct list_head css_groups; + + /* +* Linked list running through all containers that can +* potentially be reaped by the release agent. Protected by +* container_mutex +*/ + struct list_head release_list; }; /* A css_group is a structure holding pointers to a set of @@ -271,6 +274,8 @@ struct task_struct *container_iter_next( struct container_iter *it); void container_iter_end(struct container *cont, struct container_iter *it); +void container_set_release_agent_path(struct container_subsys *ss, + const char *path); #else /* !CONFIG_CONTAINERS */ Index: container-2.6.22-rc2-mm1/kernel/container.c === --- container-2.6.22-rc2-mm1.orig/kernel/container.c +++ container-2.6.22-rc2-mm1/kernel/container.c @@ -62,6 +62,8 @@ #define CONTAINER_SUPER_MAGIC 0x27e0eb +static DEFINE_MUTEX(container_mutex); + /* Generate an array of container subsystem pointers */ #define SUBSYS(_x) &_x ## _subsys, @@ -89,6 +91,13 @@ struct containerfs_root { /* A list running through the mounted hierarchies */ struct list_head root_list; + + /* The path to use for release notifications. No locking +* between setting and use - so if userspace updates this +* while subcontainers exist, you could miss a +* notification. We ensure that it's always a valid +* NUL-terminated string */ + char release_agent_path[PATH_MAX]; }; @@ -115,7 +124,13 @@ static int need_forkexit_callback = 0; /* bits in struct container flags field */ typedef enum { + /* Container is dead */ CONT_REMOVED, + /* Container has previously had a child container or a task, +* but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */ + CONT_RELEASABLE, + /* Container requires release notifications to userspace */ + CONT_NOTIFY_ON_RELEASE, } container_flagbits_t; /* convenient tests for these bits */ @@ -124,6 +139,19 @@ inline int container_is_removed(const st return test_bit(CONT_REMOVED, &cont->flags); } +inline int container_is_releasable(const struct container *cont) +{ + const int bits = + (1 << CONT_RELEASABLE) | + (1 << CONT_NOTIFY_ON_RELEASE); + return (cont->flags & bits) == bits; +} + +inline int notify_on_release(const struct container *cont) +{ + return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags); +} + /* for_each_subsys() allows you to iterate on each subsystem attached to * an active hierarchy */ #define for_each_subsys(_root, _ss) \ @@ -133,6 +161,12 @@ list_for_each_entry(_ss, &_root->subsys_ #define for_each_root(_root) \ list_for_each_entry(_root, &roots, root_list) +/* the list of containers eligible for automatic release */ +static LIST_HEAD(release_list); +static void container_release_agent(struct wor
[PATCH 0/7] Containers (V8): Generic Process Containers
-- This is an update to my multi-hierarchy multi-subsystem generic process containers patch. Changes since V7 (12th Feb) include: - Removed the config-time choice of the number of supported hierarchies - this is now completely dynamic; new hierarchies are allocated on demand, and freed when no longer in use. - Subsystems are now registered at compile-time in linux/container_subsys.h. This allows for faster access to subsystem state since the id is a compile-time constant, so there's only a single extra pointer dereference compared to having a pointer directly in the task_struct. It also avoids wasting space with unused subsystem pointers. - Removed the container pointers from container_group - this results in a structure very similar to Srivatsa Vaddagiri's rcfs approach. (RCFS uses the nsproxy object rather than the container_group object; merging container_group and nsproxy would be pretty straightforward if desired). - Removed callback_mutex from container subsystem to be purely back in the cpuset subsystem. Renamed manage_mutex to container_mutex. - Condensed post_attach_task() into attach_task() now that callback_mutex is purely within cpuset.c - Simplified the container_subsys_state reference counting - stricter rules on liveness make adding reference counts cheaper. Still TODO: - decide whether "Containers" is an acceptable name for the system given its usage by some other development groups, or whether something else (ProcessSets? ResourceGroups?) would be better - decide whether merging container_group and nsproxy is desirable - add a hash-table based lookup for container_group objects. - use seq_file properly in container tasks files (and also in cpuset_attach_task) to avoid having to allocate a big array for all the container's task pointers. - add back support for the "release agent" functionality - lots more testing - define standards for container file names Generic Process Containers -- There have recently been various proposals floating around for resource management/accounting and other task grouping subsystems in the kernel, including ResGroups, User BeanCounters, NSProxy containers, and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, or control other behaviour of the processes, and all implement this grouping in different ways. Already existing in the kernel is the cpuset subsystem; this has a process grouping mechanism that is mature, tested, and well documented (particularly with regards to synchronization rules). This patchset extracts the process grouping code from cpusets into a generic container system, and makes the cpusets code a client of the container system. It also provides several example clients of the container system, including ResGroups, BeanCounters and namespace proxy. The change is implemented in three implementation patches, plus four example subsystems that aren't necessarily intended to be merged as part of this patch set, but demonstrate the applicability of the framework. 1) extract the process grouping code from cpusets into a standalone system 2) remove the process grouping code from cpusets and hook into the container system 3) convert the container system to present a generic multi-hierarchy API, and make cpusets a client of that API 4) example of a simple CPU accounting container subsystem. Useful as a boilerplate for people implementing their own subsystems. 5) example of implementing ResGroups and its numtasks controller over generic containers 6) example of implementing BeanCounters and its numfiles counter over generic containers 7) example of integrating the namespace isolation code (sys_unshare() or various clone flags) with generic containers, allowing virtual servers to take advantage of other resource control efforts. The intention is that the various resource management and virtualization efforts can also become container clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test out e.g. the ResGroups CPU controller in conjunction with the BeanCounters memory controller, or use either of them as the resource-control portion of a virtual server system. - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel Signed-off-by: Paul Menage <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 6/7] Containers (V8): BeanCounters over generic process containers
; + +static int bc_populate(struct container_subsys *ss, struct container *cont) +{ + int err; + int attr, res; + for (res = 0; res < BC_RESOURCES; res++) { + struct bc_resource *bcr = bc_resources[res]; + + for (attr = 0; attr < BC_ATTRS; attr++) { + struct cftype *cft = &bcr->cft_attrs[attr]; + if (!cft->name[0]) continue; + err = container_add_file(cont, cft); + if (err < 0) return err; + } + } + return 0; +} + +struct container_subsys bc_subsys = { + .name = "bc", + .create = bc_create, + .destroy = bc_destroy, + .populate = bc_populate, + .subsys_id = bc_subsys_id, + .early_init = 1, +}; + +EXPORT_SYMBOL(bc_resources); +EXPORT_SYMBOL(init_bc); +EXPORT_SYMBOL(bc_change_param); Index: container-2.6.20-new/include/bc/misc.h === --- /dev/null +++ container-2.6.20-new/include/bc/misc.h @@ -0,0 +1,27 @@ +/* + * include/bc/misc.h + * + * Copyright (C) 2006 OpenVZ SWsoft Inc + * + */ + +#ifndef __BC_MISC_H__ +#define __BC_MISC_H__ + +struct file; + +#ifdef CONFIG_BEANCOUNTERS +int __must_check bc_file_charge(struct file *); +void bc_file_uncharge(struct file *); +#else +static inline int __must_check bc_file_charge(struct file *f) +{ + return 0; +} + +static inline void bc_file_uncharge(struct file *f) +{ +} +#endif + +#endif Index: container-2.6.20-new/kernel/bc/misc.c === --- /dev/null +++ container-2.6.20-new/kernel/bc/misc.c @@ -0,0 +1,57 @@ + +#include +#include +#include + +int bc_file_charge(struct file *file) +{ + int sev; + struct beancounter *bc; + + rcu_read_lock(); + bc = get_exec_bc(); + css_get(&bc->css); + rcu_read_unlock(); + + sev = (capable(CAP_SYS_ADMIN) ? BC_LIMIT : BC_BARRIER); + + if (bc_charge(bc, BC_NUMFILES, 1, sev)) { + css_put(&bc->css); + return -EMFILE; + } + + file->f_bc = bc; + return 0; +} + +void bc_file_uncharge(struct file *file) +{ + struct beancounter *bc; + + bc = file->f_bc; + bc_uncharge(bc, BC_NUMFILES, 1); + css_put(&bc->css); +} + +#define BC_NUMFILES_BARRIER256 +#define BC_NUMFILES_LIMIT 512 + +static int bc_files_init(struct beancounter *bc, int i) +{ + bc_init_resource(&bc->bc_parms[BC_NUMFILES], + BC_NUMFILES_BARRIER, BC_NUMFILES_LIMIT); + return 0; +} + +static struct bc_resource bc_files_resource = { + .bcr_name = "numfiles", + .bcr_init = bc_files_init, +}; + +static int __init bc_misc_init_resource(void) +{ + bc_register_resource(BC_NUMFILES, &bc_files_resource); + return 0; +} + +__initcall(bc_misc_init_resource); Index: container-2.6.20-new/fs/file_table.c === --- container-2.6.20-new.orig/fs/file_table.c +++ container-2.6.20-new/fs/file_table.c @@ -22,6 +22,8 @@ #include #include +#include + #include /* sysctl tunables... */ @@ -43,6 +45,7 @@ static inline void file_free_rcu(struct static inline void file_free(struct file *f) { percpu_counter_dec(&nr_files); + bc_file_uncharge(f); call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); } @@ -107,8 +110,10 @@ struct file *get_empty_filp(void) if (f == NULL) goto fail; - percpu_counter_inc(&nr_files); memset(f, 0, sizeof(*f)); + if (bc_file_charge(f)) + goto fail_charge; + percpu_counter_inc(&nr_files); if (security_file_alloc(f)) goto fail_sec; @@ -135,6 +140,10 @@ fail_sec: file_free(f); fail: return NULL; + + fail_charge: + kmem_cache_free(filp_cachep, f); + return NULL; } EXPORT_SYMBOL(get_empty_filp); Index: container-2.6.20-new/include/linux/fs.h === --- container-2.6.20-new.orig/include/linux/fs.h +++ container-2.6.20-new/include/linux/fs.h @@ -739,6 +739,9 @@ struct file { spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space*f_mapping; +#ifdef CONFIG_BEANCOUNTERS + struct beancounter *f_bc; +#endif }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); Index: container-2.6.20-new/include/bc/task.h === --- /dev/null +++ container-2.6.20-new/include/bc/task.h @@ -0,0 +1,64 @@ +/* + * include/bc/task.h + * + * Copyright (C) 2007 OpenVZ SWsoft Inc + * Adapted by Paul Menage <[EMAIL PROTECTED]> for generic containers + * + */ + +#ifndef __
[PATCH 4/7] Containers (V8): Simple CPU accounting container subsystem
This example demonstrates how to use the generic container subsystem for a simple resource tracker that counts, for the processes in a container, the total CPU time used and the %CPU used in the last complete 10 second interval. Portions contributed by Balbir Singh <[EMAIL PROTECTED]> Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container_subsys.h |6 + include/linux/cpu_acct.h | 14 ++ init/Kconfig |7 + kernel/Makefile |1 kernel/cpu_acct.c| 204 +++ kernel/sched.c | 14 ++ 6 files changed, 243 insertions(+), 3 deletions(-) Index: container-2.6.20-new/include/linux/cpu_acct.h === --- /dev/null +++ container-2.6.20-new/include/linux/cpu_acct.h @@ -0,0 +1,14 @@ + +#ifndef _LINUX_CPU_ACCT_H +#define _LINUX_CPU_ACCT_H + +#include +#include + +#ifdef CONFIG_CONTAINER_CPUACCT +extern void cpuacct_charge(struct task_struct *, cputime_t cputime); +#else +static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {} +#endif + +#endif Index: container-2.6.20-new/init/Kconfig === --- container-2.6.20-new.orig/init/Kconfig +++ container-2.6.20-new/init/Kconfig @@ -278,6 +278,13 @@ config PROC_PID_CPUSET depends on CPUSETS default y +config CONTAINER_CPUACCT + bool "Simple CPU accounting container subsystem" + select CONTAINERS + help + Provides a simple Resource Controller for monitoring the + total CPU consumed by the tasks in a container + config RELAY bool "Kernel->user space relay support (formerly relayfs)" help Index: container-2.6.20-new/kernel/cpu_acct.c === --- /dev/null +++ container-2.6.20-new/kernel/cpu_acct.c @@ -0,0 +1,204 @@ +/* + * kernel/cpu_acct.c - CPU accounting container subsystem + * + * Copyright (C) Google Inc, 2006 + * + * Developed by Paul Menage ([EMAIL PROTECTED]) and Balbir Singh + * ([EMAIL PROTECTED]) + * + */ + +/* + * Container subsystem for reporting total CPU usage of tasks in a + * container, along with percentage load over a time interval + */ + +#include +#include +#include +#include + +struct cpuacct { + struct container_subsys_state css; + spinlock_t lock; + /* total time used by this class */ + cputime64_t time; + + /* time when next load calculation occurs */ + u64 next_interval_check; + + /* time used in current period */ + cputime64_t current_interval_time; + + /* time used in last period */ + cputime64_t last_interval_time; +}; + +struct container_subsys cpuacct_subsys; + +static inline struct cpuacct *container_ca(struct container *cont) +{ + return container_of(container_subsys_state(cont, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *task_ca(struct task_struct *task) +{ + return container_ca(task_container(task, cpuacct_subsys_id)); +} + +#define INTERVAL (HZ * 10) + +static inline u64 next_interval_boundary(u64 now) { + /* calculate the next interval boundary beyond the +* current time */ + do_div(now, INTERVAL); + return (now + 1) * INTERVAL; +} + +static int cpuacct_create(struct container_subsys *ss, struct container *cont) +{ + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + return -ENOMEM; + spin_lock_init(&ca->lock); + ca->next_interval_check = next_interval_boundary(get_jiffies_64()); + cont->subsys[cpuacct_subsys.subsys_id] = &ca->css; + return 0; +} + +static void cpuacct_destroy(struct container_subsys *ss, + struct container *cont) +{ + kfree(container_ca(cont)); +} + +/* Lazily update the load calculation if necessary. Called with ca locked */ +static void cpuusage_update(struct cpuacct *ca) +{ + u64 now = get_jiffies_64(); + /* If we're not due for an update, return */ + if (ca->next_interval_check > now) + return; + + if (ca->next_interval_check <= (now - INTERVAL)) { + /* If it's been more than an interval since the last +* check, then catch up - the last interval must have +* been zero load */ + ca->last_interval_time = 0; + ca->next_interval_check = next_interval_boundary(now); + } else { + /* If a steal takes the last interval time negative, +* then we just ignore it */ + if ((s64)ca->current_interval_time > 0) { + ca->last_interval_time = ca->current_interval_time; + }
[PATCH 1/7] Containers (V8): Generic container system abstracted from cpusets code
This patch creates a generic process container system based on (and parallel top) the cpusets code. At a coarse level it was created by copying kernel/cpuset.c, doing s/cpuset/container/g, and stripping out any code that was cpuset-specific rather than applicable to any process container subsystem. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- Documentation/containers.txt | 229 +++ fs/proc/base.c |7 include/linux/container.h| 96 +++ include/linux/sched.h|5 init/Kconfig |9 init/main.c |3 kernel/Makefile |1 kernel/container.c | 1260 +++ kernel/exit.c|2 kernel/fork.c|3 10 files changed, 1614 insertions(+), 1 deletion(-) Index: container-2.6.20-new/fs/proc/base.c === --- container-2.6.20-new.orig/fs/proc/base.c +++ container-2.6.20-new/fs/proc/base.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -1870,6 +1871,9 @@ static struct pid_entry tgid_base_stuff[ #ifdef CONFIG_CPUSETS REG("cpuset", S_IRUGO, cpuset), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj",S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL @@ -2151,6 +2155,9 @@ static struct pid_entry tid_base_stuff[] #ifdef CONFIG_CPUSETS REG("cpuset",S_IRUGO, cpuset), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL Index: container-2.6.20-new/include/linux/container.h === --- /dev/null +++ container-2.6.20-new/include/linux/container.h @@ -0,0 +1,96 @@ +#ifndef _LINUX_CONTAINER_H +#define _LINUX_CONTAINER_H +/* + * container interface + * + * Copyright (C) 2003 BULL SA + * Copyright (C) 2004-2006 Silicon Graphics, Inc. + * + */ + +#include +#include +#include + +#ifdef CONFIG_CONTAINERS + +extern int number_of_containers; /* How many containers are defined in system? */ + +extern int container_init_early(void); +extern int container_init(void); +extern void container_init_smp(void); +extern void container_fork(struct task_struct *p); +extern void container_exit(struct task_struct *p); + +extern struct file_operations proc_container_operations; + +extern void container_lock(void); +extern void container_unlock(void); + +extern void container_manage_lock(void); +extern void container_manage_unlock(void); + +struct container { + unsigned long flags;/* "unsigned long" so bitops work */ + + /* +* Count is atomic so can incr (fork) or decr (exit) without a lock. +*/ + atomic_t count; /* count tasks using this container */ + + /* +* We link our 'sibling' struct into our parent's 'children'. +* Our children link their 'sibling' into our 'children'. +*/ + struct list_head sibling; /* my parent's children */ + struct list_head children; /* my children */ + + struct container *parent; /* my parent */ + struct dentry *dentry; /* container fs entry */ +}; + +/* struct cftype: + * + * The files in the container filesystem mostly have a very simple read/write + * handling, some common function will take care of it. Nevertheless some cases + * (read tasks) are special and therefore I define this structure for every + * kind of file. + * + * + * When reading/writing to a file: + * - the container to use in file->f_dentry->d_parent->d_fsdata + * - the 'cftype' of the file is file->f_dentry->d_fsdata + */ + +struct inode; +struct cftype { + char *name; + int private; + int (*open) (struct inode *inode, struct file *file); + ssize_t (*read) (struct container *cont, struct cftype *cft, +struct file *file, +char __user *buf, size_t nbytes, loff_t *ppos); + ssize_t (*write) (struct container *cont, struct cftype *cft, + struct file *file, + const char __user *buf, size_t nbytes, loff_t *ppos); + int (*release) (struct inode *inode, struct file *file); +}; + +int container_add_file(struct container *cont, const struct cftype *cft); + +int container_is_removed(const struct container *cont); + +#else /* !CONFIG_CONTAINERS */ + +static inline int container_init_early(void) { return 0; } +static inline int container_init
[PATCH 7/7] Containers (V8): Container interface to nsproxy subsystem
This is intended as a simple illustration of how a virtual server system could be integrated with generic containers, and hence take advantage of other resource-control efforts. A real implementation would probably allow parameters such as configuring what kinds of namespace creations triggered new containers, etc. When a task enters a new namespace via a clone() or unshare(), a new container is created and the task moves into it. Developed by Serge Hallyn <[EMAIL PROTECTED]>, adapted by Paul Menage <[EMAIL PROTECTED]> Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container_subsys.h |6 ++ include/linux/nsproxy.h |6 ++ init/Kconfig |9 +++ kernel/Makefile |1 kernel/fork.c|4 + kernel/ns_container.c| 99 +++ kernel/nsproxy.c |6 ++ 7 files changed, 131 insertions(+) Index: container-2.6.20-new/include/linux/nsproxy.h === --- container-2.6.20-new.orig/include/linux/nsproxy.h +++ container-2.6.20-new/include/linux/nsproxy.h @@ -53,4 +53,10 @@ static inline void exit_task_namespaces( put_nsproxy(ns); } } +#ifdef CONFIG_CONTAINER_NS +int ns_container_clone(struct task_struct *tsk); +#else +static inline int ns_container_clone(struct task_struct *tsk) { return 0; } +#endif + #endif Index: container-2.6.20-new/kernel/Makefile === --- container-2.6.20-new.orig/kernel/Makefile +++ container-2.6.20-new/kernel/Makefile @@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CONTAINERS) += container.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o +obj-$(CONFIG_CONTAINER_NS) += ns_container.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o Index: container-2.6.20-new/kernel/fork.c === --- container-2.6.20-new.orig/kernel/fork.c +++ container-2.6.20-new/kernel/fork.c @@ -1668,6 +1668,9 @@ asmlinkage long sys_unshare(unsigned lon err = -ENOMEM; goto bad_unshare_cleanup_ipc; } + err = ns_container_clone(current); + if (err) + goto bad_unshare_cleanup_dupns; } if (new_fs || new_ns || new_mm || new_fd || new_ulist || @@ -1722,6 +1725,7 @@ asmlinkage long sys_unshare(unsigned lon task_unlock(current); } + bad_unshare_cleanup_dupns: if (new_nsproxy) put_nsproxy(new_nsproxy); Index: container-2.6.20-new/kernel/ns_container.c === --- /dev/null +++ container-2.6.20-new/kernel/ns_container.c @@ -0,0 +1,99 @@ +/* + * ns_container.c - namespace container subsystem + * + * Copyright IBM, 2006 + */ + +#include +#include +#include + +struct nscont { + struct container_subsys_state css; + spinlock_t lock; +}; + +struct container_subsys ns_subsys; + +static inline struct nscont *container_nscont(struct container *cont) +{ + return container_of(container_subsys_state(cont, ns_subsys_id), + struct nscont, css); +} + +int ns_container_clone(struct task_struct *tsk) +{ + return container_clone(tsk, &ns_subsys); +} + +/* + * Rules: + * 1. you can only enter a container which is a child of your current + * container + * 2. you can only place another process into a container if + * a. you have CAP_SYS_ADMIN + * b. your container is an ancestor of tsk's destination container + * (hence either you are in the same container as tsk, or in an + *ancestor container thereof) + */ +int ns_can_attach(struct container_subsys *ss, + struct container *cont, struct task_struct *tsk) +{ + struct container *c; + + if (current != tsk) { + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!container_is_descendant(cont)) + return -EPERM; + } + + if (container_task_count(cont) != 0) + return -EPERM; + + c = task_container(tsk, ns_subsys_id); + if (c && c != cont->parent) + return -EPERM; + + return 0; +} + +/* + * Rules: you can only create a container if + * 1. you are capable(CAP_SYS_ADMIN) + * 2. the target container is a descendant of your own container + */ +static int ns_create(struct container_subsys *ss, struct container *cont) +{ + struct nscont *ns; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (cont->parent && !container_is_descendant(cont)) +
[PATCH 5/7] Containers (V8): Resource Groups over generic containers
This patch provides the RG core and numtasks controller as container subsystems, intended as an example of how to implement a more complex resource control system over generic process containers. The changes to the core involve primarily removing the group management, task membership and configfs support and adding interface layers to talk to the generic container layer instead. Each resource controller becomes an independent container subsystem; the RG core is essentially a library that the resource controllers can use to provide the RG API to userspace. Rather than a single shares and stats file in each group, there's a _shares and a _stats file, each linked to the appropriate resource controller. include/linux/container_subsys.h |6 include/linux/moduleparam.h | 12 - include/linux/numtasks.h | 28 ++ include/linux/res_group.h| 86 +++ include/linux/res_group_rc.h | 125 ++ init/Kconfig | 22 + kernel/Makefile |1 kernel/fork.c|7 kernel/res_group/Makefile|2 kernel/res_group/local.h | 38 +++ kernel/res_group/numtasks.c | 451 +++ kernel/res_group/res_group.c | 135 +++ kernel/res_group/rgcs.c | 302 ++ kernel/res_group/shares.c| 228 +++ 14 files changed, 1439 insertions(+), 4 deletions(-) Index: container-2.6.20-new/include/linux/moduleparam.h === --- container-2.6.20-new.orig/include/linux/moduleparam.h +++ container-2.6.20-new/include/linux/moduleparam.h @@ -78,11 +78,17 @@ struct kparam_array /* Helper functions: type is byte, short, ushort, int, uint, long, ulong, charp, bool or invbool, or XXX if you define param_get_XXX, param_set_XXX and param_check_XXX. */ -#define module_param_named(name, value, type, perm) \ - param_check_##type(name, &(value));\ - module_param_call(name, param_set_##type, param_get_##type, &value, perm); \ +#define module_param_named_call(name, value, type, set, perm) \ + param_check_##type(name, &(value)); \ + module_param_call(name, set, param_get_##type, &(value), perm); \ __MODULE_PARM_TYPE(name, #type) +#define module_param_named(name, value, type, perm) \ + module_param_named_call(name, value, type, param_set_##type, perm) + +#define module_param_set_call(name, type, setfn, perm) \ + module_param_named_call(name, name, type, setfn, perm) + #define module_param(name, type, perm) \ module_param_named(name, name, type, perm) Index: container-2.6.20-new/include/linux/numtasks.h === --- /dev/null +++ container-2.6.20-new/include/linux/numtasks.h @@ -0,0 +1,28 @@ +/* numtasks.h - No. of tasks resource controller for Resource Groups + * + * Copyright (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005 + * + * Provides No. of tasks resource controller for Resource Groups + * + * Latest version, more details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ +#ifndef _LINUX_NUMTASKS_H +#define _LINUX_NUMTASKS_H + +#ifdef CONFIG_RES_GROUPS_NUMTASKS +#include + +extern int numtasks_allow_fork(struct task_struct *); + +#else /* CONFIG_RES_GROUPS_NUMTASKS */ + +#define numtasks_allow_fork(task) (0) + +#endif /* CONFIG_RES_GROUPS_NUMTASKS */ +#endif /* _LINUX_NUMTASKS_H */ Index: container-2.6.20-new/include/linux/res_group.h === --- /dev/null +++ container-2.6.20-new/include/linux/res_group.h @@ -0,0 +1,86 @@ +/* + * res_group.h - Header file to be used by Resource Groups + * + * Copyright (C) Hubertus Franke, IBM Corp. 2003, 2004 + * (C) Shailabh Nagar, IBM Corp. 2003, 2004 + * (C) Chandra Seetharaman, IBM Corp. 2003, 2004, 2005 + * + * Provides data structures, macros and kernel APIs + * + * More details at http://ckrm.sf.net + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + */ + +#ifndef _LINUX_RES_GROUP_H +#define _LINUX_RES_GROUP_H + +#ifdef CONFIG_RES_GROUPS +#include +#include +#include +#include + +#define SHARE_UNCHANGED(-1)/* implicitly specified by userspace, +* never stored in a resource group' +
[PATCH 5/9] Containers (V9): Add container_clone() interface
This patch adds support for container_clone(), a speculative interface to creating new containers intended to be used for systems such as namespace unsharing. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container.h |2 kernel/container.c| 121 ++ 2 files changed, 123 insertions(+) Index: container-2.6.21-rc7-mm1/kernel/container.c === --- container-2.6.21-rc7-mm1.orig/kernel/container.c +++ container-2.6.21-rc7-mm1/kernel/container.c @@ -1619,3 +1619,124 @@ void container_exit(struct task_struct * tsk->containers = init_task.containers; task_unlock(tsk); } + +static atomic_t namecnt; +static void get_unused_name(char *buf) { + sprintf(buf, "node%d", atomic_inc_return(&namecnt)); +} + +/** + * container_clone - duplicate the current container in the hierarchy + * that the given subsystem is attached to, and move this task into + * the new child + */ +int container_clone(struct task_struct *tsk, struct container_subsys *subsys) +{ + struct dentry *dentry; + int ret = 0; + char nodename[32]; + struct container *parent, *child; + struct inode *inode; + struct css_group *cg; + struct containerfs_root *root; + + /* We shouldn't be called by an unregistered subsystem */ + BUG_ON(!subsys->active); + + /* First figure out what hierarchy and container we're dealing +* with, and pin them so we can drop container_mutex */ + mutex_lock(&container_mutex); + again: + root = subsys->root; + if (root == &rootnode) { + printk(KERN_INFO + "Not cloning container for unused subsystem %s\n", + subsys->name); + mutex_unlock(&container_mutex); + return 0; + } + cg = &tsk->containers; + parent = task_container(tsk, subsys->subsys_id); + /* Pin the hierarchy */ + atomic_inc(&parent->root->sb->s_active); + + mutex_unlock(&container_mutex); + + /* Now do the VFS work to create a container */ + get_unused_name(nodename); + inode = parent->dentry->d_inode; + + /* Hold the parent directory mutex across this operation to +* stop anyone else deleting the new container */ + mutex_lock(&inode->i_mutex); + dentry = container_get_dentry(parent->dentry, nodename); + if (IS_ERR(dentry)) { + printk(KERN_INFO + "Couldn't allocate dentry for %s: %ld\n", nodename, + PTR_ERR(dentry)); + ret = PTR_ERR(dentry); + goto out_release; + } + + /* Create the container directory, which also creates the container */ + ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755); + child = __d_cont(dentry); + dput(dentry); + if (ret) { + printk(KERN_INFO + "Failed to create container %s: %d\n", nodename, + ret); + goto out_release; + } + + if (!child) { + printk(KERN_INFO + "Couldn't find new container %s\n", nodename); + ret = -ENOMEM; + goto out_release; + } + + /* The container now exists. Retake container_mutex and check +* that we're still in the same state that we thought we +* were. */ + mutex_lock(&container_mutex); + if ((root != subsys->root) || + (parent != task_container(tsk, subsys->subsys_id))) { + /* Aargh, we raced ... */ + mutex_unlock(&inode->i_mutex); + + deactivate_super(parent->root->sb); + /* The container is still accessible in the VFS, but +* we're not going to try to rmdir() it at this +* point. */ + printk(KERN_INFO + "Race in container_clone() - leaking container %s\n", + nodename); + goto again; + } + + /* All seems fine. Finish by moving the task into the new container */ + ret = attach_task(child, tsk); + mutex_unlock(&container_mutex); + + out_release: + mutex_unlock(&inode->i_mutex); + deactivate_super(parent->root->sb); + return ret; +} + +/* See if "cont" is a descendant of the current task's container in + * the appropriate hierarchy */ + +int container_is_descendant(const struct container *cont) { + int ret; + struct container *target; + int subsys_id; + get_first_subsys(cont, NULL, &subsys_id); + target = task_container(current, subsys_i
[PATCH 0/9] Containers (V9): Generic Process Containers
-- This is an update to my multi-hierarchy multi-subsystem generic process containers patch. Changes since V8 (April 6th) include: - The patchset has been rebased over 2.6.21-rc7-mm1 - The patchset has been restructured based on feedback; more functionality is now split out into separate patches where practical. - The container_group structure has been renamed css_group since this is more descriptive of its true function - Added a simplified file registration interface, and a simple interface for the common operation of returning a single number to userspace from a container control file - Added a simple "debug" subsystem that is both an example of how to use the container system and a useful debugging tool for checking reference counts, etc. Still TODO: - decide whether "Containers" is an acceptable name for the system given its usage by some other development groups, or whether something else (ProcessSets? ResourceGroups? TaskGroups?) would be better - decide whether merging css_group and nsproxy is desirable - add a hash-table based lookup for css_group objects. - use seq_file properly in container tasks files to avoid having to allocate a big array for all the container's task pointers. - add back support for the "release agent" functionality - lots more testing - define standards for container file names Generic Process Containers -- There have recently been various proposals floating around for resource management/accounting and other task grouping subsystems in the kernel, including ResGroups, User BeanCounters, NSProxy containers, and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, or control other behaviour of the processes, and all implement this grouping in different ways. Already existing in the kernel is the cpuset subsystem; this has a process grouping mechanism that is mature, tested, and well documented (particularly with regards to synchronization rules). This patchset extracts the process grouping code from cpusets into a generic container system, and makes the cpusets code a client of the container system, along with a couple of simple example subsystems. The patch set is structured as follows: 1) Basic container framework - filesystem and tracking structures 2) Simple CPU Accounting example subsystem 3) Support for the "tasks" control file 4) Hooks for fork() and exit() 5) Support for the container_clone() operation 6) Add /proc reporting interface 7) Make cpusets a container subsystem 8) Share container subsystem pointer arrays between tasks with the same assignments 9) Simple container debugging subsystem The intention is that the various resource management and virtualization efforts can also become container clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test out e.g. the ResGroups CPU controller in conjunction with the BeanCounters memory controller, or use either of them as the resource-control portion of a virtual server system. - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel Signed-off-by: Paul Menage <[EMAIL PROTECTED]> - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 3/9] Containers (V9): Add tasks file interface
This patch adds the per-directory "tasks" file for containerfs mounts; this allows the user to determine which tasks are members of a container by reading a container's "tasks", and to move a task into a container by writing its pid to its "tasks". Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container.h |2 kernel/container.c| 344 ++ 2 files changed, 346 insertions(+) Index: container-2.6.21-rc7-mm1/include/linux/container.h === --- container-2.6.21-rc7-mm1.orig/include/linux/container.h +++ container-2.6.21-rc7-mm1/include/linux/container.h @@ -128,6 +128,8 @@ int container_is_removed(const struct co int container_path(const struct container *cont, char *buf, int buflen); +int container_task_count(const struct container *cont); + /* Return true if the container is a descendant of the current container */ int container_is_descendant(const struct container *cont); Index: container-2.6.21-rc7-mm1/kernel/container.c === --- container-2.6.21-rc7-mm1.orig/kernel/container.c +++ container-2.6.21-rc7-mm1/kernel/container.c @@ -676,6 +676,111 @@ static inline void get_first_subsys(cons *subsys_id = test_ss->subsys_id; } +/* + * Attach task 'tsk' to container 'cont' + * + * Call holding container_mutex. May take task_lock of + * the task 'pid' during call. + */ + +static int attach_task(struct container *cont, struct task_struct *tsk) +{ + int retval = 0; + struct container_subsys *ss; + struct container *oldcont; + struct css_group *cg = &tsk->containers; + struct containerfs_root *root = cont->root; + int i; + + int subsys_id; + get_first_subsys(cont, NULL, &subsys_id); + + /* Nothing to do if the task is already in that container */ + oldcont = task_container(tsk, subsys_id); + if (cont == oldcont) + return 0; + + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cont, tsk); + if (retval) { + return retval; + } + } + } + + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + task_unlock(tsk); + return -ESRCH; + } + /* Update the css_group pointers for the subsystems in this +* hierarchy */ + for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) { + if (root->subsys_bits & (1ull << i)) { + /* Subsystem is in this hierarchy. So we want +* the subsystem state from the new +* container. Transfer the refcount from the +* old to the new */ + atomic_inc(&cont->count); + atomic_dec(&cg->subsys[i]->container->count); + rcu_assign_pointer(cg->subsys[i], cont->subsys[i]); + } + } + task_unlock(tsk); + + for_each_subsys(root, ss) { + if (ss->attach) { + ss->attach(ss, cont, oldcont, tsk); + } + } + + synchronize_rcu(); + return 0; +} + +/* + * Attach task with pid 'pid' to container 'cont'. Call with + * container_mutex, may take task_lock of task + * + */ + +static int attach_task_by_pid(struct container *cont, char *pidbuf) +{ + pid_t pid; + struct task_struct *tsk; + int ret; + + if (sscanf(pidbuf, "%d", &pid) != 1) + return -EIO; + + if (pid) { + read_lock(&tasklist_lock); + + tsk = find_task_by_pid(pid); + if (!tsk || tsk->flags & PF_EXITING) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + + get_task_struct(tsk); + read_unlock(&tasklist_lock); + + if ((current->euid) && (current->euid != tsk->uid) + && (current->euid != tsk->suid)) { + put_task_struct(tsk); + return -EACCES; + } + } else { + tsk = current; + get_task_struct(tsk); + } + + ret = attach_task(cont, tsk); + put_task_struct(tsk); + return ret; +} + /* The various types of files and directories in a container file system */ typedef enum { @@ -684,6 +789,54 @@ typedef enum { FILE_TASKLIST, } container_filetype_t; +static ssize_t container_common_file_write(struct container *cont, +
[PATCH 9/9] Containers (V9): Simple debug info subsystem
This example subsystem exports debugging information as an aid to diagnosing refcount leaks, etc, in the container framework. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container_subsys.h |4 + init/Kconfig | 10 kernel/Makefile |1 kernel/container_debug.c | 89 +++ 4 files changed, 104 insertions(+) Index: container-2.6.21-rc7-mm1/include/linux/container_subsys.h === --- container-2.6.21-rc7-mm1.orig/include/linux/container_subsys.h +++ container-2.6.21-rc7-mm1/include/linux/container_subsys.h @@ -19,4 +19,8 @@ SUBSYS(cpuset) /* */ +#ifdef CONFIG_CONTAINER_DEBUG +SUBSYS(debug) +#endif + /* */ Index: container-2.6.21-rc7-mm1/init/Kconfig === --- container-2.6.21-rc7-mm1.orig/init/Kconfig +++ container-2.6.21-rc7-mm1/init/Kconfig @@ -291,6 +291,16 @@ config IKCONFIG_PROC config CONTAINERS bool +config CONTAINER_DEBUG + bool "Example debug container subsystem" + select CONTAINERS + help + This option enables a simple container subsystem that + exports useful debugging information about the containers + framework + + Say N if unsure + config CPUSETS bool "Cpuset support" depends on SMP Index: container-2.6.21-rc7-mm1/kernel/container_debug.c === --- /dev/null +++ container-2.6.21-rc7-mm1/kernel/container_debug.c @@ -0,0 +1,89 @@ +/* + * kernel/ccontainer_debug.c - Example container subsystem that + * exposes debug info + * + * Copyright (C) Google Inc, 2007 + * + * Developed by Paul Menage ([EMAIL PROTECTED]) + * + */ + +#include +#include + +static int debug_create(struct container_subsys *ss, struct container *cont) +{ + struct container_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + if (!css) + return -ENOMEM; + cont->subsys[debug_subsys_id] = css; + return 0; +} + +static void debug_destroy(struct container_subsys *ss, struct container *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 container_refcount_read(struct container *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 taskcount_read(struct container *cont, struct cftype *cft) +{ + u64 count; + container_lock(); + count = container_task_count(cont); + container_unlock(); + return count; +} + +static u64 current_css_group_read(struct container *cont, struct cftype *cft) +{ + return (u64) current->containers; +} + +static u64 current_css_group_refcount_read(struct container *cont, + struct cftype *cft) +{ + u64 count; + rcu_read_lock(); + count = atomic_read(¤t->containers->ref.refcount); + rcu_read_unlock(); + return count; +} + +static struct cftype files[] = { + { + .name = "debug.container_refcount", + .read_uint = container_refcount_read, + }, + { + .name = "debug.taskcount", + .read_uint = taskcount_read, + }, + + { + .name = "debug.current_css_group", + .read_uint = current_css_group_read, + }, + + { + .name = "debug.current_css_group_refcount", + .read_uint = current_css_group_refcount_read, + }, +}; + +static int debug_populate(struct container_subsys *ss, struct container *cont) +{ + return container_add_files(cont, files, ARRAY_SIZE(files)); +} + +struct container_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; Index: container-2.6.21-rc7-mm1/kernel/Makefile === --- container-2.6.21-rc7-mm1.orig/kernel/Makefile +++ container-2.6.21-rc7-mm1/kernel/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CONTAINERS) += container.o +obj-$(CONFIG_CONTAINER_DEBUG) += container_debug.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o obj-$(CONFIG_IKCONFIG) += configs.o -- - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 4/9] Containers (V9): Add fork/exit hooks
This patch adds the necessary hooks to the fork() and exit() paths to ensure that new children inherit their parent's container assignments, and that exiting processes release reference counts on their containers. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container.h |6 ++ kernel/container.c| 126 ++ kernel/exit.c |2 kernel/fork.c | 14 - 4 files changed, 146 insertions(+), 2 deletions(-) Index: container-2.6.21-rc7-mm1/kernel/exit.c === --- container-2.6.21-rc7-mm1.orig/kernel/exit.c +++ container-2.6.21-rc7-mm1/kernel/exit.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -939,6 +940,7 @@ fastcall NORET_TYPE void do_exit(long co __exit_fs(tsk); exit_thread(); cpuset_exit(tsk); + container_exit(tsk, 1); exit_keys(tsk); if (group_dead && tsk->signal->leader) Index: container-2.6.21-rc7-mm1/kernel/fork.c === --- container-2.6.21-rc7-mm1.orig/kernel/fork.c +++ container-2.6.21-rc7-mm1/kernel/fork.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -962,6 +963,7 @@ static struct task_struct *copy_process( { int retval; struct task_struct *p = NULL; + int container_callbacks_done = 0; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1061,12 +1063,13 @@ static struct task_struct *copy_process( p->io_wait = NULL; p->audit_context = NULL; cpuset_fork(p); + container_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_copy(p->mempolicy); if (IS_ERR(p->mempolicy)) { retval = PTR_ERR(p->mempolicy); p->mempolicy = NULL; - goto bad_fork_cleanup_cpuset; + goto bad_fork_cleanup_container; } mpol_fix_fork_child_flag(p); #endif @@ -1176,6 +1179,12 @@ static struct task_struct *copy_process( /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); + /* Now that the task is set up, run container callbacks if +* necessary. We need to run them before the task is visible +* on the tasklist. */ + container_fork_callbacks(p); + container_callbacks_done = 1; + /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); @@ -1298,9 +1307,10 @@ bad_fork_cleanup_security: bad_fork_cleanup_policy: #ifdef CONFIG_NUMA mpol_free(p->mempolicy); -bad_fork_cleanup_cpuset: +bad_fork_cleanup_container: #endif cpuset_exit(p); + container_exit(p, container_callbacks_done); delayacct_tsk_free(p); if (p->binfmt) module_put(p->binfmt->module); Index: container-2.6.21-rc7-mm1/include/linux/container.h === --- container-2.6.21-rc7-mm1.orig/include/linux/container.h +++ container-2.6.21-rc7-mm1/include/linux/container.h @@ -18,6 +18,9 @@ extern int container_init_early(void); extern int container_init(void); extern void container_init_smp(void); +extern void container_fork(struct task_struct *p); +extern void container_fork_callbacks(struct task_struct *p); +extern void container_exit(struct task_struct *p, int run_callbacks); extern struct file_operations proc_container_operations; @@ -191,6 +194,9 @@ int container_path(const struct containe static inline int container_init_early(void) { return 0; } static inline int container_init(void) { return 0; } static inline void container_init_smp(void) {} +static inline void container_fork(struct task_struct *p) {} +static inline void container_fork_callbacks(struct task_struct *p) {} +static inline void container_exit(struct task_struct *p, int callbacks) {} static inline void container_lock(void) {} static inline void container_unlock(void) {} Index: container-2.6.21-rc7-mm1/kernel/container.c === --- container-2.6.21-rc7-mm1.orig/kernel/container.c +++ container-2.6.21-rc7-mm1/kernel/container.c @@ -132,6 +132,34 @@ list_for_each_entry(_ss, &_root->subsys_ #define for_each_root(_root) \ list_for_each_entry(_root, &roots, root_list) +/* Each task_struct has an embedded css_group, so the get/put + * operation simply takes a reference count on all the containers + * referenced by subsystems in this css_group. This can end up + * multiple-counting some containers, but that's OK - the ref-count is + * just a busy/not-busy indicator; ensuring that we only count each + * container once would requi
[PATCH 6/9] Containers (V9): Add procfs interface
This patch adds: /proc/containers - general system info /proc/*/container - per-task container membership info Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- fs/proc/base.c |7 ++ kernel/container.c | 128 + 2 files changed, 135 insertions(+) Index: container-2.6.21-rc7-mm1/fs/proc/base.c === --- container-2.6.21-rc7-mm1.orig/fs/proc/base.c +++ container-2.6.21-rc7-mm1/fs/proc/base.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include @@ -1980,6 +1981,9 @@ static const struct pid_entry tgid_base_ #ifdef CONFIG_CPUSETS REG("cpuset", S_IRUGO, cpuset), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj",S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL @@ -2270,6 +2274,9 @@ static const struct pid_entry tid_base_s #ifdef CONFIG_CPUSETS REG("cpuset",S_IRUGO, cpuset), #endif +#ifdef CONFIG_CONTAINERS + REG("container", S_IRUGO, container), +#endif INF("oom_score", S_IRUGO, oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), #ifdef CONFIG_AUDITSYSCALL Index: container-2.6.21-rc7-mm1/kernel/container.c === --- container-2.6.21-rc7-mm1.orig/kernel/container.c +++ container-2.6.21-rc7-mm1/kernel/container.c @@ -247,6 +247,7 @@ static int container_mkdir(struct inode static int container_rmdir(struct inode *unused_dir, struct dentry *dentry); static int container_populate_dir(struct container *cont); static struct inode_operations container_dir_inode_operations; +struct file_operations proc_containerstats_operations; static struct backing_dev_info container_backing_dev_info = { .ra_pages = 0, /* No readahead */ @@ -1507,6 +1508,7 @@ int __init container_init(void) { int err; int i; + struct proc_dir_entry *entry; for (i = 0; i < CONTAINER_SUBSYS_COUNT; i++) { struct container_subsys *ss = subsys[i]; @@ -1518,10 +1520,136 @@ int __init container_init(void) if (err < 0) goto out; + entry = create_proc_entry("containers", 0, NULL); + if (entry) + entry->proc_fops = &proc_containerstats_operations; + out: return err; } +/* + * proc_container_show() + * - Print task's container paths into seq_file, one line for each hierarchy + * - Used for /proc//container. + * - No need to task_lock(tsk) on this tsk->container reference, as it + *doesn't really matter if tsk->container changes after we read it, + *and we take container_mutex, keeping attach_task() from changing it + *anyway. No need to check that tsk->container != NULL, thanks to + *the_top_container_hack in container_exit(), which sets an exiting tasks + *container to top_container. + */ + +/* TODO: Use a proper seq_file iterator */ +static int proc_container_show(struct seq_file *m, void *v) +{ + struct pid *pid; + struct task_struct *tsk; + char *buf; + int retval; + struct containerfs_root *root; + + retval = -ENOMEM; + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto out; + + retval = -ESRCH; + pid = m->private; + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) + goto out_free; + + retval = 0; + + mutex_lock(&container_mutex); + + for_each_root(root) { + struct container_subsys *ss; + struct container *cont; + int subsys_id; + int count = 0; + /* Skip this hierarchy if it has no active subsystems */ + if (!root->subsys_bits) continue; + for_each_subsys(root, ss) { + seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + } + seq_putc(m, ':'); + get_first_subsys(&root->top_container, NULL, &subsys_id); + cont = task_container(tsk, subsys_id); + retval = container_path(cont, buf, PAGE_SIZE); + if (retval < 0) + goto out_unlock; + seq_puts(m, buf); + seq_putc(m, '\n'); + } + +out_unlock: + mutex_unlock(&container_mutex); + put_task_struct(tsk); +out_free: + kfree(buf); +out: + return retval; +} + +static int container_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_container_show, pid); +}
[PATCH 2/9] Containers (V9): Example CPU accounting subsystem
This example demonstrates how to use the generic container subsystem for a simple resource tracker that counts, for the processes in a container, the total CPU time used and the %CPU used in the last complete 10 second interval. Portions contributed by Balbir Singh <[EMAIL PROTECTED]> Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container_subsys.h |6 + include/linux/cpu_acct.h | 14 ++ init/Kconfig |7 + kernel/Makefile |1 kernel/cpu_acct.c| 185 +++ kernel/sched.c | 14 ++ 6 files changed, 224 insertions(+), 3 deletions(-) Index: container-2.6.21-rc7-mm1/include/linux/container_subsys.h === --- container-2.6.21-rc7-mm1.orig/include/linux/container_subsys.h +++ container-2.6.21-rc7-mm1/include/linux/container_subsys.h @@ -7,4 +7,10 @@ /* */ +#ifdef CONFIG_CONTAINER_CPUACCT +SUBSYS(cpuacct) +#endif + +/* */ + /* */ Index: container-2.6.21-rc7-mm1/include/linux/cpu_acct.h === --- /dev/null +++ container-2.6.21-rc7-mm1/include/linux/cpu_acct.h @@ -0,0 +1,14 @@ + +#ifndef _LINUX_CPU_ACCT_H +#define _LINUX_CPU_ACCT_H + +#include +#include + +#ifdef CONFIG_CONTAINER_CPUACCT +extern void cpuacct_charge(struct task_struct *, cputime_t cputime); +#else +static void inline cpuacct_charge(struct task_struct *p, cputime_t cputime) {} +#endif + +#endif Index: container-2.6.21-rc7-mm1/init/Kconfig === --- container-2.6.21-rc7-mm1.orig/init/Kconfig +++ container-2.6.21-rc7-mm1/init/Kconfig @@ -322,6 +322,13 @@ config SYSFS_DEPRECATED If you are using a distro that was released in 2006 or later, it should be safe to say N here. +config CONTAINER_CPUACCT + bool "Simple CPU accounting container subsystem" + select CONTAINERS + help + Provides a simple Resource Controller for monitoring the + total CPU consumed by the tasks in a container + config RELAY bool "Kernel->user space relay support (formerly relayfs)" help Index: container-2.6.21-rc7-mm1/kernel/Makefile === --- container-2.6.21-rc7-mm1.orig/kernel/Makefile +++ container-2.6.21-rc7-mm1/kernel/Makefile @@ -38,6 +38,7 @@ obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CONTAINERS) += container.o obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_CONTAINER_CPUACCT) += cpu_acct.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_STOP_MACHINE) += stop_machine.o obj-$(CONFIG_AUDIT) += audit.o auditfilter.o Index: container-2.6.21-rc7-mm1/kernel/cpu_acct.c === --- /dev/null +++ container-2.6.21-rc7-mm1/kernel/cpu_acct.c @@ -0,0 +1,185 @@ +/* + * kernel/cpu_acct.c - CPU accounting container subsystem + * + * Copyright (C) Google Inc, 2006 + * + * Developed by Paul Menage ([EMAIL PROTECTED]) and Balbir Singh + * ([EMAIL PROTECTED]) + * + */ + +/* + * Container subsystem for reporting total CPU usage of tasks in a + * container, along with percentage load over a time interval + */ + +#include +#include +#include +#include + +struct cpuacct { + struct container_subsys_state css; + spinlock_t lock; + /* total time used by this class */ + cputime64_t time; + + /* time when next load calculation occurs */ + u64 next_interval_check; + + /* time used in current period */ + cputime64_t current_interval_time; + + /* time used in last period */ + cputime64_t last_interval_time; +}; + +struct container_subsys cpuacct_subsys; + +static inline struct cpuacct *container_ca(struct container *cont) +{ + return container_of(container_subsys_state(cont, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *task_ca(struct task_struct *task) +{ + return container_of(task_subsys_state(task, cpuacct_subsys_id), + struct cpuacct, css); +} + +#define INTERVAL (HZ * 10) + +static inline u64 next_interval_boundary(u64 now) { + /* calculate the next interval boundary beyond the +* current time */ + do_div(now, INTERVAL); + return (now + 1) * INTERVAL; +} + +static int cpuacct_create(struct container_subsys *ss, struct container *cont) +{ + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + return -ENOMEM; + spin_lock_init(&ca->lock); + ca->next_interval_check = next_interval_boundary(get_jiffies_64()); + cont->subsys[cpuacct_subsys_id] = &ca->css; + return 0; +} + +static void cpuacct_destroy(struct container_subsys
[PATCH 8/9] Containers (V9): Share css_group arrays between tasks with same container memberships
This patch replaces the struct css_group embedded in task_struct with a pointer; all tasks with the same set of memberships across all hierarchies will share a css_group object. The css_group used by init isn't refcounted, since it can't ever be freed; this speeds up fork/exit for any systems that have containers compiled in but haven't actually created any containers other than the default one. With more than one registered subsystem, this reduces the number of atomic inc/dec operations required when tasks fork/exit; Assuming that many tasks share the same container assignments, this reduces overall space usage and keeps the size of the task_struct down (only one pointer added to task_struct compared to a non-containers kernel). Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/container.h | 35 ++ include/linux/sched.h | 30 - kernel/container.c| 248 +- 3 files changed, 238 insertions(+), 75 deletions(-) Index: container-2.6.21-rc7-mm1/include/linux/container.h === --- container-2.6.21-rc7-mm1.orig/include/linux/container.h +++ container-2.6.21-rc7-mm1/include/linux/container.h @@ -29,6 +29,14 @@ extern void container_unlock(void); struct containerfs_root; +/* Define the enumeration of all container subsystems */ +#define SUBSYS(_x) _x ## _subsys_id, +enum container_subsys_id { +#include + CONTAINER_SUBSYS_COUNT +}; +#undef SUBSYS + /* Per-subsystem/per-container state maintained by the system. */ struct container_subsys_state { /* The container that this subsystem is attached to. Useful @@ -87,6 +95,31 @@ struct container { struct container *top_container; }; +/* A css_group is a structure holding pointers to a set of + * container_subsys_state objects. This saves space in the task struct + * object and speeds up fork()/exit(), since a single inc/dec can bump + * the reference count on the entire container set for a task. + */ + +struct css_group { + + /* Reference count */ + struct kref ref; + + /* List running through all container groups */ + struct list_head list; + + /* Set of subsystem states, one for each subsystem. NULL for +* subsystems that aren't part of this hierarchy. These +* pointers reduce the number of dereferences required to get +* from a task to its state for a given container, but result +* in increased space usage if tasks are in wildly different +* groupings across different hierarchies. This array is +* immutable after creation */ + struct container_subsys_state *subsys[CONTAINER_SUBSYS_COUNT]; + +}; + /* struct cftype: * * The files in the container filesystem mostly have a very simple read/write @@ -178,7 +211,7 @@ static inline struct container_subsys_st static inline struct container_subsys_state *task_subsys_state( struct task_struct *task, int subsys_id) { - return rcu_dereference(task->containers.subsys[subsys_id]); + return rcu_dereference(task->containers->subsys[subsys_id]); } static inline struct container* task_container(struct task_struct *task, Index: container-2.6.21-rc7-mm1/include/linux/sched.h === --- container-2.6.21-rc7-mm1.orig/include/linux/sched.h +++ container-2.6.21-rc7-mm1/include/linux/sched.h @@ -818,34 +818,6 @@ struct uts_namespace; struct prio_array; -#ifdef CONFIG_CONTAINERS - -#define SUBSYS(_x) _x ## _subsys_id, -enum container_subsys_id { -#include - CONTAINER_SUBSYS_COUNT -}; -#undef SUBSYS - -/* A css_group is a structure holding pointers to a set of - * container_subsys_state objects. - */ - -struct css_group { - - /* Set of subsystem states, one for each subsystem. NULL for -* subsystems that aren't part of this hierarchy. These -* pointers reduce the number of dereferences required to get -* from a task to its state for a given container, but result -* in increased space usage if tasks are in wildly different -* groupings across different hierarchies. This array is -* immutable after creation */ - struct container_subsys_state *subsys[CONTAINER_SUBSYS_COUNT]; - -}; - -#endif /* CONFIG_CONTAINERS */ - struct task_struct { volatile long state;/* -1 unrunnable, 0 runnable, >0 stopped */ struct thread_info *thread_info; @@ -1098,7 +1070,7 @@ struct task_struct { int cpuset_mem_spread_rotor; #endif #ifdef CONFIG_CONTAINERS - struct css_group containers; + struct css_group *containers; #endif struct robust_list_head __user *robust_list; #ifdef CONFIG_COMPAT Index: container-2.6.21-rc7-mm1/kernel/container.c === --- container-2.6.21-rc7-mm1.orig/kernel/cont
Re: [RFC] Default child of a cgroup
On Jan 30, 2008 6:40 PM, Srivatsa Vaddagiri <[EMAIL PROTECTED]> wrote: > > Here are some questions that arise in this picture: > > 1. What is the relationship of the task-group in A/tasks with the >task-group in A/a1/tasks? In otherwords do they form siblings >of the same parent A? I'd argue the same as Balbir - tasks in A/tasks are are children of A and are siblings of a1, a2, etc. > > 2. Somewhat related to the above question, how much resource should the >task-group A/a1/tasks get in relation to A/tasks? Is it 1/2 of parent >A's share or 1/(1 + N) of parent A's share (where N = number of tasks >in A/tasks)? Each process in A should have a scheduler weight that's derived from its static_prio field. Similarly each subgroup of A will have a scheduler weight that's determined by its cpu.shares value. So the cpu share of any child (be it a task or a subgroup) would be equal to its own weight divided by the sum of weights of all children. So yes, if a task in A forks lots of children, those children could end up getting a disproportionate amount of the CPU compared to tasks in A/a1 - but that's the same as the situation without cgroups. If you want to control cpu usage between different sets of processes in A, they should be in sibling cgroups, not directly in A. Is there a restriction in CFS that stops a given group from simultaneously holding tasks and sub-groups? If so, couldn't we change CFS to make it possible rather than enforcing awkward restructions on cgroups? If we really can't change CFS in that way, then an alternative would be similar to Peter's suggestion - make cpu_cgroup_can_attach() fail if the cgroup has children, and make cpu_cgroup_create() fail if the cgroup has any tasks - that way you limit the restriction to just the hierarchy that has CFS attached to it, rather than generically for all cgroups BTW, I noticed this code in cpu_cgroup_create(): /* we support only 1-level deep hierarchical scheduler atm */ if (cgrp->parent->parent) return ERR_PTR(-EINVAL); Is anyone working on allowing more levels? Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC] Default child of a cgroup
On Jan 31, 2008 11:58 PM, Peter Zijlstra <[EMAIL PROTECTED]> wrote: > > Is there a restriction in CFS that stops a given group from > > simultaneously holding tasks and sub-groups? If so, couldn't we change > > CFS to make it possible rather than enforcing awkward restrictions on > > cgroups? > > I think it is possible, just way more work than the proposed hack. Seems to me like the right thing to do though. > > > If we really can't change CFS in that way, then an alternative would > > be similar to Peter's suggestion - make cpu_cgroup_can_attach() fail > > if the cgroup has children, and make cpu_cgroup_create() fail if the > > cgroup has any tasks - that way you limit the restriction to just the > > hierarchy that has CFS attached to it, rather than generically for all > > cgroups > > Agreed. > Actually, I realised later that this is impossible - since the root cgroup will have tasks initially, there'd be no way to create the first child cgroup in the CFS hierarchy. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] Add MS_BIND_FLAGS mount flag
From: Paul Menage <[EMAIL PROTECTED]> Add a new mount() flag, MS_BIND_FLAGS. MS_BIND_FLAGS indicates that a bind mount should take its per-mount flags from the arguments passed to mount() rather than from the source mountpoint. This flag allows you to create a bind mount with the desired per-mount flags in a single operation, rather than having to do a bind mount followed by a remount, which is fiddly and can block for non-trivial periods of time (on sb->s_umount?). For recursive bind mounts, only the root of the tree being bound inherits the per-mount flags from the mount() arguments; sub-mounts inherit their per-mount flags from the source tree as usual. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- fs/namespace.c | 36 +--- include/linux/fs.h |2 ++ 2 files changed, 27 insertions(+), 11 deletions(-) Index: 2.6.24-mm1-bindflags/fs/namespace.c === --- 2.6.24-mm1-bindflags.orig/fs/namespace.c +++ 2.6.24-mm1-bindflags/fs/namespace.c @@ -512,13 +512,13 @@ static struct vfsmount *skip_mnt_tree(st } static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, - int flag) + int flag, int mnt_flags) { struct super_block *sb = old->mnt_sb; struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); if (mnt) { - mnt->mnt_flags = old->mnt_flags; + mnt->mnt_flags = mnt_flags; atomic_inc(&sb->s_active); mnt->mnt_sb = sb; mnt->mnt_root = dget(root); @@ -1095,8 +1095,9 @@ static int lives_below_in_same_fs(struct } } -struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, - int flag) +static struct vfsmount * +__copy_tree(struct vfsmount *mnt, struct dentry *dentry, + int flag, int mnt_flags) { struct vfsmount *res, *p, *q, *r, *s; struct nameidata nd; @@ -1104,7 +1105,7 @@ struct vfsmount *copy_tree(struct vfsmou if (!(flag & CL_COPY_ALL) && IS_MNT_UNBINDABLE(mnt)) return NULL; - res = q = clone_mnt(mnt, dentry, flag); + res = q = clone_mnt(mnt, dentry, flag, mnt_flags); if (!q) goto Enomem; q->mnt_mountpoint = mnt->mnt_mountpoint; @@ -1126,7 +1127,7 @@ struct vfsmount *copy_tree(struct vfsmou p = s; nd.path.mnt = q; nd.path.dentry = p->mnt_mountpoint; - q = clone_mnt(p, p->mnt_root, flag); + q = clone_mnt(p, p->mnt_root, flag, p->mnt_flags); if (!q) goto Enomem; spin_lock(&vfsmount_lock); @@ -1146,6 +1147,11 @@ Enomem: } return NULL; } +struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, + int flag) +{ + return __copy_tree(mnt, dentry, flag, mnt->mnt_flags); +} struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) { @@ -1320,7 +1326,8 @@ static int do_change_type(struct nameida /* * do loopback mount. */ -static int do_loopback(struct nameidata *nd, char *old_name, int recurse) +static int do_loopback(struct nameidata *nd, char *old_name, int flags, + int mnt_flags) { struct nameidata old_nd; struct vfsmount *mnt = NULL; @@ -1342,10 +1349,15 @@ static int do_loopback(struct nameidata goto out; err = -ENOMEM; - if (recurse) - mnt = copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0); + /* Use the source mount flags unless the user passed MS_BIND_FLAGS */ + if (!(flags & MS_BIND_FLAGS)) + mnt_flags = old_nd.path.mnt->mnt_flags; + if (flags & MS_REC) + mnt = __copy_tree(old_nd.path.mnt, old_nd.path.dentry, 0, + mnt_flags); else - mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0); + mnt = clone_mnt(old_nd.path.mnt, old_nd.path.dentry, 0, + mnt_flags); if (!mnt) goto out; @@ -1874,7 +1886,9 @@ long do_mount(char *dev_name, char *dir_ retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags & MS_REC); + retval = do_loopback(&nd, dev_name, +flags & (MS_REC | MS_BIND_FLAGS), +mnt_flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
Re: [PATCH] Add MS_BIND_FLAGS mount flag
On Thu, Feb 14, 2008 at 12:30 AM, Miklos Szeredi <[EMAIL PROTECTED]> wrote: > > For recursive bind mounts, only the root of the tree being bound > > inherits the per-mount flags from the mount() arguments; sub-mounts > > inherit their per-mount flags from the source tree as usual. > > This is rather strange behavior. I think it would be much better, if > setting mount flags would work for recursive operations as well. Also > what we really need is not resetting all the mount flags to some > predetermined values, but to be able to set or clear each flag > individually. This is certainly true, but as you observe below it's a fair bit more fiddly to specify in the API. I wasn't sure how much people recursive bind mounts, so I figured I'd throw out this simpler version first. > > For example, with the per-mount-read-only thing the most useful > application would be to just set the read-only flag and leave the > others alone. > > And this is where we usually conclude, that a new userspace mount API > is long overdue. So for starters, how about a new syscall for bind > mounts: > > int mount_bind(const char *src, const char *dst, unsigned flags, > unsigned mnt_flags); The "flags" argument could be the same as for regular mount, and contain the mnt_flags - so the extra argument could maybe usefully be a "mnt_flags_mask", to indicate which flags we actually care about overriding. What would happen when an existing super-block flag changes to become a per-mount flag (e.g. per-mount read-only)? I think that would just fit in with the "mask" idea, as long as we complained if any bits in mnt_flags_mask weren't actually per-mount settable. Being able to mask/set mount flags might be useful on a remount too, since there's no clean way to get the existing mount flags for a mount other than by scanning /proc/mounts. So an alternative to a separate system call would be a new mnt_flag_mask argument to mount() (whose presence would be indicated by a flag bit being set in the main flags) which would be used to control which bits were set cleared for remount/bind calls. Seems a bit wasteful of bits though. If we turned "flags" into an (optionally) 64-bit argument then we'd have plenty of bits to be able to specify both a "set" bit and a "mask" bit for each, without needing a new syscall. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] Add MS_BIND_FLAGS mount flag
On Wed, Feb 13, 2008 at 10:02 PM, Christoph Hellwig <[EMAIL PROTECTED]> wrote: > > I think this concept is reasonable, but I don't think MS_BIND_FLAGS > is a descriptive name for this flag. MS_EXPLICIT_FLAGS might be better > but still isn't optimal. > MS_BIND_FLAGS_OVERRIDE ? Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] Add MS_BIND_FLAGS mount flag
[ cc: linux-fsdevel ] On Thu, Feb 14, 2008 at 7:22 AM, Paul Menage <[EMAIL PROTECTED]> wrote: > On Wed, Feb 13, 2008 at 10:02 PM, Christoph Hellwig <[EMAIL PROTECTED]> wrote: > > > > I think this concept is reasonable, but I don't think MS_BIND_FLAGS > > is a descriptive name for this flag. MS_EXPLICIT_FLAGS might be better > > but still isn't optimal. > > > > MS_BIND_FLAGS_OVERRIDE ? > > Paul > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] Add MS_BIND_FLAGS mount flag
On Thu, Feb 14, 2008 at 8:03 AM, Miklos Szeredi <[EMAIL PROTECTED]> wrote: > > The "flags" argument could be the same as for regular mount, and > > contain the mnt_flags - so the extra argument could maybe usefully be > > a "mnt_flags_mask", to indicate which flags we actually care about > > overriding. > > The way I imagined it, is that mnt_flags is a mask, and the operation > (determined by flags) is either: > > - set bits in mask > - clear bits in mask (or not in mask) > - set flags to mask > > It doesn't allow setting some bits, clearing some others, and leaving > alone the rest. But I think such flexibility isn't really needed. I think I'd suggest something like: new_mnt->mnt_flags = (old_mnt->mnt_flags & ~arg_mask) | (arg_flags & mask) > Maybe instead of messing with masks, it's better to introduce a > get_flags() or a more general mount_stat() operation, and let > userspace deal with setting and clearing flags, just as we do for > stat/chmod? > > So we'd have > > mount_stat(path, stat); > mount_bind(from, to, flags); > mount_set_flags(path, flags); > mount_move(from, to); > > and perhaps > > mount_remount(path, opt_string, flags); Sounds reasonable to me. But it wouldn't directly solve the "do a recursive bind mount setting the MS_READONLY flag on all children" problem, so we'd need some of the earlier suggestions too. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] Add MS_BIND_FLAGS mount flag
On Thu, Feb 14, 2008 at 9:31 AM, Miklos Szeredi <[EMAIL PROTECTED]> wrote: > > I deliberately not used the MS_* flags, which is currently a messy mix > of things with totally different meanings. > > Does this solve all the issues? We should add a size parameter either in the mount_params or as a final argument, for future extensibility. And we might as well include MNT_READONLY in the API on the assumption that per-mount readonly will be available soon. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] Add linux-fsdevel to VFS entry in MAINTAINERS
Add linux-fsdevel to the VFS entry in MAINTAINERS Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- MAINTAINERS |1 + 1 file changed, 1 insertion(+) Index: 2.6.24-mm1-bindflags/MAINTAINERS === --- 2.6.24-mm1-bindflags.orig/MAINTAINERS +++ 2.6.24-mm1-bindflags/MAINTAINERS @@ -1616,6 +1616,7 @@ S:Maintained FILESYSTEMS (VFS and infrastructure) P: Alexander Viro M: [EMAIL PROTECTED] +L: [EMAIL PROTECTED] S: Maintained FIREWIRE SUBSYSTEM (drivers/firewire, ) -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC][PATCH 1/7] CGroup API: Add cgroup.api control file
Add a cgroup.api control file in every cgroup directory. This reports for each control file the type of data represented by that control file, and a user-friendly description of the contents. A secondary effect of this patch is to add the "cgroup." prefix in front of all cgroup-provided control files. This will reduce the chance of future control files clashing with user-provided names. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/cgroup.h | 21 +++ kernel/cgroup.c| 133 ++--- 2 files changed, 148 insertions(+), 6 deletions(-) Index: cgroupmap-2.6.24-mm1/include/linux/cgroup.h === --- cgroupmap-2.6.24-mm1.orig/include/linux/cgroup.h +++ cgroupmap-2.6.24-mm1/include/linux/cgroup.h @@ -179,12 +179,33 @@ struct css_set { * - the 'cftype' of the file is file->f_dentry->d_fsdata */ +/* + * The various types of control file that are reported in the + * cgroup.api file. "String" is a catch-all default, but should only + * be used for special cases. If you use the appropriate accessors + * (such as "read_uint") in your control file, then you can leave this + * as 0 (CGROUP_FILE_UNKNOWN) and let cgroup figure out the right type. + */ +enum cgroup_file_type { + CGROUP_FILE_UNKNOWN = 0, + CGROUP_FILE_VOID, + CGROUP_FILE_U64, + CGROUP_FILE_STRING, +}; + #define MAX_CFTYPE_NAME 64 struct cftype { /* By convention, the name should begin with the name of the * subsystem, followed by a period */ char name[MAX_CFTYPE_NAME]; int private; + + /* The type of a file - reported in the cgroup.api file */ + enum cgroup_file_type type; + + /* Human-readable description of the file */ + const char *desc; + int (*open) (struct inode *inode, struct file *file); ssize_t (*read) (struct cgroup *cont, struct cftype *cft, struct file *file, Index: cgroupmap-2.6.24-mm1/kernel/cgroup.c === --- cgroupmap-2.6.24-mm1.orig/kernel/cgroup.c +++ cgroupmap-2.6.24-mm1/kernel/cgroup.c @@ -1301,6 +1301,7 @@ enum cgroup_filetype { FILE_NOTIFY_ON_RELEASE, FILE_RELEASABLE, FILE_RELEASE_AGENT, + FILE_API, }; static ssize_t cgroup_write_uint(struct cgroup *cgrp, struct cftype *cft, @@ -1611,17 +1612,21 @@ static int cgroup_create_dir(struct cgro } int cgroup_add_file(struct cgroup *cgrp, - struct cgroup_subsys *subsys, - const struct cftype *cft) + struct cgroup_subsys *subsys, + const struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct dentry *dentry; int error; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { - strcpy(name, subsys->name); - strcat(name, "."); + if (!test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { + if (subsys) { + strcpy(name, subsys->name); + strcat(name, "."); + } else { + strcpy(name, "cgroup."); + } } strcat(name, cft->name); BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); @@ -2126,6 +2131,110 @@ static u64 cgroup_read_releasable(struct return test_bit(CGRP_RELEASABLE, &cgrp->flags); } +static const struct file_operations cgroup_api_file_operations = { + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * cgroup.api is a file in each cgroup directory that gives the types + * and descriptions of the various control files in that directory. + */ + +static struct dentry *cgroup_api_advance(struct dentry *d, int advance) +{ + struct dentry *parent = d->d_parent; + struct list_head *l = &d->d_u.d_child; + while (true) { + if (advance) + l = l->next; + advance = true; + /* Did we reach the end of the directory? */ + if (l == &parent->d_subdirs) + return NULL; + d = container_of(l, struct dentry, d_u.d_child); + /* Skip cgroup subdirectories */ + if (d->d_inode && S_ISREG(d->d_inode->i_mode)) + return d; + } +} + +static void *cgroup_api_start(struct seq_file *sf, loff_t *pos) +{ + struct dentry *parent = sf->private; + struct dentry *d; + loff_t l = 0; + spin_lock(&dcache_lock); + if (list_empty(&parent->d_subdirs)) +
[RFC][PATCH 5/7] CGroup API: Use read_uint in memory controller
Update the memory controller to use read_uint for its limit/usage/failcnt control files, calling the new res_counter_read_uint() function. This allows the files to show up as u64 rather than string in the cgroup.api file. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- mm/memcontrol.c | 15 ++- 1 file changed, 6 insertions(+), 9 deletions(-) Index: cgroupmap-2.6.24-mm1/mm/memcontrol.c === --- cgroupmap-2.6.24-mm1.orig/mm/memcontrol.c +++ cgroupmap-2.6.24-mm1/mm/memcontrol.c @@ -922,13 +922,10 @@ int mem_cgroup_write_strategy(char *buf, return 0; } -static ssize_t mem_cgroup_read(struct cgroup *cont, - struct cftype *cft, struct file *file, - char __user *userbuf, size_t nbytes, loff_t *ppos) +static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { - return res_counter_read(&mem_cgroup_from_cont(cont)->res, - cft->private, userbuf, nbytes, ppos, - NULL); + return res_counter_read_uint(&mem_cgroup_from_cont(cont)->res, +cft->private); } static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft, @@ -1006,18 +1003,18 @@ static struct cftype mem_cgroup_files[] { .name = "usage_in_bytes", .private = RES_USAGE, - .read = mem_cgroup_read, + .read_uint = mem_cgroup_read, }, { .name = "limit_in_bytes", .private = RES_LIMIT, .write = mem_cgroup_write, - .read = mem_cgroup_read, + .read_uint = mem_cgroup_read, }, { .name = "failcnt", .private = RES_FAILCNT, - .read = mem_cgroup_read, + .read_uint = mem_cgroup_read, }, { .name = "force_empty", -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC][PATCH 6/7] CGroup API: Use descriptions for memory controller API files
This patch adds descriptions to the memory controller API files to indicate that the usage/limit are in bytes; the names of the control files can then be simplified to usage/limit. Also removes the unnecessary mem_force_empty_read() function Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- mm/memcontrol.c | 21 + 1 file changed, 5 insertions(+), 16 deletions(-) Index: cgroupmap-2.6.24-mm1/mm/memcontrol.c === --- cgroupmap-2.6.24-mm1.orig/mm/memcontrol.c +++ cgroupmap-2.6.24-mm1/mm/memcontrol.c @@ -950,19 +950,6 @@ static ssize_t mem_force_empty_write(str return ret; } -/* - * Note: This should be removed if cgroup supports write-only file. - */ - -static ssize_t mem_force_empty_read(struct cgroup *cont, - struct cftype *cft, - struct file *file, char __user *userbuf, - size_t nbytes, loff_t *ppos) -{ - return -EINVAL; -} - - static const struct mem_cgroup_stat_desc { const char *msg; u64 unit; @@ -1001,15 +988,17 @@ static int mem_control_stat_show(struct static struct cftype mem_cgroup_files[] = { { - .name = "usage_in_bytes", + .name = "usage", .private = RES_USAGE, .read_uint = mem_cgroup_read, + .desc = "Memory usage in bytes", }, { - .name = "limit_in_bytes", + .name = "limit", .private = RES_LIMIT, .write = mem_cgroup_write, .read_uint = mem_cgroup_read, + .desc = "Memory limit in bytes", }, { .name = "failcnt", @@ -1019,7 +1008,7 @@ static struct cftype mem_cgroup_files[] { .name = "force_empty", .write = mem_force_empty_write, - .read = mem_force_empty_read, + .desc = "Write to this file to forget all memory charges" }, { .name = "stat", -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC][PATCH 0/7] CGroup API: More structured API for CGroups control files
This set of patches makes the Control Groups API more structured and self-describing. 1) Allows control files to be associated with data types such as "u64", "string", "map", etc. These types show up in a new cgroup.api file in each cgroup directory, along with a user-readable string. Files that use cgroup-provided data accessors have these file types inferred automatically. 2) Moves various files in cpusets and the memory controller from using custom-written file handlers to cgroup-defined handlers 3) Adds the "cgroup." prefix for existing cgroup-provided control files (tasks, release_agent, releasable, notify_on_release). Given than we've already had 2.6.24 go out without this prefix, I guess this could be a little contentious - but it seems like a good move to prevent name clashes in the future. (Note that this doesn't affect mounting the legacy cpuset filesystem, since the compatibility layer disables all prefixes when mounted with filesystem type "cpuset"). If people object too strongly, we could just make this the case for *new* cgroup API files, but I think this is a case where consistency would be better than compatibility - I'd be surprised if anyone has written major legacy apps yet that rely on 2.6.24 cgroup control file names. There are various motivations for this: 1) We said at Kernel Summit '07 that the cgroup API wouldn't be allowed to spiral into an arbitrary mess of ad-hoc APIs. Having simple ways to represent common data types makes this easier. (E.g. one standard way to report a map of string,u64 pairs to userspace.) 2) People were divided on the issue of binary APIs versus ASCII APIs for control groups. Compatibility with the existing cpusets system, and ease of experimentation, were two important reasons for going with the current. ASCII API. But by having structured control files, we can open the path towards having more efficient binary APIs for simpler and more efficient programmatic access too, without any additional modifications required from the subsystems themselves. My plans for this potential binary API are a little hazy at this point, but they might go something like opening a cgroup.bin file in a cgroup directory, and writing the names of the control files that you were interested in; then a read on that file handle would return the contents of the given control files in a single read in a simple binary format. (Better suggestions are welcome). Regardless, getting a good typing/structure on the control files is an important first step if we want to go in that direction. 3) The memory controller currently has files with the "_in_bytes" suffix, on the grounds that otherwise it's not obvious to a new user what they represent. By moving the description to a auto-generated API file, we can remove this (IMO) inelegant suffix. -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC][PATCH 3/7] CGroup API: Use cgroup map for memcontrol stats file
Remove the seq_file boilerplate used to construct the memcontrol stats map, and instead use the new map representation for cgroup control files Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- mm/memcontrol.c | 30 ++ 1 file changed, 6 insertions(+), 24 deletions(-) Index: cgroupmap-2.6.24-mm1/mm/memcontrol.c === --- cgroupmap-2.6.24-mm1.orig/mm/memcontrol.c +++ cgroupmap-2.6.24-mm1/mm/memcontrol.c @@ -974,9 +974,9 @@ static const struct mem_cgroup_stat_desc [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, }; -static int mem_control_stat_show(struct seq_file *m, void *arg) +static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, +struct cgroup_map_cb *cb) { - struct cgroup *cont = m->private; struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); struct mem_cgroup_stat *stat = &mem_cont->stat; int i; @@ -986,8 +986,7 @@ static int mem_control_stat_show(struct val = mem_cgroup_read_stat(stat, i); val *= mem_cgroup_stat_desc[i].unit; - seq_printf(m, "%s %lld\n", mem_cgroup_stat_desc[i].msg, - (long long)val); + cb->fill(cb, mem_cgroup_stat_desc[i].msg, val); } /* showing # of active pages */ { @@ -997,29 +996,12 @@ static int mem_control_stat_show(struct MEM_CGROUP_ZSTAT_INACTIVE); active = mem_cgroup_get_all_zonestat(mem_cont, MEM_CGROUP_ZSTAT_ACTIVE); - seq_printf(m, "active %ld\n", (active) * PAGE_SIZE); - seq_printf(m, "inactive %ld\n", (inactive) * PAGE_SIZE); + cb->fill(cb, "active", (active) * PAGE_SIZE); + cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); } return 0; } -static const struct file_operations mem_control_stat_file_operations = { - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int mem_control_stat_open(struct inode *unused, struct file *file) -{ - /* XXX __d_cont */ - struct cgroup *cont = file->f_dentry->d_parent->d_fsdata; - - file->f_op = &mem_control_stat_file_operations; - return single_open(file, mem_control_stat_show, cont); -} - - - static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -1044,7 +1026,7 @@ static struct cftype mem_cgroup_files[] }, { .name = "stat", - .open = mem_control_stat_open, + .read_map = mem_control_stat_show, }, }; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC][PATCH 2/7] CGroup API: Add cgroup map data type
Adds a new type of supported control file representation, a map from strings to u64 values. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/cgroup.h | 19 +++ kernel/cgroup.c| 61 - 2 files changed, 79 insertions(+), 1 deletion(-) Index: cgroupmap-2.6.24-mm1/include/linux/cgroup.h === --- cgroupmap-2.6.24-mm1.orig/include/linux/cgroup.h +++ cgroupmap-2.6.24-mm1/include/linux/cgroup.h @@ -191,6 +191,17 @@ enum cgroup_file_type { CGROUP_FILE_VOID, CGROUP_FILE_U64, CGROUP_FILE_STRING, + CGROUP_FILE_MAP, +}; + +/* + * cgroup_map_cb is an abstract callback API for reporting map-valued + * control files + */ + +struct cgroup_map_cb { + int (*fill)(struct cgroup_map_cb *cb, const char *key, u64 value); + void *state; }; #define MAX_CFTYPE_NAME 64 @@ -215,6 +226,14 @@ struct cftype { * single integer. Use it in place of read() */ u64 (*read_uint) (struct cgroup *cont, struct cftype *cft); + /* +* read_map() is used for defining a map of key/value +* pairs. It should call cb->fill(cb, key, value) for each +* entry. +*/ + int (*read_map) (struct cgroup *cont, struct cftype *cft, +struct cgroup_map_cb *cb); + ssize_t (*write) (struct cgroup *cont, struct cftype *cft, struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos); Index: cgroupmap-2.6.24-mm1/kernel/cgroup.c === --- cgroupmap-2.6.24-mm1.orig/kernel/cgroup.c +++ cgroupmap-2.6.24-mm1/kernel/cgroup.c @@ -1488,6 +1488,46 @@ static ssize_t cgroup_file_read(struct f return -EINVAL; } +/* + * seqfile ops/methods for returning structured data. Currently just + * supports string->u64 maps, but can be extended in future. + */ + +struct cgroup_seqfile_state { + struct cftype *cft; + struct cgroup *cgroup; +}; + +static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) +{ + struct seq_file *sf = cb->state; + return seq_printf(sf, "%s: %llu\n", key, value); +} + +static int cgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct cgroup_seqfile_state *state = m->private; + struct cftype *cft = state->cft; + struct cgroup_map_cb cb = { + .fill = cgroup_map_add, + .state = m, + }; + if (cft->read_map) { + return cft->read_map(state->cgroup, cft, &cb); + } else { + BUG(); + } +} + +int cgroup_seqfile_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + kfree(seq->private); + return single_release(inode, file); +} + +static struct file_operations cgroup_seqfile_operations; + static int cgroup_file_open(struct inode *inode, struct file *file) { int err; @@ -1500,7 +1540,18 @@ static int cgroup_file_open(struct inode cft = __d_cft(file->f_dentry); if (!cft) return -ENODEV; - if (cft->open) + if (cft->read_map) { + struct cgroup_seqfile_state *state = + kzalloc(sizeof(*state), GFP_USER); + if (!state) + return -ENOMEM; + state->cft = cft; + state->cgroup = __d_cgrp(file->f_dentry->d_parent); + file->f_op = &cgroup_seqfile_operations; + err = single_open(file, cgroup_seqfile_show, state); + if (err < 0) + kfree(state); + } else if (cft->open) err = cft->open(inode, file); else err = 0; @@ -1539,6 +1590,12 @@ static struct file_operations cgroup_fil .release = cgroup_file_release, }; +static struct file_operations cgroup_seqfile_operations = { + .read = seq_read, + .llseek = seq_lseek, + .release = cgroup_seqfile_release, +}; + static struct inode_operations cgroup_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cgroup_mkdir, @@ -2206,6 +2263,8 @@ static int cgroup_api_show(struct seq_fi if (type == CGROUP_FILE_UNKNOWN) { if (cft->read_uint) type = CGROUP_FILE_U64; + else if (cft->read_map) + type = CGROUP_FILE_MAP; else if (cft->read) type = CGROUP_FILE_STRING; else if (!cft->open) -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RFC][PATCH 7/7] CGroup API: Update cpusets to use cgroup structured file API
Many of the cpusets control files are simple integer values, which don't require the overhead of memory allocations for reads and writes. Move the handlers for these control files into cpuset_read_uint() and cpuset_write_uint(). This also has the advantage that the control files show up as "u64" rather than "string" in the cgroup.api file. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- kernel/cpuset.c | 158 +--- 1 file changed, 83 insertions(+), 75 deletions(-) Index: cgroupmap-2.6.24-mm1/kernel/cpuset.c === --- cgroupmap-2.6.24-mm1.orig/kernel/cpuset.c +++ cgroupmap-2.6.24-mm1/kernel/cpuset.c @@ -999,19 +999,6 @@ int current_cpuset_is_being_rebound(void } /* - * Call with cgroup_mutex held. - */ - -static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) -{ - if (simple_strtoul(buf, NULL, 10) != 0) - cpuset_memory_pressure_enabled = 1; - else - cpuset_memory_pressure_enabled = 0; - return 0; -} - -/* * update_flag - read a 0 or a 1 in a file and update associated flag * bit:the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, * CS_SCHED_LOAD_BALANCE, @@ -1023,15 +1010,13 @@ static int update_memory_pressure_enable * Call with cgroup_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) +static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, + int turning_on) { - int turning_on; struct cpuset trialcs; int err; int cpus_nonempty, balance_flag_changed; - turning_on = (simple_strtoul(buf, NULL, 10) != 0); - trialcs = *cs; if (turning_on) set_bit(bit, &trialcs.flags); @@ -1247,44 +1232,66 @@ static ssize_t cpuset_common_file_write( case FILE_MEMLIST: retval = update_nodemask(cs, buffer); break; + default: + retval = -EINVAL; + goto out2; + } + + if (retval == 0) + retval = nbytes; +out2: + cgroup_unlock(); +out1: + kfree(buffer); + return retval; +} + +static int cpuset_write_uint(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + cpuset_filetype_t type = cft->private; + + cgroup_lock(); + + if (cgroup_is_removed(cgrp)) { + cgroup_unlock(); + return -ENODEV; + } + + switch (type) { case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); + retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); + retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); break; case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); + retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); + retval = update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: - retval = update_memory_pressure_enabled(cs, buffer); + cpuset_memory_pressure_enabled = val; break; case FILE_MEMORY_PRESSURE: retval = -EACCES; break; case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, buffer); + retval = update_flag(CS_SPREAD_PAGE, cs, val); cs->mems_generation = cpuset_mems_generation++; break; case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, buffer); + retval = update_flag(CS_SPREAD_SLAB, cs, val); cs->mems_generation = cpuset_mems_generation++; break; default: retval = -EINVAL; - goto out2; + break; } - - if (retval == 0) - retval = nbytes; -out2: cgroup_unlock(); -out1: - kfree(buffer); - return retval; + return -EINVAL; } /* @@ -1345,30 +1352,6 @@ static ssize_t cpuset_common_file_read(s case FILE_MEMLIST: s += cpuset_sprintf_memlist(s, cs); break; - case FILE_CPU_EXCLUSIVE: - *s++ = is_cpu_exclusive(cs) ? '1' : '0'; - break; - case FILE_MEM_EXCLUSIVE: - *s++ = is_mem_exclusive(cs) ? '1' : '0'; - break; - case FILE_SCHED_LOAD_BALANCE:
[RFC][PATCH 4/7] CGroup API: Add res_counter_read_uint()
Adds a function for returning the value of a resource counter member, in a form suitable for use in a cgroup read_uint control file method. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- include/linux/res_counter.h |1 + kernel/res_counter.c|5 + 2 files changed, 6 insertions(+) Index: cgroupmap-2.6.24-mm1/include/linux/res_counter.h === --- cgroupmap-2.6.24-mm1.orig/include/linux/res_counter.h +++ cgroupmap-2.6.24-mm1/include/linux/res_counter.h @@ -54,6 +54,7 @@ struct res_counter { ssize_t res_counter_read(struct res_counter *counter, int member, const char __user *buf, size_t nbytes, loff_t *pos, int (*read_strategy)(unsigned long long val, char *s)); +u64 res_counter_read_uint(struct res_counter *counter, int member); ssize_t res_counter_write(struct res_counter *counter, int member, const char __user *buf, size_t nbytes, loff_t *pos, int (*write_strategy)(char *buf, unsigned long long *val)); Index: cgroupmap-2.6.24-mm1/kernel/res_counter.c === --- cgroupmap-2.6.24-mm1.orig/kernel/res_counter.c +++ cgroupmap-2.6.24-mm1/kernel/res_counter.c @@ -92,6 +92,11 @@ ssize_t res_counter_read(struct res_coun pos, buf, s - buf); } +u64 res_counter_read_uint(struct res_counter *counter, int member) +{ + return *res_counter_member(counter, member); +} + ssize_t res_counter_write(struct res_counter *counter, int member, const char __user *userbuf, size_t nbytes, loff_t *pos, int (*write_strategy)(char *st_buf, unsigned long long *val)) -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH 1/7] CGroup API: Add cgroup.api control file
On Feb 16, 2008 2:07 AM, Balbir Singh <[EMAIL PROTECTED]> wrote: > Paul Menage wrote: > > Hi, Paul, > > Do we need to use a cgroup.api file? Why not keep up to date documentation and > get users to use that. I fear that, cgroup.api will not be kept up-to-date, > leading to confusion. The cgroup.api file isn't meant to give complete documentation for a control file, simply a brief indication of its usage. The aim is that most bits of the information reported in cgroup.api are auto-generated, so there shouldn't be problems with it getting out-of-date. Is it just the space used by the documentation string that you're objecting to? The other function of the file is to declare a type for each variable. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH 0/7] CGroup API: More structured API for CGroups control files
On Feb 16, 2008 1:31 AM, Li Zefan <[EMAIL PROTECTED]> wrote: > > I don't quite catch what you mean. Cgoup does support write-only/read-only > files. For a write-only file, just set .write and .write_uint to be NULL, > similar for a read-only file. > > Do I miss something? > I suppose we could infer from the lack of any write handlers that we should give the file in the filesystem a mode of 444 rather 644. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH 7/7] CGroup API: Update cpusets to use cgroup structured file API
On Feb 16, 2008 7:29 PM, Paul Jackson <[EMAIL PROTECTED]> wrote: > > From: Paul Jackson <[EMAIL PROTECTED]> > > Strip all trailing whitespace (such as carriage returns) > when parsing integer writes to cgroup files, not just > one trailing newline if present. Sounds like a good idea to me. Thanks for this. > > Signed-off-by: Paul Jackson <[EMAIL PROTECTED]> > Cc: Paul Menage <[EMAIL PROTECTED]> Acked-by: Paul Menage <[EMAIL PROTECTED]> > > --- > kernel/cgroup.c |5 + > 1 file changed, 1 insertion(+), 4 deletions(-) > > --- 2.6.24-mm1.orig/kernel/cgroup.c 2008-02-16 04:20:33.0 -0800 > +++ 2.6.24-mm1/kernel/cgroup.c 2008-02-16 19:00:41.207478218 -0800 > @@ -1321,10 +1321,7 @@ static ssize_t cgroup_write_uint(struct > return -EFAULT; > > buffer[nbytes] = 0; /* nul-terminate */ > - > - /* strip newline if necessary */ > - if (nbytes && (buffer[nbytes-1] == '\n')) > - buffer[nbytes-1] = 0; > + strstrip(buffer); /* strip -just- trailing whitespace */ > val = simple_strtoull(buffer, &end, 0); > if (*end) > return -EINVAL; > > > -- > I won't rest till it's the best ... > Programmer, Linux Scalability > Paul Jackson <[EMAIL PROTECTED]> 1.940.382.4214 > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH 7/7] CGroup API: Update cpusets to use cgroup structured file API
On Feb 17, 2008 9:28 AM, Paul Jackson <[EMAIL PROTECTED]> wrote: > > I'm figuring it would be easiest if you just threw this > little change into your hopper for the bigger changes > you're making OK, will do. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: Improve init/Kconfig help descriptions [PATCH 6/9]
On Feb 19, 2008 7:12 AM, Nick Andrew <[EMAIL PROTECTED]> wrote: > config CGROUPS > bool "Control Group support" > help > - This option will let you use process cgroup subsystems > - such as Cpusets > + Control Groups enables processes to be tracked and grouped > + into "cgroups". This enables you, for example, to associate > + cgroups with certain CPU sets using "cpusets". > > - Say N if unsure. > + When enabled, a new filesystem type "cgroup" is available > + and can be mounted to control cpusets. How about: ... cpusets and other resource/behaviour controllers. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH 1/7] CGroup API: Add cgroup.api control file
On Feb 19, 2008 1:57 PM, Paul Jackson <[EMAIL PROTECTED]> wrote: > > Finally, it goes against the one thingie per file (at most, one scalar > vector) that has worked well for us when tried. Right, I like the idea of keeping things simple. But if you're going to accept that a vector is useful, then it seems reasonable that some other *simple* structured datatypes can be useful. An N-element key/value map (a la /proc/meminfo) is, I think, nicer than having to read values from N separate files. > > As to the motivations Paul M gives: > 1) Avoid "an arbitrary mess of ad-hoc APIs": > We can still do that, whether or not we "self-document" these > API's in this manner. We can, but this file makes it more clear what control files have a well-defined API and which are just returning some ad-hoc string. I guess it's not essential, I just figured that if we had that information, it made sense to make it available to userspace. I guess I'm happy with dropping the actual exposed cgroup.api file for now as long as we can work towards reducing the number of control files that just return strings, and make use of the structured output such as read_uint() miore. > 2) binary APIs versus ASCII APIs: > Well, I have an ASCII API bias, not surprising. But I'd > suggest not doing things "in anticipation" of some future > fuzzy binary API support. Wait until that day actually arrives. I have a reasonably clear idea of how we can do the binary API. That's mostly for a separate RFC. But for example, reading a map via the binary API would be able to just return a list values since the keys could be parsed once from the ascii map (provided that the subsystem guaranteed that the map keys and their order wouldn't change between reboots). > 3) The memory controller currently has files with the "_in_bytes": > The traditional way to handle this is Documentation and man > pages; good enough for my granddad, good enough for me ;). I've tried submitting patches to remove the in_bytes suffix and just rely on the documentation, and people didn't seem to like it ... Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH 1/7] CGroup API: Add cgroup.api control file
On Feb 18, 2008 1:45 AM, Li Zefan <[EMAIL PROTECTED]> wrote: > > > > But we don't have /proc/proc.api or /sys/sysfs.api ... True. And /proc is a bit of a mess. Having a similar API file for sysfs sounds like a good idea to me. > > And is it better to describe the debug subsystem too? > Yes, probably, but that would be a separate patch to the debug subsystem itself, not the main cgroups code. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 3/7] cgroup: clean up cgroup.h
On Feb 17, 2008 9:49 PM, Li Zefan <[EMAIL PROTECTED]> wrote: > - replace old name 'cont' with 'cgrp' (Paul Menage did this cleanup for > cgroup.c in commit bd89aabc6761de1c35b154fe6f914a445d301510) > - remove a duplicate declaration of cgroup_path() > > Signed-off-by: Li Zefan <[EMAIL PROTECTED]> Acked-by: Paul Menage <[EMAIL PROTECTED]> > --- > include/linux/cgroup.h | 48 > +++- > 1 files changed, 23 insertions(+), 25 deletions(-) > > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h > index 2ebf7af..028ba3b 100644 > --- a/include/linux/cgroup.h > +++ b/include/linux/cgroup.h > @@ -186,15 +186,15 @@ struct cftype { > char name[MAX_CFTYPE_NAME]; > int private; > int (*open) (struct inode *inode, struct file *file); > - ssize_t (*read) (struct cgroup *cont, struct cftype *cft, > + ssize_t (*read) (struct cgroup *cgrp, struct cftype *cft, > struct file *file, > char __user *buf, size_t nbytes, loff_t *ppos); > /* > * read_uint() is a shortcut for the common case of returning a > * single integer. Use it in place of read() > */ > - u64 (*read_uint) (struct cgroup *cont, struct cftype *cft); > - ssize_t (*write) (struct cgroup *cont, struct cftype *cft, > + u64 (*read_uint) (struct cgroup *cgrp, struct cftype *cft); > + ssize_t (*write) (struct cgroup *cgrp, struct cftype *cft, > struct file *file, > const char __user *buf, size_t nbytes, loff_t > *ppos); > > @@ -203,7 +203,7 @@ struct cftype { > * a single integer (as parsed by simple_strtoull) from > * userspace. Use in place of write(); return 0 or error. > */ > - int (*write_uint) (struct cgroup *cont, struct cftype *cft, u64 val); > + int (*write_uint) (struct cgroup *cgrp, struct cftype *cft, u64 val); > > int (*release) (struct inode *inode, struct file *file); > }; > @@ -218,41 +218,41 @@ struct cgroup_scanner { > > /* Add a new file to the given cgroup directory. Should only be > * called by subsystems from within a populate() method */ > -int cgroup_add_file(struct cgroup *cont, struct cgroup_subsys *subsys, > +int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, >const struct cftype *cft); > > /* Add a set of new files to the given cgroup directory. Should > * only be called by subsystems from within a populate() method */ > -int cgroup_add_files(struct cgroup *cont, > +int cgroup_add_files(struct cgroup *cgrp, > struct cgroup_subsys *subsys, > const struct cftype cft[], > int count); > > -int cgroup_is_removed(const struct cgroup *cont); > +int cgroup_is_removed(const struct cgroup *cgrp); > > -int cgroup_path(const struct cgroup *cont, char *buf, int buflen); > +int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen); > > -int cgroup_task_count(const struct cgroup *cont); > +int cgroup_task_count(const struct cgroup *cgrp); > > /* Return true if the cgroup is a descendant of the current cgroup */ > -int cgroup_is_descendant(const struct cgroup *cont); > +int cgroup_is_descendant(const struct cgroup *cgrp); > > /* Control Group subsystem type. See Documentation/cgroups.txt for details */ > > struct cgroup_subsys { > struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss, > - struct cgroup *cont); > - void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cont); > - void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cont); > + struct cgroup *cgrp); > + void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); > + void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); > int (*can_attach)(struct cgroup_subsys *ss, > - struct cgroup *cont, struct task_struct *tsk); > - void (*attach)(struct cgroup_subsys *ss, struct cgroup *cont, > - struct cgroup *old_cont, struct task_struct *tsk); > + struct cgroup *cgrp, struct task_struct *tsk); > + void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, > + struct cgroup *old_cgrp, struct task_struct *tsk); > void (*fork)(struct cgroup_subsys *ss, struct task_struct *task); > void (*exit)(struct cgroup_subsys *ss, struct task_struct *task); > int (*populate)(struc
Re: [PATCH 5/7] cgroup: fix subsys bitops
On Feb 17, 2008 9:49 PM, Li Zefan <[EMAIL PROTECTED]> wrote: > Cgroup uses unsigned long for subsys bitops, not unsigned long long. > > Signed-off-by: Li Zefan <[EMAIL PROTECTED]> Acked-by: Paul Menage <[EMAIL PROTECTED]> > --- > kernel/cgroup.c |4 ++-- > 1 files changed, 2 insertions(+), 2 deletions(-) > > diff --git a/kernel/cgroup.c b/kernel/cgroup.c > index aa76bbd..e8c8e58 100644 > --- a/kernel/cgroup.c > +++ b/kernel/cgroup.c > @@ -320,7 +320,7 @@ static struct css_set *find_existing_css_set( > /* Built the set of subsystem state objects that we want to > * see in the new css_set */ > for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { > - if (root->subsys_bits & (1ull << i)) { > + if (root->subsys_bits & (1UL << i)) { > /* Subsystem is in this hierarchy. So we want > * the subsystem state from the new > * cgroup */ > @@ -696,7 +696,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, > added_bits = final_bits & ~root->actual_subsys_bits; > /* Check that any added subsystems are currently free */ > for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { > - unsigned long long bit = 1ull << i; > + unsigned long bit = 1UL << i; > struct cgroup_subsys *ss = subsys[i]; > if (!(bit & added_bits)) > continue; > -- > 1.5.4.rc3 > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 4/7] cgroup: fix memory leak in cgroup_get_sb()
On Feb 17, 2008 9:49 PM, Li Zefan <[EMAIL PROTECTED]> wrote: > opts.release_agent is not kfree()ed in all necessary places. > > Signed-off-by: Li Zefan <[EMAIL PROTECTED]> Acked-by: Paul Menage <[EMAIL PROTECTED]> Good catch, although hopefully something that would be extremely rare in practice. Thanks, Paul > --- > kernel/cgroup.c |5 - > 1 files changed, 4 insertions(+), 1 deletions(-) > > diff --git a/kernel/cgroup.c b/kernel/cgroup.c > index 0c35022..aa76bbd 100644 > --- a/kernel/cgroup.c > +++ b/kernel/cgroup.c > @@ -961,8 +961,11 @@ static int cgroup_get_sb(struct file_system_type > *fs_type, > } > > root = kzalloc(sizeof(*root), GFP_KERNEL); > - if (!root) > + if (!root) { > + if (opts.release_agent) > + kfree(opts.release_agent); > return -ENOMEM; > + } > > init_cgroup_root(root); > root->subsys_bits = opts.subsys_bits; > -- > 1.5.4.rc3 > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 1/7] cgroup: fix and update documentation
On Feb 18, 2008 12:39 AM, Li Zefan <[EMAIL PROTECTED]> wrote: > Misc fixes and updates, make the doc consistent with current > cgroup implementation. > > Signed-off-by: Li Zefan <[EMAIL PROTECTED]> Acked-by: Paul Menage <[EMAIL PROTECTED]> Thanks for these cleanups. Paul > --- > Documentation/cgroups.txt | 66 ++-- > 1 files changed, 33 insertions(+), 33 deletions(-) > > diff --git a/Documentation/cgroups.txt b/Documentation/cgroups.txt > index 42d7c4c..31d12e2 100644 > > --- a/Documentation/cgroups.txt > +++ b/Documentation/cgroups.txt > @@ -28,7 +28,7 @@ CONTENTS: > 4. Questions > > 1. Control Groups > -== > += > > 1.1 What are cgroups ? > -- > @@ -143,10 +143,10 @@ proliferation of such cgroups. > > Also lets say that the administrator would like to give enhanced network > access temporarily to a student's browser (since it is night and the user > -wants to do online gaming :) OR give one of the students simulation > +wants to do online gaming :)) OR give one of the students simulation > apps enhanced CPU power, > > -With ability to write pids directly to resource classes, its just a > +With ability to write pids directly to resource classes, it's just a > matter of : > > # echo pid > /mnt/network//tasks > @@ -227,10 +227,13 @@ Each cgroup is represented by a directory in the cgroup > file system > containing the following files describing that cgroup: > > - tasks: list of tasks (by pid) attached to that cgroup > - - notify_on_release flag: run /sbin/cgroup_release_agent on exit? > + - releasable flag: cgroup currently removeable? > + - notify_on_release flag: run the release agent on exit? > + - release_agent: the path to use for release notifications (this file > + exists in the top cgroup only) > > Other subsystems such as cpusets may add additional files in each > -cgroup dir > +cgroup dir. > > New cgroups are created using the mkdir system call or shell > command. The properties of a cgroup, such as its flags, are > @@ -257,7 +260,7 @@ performance. > To allow access from a cgroup to the css_sets (and hence tasks) > that comprise it, a set of cg_cgroup_link objects form a lattice; > each cg_cgroup_link is linked into a list of cg_cgroup_links for > -a single cgroup on its cont_link_list field, and a list of > +a single cgroup on its cgrp_link_list field, and a list of > cg_cgroup_links for a single css_set on its cg_link_list. > > Thus the set of tasks in a cgroup can be listed by iterating over > @@ -271,9 +274,6 @@ for cgroups, with a minimum of additional kernel code. > 1.4 What does notify_on_release do ? > > > -*** notify_on_release is disabled in the current patch set. It will be > -*** reactivated in a future patch in a less-intrusive manner > - > If the notify_on_release flag is enabled (1) in a cgroup, then > whenever the last task in the cgroup leaves (exits or attaches to > some other cgroup) and the last child cgroup of that cgroup > @@ -360,8 +360,8 @@ Now you want to do something with this cgroup. > > In this directory you can find several files: > # ls > -notify_on_release release_agent tasks > -(plus whatever files are added by the attached subsystems) > +notify_on_release releasable tasks > +(plus whatever files added by the attached subsystems) > > Now attach your shell to this cgroup: > # /bin/echo $$ > tasks > @@ -404,19 +404,13 @@ with a subsystem id which will be assigned by the > cgroup system. > Other fields in the cgroup_subsys object include: > > - subsys_id: a unique array index for the subsystem, indicating which > - entry in cgroup->subsys[] this subsystem should be > - managing. Initialized by cgroup_register_subsys(); prior to this > - it should be initialized to -1 > + entry in cgroup->subsys[] this subsystem should be managing. > > -- hierarchy: an index indicating which hierarchy, if any, this > - subsystem is currently attached to. If this is -1, then the > - subsystem is not attached to any hierarchy, and all tasks should be > - considered to be members of the subsystem's top_cgroup. It should > - be initialized to -1. > +- name: should be initialized to a unique subsystem name. Should be > + no longer than MAX_CGROUP_TYPE_NAMELEN. > > -- name: should be initialized to a unique subsystem name prior to > - calling cgroup_register_subsystem. Should be no longer than > - MAX_CGROUP_TYPE_NAMELEN > +- early_init: indicate if the subsystem needs early initialization > + at system boot. > > Each cgroup object created by the sy
Re: Improve init/Kconfig help descriptions [PATCH 6/9]
On Feb 19, 2008 6:54 PM, Nick Andrew <[EMAIL PROTECTED]> wrote: > > config CGROUPS > bool "Control Group support" > help > Control Groups enables processes to be tracked and grouped > into "cgroups". This enables you, for example, to associate > cgroups with certain CPU sets using "cpusets". > > When enabled, a new filesystem type "cgroup" is available > and can be mounted to control cpusets and other > resource/behaviour controllers. > > See for more information. > > If unsure, say N. > > > I don't think that description is as clear as it could be. From > the non-kernel-developer point of view, that is. Originally this wasn't a user-selectable config value, it was auto-selected by any subsystem that needed it. I think that was nicer from the user-experience, and it would eliminate the need for this documentation but there were concerns that this triggered unspecified brokenness in the Kbuild system. > > Re "other resource/behaviour controllers", what in particular? > I take it that our current controllers are cpusets, scheduler, > CPU accounting and Resource counters? Resource counters aren't a resource controller, they're a helper library. The others are good examples, as is the memory controller that's just been added to 2.6.25. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 6/7] cgroup: remove duplicate code in find_css_set()
On Feb 17, 2008 9:49 PM, Li Zefan <[EMAIL PROTECTED]> wrote: > The list head res->tasks gets initialized twice in find_css_set(). > > Signed-off-by: Li Zefan <[EMAIL PROTECTED]> Acked-by: Paul Menage <[EMAIL PROTECTED]> > --- > kernel/cgroup.c |1 - > 1 files changed, 0 insertions(+), 1 deletions(-) > > diff --git a/kernel/cgroup.c b/kernel/cgroup.c > index e8c8e58..71cf961 100644 > --- a/kernel/cgroup.c > +++ b/kernel/cgroup.c > @@ -473,7 +473,6 @@ static struct css_set *find_css_set( > /* Link this cgroup group into the list */ > list_add(&res->list, &init_css_set.list); > css_set_count++; > - INIT_LIST_HEAD(&res->tasks); > write_unlock(&css_set_lock); > > return res; > -- > 1.5.4.rc3 > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 2/7] cgroup: fix comments
On Feb 17, 2008 9:49 PM, Li Zefan <[EMAIL PROTECTED]> wrote: > fix: > - comments about need_forkexit_callback > - comments about release agent > - typo and comment style, etc. > > Signed-off-by: Li Zefan <[EMAIL PROTECTED]> > --- > include/linux/cgroup.h |2 +- > kernel/cgroup.c| 44 +--- > 2 files changed, 22 insertions(+), 24 deletions(-) > > diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h > index ff9055f..2ebf7af 100644 > --- a/include/linux/cgroup.h > +++ b/include/linux/cgroup.h > @@ -175,7 +175,7 @@ struct css_set { > * > * > * When reading/writing to a file: > - * - the cgroup to use in file->f_dentry->d_parent->d_fsdata > + * - the cgroup to use is file->f_dentry->d_parent->d_fsdata > * - the 'cftype' of the file is file->f_dentry->d_fsdata > */ > > diff --git a/kernel/cgroup.c b/kernel/cgroup.c > index 4766bb6..0c35022 100644 > --- a/kernel/cgroup.c > +++ b/kernel/cgroup.c > @@ -113,9 +113,9 @@ static int root_count; > #define dummytop (&rootnode.top_cgroup) > > /* This flag indicates whether tasks in the fork and exit paths should > - * take callback_mutex and check for fork/exit handlers to call. This > - * avoids us having to do extra work in the fork/exit path if none of the > - * subsystems need to be called. > + * check for fork/exit handlers to call. This avoids us having to do > + * extra work in the fork/exit path if none of the subsystems need to > + * be called. > */ > static int need_forkexit_callback; > > @@ -507,8 +507,8 @@ static struct css_set *find_css_set( > * critical pieces of code here. The exception occurs on cgroup_exit(), > * when a task in a notify_on_release cgroup exits. Then cgroup_mutex > * is taken, and if the cgroup count is zero, a usermode call made > - * to /sbin/cgroup_release_agent with the name of the cgroup (path > - * relative to the root of cgroup file system) as the argument. > + * to the release agent with the name of the cgroup (path relative to > + * the root of cgroup file system) as the argument. > * > * A cgroup can only be deleted if both its 'count' of using tasks > * is zero, and its list of 'children' cgroups is empty. Since all > @@ -521,7 +521,7 @@ static struct css_set *find_css_set( > * > * The need for this exception arises from the action of > * cgroup_attach_task(), which overwrites one tasks cgroup pointer with > - * another. It does so using cgroup_mutexe, however there are > + * another. It does so using cgroup_mutex, however there are > * several performance critical places that need to reference > * task->cgroup without the expense of grabbing a system global > * mutex. Therefore except as noted below, when dereferencing or, as > @@ -1192,7 +1192,7 @@ static void get_first_subsys(const struct cgroup *cgrp, > * Attach task 'tsk' to cgroup 'cgrp' > * > * Call holding cgroup_mutex. May take task_lock of > - * the task 'pid' during call. > + * the task 'tsk' during call. > */ > int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) > { > @@ -1584,12 +1584,11 @@ static int cgroup_create_file(struct dentry *dentry, > int mode, > } > > /* I think that docbook-style function comments need /** at the start of the comment block. Thanks, Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 2/2] Cpusets API: Update cpusets to use cgroup structured file API
Many of the cpusets control files are simple integer values, which don't require the overhead of memory allocations for reads and writes. Move the handlers for these control files into cpuset_read_uint() and cpuset_write_uint(). This also has the advantage that the control files show up as "u64" rather than "string" in the cgroup.api file. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- kernel/cpuset.c | 156 +--- 1 file changed, 82 insertions(+), 74 deletions(-) Index: cpusets-2.6.25-rc2-mm1/kernel/cpuset.c === --- cpusets-2.6.25-rc2-mm1.orig/kernel/cpuset.c +++ cpusets-2.6.25-rc2-mm1/kernel/cpuset.c @@ -999,19 +999,6 @@ int current_cpuset_is_being_rebound(void } /* - * Call with cgroup_mutex held. - */ - -static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) -{ - if (simple_strtoul(buf, NULL, 10) != 0) - cpuset_memory_pressure_enabled = 1; - else - cpuset_memory_pressure_enabled = 0; - return 0; -} - -/* * update_flag - read a 0 or a 1 in a file and update associated flag * bit:the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, * CS_SCHED_LOAD_BALANCE, @@ -1023,15 +1010,13 @@ static int update_memory_pressure_enable * Call with cgroup_mutex held. */ -static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) +static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, + int turning_on) { - int turning_on; struct cpuset trialcs; int err; int cpus_nonempty, balance_flag_changed; - turning_on = (simple_strtoul(buf, NULL, 10) != 0); - trialcs = *cs; if (turning_on) set_bit(bit, &trialcs.flags); @@ -1247,43 +1232,65 @@ static ssize_t cpuset_common_file_write( case FILE_MEMLIST: retval = update_nodemask(cs, buffer); break; + default: + retval = -EINVAL; + goto out2; + } + + if (retval == 0) + retval = nbytes; +out2: + cgroup_unlock(); +out1: + kfree(buffer); + return retval; +} + +static int cpuset_write_uint(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + cpuset_filetype_t type = cft->private; + + cgroup_lock(); + + if (cgroup_is_removed(cgrp)) { + cgroup_unlock(); + return -ENODEV; + } + + switch (type) { case FILE_CPU_EXCLUSIVE: - retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer); + retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); break; case FILE_MEM_EXCLUSIVE: - retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); + retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); break; case FILE_SCHED_LOAD_BALANCE: - retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); + retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); break; case FILE_MEMORY_MIGRATE: - retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); + retval = update_flag(CS_MEMORY_MIGRATE, cs, val); break; case FILE_MEMORY_PRESSURE_ENABLED: - retval = update_memory_pressure_enabled(cs, buffer); + cpuset_memory_pressure_enabled = !!val; break; case FILE_MEMORY_PRESSURE: retval = -EACCES; break; case FILE_SPREAD_PAGE: - retval = update_flag(CS_SPREAD_PAGE, cs, buffer); + retval = update_flag(CS_SPREAD_PAGE, cs, val); cs->mems_generation = cpuset_mems_generation++; break; case FILE_SPREAD_SLAB: - retval = update_flag(CS_SPREAD_SLAB, cs, buffer); + retval = update_flag(CS_SPREAD_SLAB, cs, val); cs->mems_generation = cpuset_mems_generation++; break; default: retval = -EINVAL; - goto out2; + break; } - - if (retval == 0) - retval = nbytes; -out2: cgroup_unlock(); -out1: - kfree(buffer); return retval; } @@ -1345,30 +1352,6 @@ static ssize_t cpuset_common_file_read(s case FILE_MEMLIST: s += cpuset_sprintf_memlist(s, cs); break; - case FILE_CPU_EXCLUSIVE: - *s++ = is_cpu_exclusive(cs) ? '1' : '0'; - break; - case FILE_MEM_EXCLUSIVE: - *s++ = is_mem_exclusive(cs) ? '1' : '0'; - break; - case FILE_SCHED_LOAD_BALANCE: -
[PATCH 1/2] Cpusets API: From: Paul Jackson <[EMAIL PROTECTED]>
Strip all trailing whitespace in cgroup_write_uint This removes the need for people to remember to pass the -n flag to echo when writing values to cgroup control files. Signed-off-by: Paul Menage <[EMAIL PROTECTED]> --- kernel/cgroup.c |5 + 1 file changed, 1 insertion(+), 4 deletions(-) Index: cpusets-2.6.25-rc2-mm1/kernel/cgroup.c === --- cpusets-2.6.25-rc2-mm1.orig/kernel/cgroup.c +++ cpusets-2.6.25-rc2-mm1/kernel/cgroup.c @@ -1321,10 +1321,7 @@ static ssize_t cgroup_write_uint(struct return -EFAULT; buffer[nbytes] = 0; /* nul-terminate */ - - /* strip newline if necessary */ - if (nbytes && (buffer[nbytes-1] == '\n')) - buffer[nbytes-1] = 0; + strstrip(buffer); val = simple_strtoull(buffer, &end, 0); if (*end) return -EINVAL; -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH 0/2] Cpusets API: Update Cpusets control files
This pair of patches simplifies the cpusets read/write path for the control files that consist of simple integers. -- -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RFC][PATCH 1/7] CGroup API: Add cgroup.api control file
On Feb 19, 2008 9:17 PM, Paul Jackson <[EMAIL PROTECTED]> wrote: > > Perhaps my primary concern with these *.api files was that I did not > understand who or what the critical use or user was; who found this > essential, not just nice to have. > Right now, no-one would find it essential. If/when a binary API is added, I guess I'll ressurrect this part of the patchset. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH 0/2] cgroup map files: Add a key/value map file type to cgroups
On Feb 19, 2008 9:48 PM, YAMAMOTO Takashi <[EMAIL PROTECTED]> wrote: > > it changes the format from "%s %lld" to "%s: %llu", right? > why? > The colon for consistency with maps in /proc. I think it also makes it slightly more readable. For %lld versus %llu - I think that cgroup resource APIs are much more likely to need to report unsigned rather than signed values. In the case of the memory.stat file, that's certainly the case. But I guess there's an argument to be made that nothing's likely to need the final 64th bit of an unsigned value, whereas the ability to report negative numbers could potentially be useful for some cgroups. Paul -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/