On Wed, Jul 13, 2016 at 11:33 AM, Tejun Heo <t...@kernel.org> wrote: > On Wed, Jul 13, 2016 at 02:21:02PM -0400, Tejun Heo wrote: >> One interesting thing to try would be replacing it with a regular >> non-percpu rwsem and see how it behaves. That should easily tell us >> whether this is from actual contention or artifacts from percpu_rwsem >> implementation. > > So, something like the following. Can you please see whether this > makes any difference?
It is making difference. We need to conduct more tests, but it looks much better. I see only one 18.5 ms delay. Most of time it is <200 us delay. > Thanks. > > diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h > index 5b17de6..bc1e4d8 100644 > --- a/include/linux/cgroup-defs.h > +++ b/include/linux/cgroup-defs.h > @@ -14,7 +14,7 @@ > #include <linux/mutex.h> > #include <linux/rcupdate.h> > #include <linux/percpu-refcount.h> > -#include <linux/percpu-rwsem.h> > +#include <linux/rwsem.h> > #include <linux/workqueue.h> > > #ifdef CONFIG_CGROUPS > @@ -518,7 +518,7 @@ struct cgroup_subsys { > unsigned int depends_on; > }; > > -extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem; > +extern struct rw_semaphore cgroup_threadgroup_rwsem; > > /** > * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups > @@ -529,7 +529,7 @@ extern struct percpu_rw_semaphore > cgroup_threadgroup_rwsem; > */ > static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk) > { > - percpu_down_read(&cgroup_threadgroup_rwsem); > + down_read(&cgroup_threadgroup_rwsem); > } > > /** > @@ -541,7 +541,7 @@ static inline void cgroup_threadgroup_change_begin(struct > task_struct *tsk) > */ > static inline void cgroup_threadgroup_change_end(struct task_struct *tsk) > { > - percpu_up_read(&cgroup_threadgroup_rwsem); > + up_read(&cgroup_threadgroup_rwsem); > } > > #else /* CONFIG_CGROUPS */ > diff --git a/kernel/cgroup.c b/kernel/cgroup.c > index 86cb5c6..ed9c142 100644 > --- a/kernel/cgroup.c > +++ b/kernel/cgroup.c > @@ -113,7 +113,7 @@ static DEFINE_SPINLOCK(cgroup_file_kn_lock); > */ > static DEFINE_SPINLOCK(release_agent_path_lock); > > -struct percpu_rw_semaphore cgroup_threadgroup_rwsem; > +DECLARE_RWSEM(cgroup_threadgroup_rwsem); > > #define cgroup_assert_mutex_or_rcu_locked() \ > RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ > @@ -2899,7 +2899,7 @@ static ssize_t __cgroup_procs_write(struct > kernfs_open_file *of, char *buf, > if (!cgrp) > return -ENODEV; > > - percpu_down_write(&cgroup_threadgroup_rwsem); > + down_write(&cgroup_threadgroup_rwsem); > rcu_read_lock(); > if (pid) { > tsk = find_task_by_vpid(pid); > @@ -2937,7 +2937,7 @@ static ssize_t __cgroup_procs_write(struct > kernfs_open_file *of, char *buf, > out_unlock_rcu: > rcu_read_unlock(); > out_unlock_threadgroup: > - percpu_up_write(&cgroup_threadgroup_rwsem); > + up_write(&cgroup_threadgroup_rwsem); > for_each_subsys(ss, ssid) > if (ss->post_attach) > ss->post_attach(); > @@ -3077,7 +3077,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) > > lockdep_assert_held(&cgroup_mutex); > > - percpu_down_write(&cgroup_threadgroup_rwsem); > + down_write(&cgroup_threadgroup_rwsem); > > /* look up all csses currently attached to @cgrp's subtree */ > spin_lock_bh(&css_set_lock); > @@ -3112,7 +3112,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) > ret = cgroup_taskset_migrate(&tset, cgrp->root); > out_finish: > cgroup_migrate_finish(&preloaded_csets); > - percpu_up_write(&cgroup_threadgroup_rwsem); > + up_write(&cgroup_threadgroup_rwsem); > return ret; > } > > @@ -5601,7 +5601,7 @@ int __init cgroup_init(void) > int ssid; > > BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); > - BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); > + //BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); > BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); > BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); >