Clarify the locking rules associated with file level internal variables inside the cpuset code. There is no functional change.
Signed-off-by: Waiman Long <[email protected]> --- kernel/cgroup/cpuset.c | 105 ++++++++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 44 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index c43efef7df71..d705c5ba64a7 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -61,6 +61,58 @@ static const char * const perr_strings[] = { [PERR_REMOTE] = "Have remote partition underneath", }; +/* + * CPUSET Locking Convention + * ------------------------- + * + * Below are the three global locks guarding cpuset structures in lock + * acquisition order: + * - cpu_hotplug_lock (cpus_read_lock/cpus_write_lock) + * - cpuset_mutex + * - callback_lock (raw spinlock) + * + * A task must hold all the three locks to modify externally visible or + * used fields of cpusets, though some of the internally used cpuset fields + * and internal variables can be modified without holding callback_lock. If only + * reliable read access of the externally used fields are needed, a task can + * hold either cpuset_mutex or callback_lock which are exposed to other + * external subsystems. + * + * If a task holds cpu_hotplug_lock and cpuset_mutex, it blocks others, + * ensuring that it is the only task able to also acquire callback_lock and + * be able to modify cpusets. It can perform various checks on the cpuset + * structure first, knowing nothing will change. It can also allocate memory + * without holding callback_lock. While it is performing these checks, various + * callback routines can briefly acquire callback_lock to query cpusets. Once + * it is ready to make the changes, it takes callback_lock, blocking everyone + * else. + * + * Calls to the kernel memory allocator cannot be made while holding + * callback_lock which is a spinlock, as the memory allocator may sleep or + * call back into cpuset code and acquire callback_lock. + * + * Now, the task_struct fields mems_allowed and mempolicy may be changed + * by other task, we use alloc_lock in the task_struct fields to protect + * them. + * + * The cpuset_common_seq_show() handlers only hold callback_lock across + * small pieces of code, such as when reading out possibly multi-word + * cpumasks and nodemasks. + */ + +static DEFINE_MUTEX(cpuset_mutex); + +/* + * File level internal variables below follow one of the following exclusion + * rules. + * + * RWCS: Read/write-able by holding both cpus_read_lock/cpus_write_lock and + * cpuset_mutex. + * + * CSCB: Readable by holding either cpuset_mutex or callback_lock. Writable + * by holding both cpuset_mutex and callback_lock. + */ + /* * For local partitions, update to subpartitions_cpus & isolated_cpus is done * in update_parent_effective_cpumask(). For remote partitions, it is done in @@ -70,19 +122,18 @@ static const char * const perr_strings[] = { * Exclusive CPUs distributed out to local or remote sub-partitions of * top_cpuset */ -static cpumask_var_t subpartitions_cpus; +static cpumask_var_t subpartitions_cpus; /* RWCS */ /* - * Exclusive CPUs in isolated partitions + * Exclusive CPUs in isolated partitions (shown in cpuset.cpus.isolated) */ -static cpumask_var_t isolated_cpus; +static cpumask_var_t isolated_cpus; /* CSCB */ /* - * isolated_cpus updating flag (protected by cpuset_mutex) - * Set if isolated_cpus is going to be updated in the current - * cpuset_mutex crtical section. + * Set if isolated_cpus is being updated in the current cpuset_mutex + * critical section. */ -static bool isolated_cpus_updating; +static bool isolated_cpus_updating; /* RWCS */ /* * A flag to force sched domain rebuild at the end of an operation. @@ -98,7 +149,7 @@ static bool isolated_cpus_updating; * Note that update_relax_domain_level() in cpuset-v1.c can still call * rebuild_sched_domains_locked() directly without using this flag. */ -static bool force_sd_rebuild; +static bool force_sd_rebuild; /* RWCS */ /* * Partition root states: @@ -218,42 +269,6 @@ struct cpuset top_cpuset = { .partition_root_state = PRS_ROOT, }; -/* - * There are two global locks guarding cpuset structures - cpuset_mutex and - * callback_lock. The cpuset code uses only cpuset_mutex. Other kernel - * subsystems can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset - * structures. Note that cpuset_mutex needs to be a mutex as it is used in - * paths that rely on priority inheritance (e.g. scheduler - on RT) for - * correctness. - * - * A task must hold both locks to modify cpusets. If a task holds - * cpuset_mutex, it blocks others, ensuring that it is the only task able to - * also acquire callback_lock and be able to modify cpusets. It can perform - * various checks on the cpuset structure first, knowing nothing will change. - * It can also allocate memory while just holding cpuset_mutex. While it is - * performing these checks, various callback routines can briefly acquire - * callback_lock to query cpusets. Once it is ready to make the changes, it - * takes callback_lock, blocking everyone else. - * - * Calls to the kernel memory allocator can not be made while holding - * callback_lock, as that would risk double tripping on callback_lock - * from one of the callbacks into the cpuset code from within - * __alloc_pages(). - * - * If a task is only holding callback_lock, then it has read-only - * access to cpusets. - * - * Now, the task_struct fields mems_allowed and mempolicy may be changed - * by other task, we use alloc_lock in the task_struct fields to protect - * them. - * - * The cpuset_common_seq_show() handlers only hold callback_lock across - * small pieces of code, such as when reading out possibly multi-word - * cpumasks and nodemasks. - */ - -static DEFINE_MUTEX(cpuset_mutex); - /** * cpuset_lock - Acquire the global cpuset mutex * @@ -1163,6 +1178,8 @@ static void reset_partition_data(struct cpuset *cs) static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus) { WARN_ON_ONCE(old_prs == new_prs); + lockdep_assert_held(&callback_lock); + lockdep_assert_held(&cpuset_mutex); if (new_prs == PRS_ISOLATED) cpumask_or(isolated_cpus, isolated_cpus, xcpus); else -- 2.52.0

