Hello. On Fri, May 02, 2025 at 01:32:53PM +0100, Tvrtko Ursulin <tvrtko.ursu...@igalia.com> wrote: > From: Tvrtko Ursulin <tvrtko.ursu...@intel.com> > > Similar to CPU and IO scheduling, implement a concept of weight in the DRM > cgroup controller. > > Individual drivers are now able to register with the controller which will > notify them of the relative scheduling weight for each open DRM client. > > The notifications are triggered on cgroup weight changes and DRM clients > appearing and disappearing in/from cgroups. Latter is done because it is > handy to ignore the groups with no DRM clients in relative weight > calculations. > > The notifications are also consolidated by using a delayed worker. > > On the userspace API level we use the same range and defaults as the CPU > controller - CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_DFL and CGROUP_WEIGHT_MAX. > > Signed-off-by: Tvrtko Ursulin <tvrtko.ursu...@igalia.com> > Cc: Michal Koutný <mkou...@suse.com> > Cc: Tejun Heo <t...@kernel.org> > --- > Documentation/admin-guide/cgroup-v2.rst | 22 ++ > include/linux/cgroup_drm.h | 2 + > kernel/cgroup/drm.c | 313 +++++++++++++++++++++++- > 3 files changed, 331 insertions(+), 6 deletions(-) > > diff --git a/Documentation/admin-guide/cgroup-v2.rst > b/Documentation/admin-guide/cgroup-v2.rst > index 1a16ce68a4d7..095b7dee0151 100644 > --- a/Documentation/admin-guide/cgroup-v2.rst > +++ b/Documentation/admin-guide/cgroup-v2.rst > @@ -2729,6 +2729,28 @@ HugeTLB Interface Files > hugetlb pages of <hugepagesize> in this cgroup. Only active in > use hugetlb pages are included. The per-node values are in bytes. > > +DRM > +--- > + > +The controller allows for configuring of scheduling weights of cgroups > relative > +to their siblings. > + > +NOTE: This is an optional feature into which individual DRM drivers need to > + opt-in if they want to support it. > + > +NOTE: Only single GPU systems will work as expected in the current > + implementation. > + > +DRM Interface Files > +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > + > + drm.weight > + A read-write single value file which exists on non-root cgroups. The > + default is "100".
Should this be akin to IO controller and have subkey granularity for individual devices? (With special 'default' entry.) (Can those devices come and go (hotplug)?) > + > + The weights are in the range [1, 10000] and specify the relative > + scheduling weights for cgroups in relation to their siblings. > + > Misc > ---- > > diff --git a/include/linux/cgroup_drm.h b/include/linux/cgroup_drm.h > index d6a9a2fbdbf5..9961003958b4 100644 > --- a/include/linux/cgroup_drm.h > +++ b/include/linux/cgroup_drm.h > @@ -6,6 +6,8 @@ > > #include <drm/drm_file.h> > > +#define DRM_CGROUP_WEIGHT_SHIFT 10 > + > #if IS_ENABLED(CONFIG_CGROUP_DRM) > void drmcgroup_client_open(struct drm_file *file_priv); > void drmcgroup_client_close(struct drm_file *file_priv); > diff --git a/kernel/cgroup/drm.c b/kernel/cgroup/drm.c > index ea7655edf86a..532702604786 100644 > --- a/kernel/cgroup/drm.c > +++ b/kernel/cgroup/drm.c > @@ -13,10 +13,18 @@ struct drm_cgroup_state { > struct cgroup_subsys_state css; > > struct list_head clients; > + unsigned int num_clients; /* Whole branch */ > + > + unsigned int sum_children_weights; > + > + unsigned int weight; > + unsigned int effective_weight; IIUC, this is weight normalized across siblings (and not scaled by ancestors)? It should be then sufficient to only notify the siblings of modified cgroup after the recalculation, shouldn't it? > }; > > struct drm_root_cgroup_state { > struct drm_cgroup_state drmcs; > + > + struct delayed_work notify_work; > }; > > static struct drm_root_cgroup_state root_drmcs = { > @@ -31,7 +39,7 @@ css_to_drmcs(struct cgroup_subsys_state *css) > return container_of(css, struct drm_cgroup_state, css); > } > > -static void __maybe_unused > +static void > drmcs_notify_weight(struct drm_cgroup_state *drmcs) > { > struct drm_file *fpriv; > @@ -43,16 +51,152 @@ drmcs_notify_weight(struct drm_cgroup_state *drmcs) > fpriv->minor->dev->driver->cg_ops; > > if (cg_ops && cg_ops->notify_weight) > - cg_ops->notify_weight(fpriv, 0); > + cg_ops->notify_weight(fpriv, drmcs->effective_weight); > } > } > > +static void drmcg_update_weights_locked(void) > +{ > + lockdep_assert_held(&drmcg_mutex); > + > + mod_delayed_work(system_wq, > + &root_drmcs.notify_work, > + usecs_to_jiffies(1000)); This value is little bit magic. What is this consolidatiot good for? (I guess it's rather because of clients joining/leaving rather than cgroup attribute modifications.) > +} > + > +static void drmcg_update_weights(void) > +{ > + mutex_lock(&drmcg_mutex); > + drmcg_update_weights_locked(); > + mutex_unlock(&drmcg_mutex); > +} > + > +static u64 > +drmcs_read_weight(struct cgroup_subsys_state *css, struct cftype *cft) > +{ > + struct drm_cgroup_state *drmcs = css_to_drmcs(css); > + > + return drmcs->weight; > +} > + > +static int > +drmcs_write_weight(struct cgroup_subsys_state *css, struct cftype *cftype, > + u64 weight) > +{ > + struct drm_cgroup_state *drmcs = css_to_drmcs(css); > + int ret; > + > + if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) > + return -ERANGE; > + > + ret = mutex_lock_interruptible(&drmcg_mutex); > + if (ret) > + return ret; > + drmcs->weight = weight; > + drmcg_update_weights_locked(); > + mutex_unlock(&drmcg_mutex); > + > + return 0; > +} > + > +static void notify_worker(struct work_struct *work) > +{ > + struct drm_cgroup_state *root = &root_drmcs.drmcs; > + struct cgroup_subsys_state *node; > + bool updated; > + > + mutex_lock(&drmcg_mutex); > + rcu_read_lock(); > + > + /* > + * Always come back later if we race with core cgroup management. > + */ > + updated = false; > + if (WARN_ON_ONCE(!css_tryget_online(&root->css))) > + goto out_unlock; > + > + css_for_each_descendant_post(node, &root->css) { > + struct drm_cgroup_state *drmcs = css_to_drmcs(node); > + > + if (!css_tryget_online(node)) > + goto out_put; > + > + drmcs->sum_children_weights = 0; > + css_put(node); > + } > + > + css_for_each_descendant_post(node, &root->css) { > + struct drm_cgroup_state *drmcs = css_to_drmcs(node); > + struct drm_cgroup_state *parent; > + > + if (!css_tryget_online(node)) > + goto out_put; > + if (!node->parent || !drmcs->num_clients) { > + css_put(node); > + continue; > + } > + if (!css_tryget_online(node->parent)) { > + css_put(node); > + goto out_put; > + } > + > + parent = css_to_drmcs(node->parent); > + parent->sum_children_weights += drmcs->weight; > + css_put(node); > + css_put(&parent->css); > + } > + > + css_for_each_descendant_pre(node, &root->css) { > + struct drm_cgroup_state *drmcs = css_to_drmcs(node); > + struct cgroup_subsys_state *css; > + > + if (!css_tryget_online(node)) > + goto out_put; > + if (!drmcs->num_clients) { > + css_put(node); > + continue; > + } > + > + css_for_each_child(css, &drmcs->css) { > + struct drm_cgroup_state *sibling = css_to_drmcs(css); > + > + if (!css_tryget_online(css)) { > + css_put(node); > + goto out_put; > + } > + if (!sibling->num_clients) { > + css_put(css); > + continue; > + } > + > + sibling->effective_weight = > + DIV_ROUND_CLOSEST(sibling->weight << > + DRM_CGROUP_WEIGHT_SHIFT, > + drmcs->sum_children_weights); > + drmcs_notify_weight(sibling); > + css_put(css); > + } > + > + css_put(node); > + } > + > + updated = true; > + > +out_put: > + css_put(&root->css); > +out_unlock: > + rcu_read_unlock(); > + > + if (!updated) > + drmcg_update_weights_locked(); > + > + mutex_unlock(&drmcg_mutex); > +} > + > static void drmcs_free(struct cgroup_subsys_state *css) > { > - struct drm_cgroup_state *drmcs = css_to_drmcs(css); > - > - if (drmcs != &root_drmcs.drmcs) > - kfree(drmcs); > + if (css != &root_drmcs.drmcs.css) > + kfree(css_to_drmcs(css)); > } > > static struct cgroup_subsys_state * > @@ -62,6 +206,7 @@ drmcs_alloc(struct cgroup_subsys_state *parent_css) > > if (!parent_css) { > drmcs = &root_drmcs.drmcs; > + INIT_DELAYED_WORK(&root_drmcs.notify_work, notify_worker); > } else { > drmcs = kzalloc(sizeof(*drmcs), GFP_KERNEL); > if (!drmcs) > @@ -70,9 +215,147 @@ drmcs_alloc(struct cgroup_subsys_state *parent_css) > INIT_LIST_HEAD(&drmcs->clients); > } > > + drmcs->weight = CGROUP_WEIGHT_DFL; > + drmcs->effective_weight = (1 << DRM_CGROUP_WEIGHT_SHIFT) / 2; > + > return &drmcs->css; > } > > +static int drmcs_online(struct cgroup_subsys_state *css) > +{ > + drmcg_update_weights(); > + > + return 0; > +} > + > +static void drmcs_offline(struct cgroup_subsys_state *css) > +{ > + drmcg_update_weights(); > +} > + > +static struct drm_cgroup_state *old_drmcs; > + > +static int drmcs_can_attach(struct cgroup_taskset *tset) > +{ > + struct cgroup_subsys_state *css; > + struct task_struct *task; > + > + task = cgroup_taskset_first(tset, &css); > + old_drmcs = css_to_drmcs(task_css(task, drm_cgrp_id)); This is similar to cpuset's cpuset_attach_old_cs. Beware that when controller is disabled, the migration happens from multiple (chilren) csses. > + > + return 0; > +} > + > +static void __inc_clients(struct drm_cgroup_state *drmcs) > +{ > + struct cgroup_subsys_state *parent = NULL; > + > + rcu_read_lock(); > + do { > + drmcs->num_clients++; > + WARN_ON_ONCE(!drmcs->num_clients); > + > + if (parent) > + css_put(parent); > + > + parent = drmcs->css.parent; > + if (parent) { > + if (WARN_ON_ONCE(!css_tryget(parent))) This should be ensured implicitly thanks to css::online_cnt. > + break; > + > + drmcs = css_to_drmcs(parent); > + } > + } while (parent); > + rcu_read_unlock(); > +} > + > +static void __dec_clients(struct drm_cgroup_state *drmcs) > +{ > + struct cgroup_subsys_state *parent = NULL; > + > + rcu_read_lock(); > + do { > + WARN_ON_ONCE(!drmcs->num_clients); > + drmcs->num_clients--; > + > + if (parent) > + css_put(parent); > + > + parent = drmcs->css.parent; > + if (parent) { > + if (WARN_ON_ONCE(!css_tryget(parent))) > + break; > + > + drmcs = css_to_drmcs(parent); > + } > + } while (parent); > + rcu_read_unlock(); > +} > + > +static void drmcs_attach(struct cgroup_taskset *tset) > +{ > + struct drm_cgroup_state *old = old_drmcs; > + struct cgroup_subsys_state *css; > + struct drm_file *fpriv, *next; > + struct drm_cgroup_state *new; > + struct task_struct *task; > + bool migrated = false; > + > + if (!old) > + return; > + > + task = cgroup_taskset_first(tset, &css); > + new = css_to_drmcs(task_css(task, drm_cgrp_id)); > + if (new == old) > + return; Beware here too, it'd be better to use css from cgroup_taskset_for_each(task, css, tset) below as the new css. > + > + mutex_lock(&drmcg_mutex); > + > + list_for_each_entry_safe(fpriv, next, &old->clients, clink) { > + cgroup_taskset_for_each(task, css, tset) { > + struct cgroup_subsys_state *old_css; > + struct drm_cgroup_state *old_; > + > + if (task->flags & PF_KTHREAD) > + continue; > + if (!thread_group_leader(task)) > + continue; Maybe cgroup_taskset_for_each_leader() > + > + new = css_to_drmcs(task_css(task, drm_cgrp_id)); > + if (WARN_ON_ONCE(new == old)) > + continue; > + > + if (rcu_access_pointer(fpriv->pid) != task_tgid(task)) > + continue; > + > + if (WARN_ON_ONCE(fpriv->__css != &old->css)) > + continue; > + > + old_css = fpriv->__css; > + old_ = css_to_drmcs(old_css); > + fpriv->__css = &new->css; > + css_get(fpriv->__css); > + list_move_tail(&fpriv->clink, &new->clients); > + __dec_clients(old); > + __inc_clients(new); > + css_put(old_css); > + migrated = true; > + } > + } > + > + if (migrated) > + drmcg_update_weights_locked(); > + > + mutex_unlock(&drmcg_mutex); > + > + old_drmcs = NULL; > +} > + > +static void drmcs_cancel_attach(struct cgroup_taskset *tset) > +{ > + old_drmcs = NULL; > +} > + > void drmcgroup_client_open(struct drm_file *file_priv) > { > struct drm_cgroup_state *drmcs; > @@ -85,6 +368,8 @@ void drmcgroup_client_open(struct drm_file *file_priv) > mutex_lock(&drmcg_mutex); > file_priv->__css = &drmcs->css; /* Keeps the reference. */ > list_add_tail(&file_priv->clink, &drmcs->clients); > + __inc_clients(drmcs); > + drmcg_update_weights_locked(); > mutex_unlock(&drmcg_mutex); > } > EXPORT_SYMBOL_GPL(drmcgroup_client_open); > @@ -100,7 +385,9 @@ void drmcgroup_client_close(struct drm_file *file_priv) > > mutex_lock(&drmcg_mutex); > list_del(&file_priv->clink); > + __dec_clients(drmcs); > file_priv->__css = NULL; > + drmcg_update_weights_locked(); > mutex_unlock(&drmcg_mutex); > > css_put(&drmcs->css); > @@ -124,6 +411,9 @@ void drmcgroup_client_migrate(struct drm_file *file_priv) > if (src != dst) { > file_priv->__css = &dst->css; /* Keeps the reference. */ > list_move_tail(&file_priv->clink, &dst->clients); > + __dec_clients(src); > + __inc_clients(dst); > + drmcg_update_weights_locked(); > } > > mutex_unlock(&drmcg_mutex); > @@ -133,12 +423,23 @@ void drmcgroup_client_migrate(struct drm_file > *file_priv) > EXPORT_SYMBOL_GPL(drmcgroup_client_migrate); > > struct cftype files[] = { > + { > + .name = "weight", > + .flags = CFTYPE_NOT_ON_ROOT, > + .read_u64 = drmcs_read_weight, > + .write_u64 = drmcs_write_weight, > + }, > { } /* Zero entry terminates. */ > }; > > struct cgroup_subsys drm_cgrp_subsys = { > .css_alloc = drmcs_alloc, > .css_free = drmcs_free, > + .css_online = drmcs_online, > + .css_offline = drmcs_offline, > + .can_attach = drmcs_can_attach, > + .attach = drmcs_attach, > + .cancel_attach = drmcs_cancel_attach, > .early_init = false, > .legacy_cftypes = files, > .dfl_cftypes = files, > -- > 2.48.0 > Regards, Michal
signature.asc
Description: PGP signature