Thanks, Felix. All issues have been fixed, and I will send out V2 soon. Thanks Lingshan
On 7/29/2025 3:48 AM, Felix Kuehling wrote: > On 2025-07-24 22:43, Zhu Lingshan wrote: >> This commit introduces a new id field for >> struct kfd process, which helps identify >> a kfd process among multiple contexts that >> all belong to a single user space program. >> >> The sysfs entry of a secondary kfd process >> is placed under the sysfs entry folder of >> its primary kfd process. >> >> The naming format of the sysfs entry of a secondary >> kfd process is "context_%u" where %u is the process id. >> >> Signed-off-by: Zhu Lingshan <lingshan....@amd.com> >> --- >> drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 6 ++ >> drivers/gpu/drm/amd/amdkfd/kfd_process.c | 82 +++++++++++++++++++++++- >> 2 files changed, 85 insertions(+), 3 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> index de701d72aa5c..a6e12c705734 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h >> @@ -995,6 +995,9 @@ struct kfd_process { >> /* Tracks debug per-vmid request for debug flags */ >> u32 dbg_flags; >> + /* kfd process id */ >> + u16 id; >> + >> atomic_t poison; >> /* Queues are in paused stated because we are in the process of >> doing a CRIU checkpoint */ >> bool queues_paused; >> @@ -1009,6 +1012,9 @@ struct kfd_process { >> /* indicating whether this is a primary kfd_process */ >> bool primary; >> + >> + /* The primary kfd_process allocating IDs for its secondary >> kfd_process, 0 for primary kfd_process */ >> + struct ida id_table; >> }; >> #define KFD_PROCESS_TABLE_SIZE 8 /* bits: 256 entries */ >> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> index 440fde75d1e4..e1ba9015bb83 100644 >> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c >> @@ -54,6 +54,9 @@ DEFINE_MUTEX(kfd_processes_mutex); >> DEFINE_SRCU(kfd_processes_srcu); >> +#define KFD_PROCESS_ID_MIN 1 >> +#define KFD_PROCESS_ID_WIDTH 16 >> + >> /* For process termination handling */ >> static struct workqueue_struct *kfd_process_wq; >> @@ -827,6 +830,7 @@ static void >> kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd) >> int kfd_create_process_sysfs(struct kfd_process *process) >> { >> + struct kfd_process *primary_process; >> int ret; >> if (process->kobj) { >> @@ -839,9 +843,22 @@ int kfd_create_process_sysfs(struct kfd_process >> *process) >> pr_warn("Creating procfs kobject failed"); >> return -ENOMEM; >> } >> - ret = kobject_init_and_add(process->kobj, &procfs_type, >> - procfs.kobj, "%d", >> - (int)process->lead_thread->pid); >> + >> + if (process->primary) >> + ret = kobject_init_and_add(process->kobj, &procfs_type, >> + procfs.kobj, "%d", >> + (int)process->lead_thread->pid); >> + else { >> + primary_process = >> kfd_lookup_process_by_mm(process->lead_thread->mm); >> + if (!primary_process) >> + return -EFAULT; > > EFAULT means "Bad address". A better error code would be ESRCH "No > such process". > > >> + >> + ret = kobject_init_and_add(process->kobj, &procfs_type, >> + primary_process->kobj, "context_%u", >> + process->id); >> + kfd_unref_process(primary_process); >> + } >> + >> if (ret) { >> pr_warn("Creating procfs pid directory failed"); >> kobject_put(process->kobj); >> @@ -863,6 +880,51 @@ int kfd_create_process_sysfs(struct kfd_process >> *process) >> return 0; >> } >> +static int kfd_process_alloc_id(struct kfd_process *process) >> +{ >> + u16 ret; >> + struct kfd_process *primary_process; >> + >> + if (process->primary) { >> + process->id = 0; >> + >> + return 0; >> + } >> + >> + primary_process = >> kfd_lookup_process_by_mm(process->lead_thread->mm); >> + if (!primary_process) >> + return -EFAULT; > > ESRCH > > >> + >> + ret = ida_alloc_range(&primary_process->id_table, >> KFD_PROCESS_ID_MIN, >> + 1 << (KFD_PROCESS_ID_WIDTH - 1), GFP_KERNEL); > > Did you mean (1 << KFD_PROCESS_ID_WIDTH) - 1? That would give you the > range from 1 to 0xffff, which is what I'd expect with 16-bit wide ID. > > >> + if (ret < 0) >> + return ret; > > You're leaking a process reference here. > > >> + >> + process->id = ret; >> + >> + kfd_unref_process(primary_process); >> + >> + return 0; >> +} >> + >> +static void kfd_process_free_id(struct kfd_process *process) >> +{ >> + struct kfd_process *primary_process; >> + >> + if (process->primary) >> + return; >> + >> + primary_process = >> kfd_lookup_process_by_mm(process->lead_thread->mm); >> + if (!primary_process) >> + return; >> + >> + ida_free(&primary_process->id_table, process->id); >> + >> + kfd_unref_process(primary_process); >> + >> + return; > > This return statement is unnecessary. > > >> +} >> + >> struct kfd_process *kfd_create_process(struct task_struct *thread) >> { >> struct kfd_process *process; >> @@ -1193,6 +1255,11 @@ static void kfd_process_wq_release(struct >> work_struct *work) >> if (ef) >> dma_fence_signal(ef); >> + if (!p->primary) >> + kfd_process_free_id(p); >> + else >> + ida_destroy(&p->id_table); >> + >> kfd_process_remove_sysfs(p); >> kfd_debugfs_remove_process(p); >> @@ -1549,6 +1616,12 @@ static struct kfd_process >> *create_process(const struct task_struct *thread, bool >> process->queues_paused = false; >> process->primary = primary; >> + err = kfd_process_alloc_id(process); >> + if (err) { >> + pr_err("Creating kfd process: failed to alloc an id\n"); >> + goto err_alloc_process; > > That's the wrong label for cleanup. You'd end up leaking the process > structure. You need to create a new label. See below. > > >> + } >> + >> INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker); >> INIT_DELAYED_WORK(&process->restore_work, restore_process_worker); >> process->last_restore_timestamp = get_jiffies_64(); >> @@ -1599,6 +1672,8 @@ static struct kfd_process *create_process(const >> struct task_struct *thread, bool >> goto err_register_notifier; >> } >> BUG_ON(mn != &process->mmu_notifier); >> + >> + ida_init(&process->id_table); >> } >> kfd_unref_process(process); >> @@ -1619,6 +1694,7 @@ static struct kfd_process *create_process(const >> struct task_struct *thread, bool >> err_process_pqm_init: >> kfd_event_free_process(process); >> err_event_init: >> + kfd_process_free_id(process); > > You should add a new label here > > err_alloc_id: > > Regards, > Felix > > >> mutex_destroy(&process->mutex); >> kfree(process); >> err_alloc_process: