Thanks, Felix. All issues have been fixed, and I will send out V2 soon.

Thanks
Lingshan

On 7/29/2025 3:48 AM, Felix Kuehling wrote:
> On 2025-07-24 22:43, Zhu Lingshan wrote:
>> This commit introduces a new id field for
>> struct kfd process, which helps identify
>> a kfd process among multiple contexts that
>> all belong to a single user space program.
>>
>> The sysfs entry of a secondary kfd process
>> is placed under the sysfs entry folder of
>> its primary kfd process.
>>
>> The naming format of the sysfs entry of a secondary
>> kfd process is "context_%u" where %u is the process id.
>>
>> Signed-off-by: Zhu Lingshan <lingshan....@amd.com>
>> ---
>>   drivers/gpu/drm/amd/amdkfd/kfd_priv.h    |  6 ++
>>   drivers/gpu/drm/amd/amdkfd/kfd_process.c | 82 +++++++++++++++++++++++-
>>   2 files changed, 85 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> index de701d72aa5c..a6e12c705734 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
>> @@ -995,6 +995,9 @@ struct kfd_process {
>>       /* Tracks debug per-vmid request for debug flags */
>>       u32 dbg_flags;
>>   +    /* kfd process id */
>> +    u16 id;
>> +
>>       atomic_t poison;
>>       /* Queues are in paused stated because we are in the process of
>> doing a CRIU checkpoint */
>>       bool queues_paused;
>> @@ -1009,6 +1012,9 @@ struct kfd_process {
>>         /* indicating whether this is a primary kfd_process */
>>       bool primary;
>> +
>> +    /* The primary kfd_process allocating IDs for its secondary
>> kfd_process, 0 for primary kfd_process */
>> +    struct ida id_table;
>>   };
>>     #define KFD_PROCESS_TABLE_SIZE 8 /* bits: 256 entries */
>> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> index 440fde75d1e4..e1ba9015bb83 100644
>> --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
>> @@ -54,6 +54,9 @@ DEFINE_MUTEX(kfd_processes_mutex);
>>     DEFINE_SRCU(kfd_processes_srcu);
>>   +#define KFD_PROCESS_ID_MIN 1
>> +#define KFD_PROCESS_ID_WIDTH 16
>> +
>>   /* For process termination handling */
>>   static struct workqueue_struct *kfd_process_wq;
>>   @@ -827,6 +830,7 @@ static void
>> kfd_process_device_destroy_ib_mem(struct kfd_process_device *pdd)
>>     int kfd_create_process_sysfs(struct kfd_process *process)
>>   {
>> +    struct kfd_process *primary_process;
>>       int ret;
>>         if (process->kobj) {
>> @@ -839,9 +843,22 @@ int kfd_create_process_sysfs(struct kfd_process
>> *process)
>>           pr_warn("Creating procfs kobject failed");
>>           return -ENOMEM;
>>       }
>> -    ret = kobject_init_and_add(process->kobj, &procfs_type,
>> -                   procfs.kobj, "%d",
>> -                   (int)process->lead_thread->pid);
>> +
>> +    if (process->primary)
>> +        ret = kobject_init_and_add(process->kobj, &procfs_type,
>> +                       procfs.kobj, "%d",
>> +                       (int)process->lead_thread->pid);
>> +    else {
>> +        primary_process =
>> kfd_lookup_process_by_mm(process->lead_thread->mm);
>> +        if (!primary_process)
>> +            return -EFAULT;
>
> EFAULT means "Bad address". A better error code would be ESRCH "No
> such process".
>
>
>> +
>> +        ret = kobject_init_and_add(process->kobj, &procfs_type,
>> +                       primary_process->kobj, "context_%u",
>> +                       process->id);
>> +        kfd_unref_process(primary_process);
>> +    }
>> +
>>       if (ret) {
>>           pr_warn("Creating procfs pid directory failed");
>>           kobject_put(process->kobj);
>> @@ -863,6 +880,51 @@ int kfd_create_process_sysfs(struct kfd_process
>> *process)
>>       return 0;
>>   }
>>   +static int kfd_process_alloc_id(struct kfd_process *process)
>> +{
>> +    u16 ret;
>> +    struct kfd_process *primary_process;
>> +
>> +    if (process->primary) {
>> +        process->id = 0;
>> +
>> +        return 0;
>> +    }
>> +
>> +    primary_process =
>> kfd_lookup_process_by_mm(process->lead_thread->mm);
>> +    if (!primary_process)
>> +        return -EFAULT;
>
> ESRCH
>
>
>> +
>> +    ret = ida_alloc_range(&primary_process->id_table,
>> KFD_PROCESS_ID_MIN,
>> +         1 << (KFD_PROCESS_ID_WIDTH - 1), GFP_KERNEL);
>
> Did you mean (1 << KFD_PROCESS_ID_WIDTH) - 1? That would give you the
> range from 1 to 0xffff, which is what I'd expect with 16-bit wide ID.
>
>
>> +    if (ret < 0)
>> +        return ret;
>
> You're leaking a process reference here.
>
>
>> +
>> +    process->id = ret;
>> +
>> +    kfd_unref_process(primary_process);
>> +
>> +    return 0;
>> +}
>> +
>> +static void kfd_process_free_id(struct kfd_process *process)
>> +{
>> +    struct kfd_process *primary_process;
>> +
>> +    if (process->primary)
>> +        return;
>> +
>> +    primary_process =
>> kfd_lookup_process_by_mm(process->lead_thread->mm);
>> +    if (!primary_process)
>> +        return;
>> +
>> +    ida_free(&primary_process->id_table, process->id);
>> +
>> +    kfd_unref_process(primary_process);
>> +
>> +    return;
>
> This return statement is unnecessary.
>
>
>> +}
>> +
>>   struct kfd_process *kfd_create_process(struct task_struct *thread)
>>   {
>>       struct kfd_process *process;
>> @@ -1193,6 +1255,11 @@ static void kfd_process_wq_release(struct
>> work_struct *work)
>>       if (ef)
>>           dma_fence_signal(ef);
>>   +    if (!p->primary)
>> +        kfd_process_free_id(p);
>> +    else
>> +        ida_destroy(&p->id_table);
>> +
>>       kfd_process_remove_sysfs(p);
>>       kfd_debugfs_remove_process(p);
>>   @@ -1549,6 +1616,12 @@ static struct kfd_process
>> *create_process(const struct task_struct *thread, bool
>>       process->queues_paused = false;
>>       process->primary = primary;
>>   +    err = kfd_process_alloc_id(process);
>> +    if (err) {
>> +        pr_err("Creating kfd process: failed to alloc an id\n");
>> +        goto err_alloc_process;
>
> That's the wrong label for cleanup. You'd end up leaking the process
> structure. You need to create a new label. See below.
>
>
>> +    }
>> +
>>       INIT_DELAYED_WORK(&process->eviction_work, evict_process_worker);
>>       INIT_DELAYED_WORK(&process->restore_work, restore_process_worker);
>>       process->last_restore_timestamp = get_jiffies_64();
>> @@ -1599,6 +1672,8 @@ static struct kfd_process *create_process(const
>> struct task_struct *thread, bool
>>               goto err_register_notifier;
>>           }
>>           BUG_ON(mn != &process->mmu_notifier);
>> +
>> +        ida_init(&process->id_table);
>>       }
>>         kfd_unref_process(process);
>> @@ -1619,6 +1694,7 @@ static struct kfd_process *create_process(const
>> struct task_struct *thread, bool
>>   err_process_pqm_init:
>>       kfd_event_free_process(process);
>>   err_event_init:
>> +    kfd_process_free_id(process);
>
> You should add a new label here
>
> err_alloc_id:
>
> Regards,
>   Felix
>
>
>>       mutex_destroy(&process->mutex);
>>       kfree(process);
>>   err_alloc_process:

Reply via email to