Alex please make sure to include this one in your -fixes pull request.

Thanks,
Christian.

On 3/31/26 16:38, Alex Deucher wrote:
> Applied.  Thanks!
> 
> Alex
> 
> On Tue, Mar 31, 2026 at 10:29 AM Christian König
> <[email protected]> wrote:
>>
>>
>>
>> On 3/31/26 16:21, Mikhail Gavrilov wrote:
>>> Replace the PASID IDR + spinlock with XArray as noted in the TODO
>>> left by commit dccd79bb1c7f ("drm/amdgpu: fix the idr allocation
>>> flags").
>>>
>>> The IDR conversion still has an IRQ safety issue:
>>> amdgpu_pasid_free() can be called from hardirq context via the fence
>>> signal path, but amdgpu_pasid_idr_lock is taken with plain spin_lock()
>>> in process context, creating a potential deadlock:
>>>
>>>      CPU0
>>>      ----
>>>      spin_lock(&amdgpu_pasid_idr_lock)   // process context, IRQs on
>>>      <Interrupt>
>>>        spin_lock(&amdgpu_pasid_idr_lock) // deadlock
>>>
>>>    The hardirq call chain is:
>>>
>>>      sdma_v6_0_process_trap_irq
>>>       -> amdgpu_fence_process
>>>        -> dma_fence_signal
>>>         -> drm_sched_job_done
>>>          -> dma_fence_signal
>>>           -> amdgpu_pasid_free_cb
>>>            -> amdgpu_pasid_free
>>>
>>> Use XArray with XA_FLAGS_LOCK_IRQ (all xa operations use IRQ-safe
>>> locking internally) and XA_FLAGS_ALLOC1 (zero is not a valid PASID).
>>> Both xa_alloc_cyclic() and xa_erase() then handle locking
>>> consistently, fixing the IRQ safety issue and removing the need for
>>> an explicit spinlock.
>>>
>>> Suggested-by: Lijo Lazar <[email protected]>
>>> Fixes: e6d765de3d6b ("drm/amdgpu: prevent immediate PASID reuse case")
>>> Signed-off-by: Mikhail Gavrilov <[email protected]>
>>
>> Reviewed-by: Christian König <[email protected]>
>>
>>> ---
>>>
>>> v7: Rebased on amd-staging-drm-next which already includes
>>>     dccd79bb1c7f ("drm/amdgpu: fix the idr allocation flags").
>>>     Updated commit message to reflect that sleeping-under-spinlock
>>>     is already fixed and the xarray conversion now addresses the
>>>     remaining IRQ safety issue.  Inverted error check to
>>>     if (r < 0) return r; per Christian König.
>>> v6: Use DEFINE_XARRAY_FLAGS with XA_FLAGS_LOCK_IRQ | XA_FLAGS_ALLOC1
>>>     so all xa operations use IRQ-safe locking internally.  Drop
>>>     Cc: stable since the regression was never released to any stable
>>>     kernel. (Christian König)
>>>     
>>> https://lore.kernel.org/all/[email protected]/
>>> v5: Use explicit xa_lock_irqsave/__xa_erase for amdgpu_pasid_free()
>>>     since xa_erase() only uses plain xa_lock() which is not safe from
>>>     hardirq context.
>>>     
>>> https://lore.kernel.org/all/[email protected]/
>>> v4: Use xa_alloc_cyclic/xa_erase directly instead of explicit
>>>     xa_lock_irqsave, as suggested by Lijo Lazar.
>>>     
>>> https://lore.kernel.org/all/[email protected]/
>>> v3: Replace IDR with XArray instead of fixing the spinlock, as
>>>     suggested by Lijo Lazar.
>>>     
>>> https://lore.kernel.org/all/[email protected]/
>>> v2: Added second patch fixing the {HARDIRQ-ON-W} -> {IN-HARDIRQ-W}
>>>     lock inconsistency (spin_lock -> spin_lock_irqsave).
>>>     
>>> https://lore.kernel.org/all/[email protected]/
>>> v1: Fixed sleeping-under-spinlock (idr_alloc_cyclic with GFP_KERNEL)
>>>     using idr_preload/GFP_NOWAIT.
>>>     
>>> https://lore.kernel.org/all/[email protected]/
>>>
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 34 ++++++++++---------------
>>>  1 file changed, 13 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c 
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>> index e495a8fa13fd..a6ac3b4ce0df 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>> @@ -22,7 +22,7 @@
>>>   */
>>>  #include "amdgpu_ids.h"
>>>
>>> -#include <linux/idr.h>
>>> +#include <linux/xarray.h>
>>>  #include <linux/dma-fence-array.h>
>>>
>>>
>>> @@ -40,8 +40,8 @@
>>>   * VMs are looked up from the PASID per amdgpu_device.
>>>   */
>>>
>>> -static DEFINE_IDR(amdgpu_pasid_idr);
>>> -static DEFINE_SPINLOCK(amdgpu_pasid_idr_lock);
>>> +static DEFINE_XARRAY_FLAGS(amdgpu_pasid_xa, XA_FLAGS_LOCK_IRQ | 
>>> XA_FLAGS_ALLOC1);
>>> +static u32 amdgpu_pasid_xa_next;
>>>
>>>  /* Helper to free pasid from a fence callback */
>>>  struct amdgpu_pasid_cb {
>>> @@ -62,22 +62,19 @@ struct amdgpu_pasid_cb {
>>>   */
>>>  int amdgpu_pasid_alloc(unsigned int bits)
>>>  {
>>> -     int pasid;
>>> +     u32 pasid;
>>> +     int r;
>>>
>>>       if (bits == 0)
>>>               return -EINVAL;
>>>
>>> -     spin_lock(&amdgpu_pasid_idr_lock);
>>> -     /* TODO: Need to replace the idr with an xarry, and then
>>> -      * handle the internal locking with ATOMIC safe paths.
>>> -      */
>>> -     pasid = idr_alloc_cyclic(&amdgpu_pasid_idr, NULL, 1,
>>> -                              1U << bits, GFP_ATOMIC);
>>> -     spin_unlock(&amdgpu_pasid_idr_lock);
>>> -
>>> -     if (pasid >= 0)
>>> -             trace_amdgpu_pasid_allocated(pasid);
>>> +     r = xa_alloc_cyclic(&amdgpu_pasid_xa, &pasid, xa_mk_value(0),
>>> +                         XA_LIMIT(1, (1U << bits) - 1),
>>> +                         &amdgpu_pasid_xa_next, GFP_KERNEL);
>>> +     if (r < 0)
>>> +             return r;
>>>
>>> +     trace_amdgpu_pasid_allocated(pasid);
>>>       return pasid;
>>>  }
>>>
>>> @@ -88,10 +85,7 @@ int amdgpu_pasid_alloc(unsigned int bits)
>>>  void amdgpu_pasid_free(u32 pasid)
>>>  {
>>>       trace_amdgpu_pasid_freed(pasid);
>>> -
>>> -     spin_lock(&amdgpu_pasid_idr_lock);
>>> -     idr_remove(&amdgpu_pasid_idr, pasid);
>>> -     spin_unlock(&amdgpu_pasid_idr_lock);
>>> +     xa_erase(&amdgpu_pasid_xa, pasid);
>>>  }
>>>
>>>  static void amdgpu_pasid_free_cb(struct dma_fence *fence,
>>> @@ -634,7 +628,5 @@ void amdgpu_vmid_mgr_fini(struct amdgpu_device *adev)
>>>   */
>>>  void amdgpu_pasid_mgr_cleanup(void)
>>>  {
>>> -     spin_lock(&amdgpu_pasid_idr_lock);
>>> -     idr_destroy(&amdgpu_pasid_idr);
>>> -     spin_unlock(&amdgpu_pasid_idr_lock);
>>> +     xa_destroy(&amdgpu_pasid_xa);
>>>  }
>>

Reply via email to