[AMD Official Use Only - AMD Internal Distribution Only]
> -----Original Message-----
> From: amd-gfx <[email protected]> On Behalf Of Alex
> Deucher
> Sent: Thursday, October 30, 2025 2:14 AM
> To: Zhang, Jesse(Jie) <[email protected]>
> Cc: [email protected]; Deucher, Alexander
> <[email protected]>; Koenig, Christian <[email protected]>
> Subject: Re: [PATCH 2/3] drm/amdgpu: Add user queue reset mask support
>
> On Fri, Oct 24, 2025 at 5:45 AM Jesse.Zhang <[email protected]> wrote:
> >
> > This commit adds support for tracking and exposing the reset
> > capabilities of user mode queues across different IP blocks (GFX, Compute,
> SDMA).
> >
> > These changes allow userspace to query the reset capabilities of user
> > mode queues and ensure reset operations are only attempted when
> > supported by the hardware and driver.
> >
> > Suggested-by: Alex Deucher <[email protected]>
> > Signed-off-by: Jesse Zhang <[email protected]>
> > ---
> > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++-
> > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 44
> ++++++++++++++++++++++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 21 +++++++++++
> > drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 13 +++++++
> > drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 17 +++++++++
> > drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 12 ++++++
> > drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 34 ++++++++++-------
> > drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 24 ++++++++----
> > 9 files changed, 163 insertions(+), 22 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > index d0fb4eb1d7c4..48b21863065e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> > @@ -1182,6 +1182,7 @@ struct amdgpu_device {
> > * Value: struct amdgpu_usermode_queue
> > */
> > struct xarray userq_doorbell_xa;
> > + u32 userq_supported_reset[AMDGPU_RING_TYPE_MAX];
>
> I don't think we need a separate userq_supported_reset array. Just use the
> existing
> reset masks. We use the same functionality in both kernel and userq cases so
> I
> don't see a reason to have a separate tracker.
[Zhang, Jesse(Jie)] Thanks Alex for reviewing.
I have another question regarding the user queue reset mask sysfs.
Should we also share the sysfs with the kernel reset mask, right? Or set a
separate mask for userq_mask_reset?
For example:
/sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0/sdma_reset_mask
/sys/devices/pci0000:00/0000:00:01.1/0000:01:00.0/0000:02:00.0/0000:03:00.0/sdma_userq_reset_mask
Thanks
Jesse
>
> Alex
>
> >
> > /* df */
> > struct amdgpu_df df;
> > @@ -1612,6 +1613,8 @@ struct dma_fence
> *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
> > struct amdgpu_ring *ring,
> > struct amdgpu_job
> > *job); bool amdgpu_device_has_display_hardware(struct amdgpu_device
> > *adev);
> > +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev,
> > + int ring_type);
> > ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
> > ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > index 8480b72258f2..a0064c5314df 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> > @@ -7649,7 +7649,8 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct
> amdgpu_ring *ring)
> > if (!ring || !ring->adev)
> > return size;
> >
> > - if (amdgpu_device_should_recover_gpu(ring->adev))
> > + if (amdgpu_device_should_recover_gpu(ring->adev) &&
> > + unlikely(!ring->adev->debug_disable_gpu_ring_reset))
> > size |= AMDGPU_RESET_TYPE_FULL;
> >
> > if (unlikely(!ring->adev->debug_disable_soft_recovery) && @@
> > -7659,6 +7660,20 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct
> amdgpu_ring *ring)
> > return size;
> > }
> >
> > +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev,
> > +int ring_type) {
> > + ssize_t size = 0;
> > +
> > + if (!adev || !adev->userq_funcs[ring_type])
> > + return size;
> > +
> > + if (amdgpu_device_should_recover_gpu(adev) &&
> > + unlikely(!adev->debug_disable_gpu_ring_reset))
> > + size |= AMDGPU_RESET_TYPE_FULL;
> > +
> > + return size;
> > +}
> > +
> > ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
> > {
> > ssize_t size = 0;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > index 3d24f9cd750a..5597753ec61a 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > @@ -1826,6 +1826,32 @@ static ssize_t
> amdgpu_gfx_get_compute_reset_mask(struct device *dev,
> > return amdgpu_show_reset_mask(buf,
> > adev->gfx.compute_supported_reset);
> > }
> >
> > +static ssize_t amdgpu_userq_get_gfx_reset_mask(struct device *dev,
> > + struct device_attribute
> > *attr,
> > + char *buf) {
> > + struct drm_device *ddev = dev_get_drvdata(dev);
> > + struct amdgpu_device *adev = drm_to_adev(ddev);
> > +
> > + if (!adev)
> > + return -ENODEV;
> > +
> > + return amdgpu_show_reset_mask(buf,
> > +adev->userq_supported_reset[AMDGPU_HW_IP_GFX]);
> > +}
> > +
> > +static ssize_t amdgpu_userq_get_compute_reset_mask(struct device *dev,
> > + struct device_attribute
> > *attr,
> > + char *buf) {
> > + struct drm_device *ddev = dev_get_drvdata(dev);
> > + struct amdgpu_device *adev = drm_to_adev(ddev);
> > +
> > + if (!adev)
> > + return -ENODEV;
> > +
> > + return amdgpu_show_reset_mask(buf,
> > +adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE]);
> > +}
> > +
> > static DEVICE_ATTR(run_cleaner_shader, 0200,
> > NULL, amdgpu_gfx_set_run_cleaner_shader);
> >
> > @@ -1845,6 +1871,12 @@ static DEVICE_ATTR(gfx_reset_mask, 0444,
> > static DEVICE_ATTR(compute_reset_mask, 0444,
> > amdgpu_gfx_get_compute_reset_mask, NULL);
> >
> > +static DEVICE_ATTR(gfx_userq_reset_mask, 0444,
> > + amdgpu_userq_get_gfx_reset_mask, NULL);
> > +
> > +static DEVICE_ATTR(compute_userq_reset_mask, 0444,
> > + amdgpu_userq_get_compute_reset_mask, NULL);
> > +
> > static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev) {
> > struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; @@ -1928,6
> > +1960,18 @@ static int amdgpu_gfx_sysfs_reset_mask_init(struct
> amdgpu_device *adev)
> > return r;
> > }
> >
> > + if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) {
> > + r = device_create_file(adev->dev,
> > &dev_attr_gfx_userq_reset_mask);
> > + if (r)
> > + return r;
> > + }
> > +
> > + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) {
> > + r = device_create_file(adev->dev,
> &dev_attr_compute_userq_reset_mask);
> > + if (r)
> > + return r;
> > + }
> > +
> > return r;
> > }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > index 8b8a04138711..2fb288b2bfc4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> > @@ -471,6 +471,21 @@ static ssize_t amdgpu_get_sdma_reset_mask(struct
> > device *dev, static DEVICE_ATTR(sdma_reset_mask, 0444,
> > amdgpu_get_sdma_reset_mask, NULL);
> >
> > +static ssize_t amdgpu_get_sdma_userq_reset_mask(struct device *dev,
> > + struct device_attribute
> > *attr,
> > + char *buf) {
> > + struct drm_device *ddev = dev_get_drvdata(dev);
> > + struct amdgpu_device *adev = drm_to_adev(ddev);
> > +
> > + if (!adev)
> > + return -ENODEV;
> > +
> > + return amdgpu_show_reset_mask(buf,
> > +adev->userq_supported_reset[AMDGPU_HW_IP_DMA]);
> > +}
> > +static DEVICE_ATTR(sdma_userq_reset_mask, 0444,
> > + amdgpu_get_sdma_userq_reset_mask, NULL);
> > +
> > int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev) {
> > int r = 0;
> > @@ -484,6 +499,12 @@ int amdgpu_sdma_sysfs_reset_mask_init(struct
> amdgpu_device *adev)
> > return r;
> > }
> >
> > + if (adev->userq_funcs[AMDGPU_HW_IP_DMA]) {
> > + r = device_create_file(adev->dev,
> &dev_attr_sdma_userq_reset_mask);
> > + if (r)
> > + return r;
> > + }
> > +
> > return r;
> > }
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > index 188de848c229..15ae72e2d679 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> > @@ -47,6 +47,16 @@ u32 amdgpu_userq_get_supported_ip_mask(struct
> amdgpu_device *adev)
> > return userq_ip_mask;
> > }
> >
> > +bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
> > + int ring_type,
> > + int reset_type) {
> > + if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX)
> > + return false;
> > +
> > + return (adev->userq_supported_reset[ring_type] & reset_type) !=
> > +0; }
> > +
> > static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) {
> > if (amdgpu_device_should_recover_gpu(adev)) { @@ -94,6 +104,9
> > @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr
> *uq_mgr)
> > int ring_type = queue_types[i];
> > const struct amdgpu_userq_funcs *funcs =
> > adev->userq_funcs[ring_type];
> >
> > + if (!amdgpu_userq_is_reset_type_supported(adev, ring_type,
> AMDGPU_RESET_TYPE_PER_QUEUE))
> > + continue;
> > +
> > if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
> > funcs && funcs->detect_and_reset) {
> > r = funcs->detect_and_reset(adev, ring_type);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > index 252517ce5d5a..82b7c365d720 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> > @@ -1815,6 +1815,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> > amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> > adev->gfx.compute_supported_reset =
> >
> > amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> > + adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> > + amdgpu_userq_get_full_reset_mask(adev,
> AMDGPU_HW_IP_GFX);
> > + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> > +
> > + amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> > +
> > switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> > case IP_VERSION(11, 0, 0):
> > case IP_VERSION(11, 0, 2):
> > @@ -1824,12 +1829,24 @@ static int gfx_v11_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> > !amdgpu_sriov_vf(adev)) {
> > adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > adev->gfx.gfx_supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > + if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> > + adev->userq_funcs[AMDGPU_HW_IP_GFX]-
> >detect_and_reset)
> > +
> > adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> > + adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]-
> >detect_and_reset)
> > +
> > + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> > + AMDGPU_RESET_TYPE_PER_QUEUE;
> > }
> > break;
> > default:
> > if (!amdgpu_sriov_vf(adev)) {
> > adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > adev->gfx.gfx_supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > + if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> > + adev->userq_funcs[AMDGPU_HW_IP_GFX]-
> >detect_and_reset)
> > +
> > adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> > + adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]-
> >detect_and_reset)
> > +
> > + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> > + AMDGPU_RESET_TYPE_PER_QUEUE;
> > }
> > break;
> > }
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > index 35d5a7e99a7c..c5ac42a30789 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> > @@ -1543,6 +1543,11 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> > amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> > adev->gfx.compute_supported_reset =
> >
> > amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> > + adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> > + amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX);
> > + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> > +
> > + amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> > +
> > switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> > case IP_VERSION(12, 0, 0):
> > case IP_VERSION(12, 0, 1):
> > @@ -1551,6 +1556,13 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> > !amdgpu_sriov_vf(adev)) {
> > adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > adev->gfx.gfx_supported_reset |=
> > AMDGPU_RESET_TYPE_PER_QUEUE;
> > + if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> > + adev->userq_funcs[AMDGPU_HW_IP_GFX]-
> >detect_and_reset)
> > +
> > adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> > + adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]-
> >detect_and_reset)
> > +
> > + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> > + AMDGPU_RESET_TYPE_PER_QUEUE;
> > +
> > }
> > break;
> > default:
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > index db6e41967f12..8850eaf8d2c4 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> > @@ -1349,19 +1349,6 @@ static int sdma_v6_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> > return r;
> > }
> >
> > - adev->sdma.supported_reset =
> > -
> > amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > - switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> > - case IP_VERSION(6, 0, 0):
> > - case IP_VERSION(6, 0, 2):
> > - case IP_VERSION(6, 0, 3):
> > - if ((adev->sdma.instance[0].fw_version >= 21) &&
> > - !amdgpu_sriov_vf(adev))
> > - adev->sdma.supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > - break;
> > - default:
> > - break;
> > - }
> >
> > if (amdgpu_sdma_ras_sw_init(adev)) {
> > dev_err(adev->dev, "Failed to initialize sdma ras
> > block!\n"); @@ -1412,6 +1399,27 @@ static int sdma_v6_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> > break;
> > }
> >
> > + adev->sdma.supported_reset =
> > +
> > amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> > + amdgpu_userq_get_full_reset_mask(adev,
> > + AMDGPU_HW_IP_DMA);
> > +
> > + switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> > + case IP_VERSION(6, 0, 0):
> > + case IP_VERSION(6, 0, 2):
> > + case IP_VERSION(6, 0, 3):
> > + if ((adev->sdma.instance[0].fw_version >= 21) &&
> > + !amdgpu_sriov_vf(adev)) {
> > + adev->sdma.supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > + if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> > + adev->userq_funcs[AMDGPU_HW_IP_DMA]-
> >detect_and_reset)
> > +
> > + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |=
> > + AMDGPU_RESET_TYPE_PER_QUEUE;
> > +
> > + }
> > + break;
> > + default:
> > + break;
> > + }
> > r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> > if (r)
> > return r;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > index 326ecc8d37d2..9de46ac8b1db 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> > @@ -1335,14 +1335,6 @@ static int sdma_v7_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> > return r;
> > }
> >
> > - adev->sdma.supported_reset =
> > -
> > amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > - if (!amdgpu_sriov_vf(adev))
> > - adev->sdma.supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > -
> > - r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> > - if (r)
> > - return r;
> > /* Allocate memory for SDMA IP Dump buffer */
> > ptr = kcalloc(adev->sdma.num_instances * reg_count,
> > sizeof(uint32_t),
> GFP_KERNEL);
> > if (ptr)
> > @@ -1360,6 +1352,22 @@ static int sdma_v7_0_sw_init(struct
> amdgpu_ip_block *ip_block)
> > break;
> > }
> >
> > + adev->sdma.supported_reset =
> > +
> > amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> > + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> > + amdgpu_userq_get_full_reset_mask(adev,
> > + AMDGPU_HW_IP_DMA);
> > +
> > + if (!amdgpu_sriov_vf(adev)) {
> > + adev->sdma.supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> > + if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> > + adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
> > + adev->userq_supported_reset[AMDGPU_HW_IP_DMA]
> > + |= AMDGPU_RESET_TYPE_PER_QUEUE;
> > +
> > + }
> > + r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> > + if (r)
> > + return r;
> > +
> > return r;
> > }
> >
> > --
> > 2.49.0
> >