On Fri, Oct 24, 2025 at 5:45 AM Jesse.Zhang <[email protected]> wrote:
>
> This commit adds support for tracking and exposing the reset capabilities
> of user mode queues across different IP blocks (GFX, Compute, SDMA).
>
> These changes allow userspace to query the reset capabilities of user
> mode queues and ensure reset operations are only attempted when supported
> by the hardware and driver.
>
> Suggested-by: Alex Deucher <[email protected]>
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
> drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++
> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++-
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 44 ++++++++++++++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 21 +++++++++++
> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 13 +++++++
> drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 17 +++++++++
> drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 12 ++++++
> drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 34 ++++++++++-------
> drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 24 ++++++++----
> 9 files changed, 163 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d0fb4eb1d7c4..48b21863065e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1182,6 +1182,7 @@ struct amdgpu_device {
> * Value: struct amdgpu_usermode_queue
> */
> struct xarray userq_doorbell_xa;
> + u32 userq_supported_reset[AMDGPU_RING_TYPE_MAX];
I don't think we need a separate userq_supported_reset array. Just
use the existing reset masks. We use the same functionality in both
kernel and userq cases so I don't see a reason to have a separate
tracker.
Alex
>
> /* df */
> struct amdgpu_df df;
> @@ -1612,6 +1613,8 @@ struct dma_fence
> *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
> struct amdgpu_ring *ring,
> struct amdgpu_job *job);
> bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
> +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev,
> + int ring_type);
> ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
> ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 8480b72258f2..a0064c5314df 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -7649,7 +7649,8 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct
> amdgpu_ring *ring)
> if (!ring || !ring->adev)
> return size;
>
> - if (amdgpu_device_should_recover_gpu(ring->adev))
> + if (amdgpu_device_should_recover_gpu(ring->adev) &&
> + unlikely(!ring->adev->debug_disable_gpu_ring_reset))
> size |= AMDGPU_RESET_TYPE_FULL;
>
> if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
> @@ -7659,6 +7660,20 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct
> amdgpu_ring *ring)
> return size;
> }
>
> +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev, int
> ring_type)
> +{
> + ssize_t size = 0;
> +
> + if (!adev || !adev->userq_funcs[ring_type])
> + return size;
> +
> + if (amdgpu_device_should_recover_gpu(adev) &&
> + unlikely(!adev->debug_disable_gpu_ring_reset))
> + size |= AMDGPU_RESET_TYPE_FULL;
> +
> + return size;
> +}
> +
> ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
> {
> ssize_t size = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 3d24f9cd750a..5597753ec61a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1826,6 +1826,32 @@ static ssize_t
> amdgpu_gfx_get_compute_reset_mask(struct device *dev,
> return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset);
> }
>
> +static ssize_t amdgpu_userq_get_gfx_reset_mask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct drm_device *ddev = dev_get_drvdata(dev);
> + struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> + if (!adev)
> + return -ENODEV;
> +
> + return amdgpu_show_reset_mask(buf,
> adev->userq_supported_reset[AMDGPU_HW_IP_GFX]);
> +}
> +
> +static ssize_t amdgpu_userq_get_compute_reset_mask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct drm_device *ddev = dev_get_drvdata(dev);
> + struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> + if (!adev)
> + return -ENODEV;
> +
> + return amdgpu_show_reset_mask(buf,
> adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE]);
> +}
> +
> static DEVICE_ATTR(run_cleaner_shader, 0200,
> NULL, amdgpu_gfx_set_run_cleaner_shader);
>
> @@ -1845,6 +1871,12 @@ static DEVICE_ATTR(gfx_reset_mask, 0444,
> static DEVICE_ATTR(compute_reset_mask, 0444,
> amdgpu_gfx_get_compute_reset_mask, NULL);
>
> +static DEVICE_ATTR(gfx_userq_reset_mask, 0444,
> + amdgpu_userq_get_gfx_reset_mask, NULL);
> +
> +static DEVICE_ATTR(compute_userq_reset_mask, 0444,
> + amdgpu_userq_get_compute_reset_mask, NULL);
> +
> static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev)
> {
> struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
> @@ -1928,6 +1960,18 @@ static int amdgpu_gfx_sysfs_reset_mask_init(struct
> amdgpu_device *adev)
> return r;
> }
>
> + if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) {
> + r = device_create_file(adev->dev,
> &dev_attr_gfx_userq_reset_mask);
> + if (r)
> + return r;
> + }
> +
> + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) {
> + r = device_create_file(adev->dev,
> &dev_attr_compute_userq_reset_mask);
> + if (r)
> + return r;
> + }
> +
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 8b8a04138711..2fb288b2bfc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -471,6 +471,21 @@ static ssize_t amdgpu_get_sdma_reset_mask(struct device
> *dev,
> static DEVICE_ATTR(sdma_reset_mask, 0444,
> amdgpu_get_sdma_reset_mask, NULL);
>
> +static ssize_t amdgpu_get_sdma_userq_reset_mask(struct device *dev,
> + struct device_attribute *attr,
> + char *buf)
> +{
> + struct drm_device *ddev = dev_get_drvdata(dev);
> + struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> + if (!adev)
> + return -ENODEV;
> +
> + return amdgpu_show_reset_mask(buf,
> adev->userq_supported_reset[AMDGPU_HW_IP_DMA]);
> +}
> +static DEVICE_ATTR(sdma_userq_reset_mask, 0444,
> + amdgpu_get_sdma_userq_reset_mask, NULL);
> +
> int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
> {
> int r = 0;
> @@ -484,6 +499,12 @@ int amdgpu_sdma_sysfs_reset_mask_init(struct
> amdgpu_device *adev)
> return r;
> }
>
> + if (adev->userq_funcs[AMDGPU_HW_IP_DMA]) {
> + r = device_create_file(adev->dev,
> &dev_attr_sdma_userq_reset_mask);
> + if (r)
> + return r;
> + }
> +
> return r;
> }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 188de848c229..15ae72e2d679 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -47,6 +47,16 @@ u32 amdgpu_userq_get_supported_ip_mask(struct
> amdgpu_device *adev)
> return userq_ip_mask;
> }
>
> +bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
> + int ring_type,
> + int reset_type)
> +{
> + if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX)
> + return false;
> +
> + return (adev->userq_supported_reset[ring_type] & reset_type) != 0;
> +}
> +
> static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
> {
> if (amdgpu_device_should_recover_gpu(adev)) {
> @@ -94,6 +104,9 @@ amdgpu_userq_detect_and_reset_queues(struct
> amdgpu_userq_mgr *uq_mgr)
> int ring_type = queue_types[i];
> const struct amdgpu_userq_funcs *funcs =
> adev->userq_funcs[ring_type];
>
> + if (!amdgpu_userq_is_reset_type_supported(adev, ring_type,
> AMDGPU_RESET_TYPE_PER_QUEUE))
> + continue;
> +
> if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
> funcs && funcs->detect_and_reset) {
> r = funcs->detect_and_reset(adev, ring_type);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 252517ce5d5a..82b7c365d720 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1815,6 +1815,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> adev->gfx.compute_supported_reset =
> amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> + adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> + amdgpu_userq_get_full_reset_mask(adev,
> AMDGPU_HW_IP_GFX);
> + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> +
> amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> +
> switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> case IP_VERSION(11, 0, 0):
> case IP_VERSION(11, 0, 2):
> @@ -1824,12 +1829,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> !amdgpu_sriov_vf(adev)) {
> adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> adev->gfx.gfx_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> + if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> +
> adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
> + adev->userq_supported_reset[AMDGPU_HW_IP_GFX]
> |= AMDGPU_RESET_TYPE_PER_QUEUE;
> + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> +
> adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
> +
> adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> }
> break;
> default:
> if (!amdgpu_sriov_vf(adev)) {
> adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> adev->gfx.gfx_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> + if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> +
> adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
> + adev->userq_supported_reset[AMDGPU_HW_IP_GFX]
> |= AMDGPU_RESET_TYPE_PER_QUEUE;
> + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> +
> adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
> +
> adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> }
> break;
> }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 35d5a7e99a7c..c5ac42a30789 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -1543,6 +1543,11 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
> adev->gfx.compute_supported_reset =
> amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> + adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> + amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX);
> + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> + amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> +
> switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
> case IP_VERSION(12, 0, 0):
> case IP_VERSION(12, 0, 1):
> @@ -1551,6 +1556,13 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> !amdgpu_sriov_vf(adev)) {
> adev->gfx.compute_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> adev->gfx.gfx_supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> + if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> +
> adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
> + adev->userq_supported_reset[AMDGPU_HW_IP_GFX]
> |= AMDGPU_RESET_TYPE_PER_QUEUE;
> + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> +
> adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
> +
> adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> +
> }
> break;
> default:
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> index db6e41967f12..8850eaf8d2c4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> @@ -1349,19 +1349,6 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> return r;
> }
>
> - adev->sdma.supported_reset =
> - amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> - switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> - case IP_VERSION(6, 0, 0):
> - case IP_VERSION(6, 0, 2):
> - case IP_VERSION(6, 0, 3):
> - if ((adev->sdma.instance[0].fw_version >= 21) &&
> - !amdgpu_sriov_vf(adev))
> - adev->sdma.supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> - break;
> - default:
> - break;
> - }
>
> if (amdgpu_sdma_ras_sw_init(adev)) {
> dev_err(adev->dev, "Failed to initialize sdma ras block!\n");
> @@ -1412,6 +1399,27 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> break;
> }
>
> + adev->sdma.supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> + amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA);
> +
> + switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> + case IP_VERSION(6, 0, 0):
> + case IP_VERSION(6, 0, 2):
> + case IP_VERSION(6, 0, 3):
> + if ((adev->sdma.instance[0].fw_version >= 21) &&
> + !amdgpu_sriov_vf(adev)) {
> + adev->sdma.supported_reset |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> + if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> +
> adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
> + adev->userq_supported_reset[AMDGPU_HW_IP_DMA]
> |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +
> + }
> + break;
> + default:
> + break;
> + }
> r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> if (r)
> return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> index 326ecc8d37d2..9de46ac8b1db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> @@ -1335,14 +1335,6 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> return r;
> }
>
> - adev->sdma.supported_reset =
> - amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> - if (!amdgpu_sriov_vf(adev))
> - adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> -
> - r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> - if (r)
> - return r;
> /* Allocate memory for SDMA IP Dump buffer */
> ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t),
> GFP_KERNEL);
> if (ptr)
> @@ -1360,6 +1352,22 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block
> *ip_block)
> break;
> }
>
> + adev->sdma.supported_reset =
> + amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> + amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA);
> +
> + if (!amdgpu_sriov_vf(adev)) {
> + adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> + if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> + adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
> + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |=
> AMDGPU_RESET_TYPE_PER_QUEUE;
> +
> + }
> + r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> + if (r)
> + return r;
> +
> return r;
> }
>
> --
> 2.49.0
>