This commit adds support for tracking and exposing the reset capabilities of user mode queues across different IP blocks (GFX, Compute, SDMA).
These changes allow userspace to query the reset capabilities of user mode queues and ensure reset operations are only attempted when supported by the hardware and driver. Suggested-by: Alex Deucher <[email protected]> Signed-off-by: Jesse Zhang <[email protected]> --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 3 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 44 ++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 21 +++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 13 +++++++ drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 17 +++++++++ drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 12 ++++++ drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 34 ++++++++++------- drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 24 ++++++++---- 9 files changed, 163 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index d0fb4eb1d7c4..48b21863065e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1182,6 +1182,7 @@ struct amdgpu_device { * Value: struct amdgpu_usermode_queue */ struct xarray userq_doorbell_xa; + u32 userq_supported_reset[AMDGPU_RING_TYPE_MAX]; /* df */ struct amdgpu_df df; @@ -1612,6 +1613,8 @@ struct dma_fence *amdgpu_device_enforce_isolation(struct amdgpu_device *adev, struct amdgpu_ring *ring, struct amdgpu_job *job); bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev); +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev, + int ring_type); ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring); ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8480b72258f2..a0064c5314df 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -7649,7 +7649,8 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) if (!ring || !ring->adev) return size; - if (amdgpu_device_should_recover_gpu(ring->adev)) + if (amdgpu_device_should_recover_gpu(ring->adev) && + unlikely(!ring->adev->debug_disable_gpu_ring_reset)) size |= AMDGPU_RESET_TYPE_FULL; if (unlikely(!ring->adev->debug_disable_soft_recovery) && @@ -7659,6 +7660,20 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) return size; } +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev, int ring_type) +{ + ssize_t size = 0; + + if (!adev || !adev->userq_funcs[ring_type]) + return size; + + if (amdgpu_device_should_recover_gpu(adev) && + unlikely(!adev->debug_disable_gpu_ring_reset)) + size |= AMDGPU_RESET_TYPE_FULL; + + return size; +} + ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) { ssize_t size = 0; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 3d24f9cd750a..5597753ec61a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1826,6 +1826,32 @@ static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev, return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset); } +static ssize_t amdgpu_userq_get_gfx_reset_mask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + + if (!adev) + return -ENODEV; + + return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_GFX]); +} + +static ssize_t amdgpu_userq_get_compute_reset_mask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + + if (!adev) + return -ENODEV; + + return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE]); +} + static DEVICE_ATTR(run_cleaner_shader, 0200, NULL, amdgpu_gfx_set_run_cleaner_shader); @@ -1845,6 +1871,12 @@ static DEVICE_ATTR(gfx_reset_mask, 0444, static DEVICE_ATTR(compute_reset_mask, 0444, amdgpu_gfx_get_compute_reset_mask, NULL); +static DEVICE_ATTR(gfx_userq_reset_mask, 0444, + amdgpu_userq_get_gfx_reset_mask, NULL); + +static DEVICE_ATTR(compute_userq_reset_mask, 0444, + amdgpu_userq_get_compute_reset_mask, NULL); + static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev) { struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; @@ -1928,6 +1960,18 @@ static int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev) return r; } + if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) { + r = device_create_file(adev->dev, &dev_attr_gfx_userq_reset_mask); + if (r) + return r; + } + + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) { + r = device_create_file(adev->dev, &dev_attr_compute_userq_reset_mask); + if (r) + return r; + } + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c index 8b8a04138711..2fb288b2bfc4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c @@ -471,6 +471,21 @@ static ssize_t amdgpu_get_sdma_reset_mask(struct device *dev, static DEVICE_ATTR(sdma_reset_mask, 0444, amdgpu_get_sdma_reset_mask, NULL); +static ssize_t amdgpu_get_sdma_userq_reset_mask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + + if (!adev) + return -ENODEV; + + return amdgpu_show_reset_mask(buf, adev->userq_supported_reset[AMDGPU_HW_IP_DMA]); +} +static DEVICE_ATTR(sdma_userq_reset_mask, 0444, + amdgpu_get_sdma_userq_reset_mask, NULL); + int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev) { int r = 0; @@ -484,6 +499,12 @@ int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev) return r; } + if (adev->userq_funcs[AMDGPU_HW_IP_DMA]) { + r = device_create_file(adev->dev, &dev_attr_sdma_userq_reset_mask); + if (r) + return r; + } + return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c index 188de848c229..15ae72e2d679 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c @@ -47,6 +47,16 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev) return userq_ip_mask; } +bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev, + int ring_type, + int reset_type) +{ + if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX) + return false; + + return (adev->userq_supported_reset[ring_type] & reset_type) != 0; +} + static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev) { if (amdgpu_device_should_recover_gpu(adev)) { @@ -94,6 +104,9 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr) int ring_type = queue_types[i]; const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type]; + if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE)) + continue; + if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 && funcs && funcs->detect_and_reset) { r = funcs->detect_and_reset(adev, ring_type); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 252517ce5d5a..82b7c365d720 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1815,6 +1815,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); adev->gfx.compute_supported_reset = amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + adev->userq_supported_reset[AMDGPU_HW_IP_GFX] = + amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX); + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] = + amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE); + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(11, 0, 0): case IP_VERSION(11, 0, 2): @@ -1824,12 +1829,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) !amdgpu_sriov_vf(adev)) { adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + if (adev->userq_funcs[AMDGPU_HW_IP_GFX] && + adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset) + adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE; + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] && + adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset) + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE; } break; default: if (!amdgpu_sriov_vf(adev)) { adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + if (adev->userq_funcs[AMDGPU_HW_IP_GFX] && + adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset) + adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE; + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] && + adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset) + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE; } break; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 35d5a7e99a7c..c5ac42a30789 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -1543,6 +1543,11 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); adev->gfx.compute_supported_reset = amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + adev->userq_supported_reset[AMDGPU_HW_IP_GFX] = + amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX); + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] = + amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE); + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { case IP_VERSION(12, 0, 0): case IP_VERSION(12, 0, 1): @@ -1551,6 +1556,13 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) !amdgpu_sriov_vf(adev)) { adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + if (adev->userq_funcs[AMDGPU_HW_IP_GFX] && + adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset) + adev->userq_supported_reset[AMDGPU_HW_IP_GFX] |= AMDGPU_RESET_TYPE_PER_QUEUE; + if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] && + adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset) + adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= AMDGPU_RESET_TYPE_PER_QUEUE; + } break; default: diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c index db6e41967f12..8850eaf8d2c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c @@ -1349,19 +1349,6 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block) return r; } - adev->sdma.supported_reset = - amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring); - switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) { - case IP_VERSION(6, 0, 0): - case IP_VERSION(6, 0, 2): - case IP_VERSION(6, 0, 3): - if ((adev->sdma.instance[0].fw_version >= 21) && - !amdgpu_sriov_vf(adev)) - adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; - break; - default: - break; - } if (amdgpu_sdma_ras_sw_init(adev)) { dev_err(adev->dev, "Failed to initialize sdma ras block!\n"); @@ -1412,6 +1399,27 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block *ip_block) break; } + adev->sdma.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring); + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] = + amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA); + + switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) { + case IP_VERSION(6, 0, 0): + case IP_VERSION(6, 0, 2): + case IP_VERSION(6, 0, 3): + if ((adev->sdma.instance[0].fw_version >= 21) && + !amdgpu_sriov_vf(adev)) { + adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + if (adev->userq_funcs[AMDGPU_HW_IP_DMA] && + adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset) + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |= AMDGPU_RESET_TYPE_PER_QUEUE; + + } + break; + default: + break; + } r = amdgpu_sdma_sysfs_reset_mask_init(adev); if (r) return r; diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c index 326ecc8d37d2..9de46ac8b1db 100644 --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c @@ -1335,14 +1335,6 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block) return r; } - adev->sdma.supported_reset = - amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring); - if (!amdgpu_sriov_vf(adev)) - adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; - - r = amdgpu_sdma_sysfs_reset_mask_init(adev); - if (r) - return r; /* Allocate memory for SDMA IP Dump buffer */ ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), GFP_KERNEL); if (ptr) @@ -1360,6 +1352,22 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block *ip_block) break; } + adev->sdma.supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring); + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] = + amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA); + + if (!amdgpu_sriov_vf(adev)) { + adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + if (adev->userq_funcs[AMDGPU_HW_IP_DMA] && + adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset) + adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |= AMDGPU_RESET_TYPE_PER_QUEUE; + + } + r = amdgpu_sdma_sysfs_reset_mask_init(adev); + if (r) + return r; + return r; } -- 2.49.0
