amdgpu: rework gfx10 queue reset

Alex Deucher Fri, 02 May 2025 14:42:48 -0700

On Fri, May 2, 2025 at 12:17 PM Christian König
<ckoenig.leichtzumer...@gmail.com> wrote:
>
> Apply the same changes to gfx10 as done to gfx9.
>
> The general idea to reset the whole kernel queue and then asking the kiq
> to map it again didn't worked at all. Background is that we don't use per
> application kernel queues for gfx10 on Linux for performance reasons.
>
> So instead use the gfx9 approach here as well and only reset all
> submissions from a specific VMID instead of the whole queue.
>
> Navi 10 seems to be stable, but Navi 2x still shows hangs during over
> night testing. This needs more investigation, but the result is clearly
> better than before.


For gfx 10.3, we enable the second gfx pipe as a high priority queue.
Disabling that might help.  Revert:

commit b7a1a0ef12b81957584fef7b61e2d5ec049c7209
Author: Arunpravin Paneer Selvam <arunpravin.paneersel...@amd.com>
Date:   Mon Jun 6 13:59:13 2022 +0530

    drm/amd/amdgpu: add pipe1 hardware support

    Enable pipe1 support starting from SIENNA CICHLID asic

    Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2117
    Reviewed-by: Alex Deucher <alexander.deuc...@amd.com>
    Signed-off-by: Arunpravin Paneer Selvam <arunpravin.paneersel...@amd.com>
    Signed-off-by: ZhenGuo Yin <zhenguo....@amd.com>
    Signed-off-by: Alex Deucher <alexander.deuc...@amd.com>


>
> Signed-off-by: Christian König <christian.koe...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 57 ++++++++------------------
>  1 file changed, 16 insertions(+), 41 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index 75ea071744eb..41cc0d6db15b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -8746,7 +8746,17 @@ static void gfx_v10_0_ring_emit_pipeline_sync(struct 
> amdgpu_ring *ring)
>         int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX);
>         uint32_t seq = ring->fence_drv.sync_seq;
>         uint64_t addr = ring->fence_drv.gpu_addr;
> +       struct amdgpu_device *adev = ring->adev;
>
> +       amdgpu_ring_emit_reg_wait(ring,
> +                                 SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> +                                 0, 0xffff);
> +       amdgpu_ring_emit_wreg(ring,
> +                             SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET),
> +                             0);
> +       amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
> +                              ring->fence_drv.sync_seq,
> +                              AMDGPU_FENCE_FLAG_EXEC);
>         gfx_v10_0_wait_reg_mem(ring, usepfp, 1, 0, lower_32_bits(addr),
>                                upper_32_bits(addr), seq, 0xffffffff, 4);
>  }
> @@ -9046,21 +9056,6 @@ static void 
> gfx_v10_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring,
>                                                            ref, mask);
>  }
>
> -static void gfx_v10_0_ring_soft_recovery(struct amdgpu_ring *ring,
> -                                        unsigned int vmid)
> -{
> -       struct amdgpu_device *adev = ring->adev;
> -       uint32_t value = 0;
> -
> -       value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03);
> -       value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01);
> -       value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1);
> -       value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid);
> -       amdgpu_gfx_rlc_enter_safe_mode(adev, 0);
> -       WREG32_SOC15(GC, 0, mmSQ_CMD, value);
> -       amdgpu_gfx_rlc_exit_safe_mode(adev, 0);
> -}
> -
>  static void
>  gfx_v10_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev,
>                                       uint32_t me, uint32_t pipe,
> @@ -9529,38 +9524,21 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring 
> *ring, unsigned int vmid)
>         struct amdgpu_ring *kiq_ring = &kiq->ring;
>         unsigned long flags;
>         u32 tmp;
> -       u64 addr;
>         int r;
>
>         if (amdgpu_sriov_vf(adev))
>                 return -EINVAL;
>
> -       if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
> -               return -EINVAL;
> -
>         spin_lock_irqsave(&kiq->ring_lock, flags);
>
> -       if (amdgpu_ring_alloc(kiq_ring, 5 + 7 + 7 + 
> kiq->pmf->map_queues_size)) {
> +       if (amdgpu_ring_alloc(kiq_ring, 5)) {
>                 spin_unlock_irqrestore(&kiq->ring_lock, flags);
>                 return -ENOMEM;
>         }
>
> -       addr = amdgpu_bo_gpu_offset(ring->mqd_obj) +
> -               offsetof(struct v10_gfx_mqd, cp_gfx_hqd_active);
>         tmp = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 << vmid);
> -       if (ring->pipe == 0)
> -               tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE0_QUEUES, 1 << 
> ring->queue);
> -       else
> -               tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE1_QUEUES, 1 << 
> ring->queue);
> -
>         gfx_v10_0_ring_emit_wreg(kiq_ring,
>                                  SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), 
> tmp);
> -       gfx_v10_0_wait_reg_mem(kiq_ring, 0, 1, 0,
> -                              lower_32_bits(addr), upper_32_bits(addr),
> -                              0, 1, 0x20);
> -       gfx_v10_0_ring_emit_reg_wait(kiq_ring,
> -                                    SOC15_REG_OFFSET(GC, 0, 
> mmCP_VMID_RESET), 0, 0xffffffff);
> -       kiq->pmf->kiq_map_queues(kiq_ring, ring);
>         amdgpu_ring_commit(kiq_ring);
>
>         spin_unlock_irqrestore(&kiq->ring_lock, flags);
> @@ -9569,13 +9547,12 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring 
> *ring, unsigned int vmid)
>         if (r)
>                 return r;
>
> -       r = gfx_v10_0_kgq_init_queue(ring, true);
> -       if (r) {
> -               DRM_ERROR("fail to init kgq\n");
> -               return r;
> -       }
> +       if (amdgpu_ring_alloc(ring, 7 + 7 + 5 + 7))
> +               return -ENOMEM;
> +       gfx_v10_0_ring_emit_pipeline_sync(ring);
> +       amdgpu_ring_commit(ring);
>
> -       return amdgpu_ring_test_ring(ring);
> +       return gfx_v10_0_ring_test_ib(ring, AMDGPU_QUEUE_RESET_TIMEOUT);
>  }
>
>  static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring,
> @@ -9882,7 +9859,6 @@ static const struct amdgpu_ring_funcs 
> gfx_v10_0_ring_funcs_gfx = {
>         .emit_wreg = gfx_v10_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
> -       .soft_recovery = gfx_v10_0_ring_soft_recovery,
>         .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>         .reset = gfx_v10_0_reset_kgq,
>         .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader,
> @@ -9923,7 +9899,6 @@ static const struct amdgpu_ring_funcs 
> gfx_v10_0_ring_funcs_compute = {
>         .emit_wreg = gfx_v10_0_ring_emit_wreg,
>         .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
>         .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait,
> -       .soft_recovery = gfx_v10_0_ring_soft_recovery,

Same comment as the rest of the series for compute.

Alex

>         .emit_mem_sync = gfx_v10_0_emit_mem_sync,
>         .reset = gfx_v10_0_reset_kcq,
>         .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader,
> --
> 2.34.1
>

Re: [PATCH 5/5] drm/amdgpu: rework gfx10 queue reset

Reply via email to