On Fri, May 2, 2025 at 12:17 PM Christian König <ckoenig.leichtzumer...@gmail.com> wrote: > > Apply the same changes to gfx10 as done to gfx9. > > The general idea to reset the whole kernel queue and then asking the kiq > to map it again didn't worked at all. Background is that we don't use per > application kernel queues for gfx10 on Linux for performance reasons. > > So instead use the gfx9 approach here as well and only reset all > submissions from a specific VMID instead of the whole queue. > > Navi 10 seems to be stable, but Navi 2x still shows hangs during over > night testing. This needs more investigation, but the result is clearly > better than before.
For gfx 10.3, we enable the second gfx pipe as a high priority queue. Disabling that might help. Revert: commit b7a1a0ef12b81957584fef7b61e2d5ec049c7209 Author: Arunpravin Paneer Selvam <arunpravin.paneersel...@amd.com> Date: Mon Jun 6 13:59:13 2022 +0530 drm/amd/amdgpu: add pipe1 hardware support Enable pipe1 support starting from SIENNA CICHLID asic Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2117 Reviewed-by: Alex Deucher <alexander.deuc...@amd.com> Signed-off-by: Arunpravin Paneer Selvam <arunpravin.paneersel...@amd.com> Signed-off-by: ZhenGuo Yin <zhenguo....@amd.com> Signed-off-by: Alex Deucher <alexander.deuc...@amd.com> > > Signed-off-by: Christian König <christian.koe...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 57 ++++++++------------------ > 1 file changed, 16 insertions(+), 41 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > index 75ea071744eb..41cc0d6db15b 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c > @@ -8746,7 +8746,17 @@ static void gfx_v10_0_ring_emit_pipeline_sync(struct > amdgpu_ring *ring) > int usepfp = (ring->funcs->type == AMDGPU_RING_TYPE_GFX); > uint32_t seq = ring->fence_drv.sync_seq; > uint64_t addr = ring->fence_drv.gpu_addr; > + struct amdgpu_device *adev = ring->adev; > > + amdgpu_ring_emit_reg_wait(ring, > + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), > + 0, 0xffff); > + amdgpu_ring_emit_wreg(ring, > + SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), > + 0); > + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, > + ring->fence_drv.sync_seq, > + AMDGPU_FENCE_FLAG_EXEC); > gfx_v10_0_wait_reg_mem(ring, usepfp, 1, 0, lower_32_bits(addr), > upper_32_bits(addr), seq, 0xffffffff, 4); > } > @@ -9046,21 +9056,6 @@ static void > gfx_v10_0_ring_emit_reg_write_reg_wait(struct amdgpu_ring *ring, > ref, mask); > } > > -static void gfx_v10_0_ring_soft_recovery(struct amdgpu_ring *ring, > - unsigned int vmid) > -{ > - struct amdgpu_device *adev = ring->adev; > - uint32_t value = 0; > - > - value = REG_SET_FIELD(value, SQ_CMD, CMD, 0x03); > - value = REG_SET_FIELD(value, SQ_CMD, MODE, 0x01); > - value = REG_SET_FIELD(value, SQ_CMD, CHECK_VMID, 1); > - value = REG_SET_FIELD(value, SQ_CMD, VM_ID, vmid); > - amdgpu_gfx_rlc_enter_safe_mode(adev, 0); > - WREG32_SOC15(GC, 0, mmSQ_CMD, value); > - amdgpu_gfx_rlc_exit_safe_mode(adev, 0); > -} > - > static void > gfx_v10_0_set_gfx_eop_interrupt_state(struct amdgpu_device *adev, > uint32_t me, uint32_t pipe, > @@ -9529,38 +9524,21 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring > *ring, unsigned int vmid) > struct amdgpu_ring *kiq_ring = &kiq->ring; > unsigned long flags; > u32 tmp; > - u64 addr; > int r; > > if (amdgpu_sriov_vf(adev)) > return -EINVAL; > > - if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) > - return -EINVAL; > - > spin_lock_irqsave(&kiq->ring_lock, flags); > > - if (amdgpu_ring_alloc(kiq_ring, 5 + 7 + 7 + > kiq->pmf->map_queues_size)) { > + if (amdgpu_ring_alloc(kiq_ring, 5)) { > spin_unlock_irqrestore(&kiq->ring_lock, flags); > return -ENOMEM; > } > > - addr = amdgpu_bo_gpu_offset(ring->mqd_obj) + > - offsetof(struct v10_gfx_mqd, cp_gfx_hqd_active); > tmp = REG_SET_FIELD(0, CP_VMID_RESET, RESET_REQUEST, 1 << vmid); > - if (ring->pipe == 0) > - tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE0_QUEUES, 1 << > ring->queue); > - else > - tmp = REG_SET_FIELD(tmp, CP_VMID_RESET, PIPE1_QUEUES, 1 << > ring->queue); > - > gfx_v10_0_ring_emit_wreg(kiq_ring, > SOC15_REG_OFFSET(GC, 0, mmCP_VMID_RESET), > tmp); > - gfx_v10_0_wait_reg_mem(kiq_ring, 0, 1, 0, > - lower_32_bits(addr), upper_32_bits(addr), > - 0, 1, 0x20); > - gfx_v10_0_ring_emit_reg_wait(kiq_ring, > - SOC15_REG_OFFSET(GC, 0, > mmCP_VMID_RESET), 0, 0xffffffff); > - kiq->pmf->kiq_map_queues(kiq_ring, ring); > amdgpu_ring_commit(kiq_ring); > > spin_unlock_irqrestore(&kiq->ring_lock, flags); > @@ -9569,13 +9547,12 @@ static int gfx_v10_0_reset_kgq(struct amdgpu_ring > *ring, unsigned int vmid) > if (r) > return r; > > - r = gfx_v10_0_kgq_init_queue(ring, true); > - if (r) { > - DRM_ERROR("fail to init kgq\n"); > - return r; > - } > + if (amdgpu_ring_alloc(ring, 7 + 7 + 5 + 7)) > + return -ENOMEM; > + gfx_v10_0_ring_emit_pipeline_sync(ring); > + amdgpu_ring_commit(ring); > > - return amdgpu_ring_test_ring(ring); > + return gfx_v10_0_ring_test_ib(ring, AMDGPU_QUEUE_RESET_TIMEOUT); > } > > static int gfx_v10_0_reset_kcq(struct amdgpu_ring *ring, > @@ -9882,7 +9859,6 @@ static const struct amdgpu_ring_funcs > gfx_v10_0_ring_funcs_gfx = { > .emit_wreg = gfx_v10_0_ring_emit_wreg, > .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait, > .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait, > - .soft_recovery = gfx_v10_0_ring_soft_recovery, > .emit_mem_sync = gfx_v10_0_emit_mem_sync, > .reset = gfx_v10_0_reset_kgq, > .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader, > @@ -9923,7 +9899,6 @@ static const struct amdgpu_ring_funcs > gfx_v10_0_ring_funcs_compute = { > .emit_wreg = gfx_v10_0_ring_emit_wreg, > .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait, > .emit_reg_write_reg_wait = gfx_v10_0_ring_emit_reg_write_reg_wait, > - .soft_recovery = gfx_v10_0_ring_soft_recovery, Same comment as the rest of the series for compute. Alex > .emit_mem_sync = gfx_v10_0_emit_mem_sync, > .reset = gfx_v10_0_reset_kcq, > .emit_cleaner_shader = gfx_v10_0_ring_emit_cleaner_shader, > -- > 2.34.1 >