gfx11: Implement the GFX11 KGQ pipe reset

Alex Deucher Fri, 21 Feb 2025 14:46:11 -0800

On Fri, Feb 21, 2025 at 8:38 AM Prike Liang <prike.li...@amd.com> wrote:
>
> Implement the kernel graphics queue pipe reset,and the driver
> will fallback to pipe reset when the queue reset fails. However,
> the ME FW hasn't fully supported pipe reset yet so disable the
> KGQ pipe reset temporarily.
>
> Signed-off-by: Prike Liang <prike.li...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h |  2 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c    | 71 ++++++++++++++++++++++-
>  2 files changed, 71 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
> index 4eedd92f000b..06fe21e15ed6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
> @@ -25,6 +25,8 @@
>
>  #include "amdgpu_socbb.h"
>
> +#define RS64_FW_UC_START_ADDR_LO 0x3000


Will this potentially change in future firmware versions or is it
fixed?  If it will change, then let's just read it back from registers
and store it in adev->gfx.rs64_fw_use_start_addr_lo or something so
that it will be correct if the user has a mix of GPUs in the system.
Other than those comments, the series looks good to me.

Alex

> +
>  struct common_firmware_header {
>         uint32_t size_bytes; /* size of the entire header+image(s) in bytes */
>         uint32_t header_size_bytes; /* size of just the header in bytes */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 2c7f0bb242ff..7e53c0b63f88 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -6653,6 +6653,68 @@ static void gfx_v11_0_emit_mem_sync(struct amdgpu_ring 
> *ring)
>         amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
>  }
>
> +static bool gfx_v11_pipe_reset_support(struct amdgpu_device *adev)
> +{
> +       /* Disable the pipe reset until the CPFW fully support it.*/
> +       dev_warn_once(adev->dev, "The CPFW hasn't support pipe reset yet.\n");

I think we can drop this warning.  It will just confuse users.  Maybe
make it debug if you want to keep it.

> +       return false;
> +}
> +
> +
> +static int gfx_v11_reset_gfx_pipe(struct amdgpu_ring *ring)
> +{
> +       struct amdgpu_device *adev = ring->adev;
> +       uint32_t reset_pipe = 0, clean_pipe = 0;
> +       int r;
> +
> +       if (!gfx_v11_pipe_reset_support(adev))
> +               return -EOPNOTSUPP;
> +
> +       gfx_v11_0_set_safe_mode(adev, 0);
> +       mutex_lock(&adev->srbm_mutex);
> +       soc21_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
> +
> +       switch (ring->pipe) {
> +       case 0:
> +               reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
> +                                          PFP_PIPE0_RESET, 1);
> +               reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
> +                                          ME_PIPE0_RESET, 1);
> +               clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
> +                                          PFP_PIPE0_RESET, 0);
> +               clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
> +                                          ME_PIPE0_RESET, 0);
> +               break;
> +       case 1:
> +               reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
> +                                          PFP_PIPE1_RESET, 1);
> +               reset_pipe = REG_SET_FIELD(reset_pipe, CP_ME_CNTL,
> +                                          ME_PIPE1_RESET, 1);
> +               clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
> +                                          PFP_PIPE1_RESET, 0);
> +               clean_pipe = REG_SET_FIELD(clean_pipe, CP_ME_CNTL,
> +                                          ME_PIPE1_RESET, 0);
> +               break;
> +       default:
> +               break;
> +       }
> +
> +       WREG32_SOC15(GC, 0, regCP_ME_CNTL, reset_pipe);
> +       WREG32_SOC15(GC, 0, regCP_ME_CNTL, clean_pipe);
> +
> +       r = (RREG32(SOC15_REG_OFFSET(GC, 0, regCP_GFX_RS64_INSTR_PNTR1)) << 
> 2) - RS64_FW_UC_START_ADDR_LO;
> +       soc21_grbm_select(adev, 0, 0, 0, 0);
> +       mutex_unlock(&adev->srbm_mutex);
> +       gfx_v11_0_unset_safe_mode(adev, 0);
> +
> +       dev_info(adev->dev,"The ring %s pipe reset to the ME firmware start 
> PC: %s\n", ring->name,
> +                       r == 0 ? "successfuly" : "failed");
> +       /* FIXME: Sometimes driver can't cache the ME firmware start PC 
> correctly, so the pipe reset status
> +        * relies on the later gfx ring test result.
> +        */
> +       return 0;
> +}
> +
>  static int gfx_v11_0_reset_kgq(struct amdgpu_ring *ring, unsigned int vmid)
>  {
>         struct amdgpu_device *adev = ring->adev;
> @@ -6662,8 +6724,13 @@ static int gfx_v11_0_reset_kgq(struct amdgpu_ring 
> *ring, unsigned int vmid)
>                 return -EINVAL;
>
>         r = amdgpu_mes_reset_legacy_queue(ring->adev, ring, vmid, false);
> -       if (r)
> -               return r;
> +       if (r) {
> +
> +               dev_warn(adev->dev,"reset via MES failed and try pipe reset 
> %d\n", r);
> +               r = gfx_v11_reset_gfx_pipe(ring);
> +               if (r)
> +                       return r;
> +       }
>
>         r = amdgpu_bo_reserve(ring->mqd_obj, false);
>         if (unlikely(r != 0)) {
> --
> 2.34.1
>

Re: [PATCH v2 1/4] drm/amdgpu/gfx11: Implement the GFX11 KGQ pipe reset

Reply via email to