Re: [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues

Christian König Mon, 04 Feb 2019 04:43:23 -0800

At least from coding style, backward compatibility etc.. this looks saneto me, so feel free to add an Acked-by.

But I absolutely can't judge if that is correct from the hardware pointof view or not.

And I think that somebody else looking at this is mandatory for it to becommitted.


Christian.

Am 28.01.19 um 18:25 schrieb Marek Olšák:

Ping

On Tue, Jan 22, 2019 at 3:05 PM Marek Olšák <mar...@gmail.com<mailto:mar...@gmail.com>> wrote:


    From: Marek Olšák <marek.ol...@amd.com <mailto:marek.ol...@amd.com>>

    I'm not increasing the DRM version because GDS isn't totally
    without bugs yet.

    v2: update emit_ib_size

    Signed-off-by: Marek Olšák <marek.ol...@amd.com
    <mailto:marek.ol...@amd.com>>
    ---
     drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h |  2 ++
     drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c   | 19 +++++++++++-
     drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   | 21 +++++++++++--
     drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 40
    +++++++++++++++++++++++--
     include/uapi/drm/amdgpu_drm.h           |  5 ++++
     5 files changed, 82 insertions(+), 5 deletions(-)

    diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
    b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
    index ecbcefe49a98..f89f5734d985 100644
    --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
    +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gds.h
    @@ -30,20 +30,22 @@ struct amdgpu_bo;
     struct amdgpu_gds_asic_info {
            uint32_t        total_size;
            uint32_t        gfx_partition_size;
            uint32_t        cs_partition_size;
     };

     struct amdgpu_gds {
            struct amdgpu_gds_asic_info     mem;
            struct amdgpu_gds_asic_info     gws;
            struct amdgpu_gds_asic_info     oa;
    +       uint32_t gds_compute_max_wave_id;
    +
            /* At present, GDS, GWS and OA resources for gfx (graphics)
             * is always pre-allocated and available for graphics
    operation.
             * Such resource is shared between all gfx clients.
             * TODO: move this operation to user space
             * */
            struct amdgpu_bo*               gds_gfx_bo;
            struct amdgpu_bo*               gws_gfx_bo;
            struct amdgpu_bo*               oa_gfx_bo;
     };

    diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
    b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
    index 7984292f9282..a59e0fdf5a97 100644
    --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
    +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
    @@ -2257,20 +2257,36 @@ static void
    gfx_v7_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
     }

     static void gfx_v7_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
                                              struct amdgpu_job *job,
                                              struct amdgpu_ib *ib,
                                              uint32_t flags)
     {
            unsigned vmid = AMDGPU_JOB_GET_VMID(job);
            u32 control = INDIRECT_BUFFER_VALID | ib->length_dw |
    (vmid << 24);

    +       /* Currently, there is a high possibility to get wave ID
    mismatch
    +        * between ME and GDS, leading to a hw deadlock, because
    ME generates
    +        * different wave IDs than the GDS expects. This situation
    happens
    +        * randomly when at least 5 compute pipes use GDS ordered
    append.
    +        * The wave IDs generated by ME are also wrong after
    suspend/resume.
    +        * Those are probably bugs somewhere else in the kernel
    driver.
    +        *
    +        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters
    in ME and
    +        * GDS to 0 for this ring (me/pipe).
    +        */
    +       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
    +               amdgpu_ring_write(ring,
    PACKET3(PACKET3_SET_CONFIG_REG, 1));
    +               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID
    - PACKET3_SET_CONFIG_REG_START);
    +               amdgpu_ring_write(ring,
    ring->adev->gds.gds_compute_max_wave_id);
    +       }
    +
            amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
            amdgpu_ring_write(ring,
     #ifdef __BIG_ENDIAN
                                              (2 << 0) |
     #endif
                                              (ib->gpu_addr &
    0xFFFFFFFC));
            amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
            amdgpu_ring_write(ring, control);
     }

    @@ -4993,21 +5009,21 @@ static const struct amdgpu_ring_funcs
    gfx_v7_0_ring_funcs_compute = {
            .get_rptr = gfx_v7_0_ring_get_rptr,
            .get_wptr = gfx_v7_0_ring_get_wptr_compute,
            .set_wptr = gfx_v7_0_ring_set_wptr_compute,
            .emit_frame_size =
                    20 + /* gfx_v7_0_ring_emit_gds_switch */
                    7 + /* gfx_v7_0_ring_emit_hdp_flush */
                    5 + /* hdp invalidate */
                    7 + /* gfx_v7_0_ring_emit_pipeline_sync */
                    CIK_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
    gfx_v7_0_ring_emit_vm_flush */
                    7 + 7 + 7, /* gfx_v7_0_ring_emit_fence_compute x3
    for user fence, vm fence */
    -       .emit_ib_size = 4, /* gfx_v7_0_ring_emit_ib_compute */
    +       .emit_ib_size = 7, /* gfx_v7_0_ring_emit_ib_compute */
            .emit_ib = gfx_v7_0_ring_emit_ib_compute,
            .emit_fence = gfx_v7_0_ring_emit_fence_compute,
            .emit_pipeline_sync = gfx_v7_0_ring_emit_pipeline_sync,
            .emit_vm_flush = gfx_v7_0_ring_emit_vm_flush,
            .emit_gds_switch = gfx_v7_0_ring_emit_gds_switch,
            .emit_hdp_flush = gfx_v7_0_ring_emit_hdp_flush,
            .test_ring = gfx_v7_0_ring_test_ring,
            .test_ib = gfx_v7_0_ring_test_ib,
            .insert_nop = amdgpu_ring_insert_nop,
            .pad_ib = amdgpu_ring_generic_pad_ib,
    @@ -5050,20 +5066,21 @@ static void gfx_v7_0_set_irq_funcs(struct
    amdgpu_device *adev)
            adev->gfx.priv_inst_irq.num_types = 1;
            adev->gfx.priv_inst_irq.funcs = &gfx_v7_0_priv_inst_irq_funcs;
     }

     static void gfx_v7_0_set_gds_init(struct amdgpu_device *adev)
     {
            /* init asci gds info */
            adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
            adev->gds.gws.total_size = 64;
            adev->gds.oa.total_size = 16;
    +       adev->gds.gds_compute_max_wave_id =
    RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);

            if (adev->gds.mem.total_size == 64 * 1024) {
                    adev->gds.mem.gfx_partition_size = 4096;
                    adev->gds.mem.cs_partition_size = 4096;

                    adev->gds.gws.gfx_partition_size = 4;
                    adev->gds.gws.cs_partition_size = 4;

                    adev->gds.oa.gfx_partition_size = 4;
                    adev->gds.oa.cs_partition_size = 1;
    diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
    b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
    index a26747681ed6..b8e50a34bdb3 100644
    --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
    +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
    @@ -6077,20 +6077,36 @@ static void
    gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
     }

     static void gfx_v8_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
                                              struct amdgpu_job *job,
                                              struct amdgpu_ib *ib,
                                              uint32_t flags)
     {
            unsigned vmid = AMDGPU_JOB_GET_VMID(job);
            u32 control = INDIRECT_BUFFER_VALID | ib->length_dw |
    (vmid << 24);

    +       /* Currently, there is a high possibility to get wave ID
    mismatch
    +        * between ME and GDS, leading to a hw deadlock, because
    ME generates
    +        * different wave IDs than the GDS expects. This situation
    happens
    +        * randomly when at least 5 compute pipes use GDS ordered
    append.
    +        * The wave IDs generated by ME are also wrong after
    suspend/resume.
    +        * Those are probably bugs somewhere else in the kernel
    driver.
    +        *
    +        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters
    in ME and
    +        * GDS to 0 for this ring (me/pipe).
    +        */
    +       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
    +               amdgpu_ring_write(ring,
    PACKET3(PACKET3_SET_CONFIG_REG, 1));
    +               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID
    - PACKET3_SET_CONFIG_REG_START);
    +               amdgpu_ring_write(ring,
    ring->adev->gds.gds_compute_max_wave_id);
    +       }
    +
            amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
            amdgpu_ring_write(ring,
     #ifdef __BIG_ENDIAN
                                    (2 << 0) |
     #endif
                                    (ib->gpu_addr & 0xFFFFFFFC));
            amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr) & 0xFFFF);
            amdgpu_ring_write(ring, control);
     }

    @@ -6883,21 +6899,21 @@ static const struct amdgpu_ring_funcs
    gfx_v8_0_ring_funcs_compute = {
            .get_rptr = gfx_v8_0_ring_get_rptr,
            .get_wptr = gfx_v8_0_ring_get_wptr_compute,
            .set_wptr = gfx_v8_0_ring_set_wptr_compute,
            .emit_frame_size =
                    20 + /* gfx_v8_0_ring_emit_gds_switch */
                    7 + /* gfx_v8_0_ring_emit_hdp_flush */
                    5 + /* hdp_invalidate */
                    7 + /* gfx_v8_0_ring_emit_pipeline_sync */
                    VI_FLUSH_GPU_TLB_NUM_WREG * 5 + 7 + /*
    gfx_v8_0_ring_emit_vm_flush */
                    7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_compute x3
    for user fence, vm fence */
    -       .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
    +       .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
            .emit_ib = gfx_v8_0_ring_emit_ib_compute,
            .emit_fence = gfx_v8_0_ring_emit_fence_compute,
            .emit_pipeline_sync = gfx_v8_0_ring_emit_pipeline_sync,
            .emit_vm_flush = gfx_v8_0_ring_emit_vm_flush,
            .emit_gds_switch = gfx_v8_0_ring_emit_gds_switch,
            .emit_hdp_flush = gfx_v8_0_ring_emit_hdp_flush,
            .test_ring = gfx_v8_0_ring_test_ring,
            .test_ib = gfx_v8_0_ring_test_ib,
            .insert_nop = amdgpu_ring_insert_nop,
            .pad_ib = amdgpu_ring_generic_pad_ib,
    @@ -6913,21 +6929,21 @@ static const struct amdgpu_ring_funcs
    gfx_v8_0_ring_funcs_kiq = {
            .get_rptr = gfx_v8_0_ring_get_rptr,
            .get_wptr = gfx_v8_0_ring_get_wptr_compute,
            .set_wptr = gfx_v8_0_ring_set_wptr_compute,
            .emit_frame_size =
                    20 + /* gfx_v8_0_ring_emit_gds_switch */
                    7 + /* gfx_v8_0_ring_emit_hdp_flush */
                    5 + /* hdp_invalidate */
                    7 + /* gfx_v8_0_ring_emit_pipeline_sync */
                    17 + /* gfx_v8_0_ring_emit_vm_flush */
                    7 + 7 + 7, /* gfx_v8_0_ring_emit_fence_kiq x3 for
    user fence, vm fence */
    -       .emit_ib_size = 4, /* gfx_v8_0_ring_emit_ib_compute */
    +       .emit_ib_size = 7, /* gfx_v8_0_ring_emit_ib_compute */
            .emit_fence = gfx_v8_0_ring_emit_fence_kiq,
            .test_ring = gfx_v8_0_ring_test_ring,
            .insert_nop = amdgpu_ring_insert_nop,
            .pad_ib = amdgpu_ring_generic_pad_ib,
            .emit_rreg = gfx_v8_0_ring_emit_rreg,
            .emit_wreg = gfx_v8_0_ring_emit_wreg,
     };

     static void gfx_v8_0_set_ring_funcs(struct amdgpu_device *adev)
     {
    @@ -6989,20 +7005,21 @@ static void gfx_v8_0_set_rlc_funcs(struct
    amdgpu_device *adev)
     {
            adev->gfx.rlc.funcs = &iceland_rlc_funcs;
     }

     static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev)
     {
            /* init asci gds info */
            adev->gds.mem.total_size = RREG32(mmGDS_VMID0_SIZE);
            adev->gds.gws.total_size = 64;
            adev->gds.oa.total_size = 16;
    +       adev->gds.gds_compute_max_wave_id =
    RREG32(mmGDS_COMPUTE_MAX_WAVE_ID);

            if (adev->gds.mem.total_size == 64 * 1024) {
                    adev->gds.mem.gfx_partition_size = 4096;
                    adev->gds.mem.cs_partition_size = 4096;

                    adev->gds.gws.gfx_partition_size = 4;
                    adev->gds.gws.cs_partition_size = 4;

                    adev->gds.oa.gfx_partition_size = 4;
                    adev->gds.oa.cs_partition_size = 1;
    diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
    b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
    index 262ee3cf6f1c..5533f6e4f4a4 100644
    --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
    +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
    @@ -4003,20 +4003,36 @@ static void
    gfx_v9_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
     }

     static void gfx_v9_0_ring_emit_ib_compute(struct amdgpu_ring *ring,
                                              struct amdgpu_job *job,
                                              struct amdgpu_ib *ib,
                                              uint32_t flags)
     {
            unsigned vmid = AMDGPU_JOB_GET_VMID(job);
            u32 control = INDIRECT_BUFFER_VALID | ib->length_dw |
    (vmid << 24);

    +       /* Currently, there is a high possibility to get wave ID
    mismatch
    +        * between ME and GDS, leading to a hw deadlock, because
    ME generates
    +        * different wave IDs than the GDS expects. This situation
    happens
    +        * randomly when at least 5 compute pipes use GDS ordered
    append.
    +        * The wave IDs generated by ME are also wrong after
    suspend/resume.
    +        * Those are probably bugs somewhere else in the kernel
    driver.
    +        *
    +        * Writing GDS_COMPUTE_MAX_WAVE_ID resets wave ID counters
    in ME and
    +        * GDS to 0 for this ring (me/pipe).
    +        */
    +       if (ib->flags & AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID) {
    +               amdgpu_ring_write(ring,
    PACKET3(PACKET3_SET_CONFIG_REG, 1));
    +               amdgpu_ring_write(ring, mmGDS_COMPUTE_MAX_WAVE_ID);
    +               amdgpu_ring_write(ring,
    ring->adev->gds.gds_compute_max_wave_id);
    +       }
    +
            amdgpu_ring_write(ring, PACKET3(PACKET3_INDIRECT_BUFFER, 2));
            BUG_ON(ib->gpu_addr & 0x3); /* Dword align */
            amdgpu_ring_write(ring,
     #ifdef __BIG_ENDIAN
                                    (2 << 0) |
     #endif
    lower_32_bits(ib->gpu_addr));
            amdgpu_ring_write(ring, upper_32_bits(ib->gpu_addr));
            amdgpu_ring_write(ring, control);
     }
    @@ -4722,21 +4738,21 @@ static const struct amdgpu_ring_funcs
    gfx_v9_0_ring_funcs_compute = {
            .set_wptr = gfx_v9_0_ring_set_wptr_compute,
            .emit_frame_size =
                    20 + /* gfx_v9_0_ring_emit_gds_switch */
                    7 + /* gfx_v9_0_ring_emit_hdp_flush */
                    5 + /* hdp invalidate */
                    7 + /* gfx_v9_0_ring_emit_pipeline_sync */
                    SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
                    SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
                    2 + /* gfx_v9_0_ring_emit_vm_flush */
                    8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user
    fence, vm fence */
    -       .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
    +       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
            .emit_ib = gfx_v9_0_ring_emit_ib_compute,
            .emit_fence = gfx_v9_0_ring_emit_fence,
            .emit_pipeline_sync = gfx_v9_0_ring_emit_pipeline_sync,
            .emit_vm_flush = gfx_v9_0_ring_emit_vm_flush,
            .emit_gds_switch = gfx_v9_0_ring_emit_gds_switch,
            .emit_hdp_flush = gfx_v9_0_ring_emit_hdp_flush,
            .test_ring = gfx_v9_0_ring_test_ring,
            .test_ib = gfx_v9_0_ring_test_ib,
            .insert_nop = amdgpu_ring_insert_nop,
            .pad_ib = amdgpu_ring_generic_pad_ib,
    @@ -4757,21 +4773,21 @@ static const struct amdgpu_ring_funcs
    gfx_v9_0_ring_funcs_kiq = {
            .set_wptr = gfx_v9_0_ring_set_wptr_compute,
            .emit_frame_size =
                    20 + /* gfx_v9_0_ring_emit_gds_switch */
                    7 + /* gfx_v9_0_ring_emit_hdp_flush */
                    5 + /* hdp invalidate */
                    7 + /* gfx_v9_0_ring_emit_pipeline_sync */
                    SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
                    SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
                    2 + /* gfx_v9_0_ring_emit_vm_flush */
                    8 + 8 + 8, /* gfx_v9_0_ring_emit_fence_kiq x3 for
    user fence, vm fence */
    -       .emit_ib_size = 4, /* gfx_v9_0_ring_emit_ib_compute */
    +       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
            .emit_fence = gfx_v9_0_ring_emit_fence_kiq,
            .test_ring = gfx_v9_0_ring_test_ring,
            .insert_nop = amdgpu_ring_insert_nop,
            .pad_ib = amdgpu_ring_generic_pad_ib,
            .emit_rreg = gfx_v9_0_ring_emit_rreg,
            .emit_wreg = gfx_v9_0_ring_emit_wreg,
            .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
            .emit_reg_write_reg_wait =
    gfx_v9_0_ring_emit_reg_write_reg_wait,
     };

    @@ -4839,20 +4855,40 @@ static void gfx_v9_0_set_gds_init(struct
    amdgpu_device *adev)
                    adev->gds.mem.total_size = 0x10000;
                    break;
            case CHIP_RAVEN:
                    adev->gds.mem.total_size = 0x1000;
                    break;
            default:
                    adev->gds.mem.total_size = 0x10000;
                    break;
            }

    +       switch (adev->asic_type) {
    +       case CHIP_VEGA10:
    +       case CHIP_VEGA20:
    +               adev->gds.gds_compute_max_wave_id = 0x7ff;
    +               break;
    +       case CHIP_VEGA12:
    +               adev->gds.gds_compute_max_wave_id = 0x27f;
    +               break;
    +       case CHIP_RAVEN:
    +               if (adev->rev_id >= 0x8)
    +                       adev->gds.gds_compute_max_wave_id = 0x77;
    /* raven2 */
    +               else
    +                       adev->gds.gds_compute_max_wave_id = 0x15f;
    /* raven1 */
    +               break;
    +       default:
    +               /* this really depends on the chip */
    +               adev->gds.gds_compute_max_wave_id = 0x7ff;
    +               break;
    +       }
    +
            adev->gds.gws.total_size = 64;
            adev->gds.oa.total_size = 16;

            if (adev->gds.mem.total_size == 64 * 1024) {
                    adev->gds.mem.gfx_partition_size = 4096;
                    adev->gds.mem.cs_partition_size = 4096;

                    adev->gds.gws.gfx_partition_size = 4;
                    adev->gds.gws.cs_partition_size = 4;

    diff --git a/include/uapi/drm/amdgpu_drm.h
    b/include/uapi/drm/amdgpu_drm.h
    index faaad04814e4..662d379ea624 100644
    --- a/include/uapi/drm/amdgpu_drm.h
    +++ b/include/uapi/drm/amdgpu_drm.h
    @@ -561,20 +561,25 @@ union drm_amdgpu_cs {
     /* Preamble flag, which means the IB could be dropped if no
    context switch */
     #define AMDGPU_IB_FLAG_PREAMBLE (1<<1)

     /* Preempt flag, IB should set Pre_enb bit if PREEMPT flag
    detected */
     #define AMDGPU_IB_FLAG_PREEMPT (1<<2)

     /* The IB fence should do the L2 writeback but not invalidate any
    shader
      * caches (L2/vL1/sL1/I$). */
     #define AMDGPU_IB_FLAG_TC_WB_NOT_INVALIDATE (1 << 3)

    +/* Set GDS_COMPUTE_MAX_WAVE_ID = DEFAULT before
    PACKET3_INDIRECT_BUFFER.
    + * This will reset wave ID counters for the IB.
    + */
    +#define AMDGPU_IB_FLAG_RESET_GDS_MAX_WAVE_ID (1 << 4)
    +
     struct drm_amdgpu_cs_chunk_ib {
            __u32 _pad;
            /** AMDGPU_IB_FLAG_* */
            __u32 flags;
            /** Virtual address to begin IB execution */
            __u64 va_start;
            /** Size of submission */
            __u32 ib_bytes;
            /** HW IP to submit to */
            __u32 ip_type;

--2.17.1



_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Re: [PATCH] drm/amdgpu: add a workaround for GDS ordered append hangs with compute queues

Reply via email to