I try to write a patch based on the patch of Tuikov,Luben.

Inspired by Luben,here is the patch:

From 1980d8f1ed44fb9a84a5ea1f6e2edd2bc25c629a Mon Sep 17 00:00:00 2001
From: changzhu <changfeng....@amd.com>
Date: Thu, 10 Oct 2019 11:02:33 +0800
Subject: [PATCH] drm/amdgpu: add dummy read by engines for some GCVM status
 registers

The GRBM register interface is now capable of bursting 1 cycle per
register wr->wr, wr->rd much faster than previous muticycle per
transaction done interface.  This has caused a problem where
status registers requiring HW to update have a 1 cycle delay, due
to the register update having to go through GRBM.

SW may operate on an incorrect value if they write a register and
immediately check the corresponding status register.

Registers requiring HW to clear or set fields may be delayed by 1 cycle.
For example,

1. write VM_INVALIDATE_ENG0_REQ mask = 5a
2. read VM_INVALIDATE_ENG0_ACKb till the ack is same as the request mask = 5a
        a. HW will reset VM_INVALIDATE_ENG0_ACK = 0 until invalidation is 
complete
3. write VM_INVALIDATE_ENG0_REQ mask = 5a
4. read VM_INVALIDATE_ENG0_ACK till the ack is same as the request mask = 5a
        a. First read of VM_INVALIDATE_ENG0_ACK = 5a instead of 0
        b. Second read of VM_INVALIDATE_ENG0_ACK = 0 because the remote GRBM h/w
           register takes one extra cycle to be cleared
        c. In this case,SW wil see a false ACK if they exit on first read

Affected registers (only GC variant)  | Recommended Dummy Read
--------------------------------------+----------------------------
VM_INVALIDATE_ENG*_ACK                |  VM_INVALIDATE_ENG*_REQ
VM_L2_STATUS                          |  VM_L2_STATUS
VM_L2_PROTECTION_FAULT_STATUS         |  VM_L2_PROTECTION_FAULT_STATUS
VM_L2_PROTECTION_FAULT_ADDR_HI/LO32   |  VM_L2_PROTECTION_FAULT_ADDR_HI/LO32
VM_L2_IH_LOG_BUSY                     |  VM_L2_IH_LOG_BUSY
MC_VM_L2_PERFCOUNTER_HI/LO            |  MC_VM_L2_PERFCOUNTER_HI/LO
ATC_L2_PERFCOUNTER_HI/LO              |  ATC_L2_PERFCOUNTER_HI/LO
ATC_L2_PERFCOUNTER2_HI/LO             |  ATC_L2_PERFCOUNTER2_HI/LO

It also needs dummy read by engines for these gc registers.

Change-Id: Ie028f37eb789966d4593984bd661b248ebeb1ac3
Signed-off-by: changzhu <changfeng....@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c |  5 +++++
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c    |  2 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c   |  4 ++++
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 18 ++++++++++++++++++
 5 files changed, 31 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 4b3f58dbf36f..c2fbf6087ecf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -392,6 +392,11 @@ void amdgpu_ring_emit_reg_write_reg_wait_helper(struct 
amdgpu_ring *ring,
                                                uint32_t ref, uint32_t mask)
 {
        amdgpu_ring_emit_wreg(ring, reg0, ref);
+
+       /* wait for a cycle to reset vm_inv_eng0_ack */
+       if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
+               amdgpu_ring_emit_rreg(ring, reg0);
+
        amdgpu_ring_emit_reg_wait(ring, reg1, mask, mask);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index ef1975a5323a..104c47734316 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -5155,6 +5155,7 @@ static const struct amdgpu_ring_funcs 
gfx_v10_0_ring_funcs_gfx = {
        .patch_cond_exec = gfx_v10_0_ring_emit_patch_cond_exec,
        .preempt_ib = gfx_v10_0_ring_preempt_ib,
        .emit_tmz = gfx_v10_0_ring_emit_tmz,
+       .emit_rreg = gfx_v10_0_ring_emit_rreg,
        .emit_wreg = gfx_v10_0_ring_emit_wreg,
        .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 };
@@ -5188,6 +5189,7 @@ static const struct amdgpu_ring_funcs 
gfx_v10_0_ring_funcs_compute = {
        .test_ib = gfx_v10_0_ring_test_ib,
        .insert_nop = amdgpu_ring_insert_nop,
        .pad_ib = amdgpu_ring_generic_pad_ib,
+       .emit_rreg = gfx_v10_0_ring_emit_rreg,
        .emit_wreg = gfx_v10_0_ring_emit_wreg,
        .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
 };
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 2f03bf533d41..d00b53de0fdc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -6253,6 +6253,7 @@ static const struct amdgpu_ring_funcs 
gfx_v9_0_ring_funcs_gfx = {
        .init_cond_exec = gfx_v9_0_ring_emit_init_cond_exec,
        .patch_cond_exec = gfx_v9_0_ring_emit_patch_cond_exec,
        .emit_tmz = gfx_v9_0_ring_emit_tmz,
+       .emit_rreg = gfx_v9_0_ring_emit_rreg,
        .emit_wreg = gfx_v9_0_ring_emit_wreg,
        .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
        .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
@@ -6289,6 +6290,7 @@ static const struct amdgpu_ring_funcs 
gfx_v9_0_ring_funcs_compute = {
        .insert_nop = amdgpu_ring_insert_nop,
        .pad_ib = amdgpu_ring_generic_pad_ib,
        .set_priority = gfx_v9_0_ring_set_priority_compute,
+       .emit_rreg = gfx_v9_0_ring_emit_rreg,
        .emit_wreg = gfx_v9_0_ring_emit_wreg,
        .emit_reg_wait = gfx_v9_0_ring_emit_reg_wait,
        .emit_reg_write_reg_wait = gfx_v9_0_ring_emit_reg_write_reg_wait,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 3b00bce14cfb..dce6b651da1f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -346,6 +346,10 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct 
amdgpu_ring *ring,
 
        amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
 
+       /* wait for a cycle to reset vm_inv_eng0_ack */
+       if (ring->funcs->vmhub == AMDGPU_GFXHUB_0)
+               amdgpu_ring_emit_rreg(ring, hub->vm_inv_eng0_req + eng);
+
        /* wait for the invalidate to complete */
        amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
                                  1 << vmid, 1 << vmid);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 3460c00f3eaa..baaa33467882 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -38,6 +38,7 @@
 #include "navi10_sdma_pkt_open.h"
 #include "nbio_v2_3.h"
 #include "sdma_v5_0.h"
+#include "nvd.h"
 
 MODULE_FIRMWARE("amdgpu/navi10_sdma.bin");
 MODULE_FIRMWARE("amdgpu/navi10_sdma1.bin");
@@ -1147,6 +1148,22 @@ static void sdma_v5_0_ring_emit_vm_flush(struct 
amdgpu_ring *ring,
        amdgpu_gmc_emit_flush_gpu_tlb(ring, vmid, pd_addr);
 }
 
+static void sdma_v5_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
+{
+       struct amdgpu_device *adev = ring->adev;
+
+       amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
+       amdgpu_ring_write(ring, 0 | /* src: register*/
+                               (5 << 8) |  /* dst: memory */
+                               (1 << 20)); /* write confirm */
+       amdgpu_ring_write(ring, reg);
+       amdgpu_ring_write(ring, 0);
+       amdgpu_ring_write(ring, lower_32_bits(adev->wb.gpu_addr +
+                               adev->virt.reg_val_offs * 4));
+       amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
+                               adev->virt.reg_val_offs * 4));
+}
+
 static void sdma_v5_0_ring_emit_wreg(struct amdgpu_ring *ring,
                                     uint32_t reg, uint32_t val)
 {
@@ -1597,6 +1614,7 @@ static const struct amdgpu_ring_funcs 
sdma_v5_0_ring_funcs = {
        .test_ib = sdma_v5_0_ring_test_ib,
        .insert_nop = sdma_v5_0_ring_insert_nop,
        .pad_ib = sdma_v5_0_ring_pad_ib,
+       .emit_rreg = sdma_v5_0_ring_emit_rreg,
        .emit_wreg = sdma_v5_0_ring_emit_wreg,
        .emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
        .init_cond_exec = sdma_v5_0_ring_init_cond_exec,
-- 
2.17.1

Could someone give some suggestions about it?

BR,
Changfeng.



-----Original Message-----
From: amd-gfx <amd-gfx-boun...@lists.freedesktop.org> On Behalf Of Huang, Ray
Sent: Friday, October 25, 2019 5:26 PM
To: Tuikov, Luben <luben.tui...@amd.com>
Cc: Deucher, Alexander <alexander.deuc...@amd.com>; Pelloux-prayer, Pierre-eric 
<pierre-eric.pelloux-pra...@amd.com>; Koenig, Christian 
<christian.koe...@amd.com>; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

On Thu, Oct 24, 2019 at 09:16:55PM +0000, Tuikov, Luben wrote:
> The GRBM interface is now capable of bursting 1-cycle op per register, 
> a WRITE followed by another WRITE, or a WRITE followed by a READ--much 
> faster than previous muti-cycle per completed-transaction interface.
> This causes a problem, whereby status registers requiring a read/write 
> by hardware, have a 1-cycle delay, due to the register update having 
> to go through GRBM interface.
> 
> This patch adds this delay.
> 
> A one cycle read op is added after updating the invalidate request and 
> before reading the invalidate-ACK status.
> 
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
> 
> Signed-off-by: Luben Tuikov <luben.tui...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++-- 
> drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++-- 
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++ 
> drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++ 
> drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>  5 files changed, 22 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs 
> gfx_v10_0_ring_funcs_gfx = {
>               5 + /* COND_EXEC */
>               7 + /* PIPELINE_SYNC */
>               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>               2 + /* VM_FLUSH */
>               8 + /* FENCE for VM_FLUSH */
>               20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs 
> gfx_v10_0_ring_funcs_compute = {
>               5 + /* hdp invalidate */
>               7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>               2 + /* gfx_v10_0_ring_emit_vm_flush */
>               8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm 
> fence */
>       .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs 
> gfx_v9_0_ring_funcs_gfx = {
>               5 +  /* COND_EXEC */
>               7 +  /* PIPELINE_SYNC */
>               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>               2 + /* VM_FLUSH */
>               8 +  /* FENCE for VM_FLUSH */
>               20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs 
> gfx_v9_0_ring_funcs_compute = {
>               5 + /* hdp invalidate */
>               7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>               SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>               2 + /* gfx_v9_0_ring_emit_vm_flush */
>               8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm 
> fence */
>       .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t
> gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>  
>       amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>  
> +     /* Insert a dummy read to delay one cycle before the ACK
> +      * inquiry.
> +      */
> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +             amdgpu_ring_emit_reg_wait(ring,
> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
> +
>       /* wait for the invalidate to complete */
>       amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>                                 1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct 
> amdgpu_ring *ring,
>       amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>                             upper_32_bits(pd_addr));
>  
> +     /* Insert a dummy read to delay one cycle before the ACK
> +      * inquiry.
> +      */
> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +             amdgpu_ring_emit_reg_wait(ring,
> +                                       hub->vm_inv_eng0_req + eng, 0, 0);

The workaround should be add a dummy read (one cycle delay) after we write 
VM_INVALIDATE_ENGx_REQ and before we poll the VM_INVALIDATE_ENGx_ACK.
If you add it here, that cannot resolve the issue. I think you should implement 
the dummy read in below function: amdgpu_ring_emit_reg_write_reg_wait().

Thanks,
Ray

> +
>       amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>                                           hub->vm_inv_eng0_ack + eng,
>                                           req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs 
> sdma_v5_0_ring_funcs = {
>               6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>               /* sdma_v5_0_ring_emit_vm_flush */
>               SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>               10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, 
> vm fence */
>       .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>       .emit_ib = sdma_v5_0_ring_emit_ib,
> --
> 2.23.0.385.gbc12974a89
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Attachment: 0001-drm-amdgpu-add-dummy-read-by-engines-for-some-GCVM-s.patch
Description: 0001-drm-amdgpu-add-dummy-read-by-engines-for-some-GCVM-s.patch

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to