> -----Original Message-----
> From: amd-gfx [mailto:amd-gfx-boun...@lists.freedesktop.org] On Behalf
> Of David Panariti
> Sent: Tuesday, June 06, 2017 4:33 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Panariti, David
> Subject: [PATCH 2/3] drm/amdgpu: Complete Carrizo EDC (Error Detection
> and Correction) workarounds.
> 
> The workarounds are unconditionally performed on CZs with EDC enabled.
> EDC detects uncorrected ECC errors and uses data poisoning to prevent
> corrupted compute results from being used (read).
> EDC enabled CZs are often used in embedded environments.
> 
> Signed-off-by: David Panariti <david.panar...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  13 ++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 136
> +++++++++++++++++++++++-----------
>  2 files changed, 107 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 0d64bbba..a6f51eb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -2060,5 +2060,18 @@ int amdgpu_dm_display_resume(struct
> amdgpu_device *adev );
>  static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev)
> { return 0; }
>  #endif
> 
> +/* Carrizo EDC support */
> +struct reg32_counter_name_map {
> +     uint32_t rnmap_addr;         /* Counter register address */
> +     const char *rnmap_name;      /* Name of the counter */
> +     size_t rnmap_num_instances;  /* Number of block instances */
> +};
> +
> +#define DEF_mmREG32_NAME_MAP_ELEMENT(reg, num_instances) {
>       \
> +             .rnmap_addr = mm##reg,                          \
> +             .rnmap_name = #reg,                             \
> +             .rnmap_num_instances = num_instances            \
> +}
> +
>  #include "amdgpu_object.h"
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 07172f8..46e766e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -1727,35 +1727,80 @@ static const u32 sgpr2_init_regs[] =
>       mmCOMPUTE_USER_DATA_9, 0xedcedc09,
>  };
> 
> -static const u32 sec_ded_counter_registers[] =
> -{
> -     mmCPC_EDC_ATC_CNT,
> -     mmCPC_EDC_SCRATCH_CNT,
> -     mmCPC_EDC_UCODE_CNT,
> -     mmCPF_EDC_ATC_CNT,
> -     mmCPF_EDC_ROQ_CNT,
> -     mmCPF_EDC_TAG_CNT,
> -     mmCPG_EDC_ATC_CNT,
> -     mmCPG_EDC_DMA_CNT,
> -     mmCPG_EDC_TAG_CNT,
> -     mmDC_EDC_CSINVOC_CNT,
> -     mmDC_EDC_RESTORE_CNT,
> -     mmDC_EDC_STATE_CNT,
> -     mmGDS_EDC_CNT,
> -     mmGDS_EDC_GRBM_CNT,
> -     mmGDS_EDC_OA_DED,
> -     mmSPI_EDC_CNT,
> -     mmSQC_ATC_EDC_GATCL1_CNT,
> -     mmSQC_EDC_CNT,
> -     mmSQ_EDC_DED_CNT,
> -     mmSQ_EDC_INFO,
> -     mmSQ_EDC_SEC_CNT,
> -     mmTCC_EDC_CNT,
> -     mmTCP_ATC_EDC_GATCL1_CNT,
> -     mmTCP_EDC_CNT,
> -     mmTD_EDC_CNT
> +/* See GRBM_GFX_INDEX, et. al. registers. */
> +static const struct reg32_counter_name_map sec_ded_counter_registers[]
> = {
> +     DEF_mmREG32_NAME_MAP_ELEMENT(SQC_EDC_CNT, 2),
> +
>       DEF_mmREG32_NAME_MAP_ELEMENT(SQC_ATC_EDC_GATCL1_CN
> T, 2),
> +
> +     DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_SEC_CNT, 8),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(SQ_EDC_DED_CNT, 8),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(TCP_EDC_CNT, 8),
> +
>       DEF_mmREG32_NAME_MAP_ELEMENT(TCP_ATC_EDC_GATCL1_CN
> T, 8),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(TD_EDC_CNT, 8),
> +
> +     DEF_mmREG32_NAME_MAP_ELEMENT(TCC_EDC_CNT, 4),
> +
> +     DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_ATC_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_SCRATCH_CNT,
> 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(CPC_EDC_UCODE_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ATC_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_ROQ_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(CPF_EDC_TAG_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_ATC_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_DMA_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(CPG_EDC_TAG_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_CSINVOC_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_STATE_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(DC_EDC_RESTORE_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(GDS_EDC_GRBM_CNT, 1),
> +     DEF_mmREG32_NAME_MAP_ELEMENT(SPI_EDC_CNT, 1),
>  };
> 
> +static int gfx_v8_0_edc_clear_counters(struct amdgpu_device *adev)
> +{
> +     int ci, se, sh, i;
> +     uint32_t count;
> +     int r = 0;
> +
> +     mutex_lock(&adev->grbm_idx_mutex);
> +
> +     for (ci = 0; ci < ARRAY_SIZE(sec_ded_counter_registers); ++ci) {
> +             const struct reg32_counter_name_map *cp =
> +                     sec_ded_counter_registers + ci;
> +             const char *name = cp->rnmap_name;
> +
> +             for (se = 0; se < adev->gfx.config.max_shader_engines;
> ++se) {
> +                     for (sh = 0; sh < adev->gfx.config.max_sh_per_se;
> ++sh) {
> +                             for (i = 0; i < cp->rnmap_num_instances; ++i)
> {
> +                                     gfx_v8_0_select_se_sh(adev, se, sh,
> i);
> +                                     count = RREG32(cp->rnmap_addr);
> +                                     count = RREG32(cp->rnmap_addr);
> +                                     if (count != 0) {
> +                                             /*
> +                                              * EDC workaround failed.
> +                                              * If people are interested
> +                                              * in EDC at all, they will
> +                                              * want to know which
> +                                              * counters had problems.
> +                                              */
> +                                             DRM_WARN("EDC counter
> %s is 0x%08x, but should be 0\n.",
> +                                                      name, count);
> +                                             r = -EINVAL;
> +                                             goto ret;
> +                                     }
> +                             }
> +                     }
> +             }
> +     }
> +
> +ret:
> +     gfx_v8_0_select_se_sh(adev, 0xffffffff, 0xffffffff, 0xffffffff);
> +     mutex_unlock(&adev->grbm_idx_mutex);
> +
> +     return r;
> +}
> +
>  static int gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device
> *adev)
>  {
>       struct amdgpu_ring *ring = &adev->gfx.compute_ring[0];
> @@ -1763,18 +1808,36 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>       struct dma_fence *f = NULL;
>       int r, i;
>       u32 tmp;
> +     u32 dis_bit;
>       unsigned total_size, vgpr_offset, sgpr_offset;
>       u64 gpu_addr;
> 
> -     /* only supported on CZ */
> -     if (adev->asic_type != CHIP_CARRIZO)
> +     if (adev->asic_type != CHIP_CARRIZO) {
> +             /* EDC is only supported on CZ */
> +             return 0;
> +     }

No need to add the additional parens.

> +
> +     DRM_INFO("Detected Carrizo.\n");
> +

Drop this debugging print.

> +     tmp = RREG32(mmCC_GC_EDC_CONFIG);
> +     dis_bit = REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC);
> +     if (dis_bit) {

You can just check the bit directly, e.g.,
if (REG_GET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC)) {

> +             /* On Carrizo, EDC may be permanently disabled by a fuse. */
> +             DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG:
> 0x%08x.\n",
> +                     tmp);

I would reverse the logic here and only print a message if EDC is enabled.

>               return 0;
> +     }
> 
>       /* bail if the compute ring is not ready */
>       if (!ring->ready)
>               return 0;
> 
> -     tmp = RREG32(mmGB_EDC_MODE);
> +     DRM_INFO("Applying EDC workarounds.\n");
> +

Drop the debugging print.

> +     /*
> +      * Interested parties can enable EDC using debugfs register reads and
> +      * writes.
> +      */
>       WREG32(mmGB_EDC_MODE, 0);
> 
>       total_size =
> @@ -1899,18 +1962,7 @@ static int
> gfx_v8_0_do_edc_gpr_workarounds(struct amdgpu_device *adev)
>               goto fail;
>       }
> 
> -     tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 2);
> -     tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 1);
> -     WREG32(mmGB_EDC_MODE, tmp);
> -
> -     tmp = RREG32(mmCC_GC_EDC_CONFIG);
> -     tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0) | 1;
> -     WREG32(mmCC_GC_EDC_CONFIG, tmp);
> -
> -
> -     /* read back registers to clear the counters */
> -     for (i = 0; i < ARRAY_SIZE(sec_ded_counter_registers); i++)
> -             RREG32(sec_ded_counter_registers[i]);
> +     gfx_v8_0_edc_clear_counters(adev);
> 
>  fail:
>       amdgpu_ib_free(adev, &ib, NULL);
> --
> 2.7.4
> 
> _______________________________________________
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to