Allow various kinds of memory integrity methods (e.g. ECC/EDC) to be enabled
or disabled.  By default, all features are disabled.

EDC is Error Detection and Correction.  It can detect ECC errors and do 0 or
more of: count SEC (single error corrected) and DED (double error detected,
i.e. uncorrected ECC error), halt the affected block, interrupt the CPU.
Currently, only counting errors is supported.

Fixed a whitespace error.

Signed-off-by: David Panariti <david.panar...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h      |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |  4 ++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 28 ++++++++++++++++++++++++++--
 drivers/gpu/drm/amd/include/amd_shared.h | 14 ++++++++++++++
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a6f51eb..3e930ee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -116,6 +116,7 @@ extern int amdgpu_cntl_sb_buf_per_se;
 extern int amdgpu_param_buf_per_se;
 extern int amdgpu_job_hang_limit;
 extern int amdgpu_lbpw;
+extern unsigned amdgpu_ecc_flags;
 
 #define AMDGPU_DEFAULT_GTT_SIZE_MB             3072ULL /* 3GB by default */
 #define AMDGPU_WAIT_IDLE_TIMEOUT_IN_MS         3000
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index c4825ff..972660d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -115,6 +115,7 @@ int amdgpu_cntl_sb_buf_per_se = 0;
 int amdgpu_param_buf_per_se = 0;
 int amdgpu_job_hang_limit = 0;
 int amdgpu_lbpw = -1;
+unsigned amdgpu_ecc_flags = 0;
 
 MODULE_PARM_DESC(vramlimit, "Restrict VRAM for testing, in megabytes");
 module_param_named(vramlimit, amdgpu_vram_limit, int, 0600);
@@ -795,6 +796,9 @@ static struct pci_driver amdgpu_kms_pci_driver = {
        .driver.pm = &amdgpu_pm_ops,
 };
 
+MODULE_PARM_DESC(ecc_flags, "ECC/EDC enable flags (0 = disable ECC/EDC 
(default))");
+module_param_named(ecc_flags, amdgpu_ecc_flags, uint, 0444);
+
 
 
 static int __init amdgpu_init(void)
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 46e766e..3b5685c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -1824,7 +1824,17 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct 
amdgpu_device *adev)
        if (dis_bit) {
                /* On Carrizo, EDC may be permanently disabled by a fuse. */
                DRM_INFO("CZ EDC hardware is disabled, GC_EDC_CONFIG: 
0x%08x.\n",
-                       tmp);
+                        tmp);
+               return 0;
+       }
+
+       /*
+        * Check if EDC has been requested by a kernel parameter.
+        * For Carrizo, EDC is the best/safest mode WRT error handling.
+        */
+       if (!(amdgpu_ecc_flags
+             & (AMD_ECC_SUPPORT_BEST | AMD_ECC_SUPPORT_EDC))) {
+               DRM_INFO("EDC support has not been requested.\n");
                return 0;
        }
 
@@ -1962,6 +1972,20 @@ static int gfx_v8_0_do_edc_gpr_workarounds(struct 
amdgpu_device *adev)
                goto fail;
        }
 
+       /* 00 - GB_EDC_DED_MODE_LOG: Count DED errors but do not halt */
+       tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, DED_MODE, 0);
+       /* Do not propagate the errors to the next block. */
+       tmp = REG_SET_FIELD(tmp, GB_EDC_MODE, PROP_FED, 0);
+       WREG32(mmGB_EDC_MODE, tmp);
+
+       tmp = RREG32(mmCC_GC_EDC_CONFIG);
+
+       /*
+        * Clear EDC_DISABLE bit so the counters are available.
+        */
+       tmp = REG_SET_FIELD(tmp, CC_GC_EDC_CONFIG, DIS_EDC, 0);
+       WREG32(mmCC_GC_EDC_CONFIG, tmp);
+
        gfx_v8_0_edc_clear_counters(adev);
 
 fail:
@@ -5548,7 +5572,7 @@ static int gfx_v8_0_pre_soft_reset(void *handle)
                gfx_v8_0_cp_compute_enable(adev, false);
        }
 
-       return 0;
+       return 0;
 }
 
 static int gfx_v8_0_soft_reset(void *handle)
diff --git a/drivers/gpu/drm/amd/include/amd_shared.h 
b/drivers/gpu/drm/amd/include/amd_shared.h
index 0f58e95..1cf30aa 100644
--- a/drivers/gpu/drm/amd/include/amd_shared.h
+++ b/drivers/gpu/drm/amd/include/amd_shared.h
@@ -187,6 +187,20 @@ enum amd_fan_ctrl_mode {
 #define AMD_PG_SUPPORT_GFX_QUICK_MG            (1 << 11)
 #define AMD_PG_SUPPORT_GFX_PIPELINE            (1 << 12)
 
+/*
+ * ECC flags
+ * Allows the user to choose what kind of error detection/correction is used.
+ * Currently, EDC is supported on Carrizo.
+ *
+ * The AMD_ECC_SUPPORT_BEST bit is used to allow a user to have the driver
+ * set what it thinks is best/safest mode.  This may not be the same as the
+ * default, depending on the GPU and the application.
+ * Using a single bit makes it easy to request the best support without
+ * needing to know all currently supported modes.
+ */
+#define AMD_ECC_SUPPORT_BEST                   (1 << 0)
+#define AMD_ECC_SUPPORT_EDC                    (1 << 1)
+
 enum amd_pm_state_type {
        /* not used for dpm */
        POWER_STATE_TYPE_DEFAULT,
-- 
2.7.4

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to