support ECC TABLE message, this table include umc ras error count
and error address

v2:
    add smu version check to query whether support ecctable
    call smu_cmn_update_table to get ecctable directly

Signed-off-by: Stanley.Yang <stanley.y...@amd.com>
---
 drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  8 +++
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 14 ++++
 .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 70 +++++++++++++++++++
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  2 +
 4 files changed, 94 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index 3557f4e7fc30..7a06021a58f0 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -324,6 +324,7 @@ enum smu_table_id
        SMU_TABLE_OVERDRIVE,
        SMU_TABLE_I2C_COMMANDS,
        SMU_TABLE_PACE,
+       SMU_TABLE_ECCINFO,
        SMU_TABLE_COUNT,
 };
 
@@ -340,6 +341,7 @@ struct smu_table_context
        void                            *max_sustainable_clocks;
        struct smu_bios_boot_up_values  boot_values;
        void                            *driver_pptable;
+       void                            *ecc_table;
        struct smu_table                tables[SMU_TABLE_COUNT];
        /*
         * The driver table is just a staging buffer for
@@ -1261,6 +1263,11 @@ struct pptable_funcs {
         *                                                                      
        of SMUBUS table.
         */
        int (*send_hbm_bad_pages_num)(struct smu_context *smu, uint32_t size);
+
+       /**
+        * @get_ecc_table:  message SMU to get ECC INFO table.
+        */
+       ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
 };
 
 typedef enum {
@@ -1397,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu, bool 
enable);
 
 int smu_wait_for_event(struct amdgpu_device *adev, enum smu_event_type event,
                       uint64_t event_arg);
+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);
 
 #endif
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index 01168b8955bf..fd3b6b460b12 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -3072,6 +3072,20 @@ int smu_set_light_sbr(struct smu_context *smu, bool 
enable)
        return ret;
 }
 
+int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc)
+{
+       int ret = -EOPNOTSUPP;
+
+       mutex_lock(&smu->mutex);
+       if (smu->ppt_funcs &&
+               smu->ppt_funcs->get_ecc_info)
+               ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc);
+       mutex_unlock(&smu->mutex);
+
+       return ret;
+
+}
+
 static int smu_get_prv_buffer_details(void *handle, void **addr, size_t *size)
 {
        struct smu_context *smu = handle;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index f835d86cc2f5..4c21609ccea5 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -78,6 +78,12 @@
 
 #define smnPCIE_ESM_CTRL                       0x111003D0
 
+/*
+ * SMU support ECCTABLE since version 68.42.0,
+ * use this to check ECCTALE feature whether support
+ */
+#define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
+
 static const struct smu_temperature_range smu13_thermal_policy[] =
 {
        {-273150,  99000, 99000, -273150, 99000, 99000, -273150, 99000, 99000},
@@ -190,6 +196,7 @@ static const struct cmn2asic_mapping 
aldebaran_table_map[SMU_TABLE_COUNT] = {
        TAB_MAP(SMU_METRICS),
        TAB_MAP(DRIVER_SMU_CONFIG),
        TAB_MAP(I2C_COMMANDS),
+       TAB_MAP(ECCINFO),
 };
 
 static const uint8_t aldebaran_throttler_map[] = {
@@ -223,6 +230,9 @@ static int aldebaran_tables_init(struct smu_context *smu)
        SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS, sizeof(SwI2cRequest_t),
                       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
 
+       SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
+                      PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+
        smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t), GFP_KERNEL);
        if (!smu_table->metrics_table)
                return -ENOMEM;
@@ -235,6 +245,10 @@ static int aldebaran_tables_init(struct smu_context *smu)
                return -ENOMEM;
        }
 
+       smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size, 
GFP_KERNEL);
+       if (!smu_table->ecc_table)
+               return -ENOMEM;
+
        return 0;
 }
 
@@ -1765,6 +1779,61 @@ static ssize_t aldebaran_get_gpu_metrics(struct 
smu_context *smu,
        return sizeof(struct gpu_metrics_v1_3);
 }
 
+static int aldebaran_check_ecc_table_support(struct smu_context *smu)
+{
+       uint32_t if_version = 0xff, smu_version = 0xff;
+       int ret = 0;
+
+       ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
+       if (ret)
+               ret = -EOPNOTSUPP;      // return not support if failed get 
smu_version
+
+       if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
+               ret = -EOPNOTSUPP;
+
+       return ret;
+}
+
+static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
+                                        void *table)
+{
+       struct smu_table_context *smu_table = &smu->smu_table;
+       EccInfoTable_t *ecc_table = NULL;
+       struct ecc_info_per_ch *ecc_info_per_channel = NULL;
+       int i, ret = 0;
+       struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
+
+       ret = aldebaran_check_ecc_table_support(smu);
+       if (ret)
+               return ret;
+
+       ret = smu_cmn_update_table(smu,
+                              SMU_TABLE_ECCINFO,
+                              0,
+                              smu_table->ecc_table,
+                              false);
+       if (ret) {
+               dev_info(smu->adev->dev, "Failed to export SMU ecc table!\n");
+               return ret;
+       }
+
+       ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
+
+       for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+               ecc_info_per_channel = &(eccinfo->ecc[i]);
+               ecc_info_per_channel->ce_count_lo_chip =
+                       ecc_table->EccInfo[i].ce_count_lo_chip;
+               ecc_info_per_channel->ce_count_hi_chip =
+                       ecc_table->EccInfo[i].ce_count_hi_chip;
+               ecc_info_per_channel->mca_umc_status =
+                       ecc_table->EccInfo[i].mca_umc_status;
+               ecc_info_per_channel->mca_umc_addr =
+                       ecc_table->EccInfo[i].mca_umc_addr;
+       }
+
+       return ret;
+}
+
 static int aldebaran_mode1_reset(struct smu_context *smu)
 {
        u32 smu_version, fatal_err, param;
@@ -1967,6 +2036,7 @@ static const struct pptable_funcs aldebaran_ppt_funcs = {
        .i2c_init = aldebaran_i2c_control_init,
        .i2c_fini = aldebaran_i2c_control_fini,
        .send_hbm_bad_pages_num = aldebaran_smu_send_hbm_bad_page_num,
+       .get_ecc_info = aldebaran_get_ecc_info,
 };
 
 void aldebaran_set_ppt_funcs(struct smu_context *smu)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 4d96099a9bb1..55421ea622fb 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct smu_context *smu)
        kfree(smu_table->hardcode_pptable);
        smu_table->hardcode_pptable = NULL;
 
+       kfree(smu_table->ecc_table);
        kfree(smu_table->metrics_table);
        kfree(smu_table->watermarks_table);
+       smu_table->ecc_table = NULL;
        smu_table->metrics_table = NULL;
        smu_table->watermarks_table = NULL;
        smu_table->metrics_time = 0;
-- 
2.17.1

Reply via email to