On 11/18/2021 9:07 AM, Yang, Stanley wrote:
[AMD Official Use Only]



-----邮件原件-----
发件人: Lazar, Lijo <lijo.la...@amd.com>
发送时间: Wednesday, November 17, 2021 7:24 PM
收件人: Yang, Stanley <stanley.y...@amd.com>; amd-
g...@lists.freedesktop.org; Zhang, Hawking <hawking.zh...@amd.com>;
Clements, John <john.cleme...@amd.com>; Quan, Evan
<evan.q...@amd.com>; Wang, Yang(Kevin) <kevinyang.w...@amd.com>
主题: Re: [PATCH Review 3/4] drm/amdgpu: add message smu to get
ecc_table



On 11/17/2021 3:41 PM, Stanley.Yang wrote:
support ECC TABLE message, this table include unc ras error count and
error address

Signed-off-by: Stanley.Yang <stanley.y...@amd.com>
---
   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  7 ++++
   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 38
+++++++++++++++++++
   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  2 +
   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c        | 24 ++++++++++++
   drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h        |  3 ++
   5 files changed, 74 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
index 3557f4e7fc30..ea65de0160c3 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
@@ -324,6 +324,7 @@ enum smu_table_id
        SMU_TABLE_OVERDRIVE,
        SMU_TABLE_I2C_COMMANDS,
        SMU_TABLE_PACE,
+       SMU_TABLE_ECCINFO,
        SMU_TABLE_COUNT,
   };

@@ -340,6 +341,7 @@ struct smu_table_context
        void                            *max_sustainable_clocks;
        struct smu_bios_boot_up_values  boot_values;
        void                            *driver_pptable;
+       void                            *ecc_table;
        struct smu_table                tables[SMU_TABLE_COUNT];
        /*
         * The driver table is just a staging buffer for @@ -1261,6
+1263,11 @@ struct pptable_funcs {
         *
                of SMUBUS table.
         */
        int (*send_hbm_bad_pages_num)(struct smu_context *smu,
uint32_t
size);
+
+       /**
+        * @get_ecc_table:  message SMU to get ECC INFO table.
+        */
+       ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
   };

   typedef enum {
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
index f835d86cc2f5..5e4ba0e14a91 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
@@ -190,6 +190,7 @@ static const struct cmn2asic_mapping
aldebaran_table_map[SMU_TABLE_COUNT] = {
        TAB_MAP(SMU_METRICS),
        TAB_MAP(DRIVER_SMU_CONFIG),
        TAB_MAP(I2C_COMMANDS),
+       TAB_MAP(ECCINFO),
   };

   static const uint8_t aldebaran_throttler_map[] = { @@ -223,6 +224,9
@@ static int aldebaran_tables_init(struct smu_context *smu)
        SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS,
sizeof(SwI2cRequest_t),
                       PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);

+       SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO,
sizeof(EccInfoTable_t),
+                      PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+
        smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t),
GFP_KERNEL);
        if (!smu_table->metrics_table)
                return -ENOMEM;
@@ -235,6 +239,10 @@ static int aldebaran_tables_init(struct smu_context
*smu)
                return -ENOMEM;
        }

+       smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size,
GFP_KERNEL);
+       if (!smu_table->ecc_table)
+               return -ENOMEM;
+
        return 0;
   }

@@ -1765,6 +1773,35 @@ static ssize_t aldebaran_get_gpu_metrics(struct
smu_context *smu,
        return sizeof(struct gpu_metrics_v1_3);
   }

+static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
+                                        void *table)
+{
+       struct smu_table_context *smu_table = &smu->smu_table;
+       EccInfoTable_t ecc_table;
+       struct ecc_info_per_ch *ecc_info_per_channel = NULL;
+       int i, ret = 0;
+       struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
+
+       ret = smu_cmn_get_ecc_info_table(smu,
+                                       &ecc_table);
+       if (ret)
+               return ret;
+
+       for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
+               ecc_info_per_channel = &(eccinfo->ecc[i]);
+               ecc_info_per_channel->ce_count_lo_chip =
+                       ecc_table.EccInfo[i].ce_count_lo_chip;
+               ecc_info_per_channel->ce_count_hi_chip =
+                       ecc_table.EccInfo[i].ce_count_hi_chip;
+               ecc_info_per_channel->mca_umc_status =
+                       ecc_table.EccInfo[i].mca_umc_status;
+               ecc_info_per_channel->mca_umc_addr =
+                       ecc_table.EccInfo[i].mca_umc_addr;
+       }
+
+       return ret;
+}
+
   static int aldebaran_mode1_reset(struct smu_context *smu)
   {
        u32 smu_version, fatal_err, param;
@@ -1967,6 +2004,7 @@ static const struct pptable_funcs
aldebaran_ppt_funcs = {
        .i2c_init = aldebaran_i2c_control_init,
        .i2c_fini = aldebaran_i2c_control_fini,
        .send_hbm_bad_pages_num =
aldebaran_smu_send_hbm_bad_page_num,
+       .get_ecc_info = aldebaran_get_ecc_info,
   };

   void aldebaran_set_ppt_funcs(struct smu_context *smu) diff --git
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 4d96099a9bb1..55421ea622fb 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct
smu_context *smu)
        kfree(smu_table->hardcode_pptable);
        smu_table->hardcode_pptable = NULL;

+       kfree(smu_table->ecc_table);
        kfree(smu_table->metrics_table);
        kfree(smu_table->watermarks_table);
+       smu_table->ecc_table = NULL;
        smu_table->metrics_table = NULL;
        smu_table->watermarks_table = NULL;
        smu_table->metrics_time = 0;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 843d2cbfc71d..e229c9b09d80 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -983,6 +983,30 @@ int smu_cmn_get_metrics_table(struct
smu_context *smu,
        return ret;
   }

+int smu_cmn_get_ecc_info_table(struct smu_context *smu,
+                                    void *ecc_table)
+{
+       struct smu_table_context *smu_table= &smu->smu_table;
+       uint32_t table_size =
+               smu_table->tables[SMU_TABLE_ECCINFO].size;
+       int ret = 0;
+
+       ret = smu_cmn_update_table(smu,
+                              SMU_TABLE_ECCINFO,
+                              0,
+                              smu_table->ecc_table,
+                              false);
+       if (ret) {
+               dev_info(smu->adev->dev, "Failed to export SMU ecc
table!\n");
+               return ret;
+       }
+
+       if (ecc_table)
+               memcpy(ecc_table, smu_table->ecc_table, table_size);

This copy to another buffer is redundant. You may use ecc_table directly in
the callback, then this method itself looks unnecessary. Instead of calling
smu_cmn_get_ecc_info_table(), call smu_cmn_update_table() and copy
directly from ecc_table.
[Yang, Stanley] This design consider to protect ecc_table in further if 
multi-thread call smu_cmn_get_ecc_info_table same time, it should add mutex 
lock just like metrics table handle if it is necessary, but now test case is 
simple I didn't do that.
This is not like a metrics table use case. RAS error harvesting is not a multithread case. The error registers are cleared after reading, so I thought it's always expected to be one user at a time. Besides, I don't know if there is a case where driver needs to report errors from multiple threads.

Thanks,
Lijo


Thanks,
Lijo

+
+       return 0;
+}
+
   void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev, uint8_t
crev)
   {
        struct metrics_table_header *header = (struct metrics_table_header
*)table; diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
index beea03810bca..0adc5451373b 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
@@ -105,6 +105,9 @@ int smu_cmn_get_metrics_table(struct
smu_context *smu,
                              void *metrics_table,
                              bool bypass_cache);

+int smu_cmn_get_ecc_info_table(struct smu_context *smu,
+                             void *table);
+
   void smu_cmn_init_soft_gpu_metrics(void *table, uint8_t frev,
uint8_t crev);

   int smu_cmn_set_mp1_state(struct smu_context *smu,

Reply via email to