[AMD Official Use Only]


> -----邮件原件-----
> 发件人: Lazar, Lijo <lijo.la...@amd.com>
> 发送时间: Thursday, November 18, 2021 7:33 PM
> 收件人: Yang, Stanley <stanley.y...@amd.com>; amd-
> g...@lists.freedesktop.org; Zhang, Hawking <hawking.zh...@amd.com>;
> Clements, John <john.cleme...@amd.com>; Quan, Evan
> <evan.q...@amd.com>; Wang, Yang(Kevin) <kevinyang.w...@amd.com>
> 主题: Re: [PATCH Review 3/4] drm/amdgpu: add message smu to get
> ecc_table v2
> 
> 
> 
> On 11/18/2021 3:03 PM, Stanley.Yang wrote:
> > support ECC TABLE message, this table include umc ras error count and
> > error address
> >
> > v2:
> >      add smu version check to query whether support ecctable
> >      call smu_cmn_update_table to get ecctable directly
> >
> > Signed-off-by: Stanley.Yang <stanley.y...@amd.com>
> > ---
> >   drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h       |  8 +++
> >   drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 14 ++++
> >   .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 70
> +++++++++++++++++++
> >   .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c    |  2 +
> >   4 files changed, 94 insertions(+)
> >
> > diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > index 3557f4e7fc30..7a06021a58f0 100644
> > --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
> > @@ -324,6 +324,7 @@ enum smu_table_id
> >     SMU_TABLE_OVERDRIVE,
> >     SMU_TABLE_I2C_COMMANDS,
> >     SMU_TABLE_PACE,
> > +   SMU_TABLE_ECCINFO,
> >     SMU_TABLE_COUNT,
> >   };
> >
> > @@ -340,6 +341,7 @@ struct smu_table_context
> >     void                            *max_sustainable_clocks;
> >     struct smu_bios_boot_up_values  boot_values;
> >     void                            *driver_pptable;
> > +   void                            *ecc_table;
> >     struct smu_table                tables[SMU_TABLE_COUNT];
> >     /*
> >      * The driver table is just a staging buffer for @@ -1261,6
> > +1263,11 @@ struct pptable_funcs {
> >      *
>               of SMUBUS table.
> >      */
> >     int (*send_hbm_bad_pages_num)(struct smu_context *smu,
> uint32_t
> > size);
> > +
> > +   /**
> > +    * @get_ecc_table:  message SMU to get ECC INFO table.
> > +    */
> > +   ssize_t (*get_ecc_info)(struct smu_context *smu, void *table);
> >   };
> >
> >   typedef enum {
> > @@ -1397,6 +1404,7 @@ int smu_set_light_sbr(struct smu_context *smu,
> > bool enable);
> >
> >   int smu_wait_for_event(struct amdgpu_device *adev, enum
> smu_event_type event,
> >                    uint64_t event_arg);
> > +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc);
> >
> >   #endif
> >   #endif
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > index 01168b8955bf..fd3b6b460b12 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> > @@ -3072,6 +3072,20 @@ int smu_set_light_sbr(struct smu_context *smu,
> bool enable)
> >     return ret;
> >   }
> >
> > +int smu_get_ecc_info(struct smu_context *smu, void *umc_ecc) {
> > +   int ret = -EOPNOTSUPP;
> > +
> > +   mutex_lock(&smu->mutex);
> > +   if (smu->ppt_funcs &&
> > +           smu->ppt_funcs->get_ecc_info)
> > +           ret = smu->ppt_funcs->get_ecc_info(smu, umc_ecc);
> > +   mutex_unlock(&smu->mutex);
> > +
> > +   return ret;
> > +
> > +}
> > +
> >   static int smu_get_prv_buffer_details(void *handle, void **addr, size_t
> *size)
> >   {
> >     struct smu_context *smu = handle;
> > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > index f835d86cc2f5..4c21609ccea5 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> > @@ -78,6 +78,12 @@
> >
> >   #define smnPCIE_ESM_CTRL                  0x111003D0
> >
> > +/*
> > + * SMU support ECCTABLE since version 68.42.0,
> > + * use this to check ECCTALE feature whether support  */ #define
> > +SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
> > +
> >   static const struct smu_temperature_range smu13_thermal_policy[] =
> >   {
> >     {-273150,  99000, 99000, -273150, 99000, 99000, -273150, 99000,
> > 99000}, @@ -190,6 +196,7 @@ static const struct cmn2asic_mapping
> aldebaran_table_map[SMU_TABLE_COUNT] = {
> >     TAB_MAP(SMU_METRICS),
> >     TAB_MAP(DRIVER_SMU_CONFIG),
> >     TAB_MAP(I2C_COMMANDS),
> > +   TAB_MAP(ECCINFO),
> >   };
> >
> >   static const uint8_t aldebaran_throttler_map[] = { @@ -223,6 +230,9
> > @@ static int aldebaran_tables_init(struct smu_context *smu)
> >     SMU_TABLE_INIT(tables, SMU_TABLE_I2C_COMMANDS,
> sizeof(SwI2cRequest_t),
> >                    PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> >
> > +   SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO,
> sizeof(EccInfoTable_t),
> > +                  PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
> > +
> >     smu_table->metrics_table = kzalloc(sizeof(SmuMetrics_t),
> GFP_KERNEL);
> >     if (!smu_table->metrics_table)
> >             return -ENOMEM;
> > @@ -235,6 +245,10 @@ static int aldebaran_tables_init(struct smu_context
> *smu)
> >             return -ENOMEM;
> >     }
> >
> > +   smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size,
> GFP_KERNEL);
> > +   if (!smu_table->ecc_table)
> > +           return -ENOMEM;
> > +
> >     return 0;
> >   }
> >
> > @@ -1765,6 +1779,61 @@ static ssize_t aldebaran_get_gpu_metrics(struct
> smu_context *smu,
> >     return sizeof(struct gpu_metrics_v1_3);
> >   }
> >
> > +static int aldebaran_check_ecc_table_support(struct smu_context *smu)
> > +{
> > +   uint32_t if_version = 0xff, smu_version = 0xff;
> > +   int ret = 0;
> > +
> > +   ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
> > +   if (ret)
> > +           ret = -EOPNOTSUPP;      // return not support if failed get
> smu_version
> > +
> > +   if (smu_version < SUPPORT_ECCTABLE_SMU_VERSION)
> > +           ret = -EOPNOTSUPP;
> > +
> > +   return ret;
> > +}
> > +
> > +static ssize_t aldebaran_get_ecc_info(struct smu_context *smu,
> > +                                    void *table)
> > +{
> > +   struct smu_table_context *smu_table = &smu->smu_table;
> > +   EccInfoTable_t *ecc_table = NULL;
> > +   struct ecc_info_per_ch *ecc_info_per_channel = NULL;
> > +   int i, ret = 0;
> > +   struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
> > +
> 
> Missed to ask last time. Since umc_ecc_info is a common struct, do you also
> want to pass back the number of channels having data?
> 
> Now this struct can hold max of 32 channel data. Let's say if the same
> interface is going to be used on another ASIC X having only 16 channels.
> Then the callback for ASIC X fills data only for 16 channels. Or, you expect 
> that
> to be taken care at the caller side?

[Yang, Stanley] : If ASIC X have only 16 channels, the callback only fill data 
for 16 channels, and caller side also need consider its own channel number to 
handle with umc_ecc_info.

> 
> Thanks,
> Lijo
> 
> > +   ret = aldebaran_check_ecc_table_support(smu);
> > +   if (ret)
> > +           return ret;
> > +
> > +   ret = smu_cmn_update_table(smu,
> > +                          SMU_TABLE_ECCINFO,
> > +                          0,
> > +                          smu_table->ecc_table,
> > +                          false);
> > +   if (ret) {
> > +           dev_info(smu->adev->dev, "Failed to export SMU ecc
> table!\n");
> > +           return ret;
> > +   }
> > +
> > +   ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
> > +
> > +   for (i = 0; i < ALDEBARAN_UMC_CHANNEL_NUM; i++) {
> > +           ecc_info_per_channel = &(eccinfo->ecc[i]);
> > +           ecc_info_per_channel->ce_count_lo_chip =
> > +                   ecc_table->EccInfo[i].ce_count_lo_chip;
> > +           ecc_info_per_channel->ce_count_hi_chip =
> > +                   ecc_table->EccInfo[i].ce_count_hi_chip;
> > +           ecc_info_per_channel->mca_umc_status =
> > +                   ecc_table->EccInfo[i].mca_umc_status;
> > +           ecc_info_per_channel->mca_umc_addr =
> > +                   ecc_table->EccInfo[i].mca_umc_addr;
> > +   }
> > +
> > +   return ret;
> > +}
> > +
> >   static int aldebaran_mode1_reset(struct smu_context *smu)
> >   {
> >     u32 smu_version, fatal_err, param;
> > @@ -1967,6 +2036,7 @@ static const struct pptable_funcs
> aldebaran_ppt_funcs = {
> >     .i2c_init = aldebaran_i2c_control_init,
> >     .i2c_fini = aldebaran_i2c_control_fini,
> >     .send_hbm_bad_pages_num =
> aldebaran_smu_send_hbm_bad_page_num,
> > +   .get_ecc_info = aldebaran_get_ecc_info,
> >   };
> >
> >   void aldebaran_set_ppt_funcs(struct smu_context *smu) diff --git
> > a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > index 4d96099a9bb1..55421ea622fb 100644
> > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
> > @@ -428,8 +428,10 @@ int smu_v13_0_fini_smc_tables(struct
> smu_context *smu)
> >     kfree(smu_table->hardcode_pptable);
> >     smu_table->hardcode_pptable = NULL;
> >
> > +   kfree(smu_table->ecc_table);
> >     kfree(smu_table->metrics_table);
> >     kfree(smu_table->watermarks_table);
> > +   smu_table->ecc_table = NULL;
> >     smu_table->metrics_table = NULL;
> >     smu_table->watermarks_table = NULL;
> >     smu_table->metrics_time = 0;
> >

Reply via email to