[AMD Official Use Only]


> -----Original Message-----
> From: Stanley.Yang <stanley.y...@amd.com>
> Sent: Tuesday, March 1, 2022 9:30 PM
> To: amd-gfx@lists.freedesktop.org; Zhang, Hawking
> <hawking.zh...@amd.com>; Zhou1, Tao <tao.zh...@amd.com>; Joo, Maria
> <maria....@amd.com>
> Cc: Yang, Stanley <stanley.y...@amd.com>
> Subject: [PATCH Review 1/1] drm/amdgpu: support send bad channel info to
> smu
> 
> Message SMU bad channel information bitmap to update OOB table
> 
> Change-Id: I49a79af64d5263c28db059ecb8b8405a471431b4
> Signed-off-by: Stanley.Yang <stanley.y...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c       |  7 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h       |  3 ++
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c    | 25 ++++++++++-
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h    |  4 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c       |  5 +++
>  drivers/gpu/drm/amd/pm/amdgpu_dpm.c           | 12 ++++++
>  drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h       |  1 +
>  drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c     | 10 +++++
>  drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h |  7 +++
>  .../pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h    |  3 +-
>  drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |  3 +-
>  .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c    | 43 +++++++++++++++++++
>  12 files changed, 119 insertions(+), 4 deletions(-)

[Tao] It's better to split the patch into two parts, one for amdgpu and one for 
pm.

> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index d3875618ebf5..f9104f99eb9c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2068,6 +2068,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)
>       mutex_init(&con->recovery_lock);
>       INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
>       atomic_set(&con->in_recovery, 0);
> +     con->eeprom_control.bad_channel_bitmap = 0;
> 
>       max_eeprom_records_count =
> amdgpu_ras_eeprom_max_record_count();
>       amdgpu_ras_validate_threshold(adev, max_eeprom_records_count);
> @@ -2092,6 +2093,11 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)
>                       goto free;
> 
>               amdgpu_dpm_send_hbm_bad_pages_num(adev, con-
> >eeprom_control.ras_num_recs);
> +
> +             if (con->update_channel_flag == true) {
[Tao] It can be simplified to "if (con->update_channel_flag)"

> +                     amdgpu_dpm_send_hbm_bad_channel_flag(adev, con-
> >eeprom_control.bad_channel_bitmap);

[Tao] do we need to check status of the function and stop recovery_init if it 
fails?

> +                     con->update_channel_flag = false;
> +             }
>       }
> 
>  #ifdef CONFIG_X86_MCE_AMD
> @@ -2285,6 +2291,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>               goto release_con;
>       }
> 
> +     con->update_channel_flag = false;
>       con->features = 0;
>       INIT_LIST_HEAD(&con->head);
>       /* Might need get this flag from vbios. */ diff --git
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 7cddaad90d6d..9314fde81e68 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -374,6 +374,9 @@ struct amdgpu_ras {
> 
>       /* record umc error info queried from smu */
>       struct umc_ecc_info umc_ecc;
> +
> +     /* Indicates smu whether need update bad channel info */
> +     bool update_channel_flag;
>  };
> 
>  struct ras_fs_data {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> index 2b844a5aafdb..ad5d8667756d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
> @@ -265,6 +265,7 @@ int amdgpu_ras_eeprom_reset_table(struct
> amdgpu_ras_eeprom_control *control)  {
>       struct amdgpu_device *adev = to_amdgpu_device(control);
>       struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>       u8 csum;
>       int res;
> 
> @@ -285,6 +286,10 @@ int amdgpu_ras_eeprom_reset_table(struct
> amdgpu_ras_eeprom_control *control)
> 
>       amdgpu_dpm_send_hbm_bad_pages_num(adev, control-
> >ras_num_recs);
> 
> +     control->bad_channel_bitmap = 0;
> +     amdgpu_dpm_send_hbm_bad_channel_flag(adev, control-
> >bad_channel_bitmap);
> +     con->update_channel_flag = false;
> +
>       amdgpu_ras_debugfs_set_ret_size(control);
> 
>       mutex_unlock(&control->ras_tbl_mutex);
> @@ -418,6 +423,7 @@ amdgpu_ras_eeprom_append_table(struct
> amdgpu_ras_eeprom_control *control,
>                              struct eeprom_table_record *record,
>                              const u32 num)
>  {
> +     struct amdgpu_ras *con =
> +amdgpu_ras_get_context(to_amdgpu_device(control));
>       u32 a, b, i;
>       u8 *buf, *pp;
>       int res;
> @@ -429,9 +435,16 @@ amdgpu_ras_eeprom_append_table(struct
> amdgpu_ras_eeprom_control *control,
>       /* Encode all of them in one go.
>        */
>       pp = buf;
> -     for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
> +     for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
>               __encode_table_record_to_buf(control, &record[i], pp);
> 
> +             /* update bad channel bitmap */
> +             if (!(control->bad_channel_bitmap & (1 <<
> record[i].mem_channel))) {
> +                     control->bad_channel_bitmap |= 1 <<
> record[i].mem_channel;
> +                     con->update_channel_flag = true;
> +             }
> +     }
> +
>       /* a, first record index to write into.
>        * b, last record index to write into.
>        * a = first index to read (fri) + number of records in the table, @@ -
> 684,6 +697,7 @@ int amdgpu_ras_eeprom_read(struct
> amdgpu_ras_eeprom_control *control,
>                          const u32 num)
>  {
>       struct amdgpu_device *adev = to_amdgpu_device(control);
> +     struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>       int i, res;
>       u8 *buf, *pp;
>       u32 g0, g1;
> @@ -751,8 +765,15 @@ int amdgpu_ras_eeprom_read(struct
> amdgpu_ras_eeprom_control *control,
>       /* Read up everything? Then transform.
>        */
>       pp = buf;
> -     for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE)
> +     for (i = 0; i < num; i++, pp += RAS_TABLE_RECORD_SIZE) {
>               __decode_table_record_from_buf(control, &record[i], pp);
> +
> +             /* update bad channel bitmap */
> +             if (!(control->bad_channel_bitmap & (1 <<
> record[i].mem_channel))) {
> +                     control->bad_channel_bitmap |= 1 <<
> record[i].mem_channel;
> +                     con->update_channel_flag = true;
> +             }
> +     }
>  Out:
>       kfree(buf);
>       mutex_unlock(&control->ras_tbl_mutex);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> index 6bb00578bfbb..54d9bfe0881d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
> @@ -80,6 +80,10 @@ struct amdgpu_ras_eeprom_control {
>       /* Protect table access via this mutex.
>        */
>       struct mutex ras_tbl_mutex;
> +
> +     /* Record channel info which occurred bad pages
> +      */
> +     u32 bad_channel_bitmap;
>  };
> 
>  /*
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index 85da6cbaf3b7..aad3c8b4c810 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -97,6 +97,11 @@ static int amdgpu_umc_do_page_retirement(struct
> amdgpu_device *adev,
>                       amdgpu_ras_save_bad_pages(adev);
> 
>                       amdgpu_dpm_send_hbm_bad_pages_num(adev, con-
> >eeprom_control.ras_num_recs);
> +
> +                     if (con->update_channel_flag == true) {
> +
>       amdgpu_dpm_send_hbm_bad_channel_flag(adev, con-
> >eeprom_control.bad_channel_bitmap);
> +                             con->update_channel_flag = false;
> +                     }
>               }
> 
>               if (reset)
> diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> index 1d63f1e8884c..9a892d6d1d7a 100644
> --- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> +++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
> @@ -507,6 +507,18 @@ int amdgpu_dpm_send_hbm_bad_pages_num(struct
> amdgpu_device *adev, uint32_t size)
>       return ret;
>  }
> 
> +int amdgpu_dpm_send_hbm_bad_channel_flag(struct amdgpu_device *adev,
> +uint32_t size) {
> +     struct smu_context *smu = adev->powerplay.pp_handle;
> +     int ret = 0;
> +
> +     mutex_lock(&adev->pm.mutex);
> +     ret = smu_send_hbm_bad_channel_flag(smu, size);
> +     mutex_unlock(&adev->pm.mutex);
> +
> +     return ret;
> +}
> +
>  int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev,
>                                 enum pp_clock_type type,
>                                 uint32_t *min,
> diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> index ddfa55b59d02..3e78b3057277 100644
> --- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> +++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
> @@ -412,6 +412,7 @@ void amdgpu_dpm_enable_jpeg(struct amdgpu_device
> *adev, bool enable);  int amdgpu_pm_load_smu_firmware(struct
> amdgpu_device *adev, uint32_t *smu_version);  int
> amdgpu_dpm_handle_passthrough_sbr(struct amdgpu_device *adev, bool
> enable);  int amdgpu_dpm_send_hbm_bad_pages_num(struct amdgpu_device
> *adev, uint32_t size);
> +int amdgpu_dpm_send_hbm_bad_channel_flag(struct amdgpu_device *adev,
> +uint32_t size);
>  int amdgpu_dpm_get_dpm_freq_range(struct amdgpu_device *adev,
>                                      enum pp_clock_type type,
>                                      uint32_t *min,
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> index 7e79a67bb8ef..f1544755d8b4 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
> @@ -3052,3 +3052,13 @@ int smu_send_hbm_bad_pages_num(struct
> smu_context *smu, uint32_t size)
> 
>       return ret;
>  }
> +
> +int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t
> +size) {
> +     int ret = 0;
> +
> +     if (smu->ppt_funcs && smu->ppt_funcs->send_hbm_bad_channel_flag)
> +             ret = smu->ppt_funcs->send_hbm_bad_channel_flag(smu, size);
> +
> +     return ret;
> +}
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> index fbef3ab8d487..ef57b6089c69 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
> @@ -1292,6 +1292,12 @@ struct pptable_funcs {
>        * @set_config_table: Apply the input DriverSmuConfig table settings.
>        */
>       int (*set_config_table)(struct smu_context *smu, struct
> config_table_setting *table);
> +
> +     /**
> +      * @sned_hbm_bad_channel_flag:  message SMU to update bad
> channel info
> +      *
>               of SMUBUS table.
> +      */
> +     int (*send_hbm_bad_channel_flag)(struct smu_context *smu, uint32_t
> +size);
>  };
> 
>  typedef enum {
> @@ -1428,5 +1434,6 @@ int smu_get_ecc_info(struct smu_context *smu, void
> *umc_ecc);  int smu_stb_collect_info(struct smu_context *smu, void *buff,
> uint32_t size);  void amdgpu_smu_stb_debug_fs_init(struct amdgpu_device
> *adev);  int smu_send_hbm_bad_pages_num(struct smu_context *smu, uint32_t
> size);
> +int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t
> +size);
>  #endif
>  #endif
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h
> index ab66a4b9e438..0f498baf6838 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/aldebaran_ppsmc.h
> @@ -103,7 +103,8 @@
>  #define PPSMC_MSG_GfxDriverResetRecovery     0x42
>  #define PPSMC_MSG_BoardPowerCalibration      0x43
>  #define PPSMC_MSG_HeavySBR                      0x45
> -#define PPSMC_Message_Count                  0x46
> +#define PPSMC_MSG_SetBadHBMPagesRetiredFlagsPerChannel       0x46
> +#define PPSMC_Message_Count                  0x47
> 
> 
>  //PPSMC Reset Types
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> index d787c3b9fc52..9f6f306eeca0 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
> @@ -232,7 +232,8 @@
>       __SMU_DUMMY_MAP(ForceGfxVid),             \
>       __SMU_DUMMY_MAP(Spare0),                  \
>       __SMU_DUMMY_MAP(UnforceGfxVid),           \
> -     __SMU_DUMMY_MAP(HeavySBR),
> +     __SMU_DUMMY_MAP(HeavySBR),                      \
> +     __SMU_DUMMY_MAP(SetBadHBMPagesRetiredFlagsPerChannel),
> 
>  #undef __SMU_DUMMY_MAP
>  #define __SMU_DUMMY_MAP(type)        SMU_MSG_##type
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> index 890acc4e2cb8..e5e249968244 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c
> @@ -82,6 +82,12 @@
>   */
>  #define SUPPORT_ECCTABLE_SMU_VERSION 0x00442a00
> 
> +/*
> + * SMU support BAD CHENNEL info MSG since version 68.51.00,
> + * use this to check ECCTALE feature whether support  */ #define
> +SUPPORT_BAD_CHANNEL_INFO_MSG_VERSION 0x00443300
> +
>  static const struct smu_temperature_range smu13_thermal_policy[] =  {
>       {-273150,  99000, 99000, -273150, 99000, 99000, -273150, 99000,
> 99000}, @@ -140,6 +146,7 @@ static const struct cmn2asic_msg_mapping
> aldebaran_message_map[SMU_MSG_MAX_COUNT
>       MSG_MAP(GfxDriverResetRecovery,
> PPSMC_MSG_GfxDriverResetRecovery,             0),
>       MSG_MAP(BoardPowerCalibration,
> PPSMC_MSG_BoardPowerCalibration,              0),
>       MSG_MAP(HeavySBR,                            PPSMC_MSG_HeavySBR,
> 0),
> +     MSG_MAP(SetBadHBMPagesRetiredFlagsPerChannel,
>       PPSMC_MSG_SetBadHBMPagesRetiredFlagsPerChannel, 0),
>  };
> 
>  static const struct cmn2asic_mapping aldebaran_clk_map[SMU_CLK_COUNT] =
> { @@ -1997,6 +2004,41 @@ static int
> aldebaran_smu_send_hbm_bad_page_num(struct smu_context *smu,
>       return ret;
>  }
> 
> +static int aldebaran_check_bad_channel_info_support(struct smu_context
> +*smu) {
> +     uint32_t if_version = 0xff, smu_version = 0xff;
> +     int ret = 0;
> +
> +     ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
> +     if (ret) {
> +             /* return not support if failed get smu_version */
> +             ret = -EOPNOTSUPP;
> +     }
> +
> +     if (smu_version < SUPPORT_BAD_CHANNEL_INFO_MSG_VERSION)
> +             ret = -EOPNOTSUPP;
> +
> +     return ret;
> +}
> +
> +static int aldebaran_send_hbm_bad_channel_flag(struct smu_context *smu,
> +             uint32_t size)
> +{
> +     int ret = 0;
> +
> +     ret = aldebaran_check_bad_channel_info_support(smu);
> +     if (ret)
> +             return ret;
> +
> +     /* message SMU to update the bad channel info on SMUBUS */
> +     ret = smu_cmn_send_smc_msg_with_param(smu,
> SMU_MSG_SetBadHBMPagesRetiredFlagsPerChannel, size, NULL);
> +     if (ret)
> +             dev_err(smu->adev->dev, "[%s] failed to message SMU to
> update HBM bad channel info\n",
> +                             __func__);
> +
> +     return ret;
> +}
> +
>  static const struct pptable_funcs aldebaran_ppt_funcs = {
>       /* init dpm */
>       .get_allowed_feature_mask = aldebaran_get_allowed_feature_mask,
> @@ -2062,6 +2104,7 @@ static const struct pptable_funcs
> aldebaran_ppt_funcs = {
>       .i2c_fini = aldebaran_i2c_control_fini,
>       .send_hbm_bad_pages_num =
> aldebaran_smu_send_hbm_bad_page_num,
>       .get_ecc_info = aldebaran_get_ecc_info,
> +     .send_hbm_bad_channel_flag =
> aldebaran_send_hbm_bad_channel_flag,
>  };
> 
>  void aldebaran_set_ppt_funcs(struct smu_context *smu)
> --
> 2.17.1

Reply via email to