[AMD Official Use Only - General]
[AMD Official Use Only - General] 发件人: Lazar, Lijo <lijo.la...@amd.com> 日期: 星期三, 2022年5月25日 下午8:38 收件人: Yang, Stanley <stanley.y...@amd.com>, amd-gfx@lists.freedesktop.org <amd-gfx@lists.freedesktop.org>, Zhang, Hawking <hawking.zh...@amd.com>, Zhou1, Tao <tao.zh...@amd.com>, Quan, Evan <evan.q...@amd.com> 主题: Re: [PATCH Review v3 2/2] drm/amdgpu: print umc correctable error address On 5/25/2022 11:40 AM, Stanley.Yang wrote: > Changed from V1: > remove unnecessary same row physical address calculation > > Changed from V2: > move record_ce_addr_supported to umc_ecc_info struct > > Signed-off-by: Stanley.Yang <stanley.y...@amd.com> > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 5 ++ > drivers/gpu/drm/amd/amdgpu/umc_v6_7.c | 50 ++++++++++++++++++- > .../drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 1 + > 3 files changed, 54 insertions(+), 2 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index 28e603243b67..bf5a95104ec1 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -333,6 +333,11 @@ struct ecc_info_per_ch { > > struct umc_ecc_info { > struct ecc_info_per_ch ecc[MAX_UMC_CHANNEL_NUM]; > + > + /* Determine smu ecctable whether support > + * record correctable error address > + */ > + int record_ce_addr_supported; > }; > > struct amdgpu_ras { > diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > index 606892dbea1c..bf7524f16b66 100644 > --- a/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > +++ b/drivers/gpu/drm/amd/amdgpu/umc_v6_7.c > @@ -119,6 +119,24 @@ static void > umc_v6_7_ecc_info_query_correctable_error_count(struct amdgpu_device > *error_count += 1; > > umc_v6_7_query_error_status_helper(adev, mc_umc_status, > umc_reg_offset); > + > + if (ras->umc_ecc.record_ce_addr_supported) { > + uint64_t err_addr, soc_pa; > + uint32_t channel_index = > + adev->umc.channel_idx_tbl[umc_inst * > adev->umc.channel_inst_num + ch_inst]; > + > + err_addr = > ras->umc_ecc.ecc[eccinfo_table_idx].mca_ceumc_addr; > + err_addr = REG_GET_FIELD(err_addr, > MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); > + /* translate umc channel address to soc pa, 3 parts are > included */ > + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | > + ADDR_OF_256B_BLOCK(channel_index) | > + OFFSET_IN_256B_BLOCK(err_addr); > + > + /* The umc channel bits are not original values, they > are hashed */ > + SET_CHANNEL_HASH(channel_index, soc_pa); > + UMC address to PA conversion is common regardless of UE/CE error addresses. You may want to pack it in a small function. Regardless, Acked-by: Lijo Lazar <lijo.la...@amd.com> Thanks, Lijo Stanley: These lines are indeed redundant. I'll make a patch to simplify it. Reagards, Stanley > + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", > soc_pa); > + } > } > } > > @@ -251,7 +269,9 @@ static void > umc_v6_7_ecc_info_query_ras_error_address(struct amdgpu_device *adev > > static void umc_v6_7_query_correctable_error_count(struct amdgpu_device > *adev, > uint32_t umc_reg_offset, > - unsigned long *error_count) > + unsigned long *error_count, > + uint32_t ch_inst, > + uint32_t umc_inst) > { > uint32_t ecc_err_cnt_sel, ecc_err_cnt_sel_addr; > uint32_t ecc_err_cnt, ecc_err_cnt_addr; > @@ -295,6 +315,31 @@ static void > umc_v6_7_query_correctable_error_count(struct amdgpu_device *adev, > *error_count += 1; > > umc_v6_7_query_error_status_helper(adev, mc_umc_status, > umc_reg_offset); > + > + { > + uint64_t err_addr, soc_pa; > + uint32_t mc_umc_addrt0; > + uint32_t channel_index; > + > + mc_umc_addrt0 = > + SOC15_REG_OFFSET(UMC, 0, > regMCA_UMC_UMC0_MCUMC_ADDRT0); > + > + channel_index = > + adev->umc.channel_idx_tbl[umc_inst * > adev->umc.channel_inst_num + ch_inst]; > + > + err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) > * 4); > + err_addr = REG_GET_FIELD(err_addr, > MCA_UMC_UMC0_MCUMC_ADDRT0, ErrorAddr); > + > + /* translate umc channel address to soc pa, 3 parts are > included */ > + soc_pa = ADDR_OF_8KB_BLOCK(err_addr) | > + ADDR_OF_256B_BLOCK(channel_index) | > + OFFSET_IN_256B_BLOCK(err_addr); > + > + /* The umc channel bits are not original values, they > are hashed */ > + SET_CHANNEL_HASH(channel_index, soc_pa); > + > + dev_info(adev->dev, "Error Address(PA): 0x%llx\n", > soc_pa); > + } > } > } > > @@ -395,7 +440,8 @@ static void umc_v6_7_query_ras_error_count(struct > amdgpu_device *adev, > ch_inst); > umc_v6_7_query_correctable_error_count(adev, > umc_reg_offset, > - &(err_data->ce_count)); > + &(err_data->ce_count), > + ch_inst, umc_inst); > umc_v6_7_querry_uncorrectable_error_count(adev, > umc_reg_offset, > > &(err_data->ue_count)); > diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > index 9cdfeea58085..c7e0fec614ea 100644 > --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c > @@ -1883,6 +1883,7 @@ static ssize_t aldebaran_get_ecc_info(struct > smu_context *smu, > ecc_info_per_channel->mca_ceumc_addr = > ecc_table->EccInfo_V2[i].mca_ceumc_addr; > } > + eccinfo->record_ce_addr_supported =1; > } > > return ret; >