The RAS bad page retire flip bits can be set per vram type, vram vendor and nps mode.
Signed-off-by: Tao Zhou <tao.zh...@amd.com> --- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 1 - drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 82 +++++++++++++++++--------- drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 2 - 3 files changed, 53 insertions(+), 32 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 3caebe7c25a5..464015fc2012 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -1504,7 +1504,6 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device *adev) adev->umc.umc_inst_num = UMC_V12_0_UMC_INSTANCE_NUM; adev->umc.node_inst_num /= UMC_V12_0_UMC_INSTANCE_NUM; adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET; - adev->umc.retire_unit = UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) adev->umc.ras = &umc_v12_0_ras; break; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c index d715cfde6aec..774d3baa62d7 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c @@ -174,6 +174,49 @@ static void umc_v12_0_query_ras_error_count(struct amdgpu_device *adev, umc_v12_0_reset_error_count(adev); } +static void umc_v12_0_get_retire_flip_bits(struct amdgpu_device *adev) +{ + enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; + uint32_t vram_type = adev->gmc.vram_type; + struct amdgpu_umc_flip_bits *flip_bits = &(adev->umc.flip_bits); + + if (adev->gmc.gmc_funcs->query_mem_partition_mode) + nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); + + /* default setting */ + flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_C2_BIT; + flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_C3_BIT; + flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_C4_BIT; + flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R13_BIT; + flip_bits->bit_num = 4; + + switch (vram_type) { + case AMDGPU_VRAM_TYPE_HBM: + /* other nps modes are taken as nps1 */ + if (nps == AMDGPU_NPS2_PARTITION_MODE) { + flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_CH5_BIT; + flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_C2_BIT; + flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_B1_BIT; + flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R12_BIT; + } + + if (nps == AMDGPU_NPS4_PARTITION_MODE) { + flip_bits->flip_bits_in_pa[0] = UMC_V12_0_PA_CH4_BIT; + flip_bits->flip_bits_in_pa[1] = UMC_V12_0_PA_CH5_BIT; + flip_bits->flip_bits_in_pa[2] = UMC_V12_0_PA_B0_BIT; + flip_bits->flip_bits_in_pa[3] = UMC_V12_0_PA_R11_BIT; + } + + break; + default: + dev_warn(adev->dev, + "Unknown HBM type, set RAS retire flip bits to the value in NPS1 mode.\n"); + break; + } + + adev->umc.retire_unit = 0x1 << flip_bits->bit_num; +} + static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, struct ras_err_data *err_data, struct ta_ras_query_address_input *addr_in, @@ -182,11 +225,10 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, { uint32_t col, col_lower, row, row_lower, row_high, bank; uint32_t channel_index = 0, umc_inst = 0; - uint32_t i, loop_bits[UMC_V12_0_RETIRE_LOOP_BITS]; + uint32_t i, bit_num, retire_unit, *flip_bits; uint64_t soc_pa, column, err_addr; struct ta_ras_query_address_output addr_out_tmp; struct ta_ras_query_address_output *paddr_out; - enum amdgpu_memory_partition nps = AMDGPU_NPS1_PARTITION_MODE; int ret = 0; if (!addr_out) @@ -211,34 +253,15 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, umc_inst = addr_in->ma.umc_inst; } - loop_bits[0] = UMC_V12_0_PA_C2_BIT; - loop_bits[1] = UMC_V12_0_PA_C3_BIT; - loop_bits[2] = UMC_V12_0_PA_C4_BIT; - loop_bits[3] = UMC_V12_0_PA_R13_BIT; - - if (adev->gmc.gmc_funcs->query_mem_partition_mode) - nps = adev->gmc.gmc_funcs->query_mem_partition_mode(adev); - - /* other nps modes are taken as nps1 */ - if (nps == AMDGPU_NPS2_PARTITION_MODE) { - loop_bits[0] = UMC_V12_0_PA_CH5_BIT; - loop_bits[1] = UMC_V12_0_PA_C2_BIT; - loop_bits[2] = UMC_V12_0_PA_B1_BIT; - loop_bits[3] = UMC_V12_0_PA_R12_BIT; - } - - if (nps == AMDGPU_NPS4_PARTITION_MODE) { - loop_bits[0] = UMC_V12_0_PA_CH4_BIT; - loop_bits[1] = UMC_V12_0_PA_CH5_BIT; - loop_bits[2] = UMC_V12_0_PA_B0_BIT; - loop_bits[3] = UMC_V12_0_PA_R11_BIT; - } + flip_bits = adev->umc.flip_bits.flip_bits_in_pa; + bit_num = adev->umc.flip_bits.bit_num; + retire_unit = adev->umc.retire_unit; soc_pa = paddr_out->pa.pa; channel_index = paddr_out->pa.channel_idx; /* clear loop bits in soc physical address */ - for (i = 0; i < UMC_V12_0_RETIRE_LOOP_BITS; i++) - soc_pa &= ~BIT_ULL(loop_bits[i]); + for (i = 0; i < bit_num; i++) + soc_pa &= ~BIT_ULL(flip_bits[i]); paddr_out->pa.pa = soc_pa; /* get column bit 0 and 1 in mca address */ @@ -259,10 +282,10 @@ static int umc_v12_0_convert_error_address(struct amdgpu_device *adev, goto out; /* loop for all possibilities of retired bits */ - for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) { + for (column = 0; column < retire_unit; column++) { soc_pa = paddr_out->pa.pa; - for (i = 0; i < UMC_V12_0_RETIRE_LOOP_BITS; i++) - soc_pa |= (((column >> i) & 0x1ULL) << loop_bits[i]); + for (i = 0; i < bit_num; i++) + soc_pa |= (((column >> i) & 0x1ULL) << flip_bits[i]); col = ((column & 0x7) << 2) | col_lower; /* add row bit 13 */ @@ -684,5 +707,6 @@ struct amdgpu_umc_ras umc_v12_0_ras = { .update_ecc_status = umc_v12_0_update_ecc_status, .convert_ras_err_addr = umc_v12_0_convert_error_address, .get_die_id_from_pa = umc_v12_0_get_die_id, + .get_retire_flip_bits = umc_v12_0_get_retire_flip_bits, }; diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h index 056bbc038312..ccdd6cd430f8 100644 --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h @@ -55,8 +55,6 @@ #define UMC_V12_0_NA_MAP_PA_NUM 8 /* R13 bit shift should be considered, double the number */ #define UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL (UMC_V12_0_NA_MAP_PA_NUM * 2) -/* C2, C3, C4, R13, four bits in MCA address are looped in retirement */ -#define UMC_V12_0_RETIRE_LOOP_BITS 4 /* column bits in SOC physical address */ #define UMC_V12_0_PA_C2_BIT 15 -- 2.34.1