Convert MCA error address to physical address and find out all pages in
one physical row.

Signed-off-by: Tao Zhou <tao.zh...@amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |  5 ++
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 97 ++++++++++++++++++++++++-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h  | 64 ++++++++++++++++
 3 files changed, 162 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 43321f57f557..417a6726c71b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -32,6 +32,11 @@
  * is the index of 8KB block
  */
 #define ADDR_OF_8KB_BLOCK(addr)                        (((addr) & ~0xffULL) << 
5)
+/*
+ * (addr / 256) * 32768, the higher 26 bits in ErrorAddr
+ * is the index of 8KB block
+ */
+#define ADDR_OF_32KB_BLOCK(addr)                       (((addr) & ~0xffULL) << 
7)
 /* channel index is the index of 256B block */
 #define ADDR_OF_256B_BLOCK(channel_index)      ((channel_index) << 8)
 /* offset in 256B block */
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 292159814340..2a135fd8ec15 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -27,6 +27,14 @@
 #include "umc/umc_12_0_0_offset.h"
 #include "umc/umc_12_0_0_sh_mask.h"
 
+/* mapping of MCA error address to normalized address */
+static const uint32_t umc_v12_0_ma2na_mapping[] = {
+       0,  5,  6,  8,  9,  14, 12, 13,
+       10, 11, 15, 16, 17, 18, 19, 20,
+       21, 22, 23, 24, 25, 26, 27, 28,
+       24, 7,  29, 30,
+};
+
 static inline uint32_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
                                            uint32_t node_inst,
                                            uint32_t umc_inst,
@@ -133,12 +141,93 @@ static void umc_v12_0_query_ras_error_count(struct 
amdgpu_device *adev,
        umc_v12_0_reset_error_count(adev);
 }
 
-static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
-                                           struct ras_err_data *err_data, 
uint64_t err_addr,
-                                           uint32_t ch_inst, uint32_t umc_inst,
-                                           uint32_t node_inst, uint64_t 
mc_umc_status)
+static bool umc_v12_0_bit_wise_xor(uint32_t val)
 {
+       bool result = 0;
+       int i;
 
+       for (i = 0; i < 32; i++)
+               result = result ^ ((val >> i) & 0x1);
+
+       return result;
+}
+
+static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
+                                       struct ras_err_data *err_data, uint64_t 
err_addr,
+                                       uint32_t ch_inst, uint32_t umc_inst,
+                                       uint32_t node_inst, uint64_t 
mc_umc_status)
+{
+       uint32_t channel_index, i;
+       uint64_t soc_pa, na, retired_page, column;
+       uint32_t bank_hash0, bank_hash1, bank_hash2, bank_hash3, col, row;
+       uint32_t bank0, bank1, bank2, bank3, bank;
+
+       bank_hash0 = (err_addr >> UMC_V12_0_MCA_B0_BIT) & 0x1ULL;
+       bank_hash1 = (err_addr >> UMC_V12_0_MCA_B1_BIT) & 0x1ULL;
+       bank_hash2 = (err_addr >> UMC_V12_0_MCA_B2_BIT) & 0x1ULL;
+       bank_hash3 = (err_addr >> UMC_V12_0_MCA_B3_BIT) & 0x1ULL;
+       col = (err_addr >> 1) & 0x1fULL;
+       row = (err_addr >> 10) & 0x3fffULL;
+
+       /* apply bank hash algorithm */
+       bank0 =
+               bank_hash0 ^ (UMC_V12_0_XOR_EN0 &
+               (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR0) ^
+               (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR0))));
+       bank1 =
+               bank_hash1 ^ (UMC_V12_0_XOR_EN1 &
+               (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR1) ^
+               (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR1))));
+       bank2 =
+               bank_hash2 ^ (UMC_V12_0_XOR_EN2 &
+               (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR2) ^
+               (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR2))));
+       bank3 =
+               bank_hash3 ^ (UMC_V12_0_XOR_EN3 &
+               (umc_v12_0_bit_wise_xor(col & UMC_V12_0_COL_XOR3) ^
+               (umc_v12_0_bit_wise_xor(row & UMC_V12_0_ROW_XOR3))));
+
+       bank = bank0 | (bank1 << 1) | (bank2 << 2) | (bank3 << 3);
+       err_addr &= ~0x3c0ULL;
+       err_addr |= (bank << UMC_V12_0_MCA_B0_BIT);
+
+       na = 0x0;
+       /* convert mca error address to normalized address */
+       for (i = 1; i < ARRAY_SIZE(umc_v12_0_ma2na_mapping); i++)
+               na |= ((err_addr >> i) & 0x1ULL) << umc_v12_0_ma2na_mapping[i];
+
+       channel_index =
+               adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num *
+                       adev->umc.channel_inst_num +
+                       umc_inst * adev->umc.channel_inst_num +
+                       ch_inst];
+       /* translate umc channel address to soc pa, 3 parts are included */
+       soc_pa = ADDR_OF_32KB_BLOCK(na) |
+               ADDR_OF_256B_BLOCK(channel_index) |
+               OFFSET_IN_256B_BLOCK(na);
+
+       /* the umc channel bits are not original values, they are hashed */
+       UMC_V12_0_SET_CHANNEL_HASH(channel_index, soc_pa);
+
+       /* clear [C3 C2] in soc physical address */
+       soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
+       /* clear [C4] in soc physical address */
+       soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+
+       /* loop for all possibilities of [C4 C3 C2] */
+       for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+               retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
+               retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
+               dev_info(adev->dev, "Error Address(PA): 0x%llx\n", 
retired_page);
+               amdgpu_umc_fill_error_record(err_data, err_addr,
+                       retired_page, channel_index, umc_inst);
+
+               /* shift R13 bit */
+               retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
+               dev_info(adev->dev, "Error Address(PA): 0x%llx\n", 
retired_page);
+               amdgpu_umc_fill_error_record(err_data, err_addr,
+                       retired_page, channel_index, umc_inst);
+       }
 }
 
 static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index 3f822b6e0c99..c20b4b4cbfda 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -48,6 +48,70 @@
 #define UMC_V12_0_TOTAL_CHANNEL_NUM(adev) \
        (UMC_V12_0_CHANNEL_INSTANCE_NUM * (adev)->gmc.num_umc)
 
+/* one piece of normalized address is mapped to 8 pieces of physical address */
+#define UMC_V12_0_NA_MAP_PA_NUM        8
+/* bank bits in MCA error address */
+#define UMC_V12_0_MCA_B0_BIT 6
+#define UMC_V12_0_MCA_B1_BIT 7
+#define UMC_V12_0_MCA_B2_BIT 8
+#define UMC_V12_0_MCA_B3_BIT 9
+/* column bits in SOC physical address */
+#define UMC_V12_0_PA_C2_BIT 15
+#define UMC_V12_0_PA_C4_BIT 21
+/* row bits in SOC physical address */
+#define UMC_V12_0_PA_R13_BIT 35
+/* channel index bits in SOC physical address */
+#define UMC_V12_0_PA_CH4_BIT 12
+#define UMC_V12_0_PA_CH5_BIT 13
+#define UMC_V12_0_PA_CH6_BIT 14
+
+/* bank hash settings */
+#define UMC_V12_0_XOR_EN0 1
+#define UMC_V12_0_XOR_EN1 1
+#define UMC_V12_0_XOR_EN2 1
+#define UMC_V12_0_XOR_EN3 1
+#define UMC_V12_0_COL_XOR0 0x0
+#define UMC_V12_0_COL_XOR1 0x0
+#define UMC_V12_0_COL_XOR2 0x800
+#define UMC_V12_0_COL_XOR3 0x1000
+#define UMC_V12_0_ROW_XOR0 0x11111
+#define UMC_V12_0_ROW_XOR1 0x22222
+#define UMC_V12_0_ROW_XOR2 0x4444
+#define UMC_V12_0_ROW_XOR3 0x8888
+
+/* channel hash settings */
+#define UMC_V12_0_HASH_4K 0
+#define UMC_V12_0_HASH_64K 1
+#define UMC_V12_0_HASH_2M 1
+#define UMC_V12_0_HASH_1G 1
+#define UMC_V12_0_HASH_1T 1
+
+/* XOR some bits of PA into CH4~CH6 bits (bits 12~14 of PA),
+ * hash bit is only effective when related setting is enabled
+ */
+#define UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) ((((channel_idx) >> 5) & 
0x1) ^ \
+                               (((pa)  >> 20) & 0x1ULL & UMC_V12_0_HASH_64K) ^ 
\
+                               (((pa)  >> 27) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
+                               (((pa)  >> 34) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
+                               (((pa)  >> 41) & 0x1ULL & UMC_V12_0_HASH_1T))
+#define UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) ((((channel_idx) >> 6) & 
0x1) ^ \
+                               (((pa)  >> 21) & 0x1ULL & UMC_V12_0_HASH_64K) ^ 
\
+                               (((pa)  >> 28) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
+                               (((pa)  >> 35) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
+                               (((pa)  >> 42) & 0x1ULL & UMC_V12_0_HASH_1T))
+#define UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) ((((channel_idx) >> 4) & 
0x1) ^ \
+                               (((pa)  >> 19) & 0x1ULL & UMC_V12_0_HASH_64K) ^ 
\
+                               (((pa)  >> 26) & 0x1ULL & UMC_V12_0_HASH_2M) ^ \
+                               (((pa)  >> 33) & 0x1ULL & UMC_V12_0_HASH_1G) ^ \
+                               (((pa)  >> 40) & 0x1ULL & UMC_V12_0_HASH_1T) ^ \
+                               (((pa)  >> 47) & 0x1ULL & UMC_V12_0_HASH_4K))
+#define UMC_V12_0_SET_CHANNEL_HASH(channel_idx, pa) do { \
+               (pa) &= ~(0x7ULL << UMC_V12_0_PA_CH4_BIT); \
+               (pa) |= (UMC_V12_0_CHANNEL_HASH_CH4(channel_idx, pa) << 
UMC_V12_0_PA_CH4_BIT); \
+               (pa) |= (UMC_V12_0_CHANNEL_HASH_CH5(channel_idx, pa) << 
UMC_V12_0_PA_CH5_BIT); \
+               (pa) |= (UMC_V12_0_CHANNEL_HASH_CH6(channel_idx, pa) << 
UMC_V12_0_PA_CH6_BIT); \
+       } while (0)
+
 #define GET_CROSS_NODE_ADDR(reg) \
        ((((reg) >> 32) & 0x3) ? ((reg) | (1ULL << 34)) : (reg))
 
-- 
2.35.1

Reply via email to