[PATCH] drm/amd/pm: Drop unsupported features on smu v14_0_2

2024-08-21 Thread Candice Li
Drop unsupported features on smu v14_0_2.

Signed-off-by: Candice Li 
Reviewed-by: Yang Wang 
---
 .../drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c  | 47 ---
 1 file changed, 47 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
index 5913f9c60fe002..391d06cc6e5816 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_2_ppt.c
@@ -2115,50 +2115,6 @@ static void smu_v14_0_2_set_smu_mailbox_registers(struct 
smu_context *smu)
smu->debug_resp_reg = SOC15_REG_OFFSET(MP1, 0, regMP1_SMN_C2PMSG_54);
 }
 
-static int smu_v14_0_2_smu_send_bad_mem_page_num(struct smu_context *smu,
-   uint32_t size)
-{
-   int ret = 0;
-
-   /* message SMU to update the bad page number on SMUBUS */
-   ret = smu_cmn_send_smc_msg_with_param(smu,
- SMU_MSG_SetNumBadMemoryPagesRetired,
- size, NULL);
-   if (ret)
-   dev_err(smu->adev->dev,
- "[%s] failed to message SMU to update bad memory 
pages number\n",
- __func__);
-
-   return ret;
-}
-
-static int smu_v14_0_2_send_bad_mem_channel_flag(struct smu_context *smu,
-   uint32_t size)
-{
-   int ret = 0;
-
-   /* message SMU to update the bad channel info on SMUBUS */
-   ret = smu_cmn_send_smc_msg_with_param(smu,
- 
SMU_MSG_SetBadMemoryPagesRetiredFlagsPerChannel,
- size, NULL);
-   if (ret)
-   dev_err(smu->adev->dev,
- "[%s] failed to message SMU to update bad memory 
pages channel info\n",
- __func__);
-
-   return ret;
-}
-
-static ssize_t smu_v14_0_2_get_ecc_info(struct smu_context *smu,
-   void *table)
-{
-   int ret = 0;
-
-   // TODO
-
-   return ret;
-}
-
 static ssize_t smu_v14_0_2_get_gpu_metrics(struct smu_context *smu,
   void **table)
 {
@@ -2897,12 +2853,9 @@ static const struct pptable_funcs smu_v14_0_2_ppt_funcs 
= {
.enable_gfx_features = smu_v14_0_2_enable_gfx_features,
.set_mp1_state = smu_v14_0_2_set_mp1_state,
.set_df_cstate = smu_v14_0_2_set_df_cstate,
-   .send_hbm_bad_pages_num = smu_v14_0_2_smu_send_bad_mem_page_num,
-   .send_hbm_bad_channel_flag = smu_v14_0_2_send_bad_mem_channel_flag,
 #if 0
.gpo_control = smu_v14_0_gpo_control,
 #endif
-   .get_ecc_info = smu_v14_0_2_get_ecc_info,
 };
 
 void smu_v14_0_2_set_ppt_funcs(struct smu_context *smu)
-- 
2.25.1



[PATCH] drm/amd/pm: Retrieve UMC ODECC error count from aca bank

2024-02-02 Thread Candice Li
Instead of software managed counters.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index d6e14a5f406e63..03873d784be6d6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2552,8 +2552,12 @@ static int mca_umc_mca_get_err_count(const struct 
mca_ras_info *mca_ras, struct
 enum amdgpu_mca_error_type type, struct 
mca_bank_entry *entry, uint32_t *count)
 {
uint64_t status0;
+   uint32_t ext_error_code;
+   uint32_t odecc_err_cnt;
 
status0 = entry->regs[MCA_REG_IDX_STATUS];
+   ext_error_code = MCA_REG__STATUS__ERRORCODEEXT(status0);
+   odecc_err_cnt = MCA_REG__MISC0__ERRCNT(entry->regs[MCA_REG_IDX_MISC0]);
 
if (!REG_GET_FIELD(status0, MCMP1_STATUST0, Val)) {
*count = 0;
@@ -2563,7 +2567,7 @@ static int mca_umc_mca_get_err_count(const struct 
mca_ras_info *mca_ras, struct
if (umc_v12_0_is_deferred_error(adev, status0) ||
umc_v12_0_is_uncorrectable_error(adev, status0) ||
umc_v12_0_is_correctable_error(adev, status0))
-   *count = 1;
+   *count = (ext_error_code == 0) ? odecc_err_cnt : 1;
 
return 0;
 }
-- 
2.25.1



[PATCH] drm/amdgpu: Update setting EEPROM table version

2024-03-18 Thread Candice Li
Use helper function instead of umc callback to set
EEPROM table version.

Signed-off-by: Candice Li 
---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 22 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h   |  2 --
 drivers/gpu/drm/amd/amdgpu/umc_v8_10.c|  6 -
 3 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index b12808c0c331f2..06a62a8a992e9b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -404,6 +404,22 @@ static int amdgpu_ras_eeprom_correct_header_tag(
return res;
 }
 
+static void amdgpu_ras_set_eeprom_table_version(struct 
amdgpu_ras_eeprom_control *control)
+{
+   struct amdgpu_device *adev = to_amdgpu_device(control);
+   struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+
+   switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
+   case IP_VERSION(8, 10, 0):
+   case IP_VERSION(12, 0, 0):
+   hdr->version = RAS_TABLE_VER_V2_1;
+   return;
+   default:
+   hdr->version = RAS_TABLE_VER_V1;
+   return;
+   }
+}
+
 /**
  * amdgpu_ras_eeprom_reset_table -- Reset the RAS EEPROM table
  * @control: pointer to control structure
@@ -423,11 +439,7 @@ int amdgpu_ras_eeprom_reset_table(struct 
amdgpu_ras_eeprom_control *control)
mutex_lock(&control->ras_tbl_mutex);
 
hdr->header = RAS_TABLE_HDR_VAL;
-   if (adev->umc.ras &&
-   adev->umc.ras->set_eeprom_table_version)
-   adev->umc.ras->set_eeprom_table_version(hdr);
-   else
-   hdr->version = RAS_TABLE_VER_V1;
+   amdgpu_ras_set_eeprom_table_version(control);
 
if (hdr->version == RAS_TABLE_VER_V2_1) {
hdr->first_rec_offset = RAS_RECORD_START_V2_1;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 26d2ae498daf22..5954e839d5808d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -66,8 +66,6 @@ struct amdgpu_umc_ras {
void *ras_error_status);
bool (*check_ecc_err_status)(struct amdgpu_device *adev,
enum amdgpu_mca_error_type type, void 
*ras_error_status);
-   /* support different eeprom table version for different asic */
-   void (*set_eeprom_table_version)(struct amdgpu_ras_eeprom_table_header 
*hdr);
 };
 
 struct amdgpu_umc_funcs {
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
index c4c77257710c97..a32f87992f2058 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
@@ -442,11 +442,6 @@ static void 
umc_v8_10_ecc_info_query_ras_error_address(struct amdgpu_device *ade
umc_v8_10_ecc_info_query_error_address, ras_error_status);
 }
 
-static void umc_v8_10_set_eeprom_table_version(struct 
amdgpu_ras_eeprom_table_header *hdr)
-{
-   hdr->version = RAS_TABLE_VER_V2_1;
-}
-
 const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = {
.query_ras_error_count = umc_v8_10_query_ras_error_count,
.query_ras_error_address = umc_v8_10_query_ras_error_address,
@@ -460,5 +455,4 @@ struct amdgpu_umc_ras umc_v8_10_ras = {
.query_ras_poison_mode = umc_v8_10_query_ras_poison_mode,
.ecc_info_query_ras_error_count = 
umc_v8_10_ecc_info_query_ras_error_count,
.ecc_info_query_ras_error_address = 
umc_v8_10_ecc_info_query_ras_error_address,
-   .set_eeprom_table_version = umc_v8_10_set_eeprom_table_version,
 };
-- 
2.25.1



[PATCH] drm/amdgpu: Update EEPROM RAS table for mismatched table version

2024-03-26 Thread Candice Li
Update table version and restore bad page records to EEPROM RAS table
for mismatched table version case. Otherwise force to reset the table.

Signed-off-by: Candice Li 
---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 88 ---
 1 file changed, 78 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 06a62a8a992e9b..42d0ef2f512474 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -1319,6 +1319,37 @@ static int __read_table_ras_info(struct 
amdgpu_ras_eeprom_control *control)
return res == RAS_TABLE_V2_1_INFO_SIZE ? 0 : res;
 }
 
+static bool amdgpu_ras_eeprom_table_version_validate(struct 
amdgpu_ras_eeprom_control *control)
+{
+   struct amdgpu_device *adev = to_amdgpu_device(control);
+   struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
+
+   switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
+   case IP_VERSION(8, 10, 0):
+   case IP_VERSION(12, 0, 0):
+   return hdr->version == RAS_TABLE_VER_V2_1;
+   default:
+   return hdr->version == RAS_TABLE_VER_V1;
+   }
+}
+
+static void amdgpu_ras_update_eeprom_control(struct 
amdgpu_ras_eeprom_table_header *hdr)
+{
+   struct amdgpu_ras_eeprom_control *control =
+   container_of(hdr, struct amdgpu_ras_eeprom_control, tbl_hdr);
+
+   if (hdr->version == RAS_TABLE_VER_V2_1) {
+   control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
+   control->ras_record_offset = RAS_RECORD_START_V2_1;
+   control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
+   } else {
+   control->ras_num_recs = RAS_NUM_RECS(hdr);
+   control->ras_record_offset = RAS_RECORD_START;
+   control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
+   }
+   control->ras_fri = RAS_OFFSET_TO_INDEX(control, hdr->first_rec_offset);
+}
+
 int amdgpu_ras_eeprom_init(struct amdgpu_ras_eeprom_control *control,
   bool *exceed_err_limit)
 {
@@ -1326,7 +1357,9 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control,
unsigned char buf[RAS_TABLE_HEADER_SIZE] = { 0 };
struct amdgpu_ras_eeprom_table_header *hdr = &control->tbl_hdr;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-   int res;
+   int res, res1;
+   struct eeprom_table_record *bps;
+   u32 num_recs;
 
*exceed_err_limit = false;
 
@@ -1355,16 +1388,51 @@ int amdgpu_ras_eeprom_init(struct 
amdgpu_ras_eeprom_control *control,
 
__decode_table_header_from_buf(hdr, buf);
 
-   if (hdr->version == RAS_TABLE_VER_V2_1) {
-   control->ras_num_recs = RAS_NUM_RECS_V2_1(hdr);
-   control->ras_record_offset = RAS_RECORD_START_V2_1;
-   control->ras_max_record_count = RAS_MAX_RECORD_COUNT_V2_1;
-   } else {
-   control->ras_num_recs = RAS_NUM_RECS(hdr);
-   control->ras_record_offset = RAS_RECORD_START;
-   control->ras_max_record_count = RAS_MAX_RECORD_COUNT;
+   amdgpu_ras_update_eeprom_control(hdr);
+
+   if (!amdgpu_ras_eeprom_table_version_validate(control)) {
+   num_recs = control->ras_num_recs;
+   if (num_recs && amdgpu_bad_page_threshold) {
+   /* Save bad page records existed in EEPROM */
+   bps = kcalloc(num_recs, sizeof(*bps), GFP_KERNEL);
+   if (!bps)
+   return -ENOMEM;
+
+   res1 = amdgpu_ras_eeprom_read(control, bps, num_recs);
+   if (res1)
+   dev_warn(adev->dev, "Fail to load EEPROM table, 
force to reset it.");
+
+   res = amdgpu_ras_eeprom_reset_table(control);
+   if (res) {
+   dev_err(adev->dev, "Failed to create a new 
EEPROM table.");
+   kfree(bps);
+   return res < 0 ? res : 0;
+   }
+
+   if (!res1) {
+   /* Update the EEPROM table with correct table 
version and
+* original bad page records
+*/
+   amdgpu_ras_update_eeprom_control(hdr);
+   res = amdgpu_ras_eeprom_append(control, bps, 
num_recs);
+
+   if (res) {
+   dev_warn(adev->dev, "Fail to update 
EEPROM table, force to reset it.");
+   res = 
amdgpu_ras_eeprom_reset_table(control);
+  

[PATCH] drm/amdgpu: Validate TA binary size

2024-08-15 Thread Candice Li
Add TA binary size validation to avoid OOB write.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
index 0c856005df6b95..38face981c3e38 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
@@ -166,6 +166,9 @@ static ssize_t ta_if_load_debugfs_write(struct file *fp, 
const char *buf, size_t
if (ret)
return -EFAULT;
 
+   if (ta_bin_len > PSP_1_MEG)
+   return -EINVAL;
+
copy_pos += sizeof(uint32_t);
 
ta_bin = kzalloc(ta_bin_len, GFP_KERNEL);
-- 
2.25.1



[PATCH] drm/amdgpu: Update EEPROM I2C address for smu v13_0_0

2023-11-23 Thread Candice Li
Check smu v13_0_0 SKU type to select EEPROM I2C address.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 65aa218380be1b..2fde93b00cab37 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -214,6 +214,12 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device 
*adev,
control->i2c_address = EEPROM_I2C_MADDR_0;
return true;
case IP_VERSION(13, 0, 0):
+   if (strnstr(atom_ctx->vbios_pn, "D707",
+   sizeof(atom_ctx->vbios_pn)))
+   control->i2c_address = EEPROM_I2C_MADDR_0;
+   else
+   control->i2c_address = EEPROM_I2C_MADDR_4;
+   return true;
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 10):
control->i2c_address = EEPROM_I2C_MADDR_4;
-- 
2.25.1



[PATCH] drm/amdgpu: Support poison error injection via ras_ctrl debugfs

2024-01-03 Thread Candice Li
Support poison error injection.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index caf00df669bf7e..5851c7a80a5a8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -305,11 +305,13 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file 
*f,
return -EINVAL;
 
data->head.block = block_id;
-   /* only ue and ce errors are supported */
+   /* only ue, ce and poison errors are supported */
if (!memcmp("ue", err, 2))
data->head.type = AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
else if (!memcmp("ce", err, 2))
data->head.type = AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
+   else if (!memcmp("poison", err, 6))
+   data->head.type = AMDGPU_RAS_ERROR__POISON;
else
return -EINVAL;
 
-- 
2.25.1



[PATCH] drm/amdgpu: Drop unnecessary sentences about CE and deferred error.

2024-01-03 Thread Candice Li
Remove "no user action is needed" for correctable and deferred error
to avoid confusion.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 14 +-
 drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c  |  3 +--
 drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c  |  3 +--
 drivers/gpu/drm/amd/amdgpu/umc_v6_7.c   |  2 +-
 4 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b21eadd7c975df..caf00df669bf7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1069,8 +1069,7 @@ static void amdgpu_ras_error_print_error_data(struct 
amdgpu_device *adev,
mcm_info = &err_info->mcm_info;
if (err_info->ce_count) {
dev_info(adev->dev, "socket: %d, die: %d, "
-"%lld new correctable hardware errors 
detected in %s block, "
-"no user action is needed\n",
+"%lld new correctable hardware errors 
detected in %s block\n",
 mcm_info->socket_id,
 mcm_info->die_id,
 err_info->ce_count,
@@ -1082,8 +1081,7 @@ static void amdgpu_ras_error_print_error_data(struct 
amdgpu_device *adev,
err_info = &err_node->err_info;
mcm_info = &err_info->mcm_info;
dev_info(adev->dev, "socket: %d, die: %d, "
-"%lld correctable hardware errors detected in 
total in %s block, "
-"no user action is needed\n",
+"%lld correctable hardware errors detected in 
total in %s block\n",
 mcm_info->socket_id, mcm_info->die_id, 
err_info->ce_count, blk_name);
}
break;
@@ -1139,16 +1137,14 @@ static void amdgpu_ras_error_generate_report(struct 
amdgpu_device *adev,
   adev->smuio.funcs->get_die_id) {
dev_info(adev->dev, "socket: %d, die: %d "
 "%ld correctable hardware errors "
-"detected in %s block, no user "
-"action is needed.\n",
+"detected in %s block\n",
 adev->smuio.funcs->get_socket_id(adev),
 adev->smuio.funcs->get_die_id(adev),
 ras_mgr->err_data.ce_count,
 blk_name);
} else {
dev_info(adev->dev, "%ld correctable hardware errors "
-"detected in %s block, no user "
-"action is needed.\n",
+"detected in %s block\n",
 ras_mgr->err_data.ce_count,
 blk_name);
}
@@ -1978,7 +1974,7 @@ static void 
amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj
struct amdgpu_iv_entry *entry)
 {
dev_info(obj->adev->dev,
-   "Poison is created, no user action is needed.\n");
+   "Poison is created\n");
 }
 
 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
index 6d24c84924cb5d..19986ff6a48d7e 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c
@@ -401,8 +401,7 @@ static void 
nbio_v7_4_handle_ras_controller_intr_no_bifring(struct amdgpu_device
 
if (err_data.ce_count)
dev_info(adev->dev, "%ld correctable hardware "
-   "errors detected in %s block, "
-   "no user action is needed.\n",
+   "errors detected in %s block\n",
obj->err_data.ce_count,

get_ras_block_str(adev->nbio.ras_if));
 
diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c 
b/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
index 25a3da83e0fb97..e90f3378080345 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_9.c
+++ b/drivers/gpu/d

[PATCH] drm/amd/pm: Enable smu v13_0_6 eccinfo in firmware query mode

2024-01-09 Thread Candice Li
smu v13_0_6 eccinfo is supported in firmware query mode only.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 4ebc6b421c2cb4..8f78294e4a6195 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2896,8 +2896,11 @@ static int smu_v13_0_6_select_xgmi_plpd_policy(struct 
smu_context *smu,
 static ssize_t smu_v13_0_6_get_ecc_info(struct smu_context *smu,
void *table)
 {
-   /* Support ecc info by default */
-   return 0;
+   struct amdgpu_device *adev = smu->adev;
+   unsigned int error_query_mode;
+
+   return (amdgpu_ras_get_error_query_mode(adev, &error_query_mode) &&
+   error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY);
 }
 
 static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
-- 
2.25.1



[PATCH] drm/amd/pm: Enable smu v13_0_6 eccinfo in firmware query mode

2024-01-09 Thread Candice Li
smu v13_0_6 eccinfo is supported in firmware query mode only.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 4ebc6b421c2cb4..29396424a99609 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2896,8 +2896,11 @@ static int smu_v13_0_6_select_xgmi_plpd_policy(struct 
smu_context *smu,
 static ssize_t smu_v13_0_6_get_ecc_info(struct smu_context *smu,
void *table)
 {
-   /* Support ecc info by default */
-   return 0;
+   struct amdgpu_device *adev = smu->adev;
+   unsigned int error_query_mode;
+
+   return (amdgpu_ras_get_error_query_mode(adev, &error_query_mode) &&
+   error_query_mode == AMDGPU_RAS_FIRMWARE_ERROR_QUERY) ? 0 : 
-EOPNOTSUPP;
 }
 
 static const struct pptable_funcs smu_v13_0_6_ppt_funcs = {
-- 
2.25.1



[PATCH 1/2] drm/amdgpu: Log deferred error separately

2024-01-10 Thread Candice Li
Separate deferred error from UE and CE and log it
individually.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c   |  11 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 116 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |   6 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   |   1 +
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c|  60 -
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h|   3 +
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c  |   6 +-
 7 files changed, 142 insertions(+), 61 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
index 59fafb8392e0ba..666fd8fa39ad5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c
@@ -256,9 +256,14 @@ int amdgpu_mca_smu_log_ras_error(struct amdgpu_device 
*adev, enum amdgpu_ras_blo
if (type == AMDGPU_MCA_ERROR_TYPE_UE)
amdgpu_ras_error_statistic_ue_count(err_data,
&mcm_info, &err_addr, (uint64_t)count);
-   else
-   amdgpu_ras_error_statistic_ce_count(err_data,
-   &mcm_info, &err_addr, (uint64_t)count);
+   else {
+   if 
(!!(MCA_REG__STATUS__DEFERRED(entry->regs[MCA_REG_IDX_STATUS])))
+   amdgpu_ras_error_statistic_de_count(err_data,
+   &mcm_info, &err_addr, (uint64_t)count);
+   else
+   amdgpu_ras_error_statistic_ce_count(err_data,
+   &mcm_info, &err_addr, (uint64_t)count);
+   }
}
 
 out_mca_release:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ded19182dc792a..94ba10b4184349 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1036,7 +1036,8 @@ static void amdgpu_ras_error_print_error_data(struct 
amdgpu_device *adev,
  struct ras_manager *ras_mgr,
  struct ras_err_data *err_data,
  const char *blk_name,
- bool is_ue)
+ bool is_ue,
+ bool is_de)
 {
struct amdgpu_smuio_mcm_config_info *mcm_info;
struct ras_err_node *err_node;
@@ -1065,25 +1066,50 @@ static void amdgpu_ras_error_print_error_data(struct 
amdgpu_device *adev,
}
 
} else {
-   for_each_ras_error(err_node, err_data) {
-   err_info = &err_node->err_info;
-   mcm_info = &err_info->mcm_info;
-   if (err_info->ce_count) {
+   if (is_de) {
+   for_each_ras_error(err_node, err_data) {
+   err_info = &err_node->err_info;
+   mcm_info = &err_info->mcm_info;
+   if (err_info->de_count) {
+   dev_info(adev->dev, "socket: %d, die: 
%d, "
+   "%lld new deferred hardware 
errors detected in %s block\n",
+   mcm_info->socket_id,
+   mcm_info->die_id,
+   err_info->de_count,
+   blk_name);
+   }
+   }
+
+   for_each_ras_error(err_node, &ras_mgr->err_data) {
+   err_info = &err_node->err_info;
+   mcm_info = &err_info->mcm_info;
dev_info(adev->dev, "socket: %d, die: %d, "
-"%lld new correctable hardware errors 
detected in %s block\n",
-mcm_info->socket_id,
-mcm_info->die_id,
-err_info->ce_count,
-blk_name);
+   "%lld deferred hardware errors detected 
in total in %s block\n",
+   mcm_info->socket_id, mcm_info->die_id,
+   err_info->de_count, blk_name);
+   }
+   } else {
+   for_each_ras_error(err_node, err_data) {
+   err_info = &err_node->err_info;
+  

[PATCH 2/2] drm/amdgpu: Do bad page retirement for deferred errors

2024-01-10 Thread Candice Li
Needs to do bad page retirement for deferred errors.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 848df7acdd3210..df61df7e9b155f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -93,6 +93,7 @@ static int amdgpu_umc_do_page_retirement(struct amdgpu_device 
*adev,
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
+   unsigned long err_count;
 
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
@@ -147,16 +148,17 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
}
 
/* only uncorrectable error needs gpu reset */
-   if (err_data->ue_count) {
-   dev_info(adev->dev, "%ld uncorrectable hardware errors "
-   "detected in UMC block\n",
-   err_data->ue_count);
+   if (err_data->ue_count || err_data->de_count) {
+   dev_info(adev->dev, "%ld uncorrectable hardware errors and "
+   "%ld deferred hardware errors detected in UMC 
block\n",
+   err_data->ue_count, err_data->de_count);
 
+   err_count = err_data->ue_count + err_data->de_count;
if ((amdgpu_bad_page_threshold != 0) &&
err_data->err_addr_cnt) {
amdgpu_ras_add_bad_pages(adev, err_data->err_addr,
err_data->err_addr_cnt);
-   amdgpu_ras_save_bad_pages(adev, &(err_data->ue_count));
+   amdgpu_ras_save_bad_pages(adev, &err_count);
 
amdgpu_dpm_send_hbm_bad_pages_num(adev, 
con->eeprom_control.ras_num_recs);
 
-- 
2.25.1



[PATCH] drm/amd: consolidate TA shared memory structures

2021-08-17 Thread Candice Li
Change-Id: I81be5a824fced3d2244cf209444c2391f6bc6c50
Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 218 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h   |  68 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_rap.c   |   4 +-
 .../gpu/drm/amd/amdgpu/amdgpu_securedisplay.c |   4 +-
 .../amd/display/amdgpu_dm/amdgpu_dm_hdcp.c|  12 +-
 .../drm/amd/display/modules/hdcp/hdcp_psp.c   |  56 ++---
 6 files changed, 167 insertions(+), 195 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index cf40609f39d4f0..ebb827b6331b65 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -908,9 +908,9 @@ static int psp_xgmi_init_shared_buf(struct psp_context *psp)
 */
ret = amdgpu_bo_create_kernel(psp->adev, PSP_XGMI_SHARED_MEM_SIZE,
  PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
- &psp->xgmi_context.xgmi_shared_bo,
- &psp->xgmi_context.xgmi_shared_mc_addr,
- &psp->xgmi_context.xgmi_shared_buf);
+ 
&psp->xgmi_context.context.mem_context.shared_bo,
+ 
&psp->xgmi_context.context.mem_context.shared_mc_addr,
+ 
&psp->xgmi_context.context.mem_context.shared_buf);
 
return ret;
 }
@@ -957,15 +957,15 @@ static int psp_xgmi_load(struct psp_context *psp)
psp_prep_ta_load_cmd_buf(cmd,
 psp->fw_pri_mc_addr,
 psp->xgmi.size_bytes,
-psp->xgmi_context.xgmi_shared_mc_addr,
+
psp->xgmi_context.context.mem_context.shared_mc_addr,
 PSP_XGMI_SHARED_MEM_SIZE);
 
ret = psp_cmd_submit_buf(psp, NULL, cmd,
 psp->fence_buf_mc_addr);
 
if (!ret) {
-   psp->xgmi_context.initialized = 1;
-   psp->xgmi_context.session_id = cmd->resp.session_id;
+   psp->xgmi_context.context.initialized = true;
+   psp->xgmi_context.context.session_id = cmd->resp.session_id;
}
 
release_psp_cmd_buf(psp);
@@ -990,7 +990,7 @@ static int psp_xgmi_unload(struct psp_context *psp)
 
cmd = acquire_psp_cmd_buf(psp);
 
-   psp_prep_ta_unload_cmd_buf(cmd, psp->xgmi_context.session_id);
+   psp_prep_ta_unload_cmd_buf(cmd, psp->xgmi_context.context.session_id);
 
ret = psp_cmd_submit_buf(psp, NULL, cmd,
 psp->fence_buf_mc_addr);
@@ -1002,26 +1002,26 @@ static int psp_xgmi_unload(struct psp_context *psp)
 
 int psp_xgmi_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
 {
-   return psp_ta_invoke(psp, ta_cmd_id, psp->xgmi_context.session_id);
+   return psp_ta_invoke(psp, ta_cmd_id, 
psp->xgmi_context.context.session_id);
 }
 
 int psp_xgmi_terminate(struct psp_context *psp)
 {
int ret;
 
-   if (!psp->xgmi_context.initialized)
+   if (!psp->xgmi_context.context.initialized)
return 0;
 
ret = psp_xgmi_unload(psp);
if (ret)
return ret;
 
-   psp->xgmi_context.initialized = 0;
+   psp->xgmi_context.context.initialized = false;
 
/* free xgmi shared memory */
-   amdgpu_bo_free_kernel(&psp->xgmi_context.xgmi_shared_bo,
-   &psp->xgmi_context.xgmi_shared_mc_addr,
-   &psp->xgmi_context.xgmi_shared_buf);
+   amdgpu_bo_free_kernel(&psp->xgmi_context.context.mem_context.shared_bo,
+   &psp->xgmi_context.context.mem_context.shared_mc_addr,
+   &psp->xgmi_context.context.mem_context.shared_buf);
 
return 0;
 }
@@ -1036,7 +1036,7 @@ int psp_xgmi_initialize(struct psp_context *psp)
!psp->xgmi.start_addr)
return -ENOENT;
 
-   if (!psp->xgmi_context.initialized) {
+   if (!psp->xgmi_context.context.initialized) {
ret = psp_xgmi_init_shared_buf(psp);
if (ret)
return ret;
@@ -1048,7 +1048,7 @@ int psp_xgmi_initialize(struct psp_context *psp)
return ret;
 
/* Initialize XGMI session */
-   xgmi_cmd = (struct ta_xgmi_shared_memory 
*)(psp->xgmi_context.xgmi_shared_buf);
+   xgmi_cmd = (struct ta_xgmi_shared_memory 
*)(psp->xgmi_context.context.mem_context.shared_buf);
memset(xgmi_cmd, 0, sizeof(struct ta_xgmi_shared_memory));
xgmi_cmd->cmd_id = TA_COMMAND_XGMI__INITIALIZE;
 
@@ -1062,7 +1062,7 @@ int psp_xgmi_get_hive_id(struct psp_context *psp, 
uint64_t *hive_id)
 

[PATCH] drm/amd/amdgpu: add name field back to ras_common_if

2021-08-23 Thread Candice Li
Adding name filed back to ras_common_if to work around error
injection failure with amdgpuras tool.

Change-Id: I9d181a4153b055e22ac6adeb3b51a521c8c2793b
Signed-off-by: Candice Li 
Reviewed-by: John Clements 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index abc5710898e803..5b5163357fcb61 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -309,6 +309,7 @@ struct ras_common_if {
enum amdgpu_ras_block block;
enum amdgpu_ras_error_type type;
uint32_t sub_block_index;
+   char name[32];
 };
 
 struct amdgpu_ras {
-- 
2.17.1



[PATCH] drm/amd/amdgpu: consolidate PSP TA init shared buf functions

2021-08-23 Thread Candice Li
Change-Id: I779f4fb52ecc661c25c42ced487719f08f3d875d
Signed-off-by: Candice Li 
Reviewed-by: John Clements 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 142 +++-
 1 file changed, 43 insertions(+), 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index a78a832d8fea23..23efdc67250272 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -899,23 +899,37 @@ static void psp_prep_ta_load_cmd_buf(struct 
psp_gfx_cmd_resp *cmd,
cmd->cmd.cmd_load_ta.cmd_buf_len = ta_shared_size;
 }
 
-static int psp_xgmi_init_shared_buf(struct psp_context *psp)
+static int psp_ta_init_shared_buf(struct psp_context *psp,
+ struct ta_mem_context *mem_ctx,
+ uint32_t shared_mem_size)
 {
int ret;
 
/*
-* Allocate 16k memory aligned to 4k from Frame Buffer (local
-* physical) for xgmi ta <-> Driver
-*/
-   ret = amdgpu_bo_create_kernel(psp->adev, PSP_XGMI_SHARED_MEM_SIZE,
- PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
- 
&psp->xgmi_context.context.mem_context.shared_bo,
- 
&psp->xgmi_context.context.mem_context.shared_mc_addr,
- 
&psp->xgmi_context.context.mem_context.shared_buf);
+   * Allocate 16k memory aligned to 4k from Frame Buffer (local
+   * physical) for ta to host memory
+   */
+   ret = amdgpu_bo_create_kernel(psp->adev, shared_mem_size, PAGE_SIZE,
+ AMDGPU_GEM_DOMAIN_VRAM,
+ &mem_ctx->shared_bo,
+ &mem_ctx->shared_mc_addr,
+ &mem_ctx->shared_buf);
 
return ret;
 }
 
+static void psp_ta_free_shared_buf(struct ta_mem_context *mem_ctx)
+{
+   amdgpu_bo_free_kernel(&mem_ctx->shared_bo, &mem_ctx->shared_mc_addr,
+ &mem_ctx->shared_buf);
+}
+
+static int psp_xgmi_init_shared_buf(struct psp_context *psp)
+{
+   return psp_ta_init_shared_buf(psp, 
&psp->xgmi_context.context.mem_context,
+ PSP_XGMI_SHARED_MEM_SIZE);
+}
+
 static void psp_prep_ta_invoke_cmd_buf(struct psp_gfx_cmd_resp *cmd,
   uint32_t ta_cmd_id,
   uint32_t session_id)
@@ -1020,9 +1034,7 @@ int psp_xgmi_terminate(struct psp_context *psp)
psp->xgmi_context.context.initialized = false;
 
/* free xgmi shared memory */
-   amdgpu_bo_free_kernel(&psp->xgmi_context.context.mem_context.shared_bo,
-   &psp->xgmi_context.context.mem_context.shared_mc_addr,
-   &psp->xgmi_context.context.mem_context.shared_buf);
+   psp_ta_free_shared_buf(&psp->xgmi_context.context.mem_context);
 
return 0;
 }
@@ -1270,19 +1282,8 @@ int psp_xgmi_set_topology_info(struct psp_context *psp,
 // ras begin
 static int psp_ras_init_shared_buf(struct psp_context *psp)
 {
-   int ret;
-
-   /*
-* Allocate 16k memory aligned to 4k from Frame Buffer (local
-* physical) for ras ta <-> Driver
-*/
-   ret = amdgpu_bo_create_kernel(psp->adev, PSP_RAS_SHARED_MEM_SIZE,
-   PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
-   &psp->ras_context.context.mem_context.shared_bo,
-   &psp->ras_context.context.mem_context.shared_mc_addr,
-   &psp->ras_context.context.mem_context.shared_buf);
-
-   return ret;
+   return psp_ta_init_shared_buf(psp, 
&psp->ras_context.context.mem_context,
+ PSP_RAS_SHARED_MEM_SIZE);
 }
 
 static int psp_ras_load(struct psp_context *psp)
@@ -1466,9 +1467,7 @@ static int psp_ras_terminate(struct psp_context *psp)
psp->ras_context.context.initialized = false;
 
/* free ras shared memory */
-   amdgpu_bo_free_kernel(&psp->ras_context.context.mem_context.shared_bo,
-   &psp->ras_context.context.mem_context.shared_mc_addr,
-   &psp->ras_context.context.mem_context.shared_buf);
+   psp_ta_free_shared_buf(&psp->ras_context.context.mem_context);
 
return 0;
 }
@@ -1576,19 +1575,8 @@ int psp_ras_trigger_error(struct psp_context *psp,
 // HDCP start
 static int psp_hdcp_init_shared_buf(struct psp_context *psp)
 {
-   int ret;
-
-   /*
-* Allocate 16k memory aligned to 4k from Frame Buffer (local
-* physical) for hdcp ta <-> Driver
-*/
-   ret = amdgpu_bo_create_kernel(psp->

[PATCH] drm/amd/amdgpu: consolidate PSP TA unload function

2021-08-27 Thread Candice Li
Create common PSP TA unload function and replace all common TA unloading
sequences.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 160 ++--
 1 file changed, 40 insertions(+), 120 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 23efdc67250272..243adce6de3654 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -46,6 +46,9 @@ static int psp_sysfs_init(struct amdgpu_device *adev);
 static void psp_sysfs_fini(struct amdgpu_device *adev);
 
 static int psp_load_smu_fw(struct psp_context *psp);
+static int psp_ta_unload(struct psp_context *psp, uint32_t session_id);
+static int psp_rap_terminate(struct psp_context *psp);
+static int psp_securedisplay_terminate(struct psp_context *psp);
 
 /*
  * Due to DF Cstate management centralized to PMFW, the firmware
@@ -829,10 +832,28 @@ static void psp_prep_ta_unload_cmd_buf(struct 
psp_gfx_cmd_resp *cmd,
cmd->cmd.cmd_unload_ta.session_id = session_id;
 }
 
+static int psp_ta_unload(struct psp_context *psp, uint32_t session_id)
+{
+   int ret;
+   struct psp_gfx_cmd_resp *cmd = acquire_psp_cmd_buf(psp);
+
+   psp_prep_ta_unload_cmd_buf(cmd, session_id);
+
+   ret = psp_cmd_submit_buf(psp, NULL, cmd, psp->fence_buf_mc_addr);
+
+   release_psp_cmd_buf(psp);
+
+   return ret;
+}
+
 static int psp_asd_unload(struct psp_context *psp)
+{
+   return psp_ta_unload(psp, psp->asd_context.session_id);
+}
+
+static int psp_asd_terminate(struct psp_context *psp)
 {
int ret;
-   struct psp_gfx_cmd_resp *cmd;
 
if (amdgpu_sriov_vf(psp->adev))
return 0;
@@ -840,17 +861,11 @@ static int psp_asd_unload(struct psp_context *psp)
if (!psp->asd_context.asd_initialized)
return 0;
 
-   cmd = acquire_psp_cmd_buf(psp);
-
-   psp_prep_ta_unload_cmd_buf(cmd, psp->asd_context.session_id);
+   ret = psp_asd_unload(psp);
 
-   ret = psp_cmd_submit_buf(psp, NULL, cmd,
-psp->fence_buf_mc_addr);
if (!ret)
psp->asd_context.asd_initialized = false;
 
-   release_psp_cmd_buf(psp);
-
return ret;
 }
 
@@ -990,29 +1005,7 @@ static int psp_xgmi_load(struct psp_context *psp)
 
 static int psp_xgmi_unload(struct psp_context *psp)
 {
-   int ret;
-   struct psp_gfx_cmd_resp *cmd;
-   struct amdgpu_device *adev = psp->adev;
-
-   /* XGMI TA unload currently is not supported on Arcturus/Aldebaran A+A 
*/
-   if (adev->asic_type == CHIP_ARCTURUS ||
-   (adev->asic_type == CHIP_ALDEBARAN && 
adev->gmc.xgmi.connected_to_cpu))
-   return 0;
-
-   /*
-* TODO: bypass the unloading in sriov for now
-*/
-
-   cmd = acquire_psp_cmd_buf(psp);
-
-   psp_prep_ta_unload_cmd_buf(cmd, psp->xgmi_context.context.session_id);
-
-   ret = psp_cmd_submit_buf(psp, NULL, cmd,
-psp->fence_buf_mc_addr);
-
-   release_psp_cmd_buf(psp);
-
-   return ret;
+   return psp_ta_unload(psp, psp->xgmi_context.context.session_id);
 }
 
 int psp_xgmi_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
@@ -1023,6 +1016,12 @@ int psp_xgmi_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
 int psp_xgmi_terminate(struct psp_context *psp)
 {
int ret;
+   struct amdgpu_device *adev = psp->adev;
+
+   /* XGMI TA unload currently is not supported on Arcturus/Aldebaran A+A 
*/
+   if (adev->asic_type == CHIP_ARCTURUS ||
+   (adev->asic_type == CHIP_ALDEBARAN && 
adev->gmc.xgmi.connected_to_cpu))
+   return 0;
 
if (!psp->xgmi_context.context.initialized)
return 0;
@@ -1337,25 +1336,7 @@ static int psp_ras_load(struct psp_context *psp)
 
 static int psp_ras_unload(struct psp_context *psp)
 {
-   int ret;
-   struct psp_gfx_cmd_resp *cmd;
-
-   /*
-* TODO: bypass the unloading in sriov for now
-*/
-   if (amdgpu_sriov_vf(psp->adev))
-   return 0;
-
-   cmd = acquire_psp_cmd_buf(psp);
-
-   psp_prep_ta_unload_cmd_buf(cmd, psp->ras_context.context.session_id);
-
-   ret = psp_cmd_submit_buf(psp, NULL, cmd,
-   psp->fence_buf_mc_addr);
-
-   release_psp_cmd_buf(psp);
-
-   return ret;
+   return psp_ta_unload(psp, psp->ras_context.context.session_id);
 }
 
 int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
@@ -1644,24 +1625,7 @@ static int psp_hdcp_initialize(struct psp_context *psp)
 
 static int psp_hdcp_unload(struct psp_context *psp)
 {
-   int ret;
-   struct psp_gfx_cmd_resp *cmd;
-
-   /*
-* TODO: bypass the unloading in sriov for now
-*/
-   if (amdgpu_sriov_vf(psp->adev))
-   return

[PATCH] drm/amd/amdgpu: add mpio to ras block

2021-08-29 Thread Candice Li
Add MPIO to RAS block

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 2 ++
 drivers/gpu/drm/amd/amdgpu/ta_ras_if.h  | 1 +
 3 files changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 96a8fd0ca1df31..77140821dc1126 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -61,6 +61,7 @@ const char *ras_block_string[] = {
"mp0",
"mp1",
"fuse",
+   "mpio",
 };
 
 #define ras_err_str(i) (ras_error_string[ffs(i)])
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index eae604fd90b81a..1670467c205463 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -544,6 +544,8 @@ amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
return TA_RAS_BLOCK__MP1;
case AMDGPU_RAS_BLOCK__FUSE:
return TA_RAS_BLOCK__FUSE;
+   case AMDGPU_RAS_BLOCK__MPIO:
+   return TA_RAS_BLOCK__MPIO;
default:
WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block);
return TA_RAS_BLOCK__UMC;
diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h 
b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
index 0f214a398dd8fd..532260fd64db14 100644
--- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
+++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
@@ -73,6 +73,7 @@ enum ta_ras_block {
TA_RAS_BLOCK__MP0,
TA_RAS_BLOCK__MP1,
TA_RAS_BLOCK__FUSE,
+   TA_RAS_BLOCK__MPIO,
TA_NUM_BLOCK_MAX
 };
 
-- 
2.17.1



[PATCH] drm/amdgpu: Create common PSP TA load function

2021-09-06 Thread Candice Li
Creat common PSP TA load function and update PSP ta_mem_context
with size information.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 280 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  17 +-
 2 files changed, 93 insertions(+), 204 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 54c26432c65b3d..75eed18370eb12 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -901,22 +901,20 @@ int psp_reg_program(struct psp_context *psp, enum 
psp_reg_prog_id reg,
 static void psp_prep_ta_load_cmd_buf(struct psp_gfx_cmd_resp *cmd,
 uint64_t ta_bin_mc,
 uint32_t ta_bin_size,
-uint64_t ta_shared_mc,
-uint32_t ta_shared_size)
+struct ta_mem_context *mem_ctx)
 {
cmd->cmd_id = GFX_CMD_ID_LOAD_TA;
cmd->cmd.cmd_load_ta.app_phy_addr_lo= lower_32_bits(ta_bin_mc);
cmd->cmd.cmd_load_ta.app_phy_addr_hi= upper_32_bits(ta_bin_mc);
cmd->cmd.cmd_load_ta.app_len= ta_bin_size;
 
-   cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_lo = lower_32_bits(ta_shared_mc);
-   cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_hi = upper_32_bits(ta_shared_mc);
-   cmd->cmd.cmd_load_ta.cmd_buf_len = ta_shared_size;
+   cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_lo = 
lower_32_bits(mem_ctx->shared_mc_addr);
+   cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_hi = 
upper_32_bits(mem_ctx->shared_mc_addr);
+   cmd->cmd.cmd_load_ta.cmd_buf_len = mem_ctx->shared_mem_size;
 }
 
 static int psp_ta_init_shared_buf(struct psp_context *psp,
- struct ta_mem_context *mem_ctx,
- uint32_t shared_mem_size)
+ struct ta_mem_context *mem_ctx)
 {
int ret;
 
@@ -924,8 +922,8 @@ static int psp_ta_init_shared_buf(struct psp_context *psp,
* Allocate 16k memory aligned to 4k from Frame Buffer (local
* physical) for ta to host memory
*/
-   ret = amdgpu_bo_create_kernel(psp->adev, shared_mem_size, PAGE_SIZE,
- AMDGPU_GEM_DOMAIN_VRAM,
+   ret = amdgpu_bo_create_kernel(psp->adev, mem_ctx->shared_mem_size,
+ PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
  &mem_ctx->shared_bo,
  &mem_ctx->shared_mc_addr,
  &mem_ctx->shared_buf);
@@ -941,8 +939,7 @@ static void psp_ta_free_shared_buf(struct ta_mem_context 
*mem_ctx)
 
 static int psp_xgmi_init_shared_buf(struct psp_context *psp)
 {
-   return psp_ta_init_shared_buf(psp, 
&psp->xgmi_context.context.mem_context,
- PSP_XGMI_SHARED_MEM_SIZE);
+   return psp_ta_init_shared_buf(psp, 
&psp->xgmi_context.context.mem_context);
 }
 
 static void psp_prep_ta_invoke_cmd_buf(struct psp_gfx_cmd_resp *cmd,
@@ -971,31 +968,27 @@ static int psp_ta_invoke(struct psp_context *psp,
return ret;
 }
 
-static int psp_xgmi_load(struct psp_context *psp)
+static int psp_ta_load(struct psp_context *psp,
+  struct psp_bin_desc *bin_desc,
+  struct ta_context *context)
 {
int ret;
struct psp_gfx_cmd_resp *cmd;
 
-   /*
-* TODO: bypass the loading in sriov for now
-*/
-
cmd = acquire_psp_cmd_buf(psp);
 
-   psp_copy_fw(psp, psp->xgmi.start_addr, psp->xgmi.size_bytes);
+   psp_copy_fw(psp, bin_desc->start_addr, bin_desc->size_bytes);
 
psp_prep_ta_load_cmd_buf(cmd,
 psp->fw_pri_mc_addr,
-psp->xgmi.size_bytes,
-
psp->xgmi_context.context.mem_context.shared_mc_addr,
-PSP_XGMI_SHARED_MEM_SIZE);
+bin_desc->size_bytes,
+&context->mem_context);
 
ret = psp_cmd_submit_buf(psp, NULL, cmd,
 psp->fence_buf_mc_addr);
 
if (!ret) {
-   psp->xgmi_context.context.initialized = true;
-   psp->xgmi_context.context.session_id = cmd->resp.session_id;
+   context->session_id = cmd->resp.session_id;
}
 
release_psp_cmd_buf(psp);
@@ -1003,6 +996,11 @@ static int psp_xgmi_load(struct psp_context *psp)
return ret;
 }
 
+static int psp_xgmi_load(struct psp_context *psp)
+{
+   return psp_ta_load(psp, &psp->xgmi, &psp->xgmi_context.context);
+}
+
 static 

[PATCH] drm/amdgpu: Unify PSP TA context

2021-09-10 Thread Candice Li
Remove all TA binary structures and add the specific binary
structure in struct ta_context.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  23 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 122 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h   |  23 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c |   6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c  |   9 +-
 drivers/gpu/drm/amd/amdgpu/psp_v10_0.c|  22 ++--
 drivers/gpu/drm/amd/amdgpu/psp_v11_0.c|  40 ---
 drivers/gpu/drm/amd/amdgpu/psp_v12_0.c|  14 +--
 8 files changed, 141 insertions(+), 118 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 7e45640fbee026..d2955ea4a62bf4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -341,27 +341,34 @@ static int amdgpu_firmware_info(struct 
drm_amdgpu_info_firmware *fw_info,
switch (query_fw->index) {
case TA_FW_TYPE_PSP_XGMI:
fw_info->ver = adev->psp.ta_fw_version;
-   fw_info->feature = adev->psp.xgmi.feature_version;
+   fw_info->feature = adev->psp.xgmi_context.context
+  .bin_desc.feature_version;
break;
case TA_FW_TYPE_PSP_RAS:
fw_info->ver = adev->psp.ta_fw_version;
-   fw_info->feature = adev->psp.ras.feature_version;
+   fw_info->feature = adev->psp.ras_context.context
+  .bin_desc.feature_version;
break;
case TA_FW_TYPE_PSP_HDCP:
fw_info->ver = adev->psp.ta_fw_version;
-   fw_info->feature = adev->psp.hdcp.feature_version;
+   fw_info->feature = adev->psp.hdcp_context.context
+  .bin_desc.feature_version;
break;
case TA_FW_TYPE_PSP_DTM:
fw_info->ver = adev->psp.ta_fw_version;
-   fw_info->feature = adev->psp.dtm.feature_version;
+   fw_info->feature = adev->psp.dtm_context.context
+  .bin_desc.feature_version;
break;
case TA_FW_TYPE_PSP_RAP:
fw_info->ver = adev->psp.ta_fw_version;
-   fw_info->feature = adev->psp.rap.feature_version;
+   fw_info->feature = adev->psp.rap_context.context
+  .bin_desc.feature_version;
break;
case TA_FW_TYPE_PSP_SECUREDISPLAY:
fw_info->ver = adev->psp.ta_fw_version;
-   fw_info->feature = 
adev->psp.securedisplay.feature_version;
+   fw_info->feature =
+   adev->psp.securedisplay_context.context.bin_desc
+   .feature_version;
break;
default:
return -EINVAL;
@@ -378,8 +385,8 @@ static int amdgpu_firmware_info(struct 
drm_amdgpu_info_firmware *fw_info,
fw_info->feature = adev->psp.sos.feature_version;
break;
case AMDGPU_INFO_FW_ASD:
-   fw_info->ver = adev->psp.asd.fw_version;
-   fw_info->feature = adev->psp.asd.feature_version;
+   fw_info->ver = adev->psp.asd_context.bin_desc.fw_version;
+   fw_info->feature = 
adev->psp.asd_context.bin_desc.feature_version;
break;
case AMDGPU_INFO_FW_DMCU:
fw_info->ver = adev->dm.dmcu_fw_version;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 885876e2ce73b6..071dadf3a4509f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -803,15 +803,16 @@ static int psp_asd_load(struct psp_context *psp)
 * add workaround to bypass it for sriov now.
 * TODO: add version check to make it common
 */
-   if (amdgpu_sriov_vf(psp->adev) || !psp->asd.size_bytes)
+   if (amdgpu_sriov_vf(psp->adev) || !psp->asd_context.bin_desc.size_bytes)
return 0;
 
cmd = acquire_psp_cmd_buf(psp);
 
-   psp_copy_fw(psp, psp->asd.start_addr, psp->asd.size_bytes);
+   psp_copy_fw(psp, psp->asd_context.bin_desc.start_addr,
+   psp->asd_context.bin_desc.size_bytes);
 
psp_prep_asd_load_cmd_buf(cmd, psp->fw_pri_mc_addr,
-   

[PATCH] drm/amdgpu: Conform ASD header/loading to generic TA systems

2021-09-13 Thread Candice Li
Update asd_context structure and add asd_initialize function to
conform ASD header/loading to generic TA systems.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 60 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 10 ++---
 2 files changed, 26 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 071dadf3a4509f..bc861f2fe0ecf6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -47,6 +47,7 @@ static void psp_sysfs_fini(struct amdgpu_device *adev);
 
 static int psp_load_smu_fw(struct psp_context *psp);
 static int psp_ta_unload(struct psp_context *psp, uint32_t session_id);
+static int psp_ta_load(struct psp_context *psp, struct ta_context *context);
 static int psp_rap_terminate(struct psp_context *psp);
 static int psp_securedisplay_terminate(struct psp_context *psp);
 
@@ -781,23 +782,14 @@ static int psp_rl_load(struct amdgpu_device *adev)
return ret;
 }
 
-static void psp_prep_asd_load_cmd_buf(struct psp_gfx_cmd_resp *cmd,
-   uint64_t asd_mc, uint32_t size)
+static int psp_asd_load(struct psp_context *psp)
 {
-   cmd->cmd_id = GFX_CMD_ID_LOAD_ASD;
-   cmd->cmd.cmd_load_ta.app_phy_addr_lo = lower_32_bits(asd_mc);
-   cmd->cmd.cmd_load_ta.app_phy_addr_hi = upper_32_bits(asd_mc);
-   cmd->cmd.cmd_load_ta.app_len = size;
-
-   cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_lo = 0;
-   cmd->cmd.cmd_load_ta.cmd_buf_phy_addr_hi = 0;
-   cmd->cmd.cmd_load_ta.cmd_buf_len = 0;
+   return psp_ta_load(psp, &psp->asd_context);
 }
 
-static int psp_asd_load(struct psp_context *psp)
+static int psp_asd_initialize(struct psp_context *psp)
 {
int ret;
-   struct psp_gfx_cmd_resp *cmd;
 
/* If PSP version doesn't match ASD version, asd loading will be failed.
 * add workaround to bypass it for sriov now.
@@ -806,22 +798,13 @@ static int psp_asd_load(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev) || !psp->asd_context.bin_desc.size_bytes)
return 0;
 
-   cmd = acquire_psp_cmd_buf(psp);
+   psp->asd_context.mem_context.shared_mc_addr  = 0;
+   psp->asd_context.mem_context.shared_mem_size = PSP_ASD_SHARED_MEM_SIZE;
+   psp->asd_context.ta_load_type= GFX_CMD_ID_LOAD_ASD;
 
-   psp_copy_fw(psp, psp->asd_context.bin_desc.start_addr,
-   psp->asd_context.bin_desc.size_bytes);
-
-   psp_prep_asd_load_cmd_buf(cmd, psp->fw_pri_mc_addr,
- psp->asd_context.bin_desc.size_bytes);
-
-   ret = psp_cmd_submit_buf(psp, NULL, cmd,
-psp->fence_buf_mc_addr);
-   if (!ret) {
-   psp->asd_context.asd_initialized = true;
-   psp->asd_context.session_id = cmd->resp.session_id;
-   }
-
-   release_psp_cmd_buf(psp);
+   ret = psp_asd_load(psp);
+   if (!ret)
+   psp->asd_context.initialized = true;
 
return ret;
 }
@@ -859,13 +842,13 @@ static int psp_asd_terminate(struct psp_context *psp)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
-   if (!psp->asd_context.asd_initialized)
+   if (!psp->asd_context.initialized)
return 0;
 
ret = psp_asd_unload(psp);
 
if (!ret)
-   psp->asd_context.asd_initialized = false;
+   psp->asd_context.initialized = false;
 
return ret;
 }
@@ -903,7 +886,7 @@ static void psp_prep_ta_load_cmd_buf(struct 
psp_gfx_cmd_resp *cmd,
 uint64_t ta_bin_mc,
 struct ta_context *context)
 {
-   cmd->cmd_id = GFX_CMD_ID_LOAD_TA;
+   cmd->cmd_id = context->ta_load_type;
cmd->cmd.cmd_load_ta.app_phy_addr_lo= lower_32_bits(ta_bin_mc);
cmd->cmd.cmd_load_ta.app_phy_addr_hi= upper_32_bits(ta_bin_mc);
cmd->cmd.cmd_load_ta.app_len= context->bin_desc.size_bytes;
@@ -970,8 +953,7 @@ static int psp_ta_invoke(struct psp_context *psp,
return ret;
 }
 
-static int psp_ta_load(struct psp_context *psp,
-  struct ta_context *context)
+static int psp_ta_load(struct psp_context *psp, struct ta_context *context)
 {
int ret;
struct psp_gfx_cmd_resp *cmd;
@@ -981,9 +963,7 @@ static int psp_ta_load(struct psp_context *psp,
psp_copy_fw(psp, context->bin_desc.start_addr,
context->bin_desc.size_bytes);
 
-   psp_prep_ta_load_cmd_buf(cmd,
-psp->fw_pri_mc_addr,
-context);
+   psp_prep_ta_load_cmd_buf(cmd, psp->fw_pri_mc_addr, context);
 
r

[PATCH] drm/amdgpu: Update PSP TA unload function

2021-09-13 Thread Candice Li
Update PSP TA unload function to use PSP TA context as input argument.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index bc861f2fe0ecf6..7d09b28889afef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -46,7 +46,7 @@ static int psp_sysfs_init(struct amdgpu_device *adev);
 static void psp_sysfs_fini(struct amdgpu_device *adev);
 
 static int psp_load_smu_fw(struct psp_context *psp);
-static int psp_ta_unload(struct psp_context *psp, uint32_t session_id);
+static int psp_ta_unload(struct psp_context *psp, struct ta_context *context);
 static int psp_ta_load(struct psp_context *psp, struct ta_context *context);
 static int psp_rap_terminate(struct psp_context *psp);
 static int psp_securedisplay_terminate(struct psp_context *psp);
@@ -816,12 +816,12 @@ static void psp_prep_ta_unload_cmd_buf(struct 
psp_gfx_cmd_resp *cmd,
cmd->cmd.cmd_unload_ta.session_id = session_id;
 }
 
-static int psp_ta_unload(struct psp_context *psp, uint32_t session_id)
+static int psp_ta_unload(struct psp_context *psp, struct ta_context *context)
 {
int ret;
struct psp_gfx_cmd_resp *cmd = acquire_psp_cmd_buf(psp);
 
-   psp_prep_ta_unload_cmd_buf(cmd, session_id);
+   psp_prep_ta_unload_cmd_buf(cmd, context->session_id);
 
ret = psp_cmd_submit_buf(psp, NULL, cmd, psp->fence_buf_mc_addr);
 
@@ -832,7 +832,7 @@ static int psp_ta_unload(struct psp_context *psp, uint32_t 
session_id)
 
 static int psp_asd_unload(struct psp_context *psp)
 {
-   return psp_ta_unload(psp, psp->asd_context.session_id);
+   return psp_ta_unload(psp, &psp->asd_context);
 }
 
 static int psp_asd_terminate(struct psp_context *psp)
@@ -984,7 +984,7 @@ static int psp_xgmi_load(struct psp_context *psp)
 
 static int psp_xgmi_unload(struct psp_context *psp)
 {
-   return psp_ta_unload(psp, psp->xgmi_context.context.session_id);
+   return psp_ta_unload(psp, &psp->xgmi_context.context);
 }
 
 int psp_xgmi_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
@@ -1275,7 +1275,7 @@ static int psp_ras_load(struct psp_context *psp)
 
 static int psp_ras_unload(struct psp_context *psp)
 {
-   return psp_ta_unload(psp, psp->ras_context.context.session_id);
+   return psp_ta_unload(psp, &psp->ras_context.context);
 }
 
 int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
@@ -1540,7 +1540,7 @@ static int psp_hdcp_initialize(struct psp_context *psp)
 
 static int psp_hdcp_unload(struct psp_context *psp)
 {
-   return psp_ta_unload(psp, psp->hdcp_context.context.session_id);
+   return psp_ta_unload(psp, &psp->hdcp_context.context);
 }
 
 int psp_hdcp_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
@@ -1632,7 +1632,7 @@ static int psp_dtm_initialize(struct psp_context *psp)
 
 static int psp_dtm_unload(struct psp_context *psp)
 {
-   return psp_ta_unload(psp, psp->dtm_context.context.session_id);
+   return psp_ta_unload(psp, &psp->dtm_context.context);
 }
 
 int psp_dtm_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
@@ -1690,7 +1690,7 @@ static int psp_rap_load(struct psp_context *psp)
 
 static int psp_rap_unload(struct psp_context *psp)
 {
-   return psp_ta_unload(psp, psp->rap_context.context.session_id);
+   return psp_ta_unload(psp, &psp->rap_context.context);
 }
 
 static int psp_rap_initialize(struct psp_context *psp)
@@ -1805,7 +1805,7 @@ static int psp_securedisplay_load(struct psp_context *psp)
 
 static int psp_securedisplay_unload(struct psp_context *psp)
 {
-   return psp_ta_unload(psp, 
psp->securedisplay_context.context.session_id);
+   return psp_ta_unload(psp, &psp->securedisplay_context.context);
 }
 
 static int psp_securedisplay_initialize(struct psp_context *psp)
-- 
2.17.1



[PATCH] drm/amdgpu: Remove all code paths under the EAGAIN path in RAS late init

2021-09-23 Thread Candice Li
All code paths under the EAGAIN path in RAS late init are unused.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 33 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 ---
 2 files changed, 1 insertion(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index b5332db4d28730..6cf5f6e06b76ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2060,19 +2060,6 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device 
*adev)
 }
 /* recovery end */
 
-/* return 0 if ras will reset gpu and repost.*/
-int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
-   unsigned int block)
-{
-   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
-
-   if (!ras)
-   return -EINVAL;
-
-   ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
-   return 0;
-}
-
 static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
 {
return adev->asic_type == CHIP_VEGA10 ||
@@ -2310,12 +2297,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev,
 
r = amdgpu_ras_feature_enable_on_boot(adev, ras_block, 1);
if (r) {
-   if (r == -EAGAIN) {
-   /* request gpu reset. will run again */
-   amdgpu_ras_request_reset_on_boot(adev,
-   ras_block->block);
-   return 0;
-   } else if (adev->in_suspend || amdgpu_in_reset(adev)) {
+   if (adev->in_suspend || amdgpu_in_reset(adev)) {
/* in resume phase, if fail to enable ras,
 * clean up all ras fs nodes, and disable ras */
goto cleanup;
@@ -2407,19 +2389,6 @@ void amdgpu_ras_resume(struct amdgpu_device *adev)
}
}
}
-
-   if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
-   con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
-   /* setup ras obj state as disabled.
-* for init_by_vbios case.
-* if we want to enable ras, just enable it in a normal way.
-* If we want do disable it, need setup ras obj as enabled,
-* then issue another TA disable cmd.
-* See feature_enable_on_boot
-*/
-   amdgpu_ras_disable_all_features(adev, 1);
-   amdgpu_ras_reset_gpu(adev);
-   }
 }
 
 void amdgpu_ras_suspend(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 1670467c205463..30a3eafbf6d095 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -32,7 +32,6 @@
 #include "amdgpu_ras_eeprom.h"
 
 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS  (0x1 << 0)
-#define AMDGPU_RAS_FLAG_INIT_NEED_RESET(0x1 << 1)
 
 enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
@@ -488,8 +487,6 @@ static inline int amdgpu_ras_is_supported(struct 
amdgpu_device *adev,
 }
 
 int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
-int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
-   unsigned int block);
 
 void amdgpu_ras_resume(struct amdgpu_device *adev);
 void amdgpu_ras_suspend(struct amdgpu_device *adev);
-- 
2.17.1



[PATCH] drm/amdgpu: Update PSP TA Invoke to use common TA context as input

2021-09-23 Thread Candice Li
Updated invoke to use new common TA structure similarily to load/unload.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 16 
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 55ffc3da89ced2..17d09771be3ee0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -938,12 +938,12 @@ static void psp_prep_ta_invoke_cmd_buf(struct 
psp_gfx_cmd_resp *cmd,
 
 static int psp_ta_invoke(struct psp_context *psp,
  uint32_t ta_cmd_id,
- uint32_t session_id)
+ struct ta_context *context)
 {
int ret;
struct psp_gfx_cmd_resp *cmd = acquire_psp_cmd_buf(psp);
 
-   psp_prep_ta_invoke_cmd_buf(cmd, ta_cmd_id, session_id);
+   psp_prep_ta_invoke_cmd_buf(cmd, ta_cmd_id, context->session_id);
 
ret = psp_cmd_submit_buf(psp, NULL, cmd,
 psp->fence_buf_mc_addr);
@@ -989,7 +989,7 @@ static int psp_xgmi_unload(struct psp_context *psp)
 
 int psp_xgmi_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
 {
-   return psp_ta_invoke(psp, ta_cmd_id, 
psp->xgmi_context.context.session_id);
+   return psp_ta_invoke(psp, ta_cmd_id, &psp->xgmi_context.context);
 }
 
 int psp_xgmi_terminate(struct psp_context *psp)
@@ -1291,7 +1291,7 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
-   ret = psp_ta_invoke(psp, ta_cmd_id, 
psp->ras_context.context.session_id);
+   ret = psp_ta_invoke(psp, ta_cmd_id, &psp->ras_context.context);
 
if (amdgpu_ras_intr_triggered())
return ret;
@@ -1551,7 +1551,7 @@ int psp_hdcp_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
-   return psp_ta_invoke(psp, ta_cmd_id, 
psp->hdcp_context.context.session_id);
+   return psp_ta_invoke(psp, ta_cmd_id, &psp->hdcp_context.context);
 }
 
 static int psp_hdcp_terminate(struct psp_context *psp)
@@ -1643,7 +1643,7 @@ int psp_dtm_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id)
if (amdgpu_sriov_vf(psp->adev))
return 0;
 
-   return psp_ta_invoke(psp, ta_cmd_id, 
psp->dtm_context.context.session_id);
+   return psp_ta_invoke(psp, ta_cmd_id, &psp->dtm_context.context);
 }
 
 static int psp_dtm_terminate(struct psp_context *psp)
@@ -1777,7 +1777,7 @@ int psp_rap_invoke(struct psp_context *psp, uint32_t 
ta_cmd_id, enum ta_rap_stat
rap_cmd->cmd_id = ta_cmd_id;
rap_cmd->validation_method_id = METHOD_A;
 
-   ret = psp_ta_invoke(psp, rap_cmd->cmd_id, 
psp->rap_context.context.session_id);
+   ret = psp_ta_invoke(psp, rap_cmd->cmd_id, &psp->rap_context.context);
if (ret)
goto out_unlock;
 
@@ -1899,7 +1899,7 @@ int psp_securedisplay_invoke(struct psp_context *psp, 
uint32_t ta_cmd_id)
 
mutex_lock(&psp->securedisplay_context.mutex);
 
-   ret = psp_ta_invoke(psp, ta_cmd_id, 
psp->securedisplay_context.context.session_id);
+   ret = psp_ta_invoke(psp, ta_cmd_id, 
&psp->securedisplay_context.context);
 
mutex_unlock(&psp->securedisplay_context.mutex);
 
-- 
2.17.1



[PATCH] drm/amdgpu: Update TA version output in driver

2021-10-24 Thread Candice Li
TA version should only be displayed in firmware version column.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   | 12 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 14 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/psp_v10_0.c|  6 +++---
 drivers/gpu/drm/amd/amdgpu/psp_v11_0.c|  8 
 drivers/gpu/drm/amd/amdgpu/psp_v12_0.c|  4 ++--
 7 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index d2955ea4a62bf4..dfe667ea8b058e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -340,32 +340,32 @@ static int amdgpu_firmware_info(struct 
drm_amdgpu_info_firmware *fw_info,
case AMDGPU_INFO_FW_TA:
switch (query_fw->index) {
case TA_FW_TYPE_PSP_XGMI:
-   fw_info->ver = adev->psp.ta_fw_version;
+   fw_info->ver = 
adev->psp.xgmi_context.context.bin_desc.fw_version;
fw_info->feature = adev->psp.xgmi_context.context
   .bin_desc.feature_version;
break;
case TA_FW_TYPE_PSP_RAS:
-   fw_info->ver = adev->psp.ta_fw_version;
+   fw_info->ver = 
adev->psp.ras_context.context.bin_desc.fw_version;
fw_info->feature = adev->psp.ras_context.context
   .bin_desc.feature_version;
break;
case TA_FW_TYPE_PSP_HDCP:
-   fw_info->ver = adev->psp.ta_fw_version;
+   fw_info->ver = 
adev->psp.hdcp_context.context.bin_desc.fw_version;
fw_info->feature = adev->psp.hdcp_context.context
   .bin_desc.feature_version;
break;
case TA_FW_TYPE_PSP_DTM:
-   fw_info->ver = adev->psp.ta_fw_version;
+   fw_info->ver = 
adev->psp.dtm_context.context.bin_desc.fw_version;
fw_info->feature = adev->psp.dtm_context.context
   .bin_desc.feature_version;
break;
case TA_FW_TYPE_PSP_RAP:
-   fw_info->ver = adev->psp.ta_fw_version;
+   fw_info->ver = 
adev->psp.rap_context.context.bin_desc.fw_version;
fw_info->feature = adev->psp.rap_context.context
   .bin_desc.feature_version;
break;
case TA_FW_TYPE_PSP_SECUREDISPLAY:
-   fw_info->ver = adev->psp.ta_fw_version;
+   fw_info->ver = 
adev->psp.securedisplay_context.context.bin_desc.fw_version;
fw_info->feature =
adev->psp.securedisplay_context.context.bin_desc
.feature_version;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index fd04e83031d642..c641f84649d6bd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1115,7 +1115,7 @@ int psp_xgmi_get_node_id(struct psp_context *psp, 
uint64_t *node_id)
 static bool psp_xgmi_peer_link_info_supported(struct psp_context *psp)
 {
return psp->adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 2) &&
-   psp->xgmi_context.context.bin_desc.feature_version >= 
0x200b;
+   psp->xgmi_context.context.bin_desc.fw_version >= 0x200b;
 }
 
 /*
@@ -3108,32 +3108,32 @@ static int parse_ta_bin_descriptor(struct psp_context 
*psp,
psp->asd_context.bin_desc.start_addr= ucode_start_addr;
break;
case TA_FW_TYPE_PSP_XGMI:
-   psp->xgmi_context.context.bin_desc.feature_version  = 
le32_to_cpu(desc->fw_version);
+   psp->xgmi_context.context.bin_desc.fw_version   = 
le32_to_cpu(desc->fw_version);
psp->xgmi_context.context.bin_desc.size_bytes   = 
le32_to_cpu(desc->size_bytes);
psp->xgmi_context.context.bin_desc.start_addr   = 
ucode_start_addr;
break;
case TA_FW_TYPE_PSP_RAS:
-   psp->ras_context.context.bin_desc.feature_version   = 
le32_to_cpu(desc->fw_version);
+   psp->ras_context.context.bin_desc.fw_version= 
le32_to_cpu(desc->fw_version);
psp->ras_context.context.bin_desc.

[PATCH] drm/amdgpu: Add recovery_lock to save bad pages function

2021-11-16 Thread Candice Li
Fix race condition failure during UMC UE injection.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 08133de21fdd63..711b5fb26d47d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1931,10 +1931,12 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device 
*adev)
struct ras_err_handler_data *data;
struct amdgpu_ras_eeprom_control *control;
int save_count;
+   int ret = 0;
 
if (!con || !con->eh_data)
return 0;
 
+   mutex_lock(&con->recovery_lock);
control = &con->eeprom_control;
data = con->eh_data;
save_count = data->count - control->ras_num_recs;
@@ -1944,13 +1946,16 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device 
*adev)
 &data->bps[control->ras_num_recs],
 save_count)) {
dev_err(adev->dev, "Failed to save EEPROM table data!");
-   return -EIO;
+   ret = -EIO;
+   goto out;
}
 
dev_info(adev->dev, "Saved %d pages to EEPROM table.\n", 
save_count);
}
 
-   return 0;
+out:
+   mutex_unlock(&con->recovery_lock);
+   return ret;
 }
 
 /*
-- 
2.17.1



[PATCH 1/3] drm/amdgpu: Add RREG64_PCIE_EXT/WREG64_PCIE_EXT functions

2023-09-04 Thread Candice Li
1. Add 64bits register access support on register whose address
is greater than 32bits.
2. Update RREG32_PCIE_EXT/WREG32_PCIE_EXT.

Signed-off-by: Candice Li 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  11 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 119 -
 drivers/gpu/drm/amd/amdgpu/soc15.c |   2 +
 3 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 45e5db95496906..6ff4289b255bbf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -653,6 +653,9 @@ typedef void (*amdgpu_wreg_ext_t)(struct amdgpu_device*, 
uint64_t, uint32_t);
 typedef uint64_t (*amdgpu_rreg64_t)(struct amdgpu_device*, uint32_t);
 typedef void (*amdgpu_wreg64_t)(struct amdgpu_device*, uint32_t, uint64_t);
 
+typedef uint64_t (*amdgpu_rreg64_ext_t)(struct amdgpu_device*, uint64_t);
+typedef void (*amdgpu_wreg64_ext_t)(struct amdgpu_device*, uint64_t, uint64_t);
+
 typedef uint32_t (*amdgpu_block_rreg_t)(struct amdgpu_device*, uint32_t, 
uint32_t);
 typedef void (*amdgpu_block_wreg_t)(struct amdgpu_device*, uint32_t, uint32_t, 
uint32_t);
 
@@ -867,6 +870,8 @@ struct amdgpu_device {
amdgpu_wreg_ext_t   pcie_wreg_ext;
amdgpu_rreg64_t pcie_rreg64;
amdgpu_wreg64_t pcie_wreg64;
+   amdgpu_rreg64_ext_t pcie_rreg64_ext;
+   amdgpu_wreg64_ext_t pcie_wreg64_ext;
/* protects concurrent UVD register access */
spinlock_t uvd_ctx_idx_lock;
amdgpu_rreg_t   uvd_ctx_rreg;
@@ -1178,10 +1183,14 @@ u32 amdgpu_device_indirect_rreg(struct amdgpu_device 
*adev,
u32 reg_addr);
 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
  u32 reg_addr);
+u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
+ u64 reg_addr);
 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 u32 reg_addr, u32 reg_data);
 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
   u32 reg_addr, u64 reg_data);
+void amdgpu_device_indirect_wreg64_ext(struct amdgpu_device *adev,
+  u64 reg_addr, u64 reg_data);
 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev);
 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type);
 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev);
@@ -1224,6 +1233,8 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_PCIE_EXT(reg, v) adev->pcie_wreg_ext(adev, (reg), (v))
 #define RREG64_PCIE(reg) adev->pcie_rreg64(adev, (reg))
 #define WREG64_PCIE(reg, v) adev->pcie_wreg64(adev, (reg), (v))
+#define RREG64_PCIE_EXT(reg) adev->pcie_rreg64_ext(adev, (reg))
+#define WREG64_PCIE_EXT(reg, v) adev->pcie_wreg64_ext(adev, (reg), (v))
 #define RREG32_SMC(reg) adev->smc_rreg(adev, (reg))
 #define WREG32_SMC(reg, v) adev->smc_wreg(adev, (reg), (v))
 #define RREG32_UVD_CTX(reg) adev->uvd_ctx_rreg(adev, (reg))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index de9223b5e6fd76..57b24053e1e320 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -574,7 +574,7 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device 
*adev,
 
pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
-   if (adev->nbio.funcs->get_pcie_index_hi_offset)
+   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
else
pcie_index_hi = 0;
@@ -641,6 +641,56 @@ u64 amdgpu_device_indirect_rreg64(struct amdgpu_device 
*adev,
return r;
 }
 
+u64 amdgpu_device_indirect_rreg64_ext(struct amdgpu_device *adev,
+ u64 reg_addr)
+{
+   unsigned long flags, pcie_index, pcie_data;
+   unsigned long pcie_index_hi = 0;
+   void __iomem *pcie_index_offset;
+   void __iomem *pcie_index_hi_offset;
+   void __iomem *pcie_data_offset;
+   u64 r;
+
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
+   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+
+   spin_lock_irqsave(&adev->pcie_idx_lock, flags);
+   pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
+   pcie_data_offset = (void __iomem *)adev

[PATCH 2/3] drm/amd: Add umc v12_0_0 ip headers

2023-09-04 Thread Candice Li
Add umc v12_0_0 ip headers.

Signed-off-by: Candice Li 
Reviewed-by: Tao Zhou 
---
 .../include/asic_reg/umc/umc_12_0_0_offset.h  | 33 +++
 .../include/asic_reg/umc/umc_12_0_0_sh_mask.h | 95 +++
 2 files changed, 128 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/umc/umc_12_0_0_offset.h
 create mode 100644 
drivers/gpu/drm/amd/include/asic_reg/umc/umc_12_0_0_sh_mask.h

diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_12_0_0_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_12_0_0_offset.h
new file mode 100644
index 00..2913127c03d52b
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_12_0_0_offset.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2023  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _umc_12_0_0_OFFSET_HEADER
+#define _umc_12_0_0_OFFSET_HEADER
+
+#define regUMCCH0_OdEccCntSel  
 0x032c
+#define regUMCCH0_OdEccCntSel_BASE_IDX 
 0
+#define regUMCCH0_OdEccErrCnt  
 0x032d
+#define regUMCCH0_OdEccErrCnt_BASE_IDX 
 0
+#define regMCA_UMC_UMC0_MCUMC_STATUST0 
 0x03c2
+#define regMCA_UMC_UMC0_MCUMC_STATUST0_BASE_IDX
 0
+#define regMCA_UMC_UMC0_MCUMC_ADDRT0   
 0x03c4
+#define regMCA_UMC_UMC0_MCUMC_ADDRT0_BASE_IDX  
 0
+
+#endif
diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_12_0_0_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_12_0_0_sh_mask.h
new file mode 100644
index 00..14bbee775032ab
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_12_0_0_sh_mask.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (C) 2023  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _umc_12_0_0_SH_MASK_HEADER
+#define _umc_12_0_0_SH_MASK_HEADER
+
+//UMCCH0_OdEccCntSel
+#define UMCCH0_OdEccCntSel__OdEccCntSel__SHIFT 
0x0
+#define UMCCH0_OdEccCntSel__OdEccErrInt__SHIFT 
0x4
+#define UMCCH0_OdEccCntSel__OdEccCntSel_MASK   
0x0007L
+#define UMCCH0_OdEccCntSel__OdEccErrInt_MASK   
0x0030L
+//UMCCH0_OdEccErrCnt
+#define UMCCH0_OdEccErrCnt__Cnt__SHIFT 
0x0
+#define UMCCH0_OdEccErrCnt__CntOvr__SHIFT  
0x10
+#define UMCCH0_OdEccErrCnt__OvrClr__SHIFT  
0x11
+#define UMCCH0_OdEccErrCnt__Cnt_MASK   
0xL
+#define UMCCH0_OdEccErrCnt__CntOvr_MASK 

[PATCH 3/3] drm/amdgpu: Add umc v12_0 ras functions

2023-09-04 Thread Candice Li
Add umc v12_0 ras error querying.

Signed-off-by: Candice Li 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/Makefile|   2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  16 +-
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 256 +
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h |  56 ++
 4 files changed, 327 insertions(+), 3 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/umc_v12_0.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index ce0188b329cdeb..adf5470aa81020 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -121,7 +121,7 @@ amdgpu-y += \
 
 # add UMC block
 amdgpu-y += \
-   umc_v6_0.o umc_v6_1.o umc_v6_7.o umc_v8_7.o umc_v8_10.o
+   umc_v6_0.o umc_v6_1.o umc_v6_7.o umc_v8_7.o umc_v8_10.o umc_v12_0.o
 
 # add IH block
 amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 8447fcada8bb92..41e1759b5f1eaa 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -56,6 +56,7 @@
 #include "umc_v6_1.h"
 #include "umc_v6_0.h"
 #include "umc_v6_7.h"
+#include "umc_v12_0.h"
 #include "hdp_v4_0.h"
 #include "mca_v3_0.h"
 
@@ -737,7 +738,8 @@ static void gmc_v9_0_set_irq_funcs(struct amdgpu_device 
*adev)
adev->gmc.vm_fault.funcs = &gmc_v9_0_irq_funcs;
 
if (!amdgpu_sriov_vf(adev) &&
-   !adev->gmc.xgmi.connected_to_cpu) {
+   !adev->gmc.xgmi.connected_to_cpu &&
+   !adev->gmc.is_app_apu) {
adev->gmc.ecc_irq.num_types = 1;
adev->gmc.ecc_irq.funcs = &gmc_v9_0_ecc_funcs;
}
@@ -1487,6 +1489,15 @@ static void gmc_v9_0_set_umc_funcs(struct amdgpu_device 
*adev)
else
adev->umc.channel_idx_tbl = 
&umc_v6_7_channel_idx_tbl_second[0][0];
break;
+   case IP_VERSION(12, 0, 0):
+   adev->umc.max_ras_err_cnt_per_query = 
UMC_V12_0_TOTAL_CHANNEL_NUM(adev);
+   adev->umc.channel_inst_num = UMC_V12_0_CHANNEL_INSTANCE_NUM;
+   adev->umc.umc_inst_num = UMC_V12_0_UMC_INSTANCE_NUM;
+   adev->umc.node_inst_num /= UMC_V12_0_UMC_INSTANCE_NUM;
+   adev->umc.channel_offs = UMC_V12_0_PER_CHANNEL_OFFSET;
+   adev->umc.active_mask = adev->aid_mask;
+   if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu)
+   adev->umc.ras = &umc_v12_0_ras;
default:
break;
}
@@ -2131,7 +2142,8 @@ static int gmc_v9_0_sw_init(void *handle)
return r;
 
if (!amdgpu_sriov_vf(adev) &&
-   !adev->gmc.xgmi.connected_to_cpu) {
+   !adev->gmc.xgmi.connected_to_cpu &&
+   !adev->gmc.is_app_apu) {
/* interrupt sent to DF. */
r = amdgpu_irq_add_id(adev, SOC15_IH_CLIENTID_DF, 0,
  &adev->gmc.ecc_irq);
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
new file mode 100644
index 00..b3d6db14b351f1
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "umc_v12_0.h"
+#include "amdgpu_ras.h"
+#include "amdgpu_umc.h"
+#include "amdgpu.h"
+#include "umc/umc_12_0_0_offset.h"
+#include "umc/umc_12_0_0_sh_mask.h"
+
+static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
+  

[PATCH] drm/amdgpu: Log UE corrected by replay as correctable error

2023-10-18 Thread Candice Li
Support replay mode where UE could be converted to CE.

Signed-off-by: Candice Li 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index b664ee3ee92d8d..025e6aeb058d43 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -105,7 +105,9 @@ static void umc_v12_0_query_correctable_error_count(struct 
amdgpu_device *adev,
RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
 
if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 
&&
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 
1)
+   (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 
1 ||
+   (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 
1 &&
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 
0)))
*error_count += 1;
 }
 
@@ -125,7 +127,6 @@ static void 
umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device *adev
 
if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 
1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1 ||
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 
1 ||
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 
||
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 
||
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 
1))
@@ -293,7 +294,7 @@ static int umc_v12_0_query_error_address(struct 
amdgpu_device *adev,
/* calculate error address if ue error is detected */
if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 
&&
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 
1 &&
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 
1) {
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1) 
{
 
mc_umc_addrt0 =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_ADDRT0);
-- 
2.25.1



[PATCH] drm/amdgpu: Identify data parity error corrected in replay mode

2023-10-25 Thread Candice Li
Use ErrorCodeExt field to identify data parity error in replay mode.

Signed-off-by: Candice Li 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 32 ++
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 025e6aeb058d43..743d2f68b09020 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -88,6 +88,27 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)
umc_v12_0_reset_error_count_per_channel, NULL);
 }
 
+static bool umc_v12_0_is_uncorrectable_error(uint64_t mc_umc_status)
+{
+   return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
+   (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1 ||
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) 
== 1 ||
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) 
== 1 ||
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) 
== 1));
+}
+
+static bool umc_v12_0_is_correctable_error(uint64_t mc_umc_status)
+{
+   return (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1 &&
+   (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
CECC) == 1 ||
+   (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
UECC) == 1 &&
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) 
== 0) ||
+   /* Identify data parity error in replay mode */
+   ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
ErrorCodeExt) == 0x5 ||
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
ErrorCodeExt) == 0xb) &&
+   !(umc_v12_0_is_uncorrectable_error(mc_umc_status);
+}
+
 static void umc_v12_0_query_correctable_error_count(struct amdgpu_device *adev,
   uint64_t umc_reg_offset,
   unsigned long *error_count)
@@ -104,10 +125,7 @@ static void umc_v12_0_query_correctable_error_count(struct 
amdgpu_device *adev,
mc_umc_status =
RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
 
-   if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 
&&
-   (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 
1 ||
-   (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 
1 &&
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 
0)))
+   if (umc_v12_0_is_correctable_error(mc_umc_status))
*error_count += 1;
 }
 
@@ -125,11 +143,7 @@ static void 
umc_v12_0_query_uncorrectable_error_count(struct amdgpu_device *adev
mc_umc_status =
RREG64_PCIE_EXT((mc_umc_status_addr + umc_reg_offset) * 4);
 
-   if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 
1) &&
-   (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1 ||
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 
||
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 
||
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 
1))
+   if (umc_v12_0_is_uncorrectable_error(mc_umc_status))
*error_count += 1;
 }
 
-- 
2.25.1



[PATCH] drm/amdgpu: Retrieve CE count from ce_count_lo_chip in EccInfo table

2023-10-25 Thread Candice Li
Retrieve correctable error count from ce_count_lo_chip instead of
mca_umc_status.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 12 +---
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
index 46bfdee79bfd2a..c4c77257710c97 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
@@ -336,7 +336,7 @@ static void 
umc_v8_10_ecc_info_query_correctable_error_count(struct amdgpu_devic
  uint32_t node_inst, uint32_t umc_inst, 
uint32_t ch_inst,
  unsigned long *error_count)
 {
-   uint64_t mc_umc_status;
+   uint16_t ecc_ce_cnt;
uint32_t eccinfo_table_idx;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
@@ -345,12 +345,10 @@ static void 
umc_v8_10_ecc_info_query_correctable_error_count(struct amdgpu_devic
  umc_inst * adev->umc.channel_inst_num +
  ch_inst;
 
-   /* check the MCUMC_STATUS */
-   mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
-   if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 
&&
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 
1) {
-   *error_count += 1;
-   }
+   /* Retrieve CE count */
+   ecc_ce_cnt = ras->umc_ecc.ecc[eccinfo_table_idx].ce_count_lo_chip;
+   if (ecc_ce_cnt)
+   *error_count += ecc_ce_cnt;
 }
 
 static void umc_v8_10_ecc_info_query_uncorrectable_error_count(struct 
amdgpu_device *adev,
-- 
2.25.1



[PATCH] drm/amdgpu: Drop deferred error in uncorrectable error check

2023-10-27 Thread Candice Li
Drop checking deferred error which can be handled by poison
consumption.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 743d2f68b09020..770b4b4e313838 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,8 +91,7 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)
 static bool umc_v12_0_is_uncorrectable_error(uint64_t mc_umc_status)
 {
return ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
-   (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1 ||
-   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) 
== 1 ||
+   (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) 
== 1 ||
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) 
== 1 ||
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) 
== 1));
 }
-- 
2.25.1



[PATCH] drm/amdgpu: Enable full reset when RAS is supported on gc v11_0_0

2022-09-07 Thread Candice Li
Enable full reset for RAS supported configuration on gc v11_0_0.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/soc21.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c 
b/drivers/gpu/drm/amd/amdgpu/soc21.c
index a26c5723c46e27..81f32d77c98cd5 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -421,6 +421,10 @@ static bool soc21_need_full_reset(struct amdgpu_device 
*adev)
 {
switch (adev->ip_versions[GC_HWIP][0]) {
case IP_VERSION(11, 0, 0):
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+   return true;
+   else
+   return false;
case IP_VERSION(11, 0, 2):
return false;
default:
-- 
2.17.1



[PATCH] drm/amdgpu: Rely on MCUMC_STATUS for umc v8_10 correctable error counter only

2022-09-07 Thread Candice Li
Only check MCUMC_STATUS for CE counter for umc v8_10.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 12 +++-
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
index 36a2053f2e8b94..a8cbda81828daf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
@@ -101,22 +101,16 @@ static void 
umc_v8_10_query_correctable_error_count(struct amdgpu_device *adev,
   uint32_t umc_reg_offset,
   unsigned long *error_count)
 {
-   uint32_t ecc_err_cnt, ecc_err_cnt_addr;
uint64_t mc_umc_status;
uint32_t mc_umc_status_addr;
 
/* UMC 8_10 registers */
-   ecc_err_cnt_addr =
-   SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccErrCnt);
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
 
-   ecc_err_cnt = RREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4);
-   *error_count +=
-   (REG_GET_FIELD(ecc_err_cnt, UMCCH0_0_GeccErrCnt, GeccErrCnt) -
-UMC_V8_10_CE_CNT_INIT);
-
-   /* Check for SRAM correctable error, MCUMC_STATUS is a 64 bit register 
*/
+   /* Rely on MCUMC_STATUS for correctable error counter
+* MCUMC_STATUS is a 64 bit register
+*/
mc_umc_status = RREG64_PCIE((mc_umc_status_addr + umc_reg_offset) * 4);
if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 
&&
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 
1)
-- 
2.17.1



[PATCH v2] drm/amdgpu: Enable full reset when RAS is supported on gc v11_0_0

2022-09-08 Thread Candice Li
Enable full reset for RAS supported configuration on gc v11_0_0.

v2: simplify the code.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/soc21.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc21.c 
b/drivers/gpu/drm/amd/amdgpu/soc21.c
index a26c5723c46e27..5f0d6983714add 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc21.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc21.c
@@ -421,6 +421,7 @@ static bool soc21_need_full_reset(struct amdgpu_device 
*adev)
 {
switch (adev->ip_versions[GC_HWIP][0]) {
case IP_VERSION(11, 0, 0):
+   return amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC);
case IP_VERSION(11, 0, 2):
return false;
default:
-- 
2.17.1



[PATCH] drm/amdgpu: Add EEPROM I2C address for smu v13_0_0

2022-09-09 Thread Candice Li
Set correct EEPROM I2C address for smu v13_0_0.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index c4283987bb1e89..84c241b9a2a133 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -38,6 +38,7 @@
 #define EEPROM_I2C_MADDR_ARCTURUS_D342  0x0
 #define EEPROM_I2C_MADDR_SIENNA_CICHLID 0x0
 #define EEPROM_I2C_MADDR_ALDEBARAN  0x0
+#define EEPROM_I2C_MADDR_SMU_13_0_0 (0x54UL << 16)
 
 /*
  * The 2 macros bellow represent the actual size in bytes that
@@ -156,6 +157,15 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device 
*adev,
return false;
}
 
+   switch (adev->ip_versions[MP1_HWIP][0]) {
+   case IP_VERSION(13, 0, 0):
+   control->i2c_address = EEPROM_I2C_MADDR_SMU_13_0_0;
+   break;
+
+   default:
+   break;
+   }
+
return true;
 }
 
-- 
2.17.1



[PATCH] drm/amdgpu: Skip reset error status for psp v13_0_0

2022-09-09 Thread Candice Li
No need to reset error status since only umc ras supported on psp v13_0_0.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index ab9ba5a9c33dbe..e55f106621effd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1811,7 +1811,8 @@ static void amdgpu_ras_log_on_err_counter(struct 
amdgpu_device *adev)
amdgpu_ras_query_error_status(adev, &info);
 
if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
-   adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+   adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) &&
+   adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) {
if (amdgpu_ras_reset_error_status(adev, 
info.head.block))
dev_warn(adev->dev, "Failed to reset error 
counter and error status");
}
-- 
2.17.1



[PATCH] drm/amdgpu: added support for ras driver loading

2022-09-09 Thread Candice Li
From: John Clements 

copy ras driver to psp if present

Signed-off-by: John Clements 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 15 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h   |  6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h |  1 +
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c|  7 +++
 4 files changed, 29 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index cfcaf890a6a122..218666f6203a9f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2055,6 +2055,15 @@ static int psp_hw_start(struct psp_context *psp)
}
}
 
+   if ((is_psp_fw_valid(psp->ras_drv)) &&
+   (psp->funcs->bootloader_load_ras_drv != NULL)) {
+   ret = psp_bootloader_load_ras_drv(psp);
+   if (ret) {
+   DRM_ERROR("PSP load ras_drv failed!\n");
+   return ret;
+   }
+   }
+
if ((is_psp_fw_valid(psp->sos)) &&
(psp->funcs->bootloader_load_sos != NULL)) {
ret = psp_bootloader_load_sos(psp);
@@ -3040,6 +3049,12 @@ static int parse_sos_bin_descriptor(struct psp_context 
*psp,
psp->dbg_drv.size_bytes = le32_to_cpu(desc->size_bytes);
psp->dbg_drv.start_addr = ucode_start_addr;
break;
+   case PSP_FW_TYPE_PSP_RAS_DRV:
+   psp->ras_drv.fw_version = le32_to_cpu(desc->fw_version);
+   psp->ras_drv.feature_version= le32_to_cpu(desc->fw_version);
+   psp->ras_drv.size_bytes = le32_to_cpu(desc->size_bytes);
+   psp->ras_drv.start_addr = ucode_start_addr;
+   break;
default:
dev_warn(psp->adev->dev, "Unsupported PSP FW type: %d\n", 
desc->fw_type);
break;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index c32b74bd970fc5..d7a5ff4660cf9e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -71,6 +71,7 @@ enum psp_bootloader_cmd {
PSP_BL__LOAD_SOCDRV = 0xB,
PSP_BL__LOAD_DBGDRV = 0xC,
PSP_BL__LOAD_INTFDRV= 0xD,
+   PSP_BL__LOAD_RASDRV = 0xE,
PSP_BL__DRAM_LONG_TRAIN = 0x10,
PSP_BL__DRAM_SHORT_TRAIN= 0x20,
PSP_BL__LOAD_TOS_SPL_TABLE  = 0x1000,
@@ -114,6 +115,7 @@ struct psp_funcs
int (*bootloader_load_soc_drv)(struct psp_context *psp);
int (*bootloader_load_intf_drv)(struct psp_context *psp);
int (*bootloader_load_dbg_drv)(struct psp_context *psp);
+   int (*bootloader_load_ras_drv)(struct psp_context *psp);
int (*bootloader_load_sos)(struct psp_context *psp);
int (*ring_init)(struct psp_context *psp, enum psp_ring_type ring_type);
int (*ring_create)(struct psp_context *psp,
@@ -323,6 +325,7 @@ struct psp_context
struct psp_bin_desc soc_drv;
struct psp_bin_desc intf_drv;
struct psp_bin_desc dbg_drv;
+   struct psp_bin_desc ras_drv;
 
/* tmr buffer */
struct amdgpu_bo*tmr_bo;
@@ -403,6 +406,9 @@ struct amdgpu_psp_funcs {
((psp)->funcs->bootloader_load_intf_drv ? 
(psp)->funcs->bootloader_load_intf_drv((psp)) : 0)
 #define psp_bootloader_load_dbg_drv(psp) \
((psp)->funcs->bootloader_load_dbg_drv ? 
(psp)->funcs->bootloader_load_dbg_drv((psp)) : 0)
+#define psp_bootloader_load_ras_drv(psp) \
+   ((psp)->funcs->bootloader_load_ras_drv ? \
+   (psp)->funcs->bootloader_load_ras_drv((psp)) : 0)
 #define psp_bootloader_load_sos(psp) \
((psp)->funcs->bootloader_load_sos ? 
(psp)->funcs->bootloader_load_sos((psp)) : 0)
 #define psp_smu_reload_quirk(psp) \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
index 96b6cf4c4d54f8..3975bcaa2c8997 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ucode.h
@@ -124,6 +124,7 @@ enum psp_fw_type {
PSP_FW_TYPE_PSP_SOC_DRV,
PSP_FW_TYPE_PSP_INTF_DRV,
PSP_FW_TYPE_PSP_DBG_DRV,
+   PSP_FW_TYPE_PSP_RAS_DRV,
 };
 
 /* version_major=2, version_minor=0 */
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index eac33d5c93e74f..262dcd7513a6fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -223,6 +223,12 @@ static int psp_v13_0_bootloader_load_dbg_drv(struct 
psp_context *psp)
return psp_v13_0_bootloader_load_component(psp, &psp->dbg_drv, 
PSP_BL__LOAD_DBGDRV);
 }

[PATCH 1/2] drm/amdgpu: Update umc v8_10_0 headers

2022-10-09 Thread Candice Li
Add GeccCtrl offset and mask to umc v8_10_0 headers.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_offset.h  | 2 ++
 drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_sh_mask.h | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_offset.h
index b798cf5a2c39c8..38adde3cae5ac5 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_offset.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_offset.h
@@ -29,5 +29,7 @@
 #define regMCA_UMC_UMC0_MCUMC_STATUST0_BASE_IDX  2
 #define regMCA_UMC_UMC0_MCUMC_ADDRT0 0x03c4
 #define regMCA_UMC_UMC0_MCUMC_ADDRT0_BASE_IDX2
+#define regUMCCH0_0_GeccCtrl 0x0053
+#define regUMCCH0_0_GeccCtrl_BASE_IDX2
 
 #endif
diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_sh_mask.h
index bd99b431247f3e..4dbec524f9434c 100644
--- a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_sh_mask.h
+++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_10_0_sh_mask.h
@@ -90,5 +90,8 @@
 #define MCA_UMC_UMC0_MCUMC_ADDRT0__ErrorAddr__SHIFT0x0
 #define MCA_UMC_UMC0_MCUMC_ADDRT0__Reserved__SHIFT 0x38
 #define MCA_UMC_UMC0_MCUMC_ADDRT0__ErrorAddr_MASK  0x00FFL
+//UMCCH0_0_GeccCtrl
+#define UMCCH0_0_GeccCtrl__UCFatalEn__SHIFT0xd
+#define UMCCH0_0_GeccCtrl__UCFatalEn_MASK  0x2000L
 
 #endif
-- 
2.17.1



[PATCH 2/2] drm/amdgpu: Add poison mode query for umc v8_10_0

2022-10-09 Thread Candice Li
Add poison mode query support on umc v8_10_0.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
index 36a2053f2e8b94..0ba10d80c02536 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
@@ -344,6 +344,31 @@ static void umc_v8_10_err_cnt_init(struct amdgpu_device 
*adev)
}
 }
 
+static uint32_t umc_v8_10_query_ras_poison_mode_per_channel(
+   struct amdgpu_device *adev,
+   uint32_t umc_reg_offset)
+{
+   uint32_t ecc_ctrl_addr, ecc_ctrl;
+
+   ecc_ctrl_addr =
+   SOC15_REG_OFFSET(UMC, 0, regUMCCH0_0_GeccCtrl);
+   ecc_ctrl = RREG32_PCIE((ecc_ctrl_addr +
+   umc_reg_offset) * 4);
+
+   return REG_GET_FIELD(ecc_ctrl, UMCCH0_0_GeccCtrl, UCFatalEn);
+}
+
+static bool umc_v8_10_query_ras_poison_mode(struct amdgpu_device *adev)
+{
+   uint32_t umc_reg_offset  = 0;
+
+   /* Enabling fatal error in umc node0 instance0 channel0 will be
+* considered as fatal error mode
+*/
+   umc_reg_offset = get_umc_v8_10_reg_offset(adev, 0, 0, 0);
+   return !umc_v8_10_query_ras_poison_mode_per_channel(adev, 
umc_reg_offset);
+}
+
 const struct amdgpu_ras_block_hw_ops umc_v8_10_ras_hw_ops = {
.query_ras_error_count = umc_v8_10_query_ras_error_count,
.query_ras_error_address = umc_v8_10_query_ras_error_address,
@@ -354,4 +379,5 @@ struct amdgpu_umc_ras umc_v8_10_ras = {
.hw_ops = &umc_v8_10_ras_hw_ops,
},
.err_cnt_init = umc_v8_10_err_cnt_init,
+   .query_ras_poison_mode = umc_v8_10_query_ras_poison_mode,
 };
-- 
2.17.1



[PATCH] drm/amdgpu: Update ras eeprom support for smu v13_0_0 and v13_0_10

2022-10-17 Thread Candice Li
Enable RAS EEPROM support for smu v13_0_0 and v13_0_10.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 84c241b9a2a133..7dc39154822c50 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -90,6 +90,16 @@
 
 static bool __is_ras_eeprom_supported(struct amdgpu_device *adev)
 {
+   if (adev->asic_type == CHIP_IP_DISCOVERY) {
+   switch (adev->ip_versions[MP1_HWIP][0]) {
+   case IP_VERSION(13, 0, 0):
+   case IP_VERSION(13, 0, 10):
+   return true;
+   default:
+   return false;
+   }
+   }
+
return  adev->asic_type == CHIP_VEGA20 ||
adev->asic_type == CHIP_ARCTURUS ||
adev->asic_type == CHIP_SIENNA_CICHLID ||
-- 
2.17.1



[PATCH] drm/amdgpu: Add EEPROM I2C address support for ip discovery

2022-10-17 Thread Candice Li
1. Update EEPROM_I2C_MADDR_SMU_13_0_0 to EEPROM_I2C_MADDR_54H
2. Add EEPROM I2C address support for smu v13_0_0 and v13_0_10.

Signed-off-by: Candice Li 
Reviewed-by: Tao Zhou 
---
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 20 +--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 84c241b9a2a133..adf36d570fe65e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -38,7 +38,7 @@
 #define EEPROM_I2C_MADDR_ARCTURUS_D342  0x0
 #define EEPROM_I2C_MADDR_SIENNA_CICHLID 0x0
 #define EEPROM_I2C_MADDR_ALDEBARAN  0x0
-#define EEPROM_I2C_MADDR_SMU_13_0_0 (0x54UL << 16)
+#define EEPROM_I2C_MADDR_54H(0x54UL << 16)
 
 /*
  * The 2 macros bellow represent the actual size in bytes that
@@ -114,6 +114,19 @@ static bool __get_eeprom_i2c_addr_arct(struct 
amdgpu_device *adev,
return true;
 }
 
+static bool __get_eeprom_i2c_addr_ip_discovery(struct amdgpu_device *adev,
+  struct amdgpu_ras_eeprom_control 
*control)
+{
+   switch (adev->ip_versions[MP1_HWIP][0]) {
+   case IP_VERSION(13, 0, 0):
+   case IP_VERSION(13, 0, 10):
+   control->i2c_address = EEPROM_I2C_MADDR_54H;
+   return true;
+   default:
+   return false;
+   }
+}
+
 static bool __get_eeprom_i2c_addr(struct amdgpu_device *adev,
  struct amdgpu_ras_eeprom_control *control)
 {
@@ -153,13 +166,16 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device 
*adev,
control->i2c_address = EEPROM_I2C_MADDR_ALDEBARAN;
break;
 
+   case CHIP_IP_DISCOVERY:
+   return __get_eeprom_i2c_addr_ip_discovery(adev, control);
+
default:
return false;
}
 
switch (adev->ip_versions[MP1_HWIP][0]) {
case IP_VERSION(13, 0, 0):
-   control->i2c_address = EEPROM_I2C_MADDR_SMU_13_0_0;
+   control->i2c_address = EEPROM_I2C_MADDR_54H;
break;
 
default:
-- 
2.17.1



[PATCH 1/2] drm/amdgpu: Optimize RAS TA initialzation and TA unload funcs

2022-10-25 Thread Candice Li
1. Save TA unload psp response status
2. Add RAS TA loading status check for initialzaiton
3. Drop RAS context teardown to allow RAS TA to be reloaded

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index effa7df3ddbfa4..643810c4148120 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -988,6 +988,8 @@ int psp_ta_unload(struct psp_context *psp, struct 
ta_context *context)
 
ret = psp_cmd_submit_buf(psp, NULL, cmd, psp->fence_buf_mc_addr);
 
+   context->resp_status = cmd->resp.status;
+
release_psp_cmd_buf(psp);
 
return ret;
@@ -1560,6 +1562,11 @@ static int psp_ras_initialize(struct psp_context *psp)
if (amdgpu_sriov_vf(adev))
return 0;
 
+   if (psp->ras_context.context.initialized) {
+   dev_warn(adev->dev, "RAS WARN: TA has already been loaded\n");
+   return 0;
+   }
+
if (!adev->psp.ras_context.context.bin_desc.size_bytes ||
!adev->psp.ras_context.context.bin_desc.start_addr) {
dev_info(adev->dev, "RAS: optional ras ta ucode is not 
available\n");
@@ -1610,7 +1617,7 @@ static int psp_ras_initialize(struct psp_context *psp)
psp->ras_context.context.mem_context.shared_mem_size = 
PSP_RAS_SHARED_MEM_SIZE;
psp->ras_context.context.ta_load_type = GFX_CMD_ID_LOAD_TA;
 
-   if (!psp->ras_context.context.initialized) {
+   if (!psp->ras_context.context.mem_context.shared_buf) {
ret = psp_ta_init_shared_buf(psp, 
&psp->ras_context.context.mem_context);
if (ret)
return ret;
@@ -1631,7 +1638,6 @@ static int psp_ras_initialize(struct psp_context *psp)
else {
if (ras_cmd->ras_status)
dev_warn(psp->adev->dev, "RAS Init Status: 0x%X\n", 
ras_cmd->ras_status);
-   amdgpu_ras_fini(psp->adev);
}
 
return ret;
-- 
2.17.1



[PATCH 2/2] drm/amdgpu: Optimize TA load/unload/invoke debugfs interfaces

2022-10-25 Thread Candice Li
1. Add a function pointer structure ta_funcs to psp context
2. Make the interfaces generic to all TAs
3. Leverage exisitng TA context and remove unused functions
4. Fix return code bugs

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c|  38 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|  12 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 217 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.h |   4 +
 4 files changed, 167 insertions(+), 104 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 643810c4148120..cdb0605d04f7cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1071,42 +1071,6 @@ int psp_ta_init_shared_buf(struct psp_context *psp,
  &mem_ctx->shared_buf);
 }
 
-static void psp_prep_ta_invoke_indirect_cmd_buf(struct psp_gfx_cmd_resp *cmd,
-  uint32_t ta_cmd_id,
-  struct ta_context *context)
-{
-   cmd->cmd_id = GFX_CMD_ID_INVOKE_CMD;
-   cmd->cmd.cmd_invoke_cmd.session_id  = context->session_id;
-   cmd->cmd.cmd_invoke_cmd.ta_cmd_id   = ta_cmd_id;
-
-   cmd->cmd.cmd_invoke_cmd.buf.num_desc   = 1;
-   cmd->cmd.cmd_invoke_cmd.buf.total_size = 
context->mem_context.shared_mem_size;
-   cmd->cmd.cmd_invoke_cmd.buf.buf_desc[0].buf_size = 
context->mem_context.shared_mem_size;
-   cmd->cmd.cmd_invoke_cmd.buf.buf_desc[0].buf_phy_addr_lo =
-
lower_32_bits(context->mem_context.shared_mc_addr);
-   cmd->cmd.cmd_invoke_cmd.buf.buf_desc[0].buf_phy_addr_hi =
-
upper_32_bits(context->mem_context.shared_mc_addr);
-}
-
-int psp_ta_invoke_indirect(struct psp_context *psp,
- uint32_t ta_cmd_id,
- struct ta_context *context)
-{
-   int ret;
-   struct psp_gfx_cmd_resp *cmd = acquire_psp_cmd_buf(psp);
-
-   psp_prep_ta_invoke_indirect_cmd_buf(cmd, ta_cmd_id, context);
-
-   ret = psp_cmd_submit_buf(psp, NULL, cmd,
-psp->fence_buf_mc_addr);
-
-   context->resp_status = cmd->resp.status;
-
-   release_psp_cmd_buf(psp);
-
-   return ret;
-}
-
 static void psp_prep_ta_invoke_cmd_buf(struct psp_gfx_cmd_resp *cmd,
   uint32_t ta_cmd_id,
   uint32_t session_id)
@@ -1549,7 +1513,7 @@ int psp_ras_terminate(struct psp_context *psp)
return ret;
 }
 
-static int psp_ras_initialize(struct psp_context *psp)
+int psp_ras_initialize(struct psp_context *psp)
 {
int ret;
uint32_t boot_cfg = 0xFF;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index 58ce3ebb446cf8..edc266f65b4e2b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -136,6 +136,12 @@ struct psp_funcs
int (*vbflash_stat)(struct psp_context *psp);
 };
 
+struct ta_funcs {
+   int (*fn_ta_initialze)(struct psp_context *psp);
+   int (*fn_ta_invoke)(struct psp_context *psp, uint32_t ta_cmd_id);
+   int (*fn_ta_terminate)(struct psp_context *psp);
+};
+
 #define AMDGPU_XGMI_MAX_CONNECTED_NODES64
 struct psp_xgmi_node_info {
uint64_tnode_id;
@@ -309,6 +315,7 @@ struct psp_context
struct psp_gfx_cmd_resp *cmd;
 
const struct psp_funcs  *funcs;
+   const struct ta_funcs   *ta_funcs;
 
/* firmware buffer */
struct amdgpu_bo*fw_pri_bo;
@@ -463,9 +470,6 @@ int psp_ta_load(struct psp_context *psp, struct ta_context 
*context);
 int psp_ta_invoke(struct psp_context *psp,
uint32_t ta_cmd_id,
struct ta_context *context);
-int psp_ta_invoke_indirect(struct psp_context *psp,
- uint32_t ta_cmd_id,
- struct ta_context *context);
 
 int psp_xgmi_initialize(struct psp_context *psp, bool set_extended_data, bool 
load_ta);
 int psp_xgmi_terminate(struct psp_context *psp);
@@ -479,7 +483,7 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
 int psp_xgmi_set_topology_info(struct psp_context *psp,
   int number_devices,
   struct psp_xgmi_topology_info *topology);
-
+int psp_ras_initialize(struct psp_context *psp);
 int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id);
 int psp_ras_enable_features(struct psp_context *psp,
union ta_ras_cmd_input *info, bool enable);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
index 0988e00612e515..93e1c07861e47b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ps

[PATCH] drm/amdgpu: Enable GFX RAS feature for gfx v11_0_3

2022-10-27 Thread Candice Li
v1: Support gfx ras feature enablement for gfx v11_0_3.
v2: Update function name and error message.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 26 ++
 1 file changed, 26 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 5eefba2948a552..443ce664b6630f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -4673,6 +4673,26 @@ static int gfx_v11_0_early_init(void *handle)
return 0;
 }
 
+static int gfx_v11_0_ras_late_init(void *handle)
+{
+   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+   struct ras_common_if *gfx_common_if;
+   int ret;
+
+   gfx_common_if = kzalloc(sizeof(struct ras_common_if), GFP_KERNEL);
+   if (!gfx_common_if)
+   return -ENOMEM;
+
+   gfx_common_if->block = AMDGPU_RAS_BLOCK__GFX;
+
+   ret = amdgpu_ras_feature_enable(adev, gfx_common_if, true);
+   if (ret)
+   dev_err(adev->dev, "Failed to enable gfx11 ras feature\n");
+
+   kfree(gfx_common_if);
+   return ret;
+}
+
 static int gfx_v11_0_late_init(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -4686,6 +4706,12 @@ static int gfx_v11_0_late_init(void *handle)
if (r)
return r;
 
+   if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3)) {
+   r = gfx_v11_0_ras_late_init(handle);
+   if (r)
+   return r;
+   }
+
return 0;
 }
 
-- 
2.17.1



[PATCH 1/2] drm/amdgpu: Use indirect buffer and save response status for TA load/invoke

2022-04-17 Thread Candice Li
The upcoming TA debugfs interface needs to use indirect buffer
when performing TA invoke and check psp response status for TA
load and invoke.

Signed-off-by: John Clements 
Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 60 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 26 +++
 2 files changed, 72 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index a6acec1a6155d0..cb7e081b1ef426 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -46,8 +46,6 @@ static int psp_sysfs_init(struct amdgpu_device *adev);
 static void psp_sysfs_fini(struct amdgpu_device *adev);
 
 static int psp_load_smu_fw(struct psp_context *psp);
-static int psp_ta_unload(struct psp_context *psp, struct ta_context *context);
-static int psp_ta_load(struct psp_context *psp, struct ta_context *context);
 static int psp_rap_terminate(struct psp_context *psp);
 static int psp_securedisplay_terminate(struct psp_context *psp);
 
@@ -862,7 +860,7 @@ static void psp_prep_ta_unload_cmd_buf(struct 
psp_gfx_cmd_resp *cmd,
cmd->cmd.cmd_unload_ta.session_id = session_id;
 }
 
-static int psp_ta_unload(struct psp_context *psp, struct ta_context *context)
+int psp_ta_unload(struct psp_context *psp, struct ta_context *context)
 {
int ret;
struct psp_gfx_cmd_resp *cmd = acquire_psp_cmd_buf(psp);
@@ -944,7 +942,7 @@ static void psp_prep_ta_load_cmd_buf(struct 
psp_gfx_cmd_resp *cmd,
cmd->cmd.cmd_load_ta.cmd_buf_len = context->mem_context.shared_mem_size;
 }
 
-static int psp_ta_init_shared_buf(struct psp_context *psp,
+int psp_ta_init_shared_buf(struct psp_context *psp,
  struct ta_mem_context *mem_ctx)
 {
/*
@@ -958,7 +956,7 @@ static int psp_ta_init_shared_buf(struct psp_context *psp,
  &mem_ctx->shared_buf);
 }
 
-static void psp_ta_free_shared_buf(struct ta_mem_context *mem_ctx)
+void psp_ta_free_shared_buf(struct ta_mem_context *mem_ctx)
 {
amdgpu_bo_free_kernel(&mem_ctx->shared_bo, &mem_ctx->shared_mc_addr,
  &mem_ctx->shared_buf);
@@ -969,6 +967,42 @@ static int psp_xgmi_init_shared_buf(struct psp_context 
*psp)
return psp_ta_init_shared_buf(psp, 
&psp->xgmi_context.context.mem_context);
 }
 
+static void psp_prep_ta_invoke_indirect_cmd_buf(struct psp_gfx_cmd_resp *cmd,
+  uint32_t ta_cmd_id,
+  struct ta_context *context)
+{
+   cmd->cmd_id = GFX_CMD_ID_INVOKE_CMD;
+   cmd->cmd.cmd_invoke_cmd.session_id  = context->session_id;
+   cmd->cmd.cmd_invoke_cmd.ta_cmd_id   = ta_cmd_id;
+
+   cmd->cmd.cmd_invoke_cmd.buf.num_desc   = 1;
+   cmd->cmd.cmd_invoke_cmd.buf.total_size = 
context->mem_context.shared_mem_size;
+   cmd->cmd.cmd_invoke_cmd.buf.buf_desc[0].buf_size = 
context->mem_context.shared_mem_size;
+   cmd->cmd.cmd_invoke_cmd.buf.buf_desc[0].buf_phy_addr_lo =
+
lower_32_bits(context->mem_context.shared_mc_addr);
+   cmd->cmd.cmd_invoke_cmd.buf.buf_desc[0].buf_phy_addr_hi =
+
upper_32_bits(context->mem_context.shared_mc_addr);
+}
+
+int psp_ta_invoke_indirect(struct psp_context *psp,
+ uint32_t ta_cmd_id,
+ struct ta_context *context)
+{
+   int ret;
+   struct psp_gfx_cmd_resp *cmd = acquire_psp_cmd_buf(psp);
+
+   psp_prep_ta_invoke_indirect_cmd_buf(cmd, ta_cmd_id, context);
+
+   ret = psp_cmd_submit_buf(psp, NULL, cmd,
+psp->fence_buf_mc_addr);
+
+   context->resp_status = cmd->resp.status;
+
+   release_psp_cmd_buf(psp);
+
+   return ret;
+}
+
 static void psp_prep_ta_invoke_cmd_buf(struct psp_gfx_cmd_resp *cmd,
   uint32_t ta_cmd_id,
   uint32_t session_id)
@@ -978,7 +1012,7 @@ static void psp_prep_ta_invoke_cmd_buf(struct 
psp_gfx_cmd_resp *cmd,
cmd->cmd.cmd_invoke_cmd.ta_cmd_id   = ta_cmd_id;
 }
 
-static int psp_ta_invoke(struct psp_context *psp,
+int psp_ta_invoke(struct psp_context *psp,
  uint32_t ta_cmd_id,
  struct ta_context *context)
 {
@@ -990,12 +1024,14 @@ static int psp_ta_invoke(struct psp_context *psp,
ret = psp_cmd_submit_buf(psp, NULL, cmd,
 psp->fence_buf_mc_addr);
 
+   context->resp_status = cmd->resp.status;
+
release_psp_cmd_buf(psp);
 
return ret;
 }
 
-static int psp_ta_load(struct psp_context *psp, struct ta_context *context)
+int psp_ta_load(struct psp_context *psp, struct ta_context *context)
 {
int ret;
  

[PATCH 2/2] drm/amdgpu: Add debugfs TA load/unload/invoke support

2022-04-17 Thread Candice Li
Add debugfs support to load/unload/invoke TA in runtime.

Signed-off-by: John Clements 
Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/Makefile |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c  | 312 
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.h  |  30 ++
 4 files changed, 345 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 7d7af43a258f83..b525f9be9326f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -58,7 +58,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
-   amdgpu_eeprom.o amdgpu_mca.o
+   amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o
 
 amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 13e4d8f9b87449..eedb12f6b8a32d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -38,6 +38,7 @@
 #include "amdgpu_umr.h"
 
 #include "amdgpu_reset.h"
+#include "amdgpu_psp_ta.h"
 
 #if defined(CONFIG_DEBUG_FS)
 
@@ -1767,6 +1768,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
DRM_ERROR("registering register debugfs failed (%d).\n", r);
 
amdgpu_debugfs_firmware_init(adev);
+   amdgpu_ta_if_debugfs_init(adev);
 
 #if defined(CONFIG_DRM_AMD_DC)
if (amdgpu_device_has_dc_support(adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
new file mode 100644
index 00..916bf3f6fce0d4
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
@@ -0,0 +1,312 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "amdgpu.h"
+#include "amdgpu_psp_ta.h"
+
+static const char *TA_IF_FS_NAME = "ta_if";
+
+struct dentry *dir;
+struct dentry *ta_load_debugfs_dentry;
+struct dentry *ta_unload_debugfs_dentry;
+struct dentry *ta_invoke_debugfs_dentry;
+
+static ssize_t ta_if_load_debugfs_read(struct file *fp, char *buf, size_t len, 
loff_t *off);
+static ssize_t ta_if_unload_debugfs_read(struct file *fp, char *buf, size_t 
len, loff_t *off);
+static ssize_t ta_if_invoke_debugfs_read(struct file *fp, char *buf, size_t 
len, loff_t *off);
+
+
+static uint32_t get_bin_version(const uint8_t *bin)
+{
+   const struct common_firmware_header *hdr =
+(const struct common_firmware_header *)bin;
+
+   return hdr->ucode_version;
+}
+
+static uint32_t get_shared_buf_size(uint32_t shared_buf_len)
+{
+   return (shared_buf_len % PAGE_SIZE) ?
+(shared_buf_len/PAGE_SIZE + 1) * PAGE_SIZE :
+shared_buf_len;
+}
+
+static void prep_ta_mem_context(struct psp_context *psp,
+struct ta_context *context,
+uint8_t *shared_buf,
+uint32_t shared_buf_len)
+{
+   context->mem_context.shared_mem_size = 
get_shared_buf_size(shared_buf_len);
+   psp_ta_init_shared_buf(psp, &context->mem_context);
+
+   memcpy((void *)context->mem_context.shared_buf, shared_buf, 
shared_buf_len);
+}
+
+static bool is_ta_type_valid(enum ta_type_id ta_type)
+{
+   bool ret = false;
+
+   switch (ta_type) {
+   c

[PATCH v2 1/2] drm/amdgpu: Use indirect buffer and save response status for TA load/invoke

2022-04-20 Thread Candice Li
The upcoming TA debugfs interface needs to use indirect buffer
when performing TA invoke and check psp response status for TA
load and invoke.

Signed-off-by: John Clements 
Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 54 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h | 26 
 2 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index a6acec1a6155d0..f6527aa19238a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -46,8 +46,6 @@ static int psp_sysfs_init(struct amdgpu_device *adev);
 static void psp_sysfs_fini(struct amdgpu_device *adev);
 
 static int psp_load_smu_fw(struct psp_context *psp);
-static int psp_ta_unload(struct psp_context *psp, struct ta_context *context);
-static int psp_ta_load(struct psp_context *psp, struct ta_context *context);
 static int psp_rap_terminate(struct psp_context *psp);
 static int psp_securedisplay_terminate(struct psp_context *psp);
 
@@ -862,7 +860,7 @@ static void psp_prep_ta_unload_cmd_buf(struct 
psp_gfx_cmd_resp *cmd,
cmd->cmd.cmd_unload_ta.session_id = session_id;
 }
 
-static int psp_ta_unload(struct psp_context *psp, struct ta_context *context)
+int psp_ta_unload(struct psp_context *psp, struct ta_context *context)
 {
int ret;
struct psp_gfx_cmd_resp *cmd = acquire_psp_cmd_buf(psp);
@@ -944,7 +942,7 @@ static void psp_prep_ta_load_cmd_buf(struct 
psp_gfx_cmd_resp *cmd,
cmd->cmd.cmd_load_ta.cmd_buf_len = context->mem_context.shared_mem_size;
 }
 
-static int psp_ta_init_shared_buf(struct psp_context *psp,
+int psp_ta_init_shared_buf(struct psp_context *psp,
  struct ta_mem_context *mem_ctx)
 {
/*
@@ -958,7 +956,7 @@ static int psp_ta_init_shared_buf(struct psp_context *psp,
  &mem_ctx->shared_buf);
 }
 
-static void psp_ta_free_shared_buf(struct ta_mem_context *mem_ctx)
+void psp_ta_free_shared_buf(struct ta_mem_context *mem_ctx)
 {
amdgpu_bo_free_kernel(&mem_ctx->shared_bo, &mem_ctx->shared_mc_addr,
  &mem_ctx->shared_buf);
@@ -969,6 +967,42 @@ static int psp_xgmi_init_shared_buf(struct psp_context 
*psp)
return psp_ta_init_shared_buf(psp, 
&psp->xgmi_context.context.mem_context);
 }
 
+static void psp_prep_ta_invoke_indirect_cmd_buf(struct psp_gfx_cmd_resp *cmd,
+  uint32_t ta_cmd_id,
+  struct ta_context *context)
+{
+   cmd->cmd_id = GFX_CMD_ID_INVOKE_CMD;
+   cmd->cmd.cmd_invoke_cmd.session_id  = context->session_id;
+   cmd->cmd.cmd_invoke_cmd.ta_cmd_id   = ta_cmd_id;
+
+   cmd->cmd.cmd_invoke_cmd.buf.num_desc   = 1;
+   cmd->cmd.cmd_invoke_cmd.buf.total_size = 
context->mem_context.shared_mem_size;
+   cmd->cmd.cmd_invoke_cmd.buf.buf_desc[0].buf_size = 
context->mem_context.shared_mem_size;
+   cmd->cmd.cmd_invoke_cmd.buf.buf_desc[0].buf_phy_addr_lo =
+
lower_32_bits(context->mem_context.shared_mc_addr);
+   cmd->cmd.cmd_invoke_cmd.buf.buf_desc[0].buf_phy_addr_hi =
+
upper_32_bits(context->mem_context.shared_mc_addr);
+}
+
+int psp_ta_invoke_indirect(struct psp_context *psp,
+ uint32_t ta_cmd_id,
+ struct ta_context *context)
+{
+   int ret;
+   struct psp_gfx_cmd_resp *cmd = acquire_psp_cmd_buf(psp);
+
+   psp_prep_ta_invoke_indirect_cmd_buf(cmd, ta_cmd_id, context);
+
+   ret = psp_cmd_submit_buf(psp, NULL, cmd,
+psp->fence_buf_mc_addr);
+
+   context->resp_status = cmd->resp.status;
+
+   release_psp_cmd_buf(psp);
+
+   return ret;
+}
+
 static void psp_prep_ta_invoke_cmd_buf(struct psp_gfx_cmd_resp *cmd,
   uint32_t ta_cmd_id,
   uint32_t session_id)
@@ -978,7 +1012,7 @@ static void psp_prep_ta_invoke_cmd_buf(struct 
psp_gfx_cmd_resp *cmd,
cmd->cmd.cmd_invoke_cmd.ta_cmd_id   = ta_cmd_id;
 }
 
-static int psp_ta_invoke(struct psp_context *psp,
+int psp_ta_invoke(struct psp_context *psp,
  uint32_t ta_cmd_id,
  struct ta_context *context)
 {
@@ -990,12 +1024,14 @@ static int psp_ta_invoke(struct psp_context *psp,
ret = psp_cmd_submit_buf(psp, NULL, cmd,
 psp->fence_buf_mc_addr);
 
+   context->resp_status = cmd->resp.status;
+
release_psp_cmd_buf(psp);
 
return ret;
 }
 
-static int psp_ta_load(struct psp_context *psp, struct ta_context *context)
+int psp_ta_load(struct psp_context *psp, struct ta_context *context)
 {
int ret;
  

[PATCH v2 2/2] drm/amdgpu: Add debugfs TA load/unload/invoke support

2022-04-20 Thread Candice Li
v1:
Add debugfs support to load/unload/invoke TA in runtime.

v2:
1. Update some variables to static.
2. Use PAGE_ALIGN to calculate shared buf size directly.
3. Remove fp check.
4. Update debugfs from read to write.

Signed-off-by: John Clements 
Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/Makefile |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |   2 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c  | 308 
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.h  |  30 ++
 4 files changed, 341 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 7d7af43a258f83..b525f9be9326f4 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -58,7 +58,7 @@ amdgpu-y += amdgpu_device.o amdgpu_kms.o \
amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o amdgpu_nbio.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
-   amdgpu_eeprom.o amdgpu_mca.o
+   amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o
 
 amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 13e4d8f9b87449..eedb12f6b8a32d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -38,6 +38,7 @@
 #include "amdgpu_umr.h"
 
 #include "amdgpu_reset.h"
+#include "amdgpu_psp_ta.h"
 
 #if defined(CONFIG_DEBUG_FS)
 
@@ -1767,6 +1768,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
DRM_ERROR("registering register debugfs failed (%d).\n", r);
 
amdgpu_debugfs_firmware_init(adev);
+   amdgpu_ta_if_debugfs_init(adev);
 
 #if defined(CONFIG_DRM_AMD_DC)
if (amdgpu_device_has_dc_support(adev))
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
new file mode 100644
index 00..247a476e63544c
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "amdgpu.h"
+#include "amdgpu_psp_ta.h"
+
+static const char *TA_IF_FS_NAME = "ta_if";
+
+struct dentry *dir;
+static struct dentry *ta_load_debugfs_dentry;
+static struct dentry *ta_unload_debugfs_dentry;
+static struct dentry *ta_invoke_debugfs_dentry;
+
+static ssize_t ta_if_load_debugfs_write(struct file *fp, const char *buf,
+   size_t len, loff_t *off);
+static ssize_t ta_if_unload_debugfs_write(struct file *fp, const char *buf,
+   size_t len, loff_t *off);
+static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf,
+   size_t len, loff_t *off);
+
+
+static uint32_t get_bin_version(const uint8_t *bin)
+{
+   const struct common_firmware_header *hdr =
+(const struct common_firmware_header *)bin;
+
+   return hdr->ucode_version;
+}
+
+static void prep_ta_mem_context(struct psp_context *psp,
+struct ta_context *context,
+uint8_t *shared_buf,
+uint32_t shared_buf_len)
+{
+   context->mem_context.shared_mem_size = PAGE_ALIGN(shared_buf_len);
+   psp_ta_init_shared_buf(psp, &context->mem_context);
+
+   memcpy((void *)context->mem_context.shared_buf, shared_buf, 
shared_buf_len);
+}
+
+static bool is_ta_type_valid(e

[PATCH v2] drm/amdgpu: Fix UBSAN shift-out-of-bounds for gfx v9_0

2022-08-15 Thread Candice Li
Check shift number to avoid doing a shift operation when the number
of bits shifted equal to or greater than number of bits in the operand.

v2: Only calculate shift number for non-zero data and fix build warning.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 7f187558220e9a..c398c21d906069 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2495,6 +2495,7 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
 {
int i, j;
u32 data;
+   int shift_num = 0;
u32 active_rbs = 0;
u32 rb_bitmap_width_per_sh = adev->gfx.config.max_backends_per_se /
adev->gfx.config.max_sh_per_se;
@@ -2504,8 +2505,11 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
gfx_v9_0_select_se_sh(adev, i, j, 0x);
data = gfx_v9_0_get_rb_active_bitmap(adev);
-   active_rbs |= data << ((i * 
adev->gfx.config.max_sh_per_se + j) *
-  rb_bitmap_width_per_sh);
+   if (data) {
+   shift_num = MIN(((i * 
adev->gfx.config.max_sh_per_se + j) *
+ rb_bitmap_width_per_sh), 
__builtin_clz(data));
+   active_rbs |= data << shift_num;
+   }
}
}
gfx_v9_0_select_se_sh(adev, 0x, 0x, 0x);
-- 
2.17.1



[PATCH] drm/amdgpu: Check num_gfx_rings for gfx v9_0 rb setup.

2022-08-17 Thread Candice Li
No need to set up rb when no gfx rings.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 7f187558220e9a..1d6d3a852a0b3d 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2597,7 +2597,8 @@ static void gfx_v9_0_constants_init(struct amdgpu_device 
*adev)
 
gfx_v9_0_tiling_mode_table_init(adev);
 
-   gfx_v9_0_setup_rb(adev);
+   if (adev->gfx.num_gfx_rings)
+   gfx_v9_0_setup_rb(adev);
gfx_v9_0_get_cu_info(adev, &adev->gfx.cu_info);
adev->gfx.config.db_debug2 = RREG32_SOC15(GC, 0, mmDB_DEBUG2);
 
-- 
2.17.1



[PATCH] drm/amdgpu: Fix UBSAN shift-out-of-bounds for gfx v9_0

2022-08-24 Thread Candice Li
Check shift number to avoid doing a shift operation when the number
of bits shifted equal to or greater than number of bits in the operand.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 7f187558220e9a..0b9215b6e4b316 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -2494,7 +2494,7 @@ static u32 gfx_v9_0_get_rb_active_bitmap(struct 
amdgpu_device *adev)
 static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
 {
int i, j;
-   u32 data;
+   u32 data, shift_num;
u32 active_rbs = 0;
u32 rb_bitmap_width_per_sh = adev->gfx.config.max_backends_per_se /
adev->gfx.config.max_sh_per_se;
@@ -2504,8 +2504,10 @@ static void gfx_v9_0_setup_rb(struct amdgpu_device *adev)
for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
gfx_v9_0_select_se_sh(adev, i, j, 0x);
data = gfx_v9_0_get_rb_active_bitmap(adev);
-   active_rbs |= data << ((i * 
adev->gfx.config.max_sh_per_se + j) *
-  rb_bitmap_width_per_sh);
+   shift_num = min(((i * adev->gfx.config.max_sh_per_se + 
j) *
+ rb_bitmap_width_per_sh), 
__builtin_clz(data));
+   if (data)
+   active_rbs |= data << shift_num;
}
}
gfx_v9_0_select_se_sh(adev, 0x, 0x, 0x);
-- 
2.17.1



[PATCH 1/2] drm/amdgpu: Add convert_error_address function for umc v8_10

2023-02-21 Thread Candice Li
Add convert_error_address for umc v8_10.

Signed-off-by: Candice Li 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 73 +++---
 1 file changed, 42 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
index da394bc06bbaaf..293ba39c8a2fda 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
@@ -209,6 +209,45 @@ static int umc_v8_10_swizzle_mode_na_to_pa(struct 
amdgpu_device *adev,
return 0;
 }
 
+void umc_v8_10_convert_error_address(struct amdgpu_device *adev,
+   struct ras_err_data *err_data, uint64_t 
err_addr,
+   uint32_t ch_inst, uint32_t umc_inst,
+   uint32_t node_inst, uint64_t mc_umc_status)
+{
+   uint64_t na_err_addr_base;
+   uint64_t na_err_addr, retired_page_addr;
+   uint32_t channel_index, addr_lsb, col = 0;
+   int ret = 0;
+
+   channel_index =
+   adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num *
+   adev->umc.channel_inst_num +
+   umc_inst * adev->umc.channel_inst_num +
+   ch_inst];
+
+   /* the lowest lsb bits should be ignored */
+   addr_lsb = REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
AddrLsb);
+   err_addr &= ~((0x1ULL << addr_lsb) - 1);
+   na_err_addr_base = err_addr & ~(0x3ULL << UMC_V8_10_NA_C5_BIT);
+
+   /* loop for all possibilities of [C6 C5] in normal address. */
+   for (col = 0; col < UMC_V8_10_NA_COL_2BITS_POWER_OF_2_NUM; col++) {
+   na_err_addr = na_err_addr_base | (col << UMC_V8_10_NA_C5_BIT);
+
+   /* Mapping normal error address to retired soc physical 
address. */
+   ret = umc_v8_10_swizzle_mode_na_to_pa(adev, channel_index,
+   na_err_addr, 
&retired_page_addr);
+   if (ret) {
+   dev_err(adev->dev, "Failed to map pa from umc na.\n");
+   break;
+   }
+   dev_info(adev->dev, "Error Address(PA): 0x%llx\n",
+   retired_page_addr);
+   amdgpu_umc_fill_error_record(err_data, na_err_addr,
+   retired_page_addr, channel_index, umc_inst);
+   }
+}
+
 static void umc_v8_10_query_error_address(struct amdgpu_device *adev,
 struct ras_err_data *err_data,
 uint32_t umc_reg_offset,
@@ -218,10 +257,7 @@ static void umc_v8_10_query_error_address(struct 
amdgpu_device *adev,
 {
uint64_t mc_umc_status_addr;
uint64_t mc_umc_status, err_addr;
-   uint64_t mc_umc_addrt0, na_err_addr_base;
-   uint64_t na_err_addr, retired_page_addr;
-   uint32_t channel_index, addr_lsb, col = 0;
-   int ret = 0;
+   uint64_t mc_umc_addrt0;
 
mc_umc_status_addr =
SOC15_REG_OFFSET(UMC, 0, regMCA_UMC_UMC0_MCUMC_STATUST0);
@@ -236,12 +272,6 @@ static void umc_v8_10_query_error_address(struct 
amdgpu_device *adev,
return;
}
 
-   channel_index =
-   adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num *
-   adev->umc.channel_inst_num +
-   umc_inst * adev->umc.channel_inst_num +
-   ch_inst];
-
/* calculate error address if ue error is detected */
if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 
&&
REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, AddrV) == 
1 &&
@@ -251,27 +281,8 @@ static void umc_v8_10_query_error_address(struct 
amdgpu_device *adev,
err_addr = RREG64_PCIE((mc_umc_addrt0 + umc_reg_offset) * 4);
err_addr = REG_GET_FIELD(err_addr, MCA_UMC_UMC0_MCUMC_ADDRT0, 
ErrorAddr);
 
-   /* the lowest lsb bits should be ignored */
-   addr_lsb = REG_GET_FIELD(mc_umc_status, 
MCA_UMC_UMC0_MCUMC_STATUST0, AddrLsb);
-   err_addr &= ~((0x1ULL << addr_lsb) - 1);
-   na_err_addr_base = err_addr & ~(0x3ULL << UMC_V8_10_NA_C5_BIT);
-
-   /* loop for all possibilities of [C6 C5] in normal address. */
-   for (col = 0; col < UMC_V8_10_NA_COL_2BITS_POWER_OF_2_NUM; 
col++) {
-   na_err_addr = na_err_addr_base | (col << 
UMC_V8_10_NA_C5_BIT);
-
-   /* Mapping normal error address to retired soc physical 
address. */
-   ret = umc_v8_10_swizzle_mode_na_to_pa(adev, 
channel_i

[PATCH 2/2] drm/amdgpu: Add ecc info query interface for umc v8_10

2023-02-21 Thread Candice Li
Support ecc info query for umc v8_10.

v2: Simplied by convert_error_address.
v3: Remove unused variable and invalid checking.

Signed-off-by: Candice Li 
Reviewed-by: Tao Zhou 
Reviewed-by: Stanley.Yang 
---
 drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 134 +
 1 file changed, 134 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
index 293ba39c8a2fda..66158219f791cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
@@ -360,6 +360,138 @@ static bool umc_v8_10_query_ras_poison_mode(struct 
amdgpu_device *adev)
return true;
 }
 
+static void umc_v8_10_ecc_info_query_correctable_error_count(struct 
amdgpu_device *adev,
+ uint32_t node_inst, uint32_t umc_inst, 
uint32_t ch_inst,
+ unsigned long *error_count)
+{
+   uint64_t mc_umc_status;
+   uint32_t eccinfo_table_idx;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
+ adev->umc.channel_inst_num +
+ umc_inst * adev->umc.channel_inst_num +
+ ch_inst;
+
+   /* check the MCUMC_STATUS */
+   mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
+   if (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 1 
&&
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, CECC) == 
1) {
+   *error_count += 1;
+   }
+}
+
+static void umc_v8_10_ecc_info_query_uncorrectable_error_count(struct 
amdgpu_device *adev,
+ uint32_t node_inst, uint32_t umc_inst, 
uint32_t ch_inst,
+ unsigned long *error_count)
+{
+   uint64_t mc_umc_status;
+   uint32_t eccinfo_table_idx;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
+ adev->umc.channel_inst_num +
+ umc_inst * adev->umc.channel_inst_num +
+ ch_inst;
+
+   /* check the MCUMC_STATUS */
+   mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
+   if ((REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) == 
1) &&
+   (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1 ||
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UECC) == 
1 ||
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC) == 1 
||
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC) == 1 
||
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC) == 
1)) {
+   *error_count += 1;
+   }
+}
+
+static void umc_v8_10_ecc_info_query_ras_error_count(struct amdgpu_device 
*adev,
+   void *ras_error_status)
+{
+   struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+
+   uint32_t node_inst   = 0;
+   uint32_t umc_inst= 0;
+   uint32_t ch_inst = 0;
+
+   /* TODO: driver needs to toggle DF Cstate to ensure
+* safe access of UMC registers. Will add the protection
+*/
+   LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) {
+   umc_v8_10_ecc_info_query_correctable_error_count(adev,
+   node_inst, umc_inst, 
ch_inst,
+   &(err_data->ce_count));
+   umc_v8_10_ecc_info_query_uncorrectable_error_count(adev,
+   node_inst, umc_inst, 
ch_inst,
+   &(err_data->ue_count));
+   }
+}
+
+static void umc_v8_10_ecc_info_query_error_address(struct amdgpu_device *adev,
+   struct ras_err_data *err_data,
+   uint32_t ch_inst,
+   uint32_t umc_inst,
+   uint32_t node_inst)
+{
+   uint32_t eccinfo_table_idx, channel_index;
+   uint64_t mc_umc_status, err_addr;
+
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   eccinfo_table_idx = node_inst * adev->umc.umc_inst_num *
+ adev->umc.channel_inst_num +
+ umc_inst * adev->umc.channel_inst_num +
+ ch_inst;
+   channel_index =
+   adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num *
+ adev->umc.channel_inst_

[PATCH] drm/amdgpu: Make umc_v8_10_convert_error_address static and remove unused variable

2023-02-23 Thread Candice Li
Fixes following warnings:
warning: no previous prototype for 'umc_v8_10_convert_error_address'
warning: variable 'channel_index' set but not used

Reported-by: kernel test robot 
Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/umc_v8_10.c | 15 +--
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
index 66158219f791cb..fb55e8cb9967ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
@@ -209,10 +209,10 @@ static int umc_v8_10_swizzle_mode_na_to_pa(struct 
amdgpu_device *adev,
return 0;
 }
 
-void umc_v8_10_convert_error_address(struct amdgpu_device *adev,
-   struct ras_err_data *err_data, uint64_t 
err_addr,
-   uint32_t ch_inst, uint32_t umc_inst,
-   uint32_t node_inst, uint64_t mc_umc_status)
+static void umc_v8_10_convert_error_address(struct amdgpu_device *adev,
+   struct ras_err_data *err_data, 
uint64_t err_addr,
+   uint32_t ch_inst, uint32_t umc_inst,
+   uint32_t node_inst, uint64_t 
mc_umc_status)
 {
uint64_t na_err_addr_base;
uint64_t na_err_addr, retired_page_addr;
@@ -434,7 +434,7 @@ static void umc_v8_10_ecc_info_query_error_address(struct 
amdgpu_device *adev,
uint32_t umc_inst,
uint32_t node_inst)
 {
-   uint32_t eccinfo_table_idx, channel_index;
+   uint32_t eccinfo_table_idx;
uint64_t mc_umc_status, err_addr;
 
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
@@ -443,11 +443,6 @@ static void umc_v8_10_ecc_info_query_error_address(struct 
amdgpu_device *adev,
  adev->umc.channel_inst_num +
  umc_inst * adev->umc.channel_inst_num +
  ch_inst;
-   channel_index =
-   adev->umc.channel_idx_tbl[node_inst * adev->umc.umc_inst_num *
- adev->umc.channel_inst_num +
- umc_inst * 
adev->umc.channel_inst_num +
- ch_inst];
 
mc_umc_status = ras->umc_ecc.ecc[eccinfo_table_idx].mca_umc_status;
 
-- 
2.17.1



[PATCH] drm/amdgpu: Support umc node harvest config on umc v8_10

2023-02-28 Thread Candice Li
Don't need to query error count and error address on harvest umc nodes.
v2: Fix code bug, use active_mask instead of harvsest_config
and remove unnecessary argument in LOOP macro.
v3: Leave adev->gmc.num_umc unchanged.

Signed-off-by: Candice Li 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h   |  7 +--
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c|  1 -
 drivers/gpu/drm/amd/amdgpu/umc_v8_10.h|  4 ++--
 4 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index ea040adb1f150f..aebf3542481ead 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -543,6 +543,7 @@ static void amdgpu_discovery_read_from_harvest_table(struct 
amdgpu_device *adev,
struct harvest_table *harvest_info;
u16 offset;
int i;
+   uint32_t umc_harvest_config = 0;
 
bhdr = (struct binary_header *)adev->mman.discovery_bin;
offset = le16_to_cpu(bhdr->table_list[HARVEST_INFO].offset);
@@ -570,12 +571,17 @@ static void 
amdgpu_discovery_read_from_harvest_table(struct amdgpu_device *adev,
adev->harvest_ip_mask |= AMD_HARVEST_IP_DMU_MASK;
break;
case UMC_HWID:
+   umc_harvest_config |=
+   1 << 
(le16_to_cpu(harvest_info->list[i].number_instance));
(*umc_harvest_count)++;
break;
default:
break;
}
}
+
+   adev->umc.active_mask = ((1 << adev->umc.node_inst_num) - 1) &
+   ~umc_harvest_config;
 }
 
 /* == */
@@ -1156,8 +1162,10 @@ static int amdgpu_discovery_reg_base_init(struct 
amdgpu_device *adev)
AMDGPU_MAX_SDMA_INSTANCES);
}
 
-   if (le16_to_cpu(ip->hw_id) == UMC_HWID)
+   if (le16_to_cpu(ip->hw_id) == UMC_HWID) {
adev->gmc.num_umc++;
+   adev->umc.node_inst_num++;
+   }
 
for (k = 0; k < num_base_address; k++) {
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index f2bf979af58835..36e19336f3b34e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -42,7 +42,7 @@
 #define LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) LOOP_UMC_INST((umc_inst)) 
LOOP_UMC_CH_INST((ch_inst))
 
 #define LOOP_UMC_NODE_INST(node_inst) \
-   for ((node_inst) = 0; (node_inst) < adev->umc.node_inst_num; 
(node_inst)++)
+   for_each_set_bit((node_inst), &(adev->umc.active_mask), 
adev->umc.node_inst_num)
 
 #define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) \
LOOP_UMC_NODE_INST((node_inst)) 
LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst))
@@ -69,7 +69,7 @@ struct amdgpu_umc {
/* number of umc instance with memory map register access */
uint32_t umc_inst_num;
 
-   /*number of umc node instance with memory map register access*/
+   /* Total number of umc node instance including harvest one */
uint32_t node_inst_num;
 
/* UMC regiser per channel offset */
@@ -82,6 +82,9 @@ struct amdgpu_umc {
 
const struct amdgpu_umc_funcs *funcs;
struct amdgpu_umc_ras *ras;
+
+   /* active mask for umc node instance */
+   unsigned long active_mask;
 };
 
 int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block);
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 85e0afc3d4f7f3..af7b3ba1ca0002 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -567,7 +567,6 @@ static void gmc_v11_0_set_umc_funcs(struct amdgpu_device 
*adev)
case IP_VERSION(8, 10, 0):
adev->umc.channel_inst_num = UMC_V8_10_CHANNEL_INSTANCE_NUM;
adev->umc.umc_inst_num = UMC_V8_10_UMC_INSTANCE_NUM;
-   adev->umc.node_inst_num = adev->gmc.num_umc;
adev->umc.max_ras_err_cnt_per_query = 
UMC_V8_10_TOTAL_CHANNEL_NUM(adev);
adev->umc.channel_offs = UMC_V8_10_PER_CHANNEL_OFFSET;
adev->umc.retire_unit = UMC_V8_10_NA_COL_2BITS_POWER_OF_2_NUM;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h
index 25eaf4af5fcf4b..c6dfd433fec7bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h
+++ b/drivers/gpu/drm/amd/a

[PATCH] drm/amd/pm: Enable ecc_info table support for smu v13_0_10

2023-02-28 Thread Candice Li
Support EccInfoTable which includes umc ras error count and
error address.

Signed-off-by: Candice Li 
Reviewed-by: Evan Quan 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  | 75 +++
 1 file changed, 75 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 923a9fb3c8873c..27448ffe60a439 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -46,6 +46,7 @@
 #include "asic_reg/mp/mp_13_0_0_sh_mask.h"
 #include "smu_cmn.h"
 #include "amdgpu_ras.h"
+#include "umc_v8_10.h"
 
 /*
  * DO NOT use these for err/warn/info/debug messages.
@@ -90,6 +91,12 @@
 
 #define DEBUGSMC_MSG_Mode1Reset2
 
+/*
+ * SMU_v13_0_10 supports ECCTABLE since version 80.34.0,
+ * use this to check ECCTABLE feature whether support
+ */
+#define SUPPORT_ECCTABLE_SMU_13_0_10_VERSION 0x00502200
+
 static struct cmn2asic_msg_mapping smu_v13_0_0_message_map[SMU_MSG_MAX_COUNT] 
= {
MSG_MAP(TestMessage,PPSMC_MSG_TestMessage,  
   1),
MSG_MAP(GetSmuVersion,  PPSMC_MSG_GetSmuVersion,
   1),
@@ -229,6 +236,7 @@ static struct cmn2asic_mapping 
smu_v13_0_0_table_map[SMU_TABLE_COUNT] = {
TAB_MAP(ACTIVITY_MONITOR_COEFF),
[SMU_TABLE_COMBO_PPTABLE] = {1, TABLE_COMBO_PPTABLE},
TAB_MAP(I2C_COMMANDS),
+   TAB_MAP(ECCINFO),
 };
 
 static struct cmn2asic_mapping smu_v13_0_0_pwr_src_map[SMU_POWER_SOURCE_COUNT] 
= {
@@ -462,6 +470,8 @@ static int smu_v13_0_0_tables_init(struct smu_context *smu)
   AMDGPU_GEM_DOMAIN_VRAM);
SMU_TABLE_INIT(tables, SMU_TABLE_COMBO_PPTABLE, 
MP0_MP1_DATA_REGION_SIZE_COMBOPPTABLE,
PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
+   SMU_TABLE_INIT(tables, SMU_TABLE_ECCINFO, sizeof(EccInfoTable_t),
+   PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM);
 
smu_table->metrics_table = kzalloc(sizeof(SmuMetricsExternal_t), 
GFP_KERNEL);
if (!smu_table->metrics_table)
@@ -477,8 +487,14 @@ static int smu_v13_0_0_tables_init(struct smu_context *smu)
if (!smu_table->watermarks_table)
goto err2_out;
 
+   smu_table->ecc_table = kzalloc(tables[SMU_TABLE_ECCINFO].size, 
GFP_KERNEL);
+   if (!smu_table->ecc_table)
+   goto err3_out;
+
return 0;
 
+err3_out:
+   kfree(smu_table->watermarks_table);
 err2_out:
kfree(smu_table->gpu_metrics_table);
 err1_out:
@@ -2036,6 +2052,64 @@ static int smu_v13_0_0_send_bad_mem_channel_flag(struct 
smu_context *smu,
return ret;
 }
 
+static int smu_v13_0_0_check_ecc_table_support(struct smu_context *smu)
+{
+   struct amdgpu_device *adev = smu->adev;
+   uint32_t if_version = 0xff, smu_version = 0xff;
+   int ret = 0;
+
+   ret = smu_cmn_get_smc_version(smu, &if_version, &smu_version);
+   if (ret)
+   return -EOPNOTSUPP;
+
+   if ((adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 10)) &&
+   (smu_version >= SUPPORT_ECCTABLE_SMU_13_0_10_VERSION))
+   return ret;
+   else
+   return -EOPNOTSUPP;
+}
+
+static ssize_t smu_v13_0_0_get_ecc_info(struct smu_context *smu,
+   void 
*table)
+{
+   struct smu_table_context *smu_table = &smu->smu_table;
+   struct amdgpu_device *adev = smu->adev;
+   EccInfoTable_t *ecc_table = NULL;
+   struct ecc_info_per_ch *ecc_info_per_channel = NULL;
+   int i, ret = 0;
+   struct umc_ecc_info *eccinfo = (struct umc_ecc_info *)table;
+
+   ret = smu_v13_0_0_check_ecc_table_support(smu);
+   if (ret)
+   return ret;
+
+   ret = smu_cmn_update_table(smu,
+   SMU_TABLE_ECCINFO,
+   0,
+   smu_table->ecc_table,
+   false);
+   if (ret) {
+   dev_info(adev->dev, "Failed to export SMU ecc table!\n");
+   return ret;
+   }
+
+   ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
+
+   for (i = 0; i < UMC_V8_10_TOTAL_CHANNEL_NUM(adev); i++) {
+   ecc_info_per_channel = &(eccinfo->ecc[i]);
+   ecc_info_per_channel->ce_count_lo_chip =
+   ecc_table->EccInfo[i].ce_count_lo_chip;
+   ecc_info_per_channel->ce_count_hi_chip =
+   ecc_table->EccInfo[i].ce_count_hi_chip;
+   ecc_info_per_channel->mca_umc_status =
+   ecc_table->EccInfo[i].mca_umc_status;
+   ecc_info_per_channel->mca_umc_addr =
+  

[PATCH] drm/amdgpu: Drop pcie_bif ras check from fatal error handler

2023-04-19 Thread Candice Li
Some ASICs support fatal error event but do not
support pcie_bif ras.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 3ab8a88789c8fe..22f401fd1901cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1597,8 +1597,7 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
 {
/* Fatal error events are handled on host side */
-   if (amdgpu_sriov_vf(adev) ||
-   !amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
+   if (amdgpu_sriov_vf(adev))
return;
 
if (adev->nbio.ras &&
-- 
2.17.1



[PATCH] Align eccinfo table structure with smu v13_0_0 interface

2023-06-08 Thread Candice Li
Update eccinfo table structure according to smu v13_0_0 interface.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/umc_v8_10.h   | 3 +++
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h
index c6dfd433fec7bc..2cdaf746e8cd4b 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h
@@ -31,6 +31,9 @@
 /* number of umc instance with memory map register access */
 #define UMC_V8_10_UMC_INSTANCE_NUM 2
 
+/* Max number of umc channel instances */
+#define UMC_V8_10_MAX_CHANNEL_NUM  24
+
 /* Total channel instances for all available umc nodes */
 #define UMC_V8_10_TOTAL_CHANNEL_NUM(adev) \
(UMC_V8_10_CHANNEL_INSTANCE_NUM * UMC_V8_10_UMC_INSTANCE_NUM * 
(adev)->gmc.num_umc)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 413e592f0ed611..90ea15496879c4 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -2580,7 +2580,7 @@ static ssize_t smu_v13_0_0_get_ecc_info(struct 
smu_context *smu,
 
ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
 
-   for (i = 0; i < UMC_V8_10_TOTAL_CHANNEL_NUM(adev); i++) {
+   for (i = 0; i < UMC_V8_10_MAX_CHANNEL_NUM; i++) {
ecc_info_per_channel = &(eccinfo->ecc[i]);
ecc_info_per_channel->ce_count_lo_chip =
ecc_table->EccInfo[i].ce_count_lo_chip;
-- 
2.25.1



[PATCH 1/2] drm/amdgpu: Update total channel number for umc v8_10

2023-06-13 Thread Candice Li
Update total channel number for umc v8_10.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h   | 2 ++
 drivers/gpu/drm/amd/amdgpu/umc_v8_10.h| 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 859882109f55d6..16cf7b199457e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -1515,6 +1515,7 @@ static int amdgpu_discovery_get_mall_info(struct 
amdgpu_device *adev)
mall_size += mall_size_per_umc;
}
adev->gmc.mall_size = mall_size;
+   adev->gmc.m_half_use = half_use;
break;
default:
dev_err(adev->dev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 6794edd1d2d2ae..56d73fade56850 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -301,6 +301,8 @@ struct amdgpu_gmc {
 
/* MALL size */
u64 mall_size;
+   uint32_t m_half_use;
+
/* number of UMC instances */
int num_umc;
/* mode2 save restore */
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h
index c6dfd433fec7bc..dc12e0af5451e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.h
@@ -33,7 +33,8 @@
 
 /* Total channel instances for all available umc nodes */
 #define UMC_V8_10_TOTAL_CHANNEL_NUM(adev) \
-   (UMC_V8_10_CHANNEL_INSTANCE_NUM * UMC_V8_10_UMC_INSTANCE_NUM * 
(adev)->gmc.num_umc)
+   (UMC_V8_10_CHANNEL_INSTANCE_NUM * UMC_V8_10_UMC_INSTANCE_NUM * \
+   (adev)->gmc.num_umc - hweight32((adev)->gmc.m_half_use) * 2)
 
 /* UMC regiser per channel offset */
 #define UMC_V8_10_PER_CHANNEL_OFFSET   0x400
-- 
2.25.1



[PATCH 2/2] drm/amdgpu: Add channel_dis_num to ras init flags

2023-06-13 Thread Candice Li
Add disabled channel number to ras init flags.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 1 +
 drivers/gpu/drm/amd/amdgpu/ta_ras_if.h  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index dd865beb39a8c4..6070c91f0b8293 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1657,6 +1657,7 @@ int psp_ras_initialize(struct psp_context *psp)
ras_cmd->ras_in_message.init_flags.dgpu_mode = 1;
ras_cmd->ras_in_message.init_flags.xcc_mask =
adev->gfx.xcc_mask;
+   ras_cmd->ras_in_message.init_flags.channel_dis_num = 
hweight32(adev->gmc.m_half_use) * 2;
 
ret = psp_ta_load(psp, &psp->ras_context.context);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h 
b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
index be2984ac00a56d..879bb7af297c7b 100644
--- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
+++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
@@ -130,6 +130,7 @@ struct ta_ras_init_flags {
uint8_t poison_mode_en;
uint8_t dgpu_mode;
uint16_t xcc_mask;
+   uint8_t channel_dis_num;
 };
 
 struct ta_ras_output_flags {
-- 
2.25.1



[PATCH v2] drm/amd/pm: Align eccinfo table structure with smu v13_0_0 interface

2023-06-13 Thread Candice Li
Update eccinfo table structure according to smu v13_0_0 interface.

v2: Calculate array size instead of using macro definition.

Signed-off-by: Candice Li 
Reviewed-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 413e592f0ed611..cbf0b2d738c1a6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -46,7 +46,6 @@
 #include "asic_reg/mp/mp_13_0_0_sh_mask.h"
 #include "smu_cmn.h"
 #include "amdgpu_ras.h"
-#include "umc_v8_10.h"
 
 /*
  * DO NOT use these for err/warn/info/debug messages.
@@ -2580,7 +2579,7 @@ static ssize_t smu_v13_0_0_get_ecc_info(struct 
smu_context *smu,
 
ecc_table = (EccInfoTable_t *)smu_table->ecc_table;
 
-   for (i = 0; i < UMC_V8_10_TOTAL_CHANNEL_NUM(adev); i++) {
+   for (i = 0; i < ARRAY_SIZE(ecc_table->EccInfo); i++) {
ecc_info_per_channel = &(eccinfo->ecc[i]);
ecc_info_per_channel->ce_count_lo_chip =
ecc_table->EccInfo[i].ce_count_lo_chip;
-- 
2.25.1



[PATCH] drm/amdgpu: Add psp_13_0_10_ta firmware to modinfo

2022-11-12 Thread Candice Li
TA firmware loaded on psp v13_0_10, but it is missing in modinfo.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index b21cb38ab4d750..86d7038f5dbde2 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -44,6 +44,7 @@ MODULE_FIRMWARE("amdgpu/psp_13_0_0_ta.bin");
 MODULE_FIRMWARE("amdgpu/psp_13_0_7_sos.bin");
 MODULE_FIRMWARE("amdgpu/psp_13_0_7_ta.bin");
 MODULE_FIRMWARE("amdgpu/psp_13_0_10_sos.bin");
+MODULE_FIRMWARE("amdgpu/psp_13_0_10_ta.bin");
 
 /* For large FW files the time to complete can be very long */
 #define USBC_PD_POLLING_LIMIT_S 240
-- 
2.25.1



[PATCH] drm/amd/pm: Enable bad memory page/channel recording support for smu v13_0_0

2022-11-18 Thread Candice Li
Send message to SMU to update bad memory page and bad channel info.

Signed-off-by: Candice Li 
Reviewed-by: Evan Quan 
---
 .../pm/swsmu/inc/pmfw_if/smu_v13_0_0_ppsmc.h  |  8 +++-
 drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h  |  4 +-
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  | 39 +++
 3 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_0_ppsmc.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_0_ppsmc.h
index 9ebb8f39732a0e..8b8266890a1002 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_0_ppsmc.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/pmfw_if/smu_v13_0_0_ppsmc.h
@@ -131,7 +131,13 @@
 #define PPSMC_MSG_EnableAudioStutterWA   0x44
 #define PPSMC_MSG_PowerUpUmsch   0x45
 #define PPSMC_MSG_PowerDownUmsch 0x46
-#define PPSMC_Message_Count  0x47
+#define PPSMC_MSG_SetDcsArch 0x47
+#define PPSMC_MSG_TriggerVFFLR   0x48
+#define PPSMC_MSG_SetNumBadMemoryPagesRetired0x49
+#define PPSMC_MSG_SetBadMemoryPagesRetiredFlagsPerChannel 0x4A
+#define PPSMC_MSG_SetPriorityDeltaGain   0x4B
+#define PPSMC_MSG_AllowIHHostInterrupt   0x4C
+#define PPSMC_Message_Count  0x4D
 
 //Debug Dump Message
 #define DEBUGSMC_MSG_TestMessage0x1
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
index 58098b82df660c..a4e3425b1027c2 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_types.h
@@ -239,7 +239,9 @@
__SMU_DUMMY_MAP(DriverMode2Reset), \
__SMU_DUMMY_MAP(GetGfxOffStatus),\
__SMU_DUMMY_MAP(GetGfxOffEntryCount),\
-   __SMU_DUMMY_MAP(LogGfxOffResidency),
+   __SMU_DUMMY_MAP(LogGfxOffResidency),\
+   __SMU_DUMMY_MAP(SetNumBadMemoryPagesRetired),   \
+   __SMU_DUMMY_MAP(SetBadMemoryPagesRetiredFlagsPerChannel),
 
 #undef __SMU_DUMMY_MAP
 #define __SMU_DUMMY_MAP(type)  SMU_MSG_##type
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 5bcb61f77e4193..87d7c66e49ef28 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -141,6 +141,9 @@ static struct cmn2asic_msg_mapping 
smu_v13_0_0_message_map[SMU_MSG_MAX_COUNT] =
MSG_MAP(PrepareMp1ForUnload,PPSMC_MSG_PrepareMp1ForUnload,  
   0),
MSG_MAP(DFCstateControl,
PPSMC_MSG_SetExternalClientDfCstateAllow, 0),
MSG_MAP(ArmD3,  PPSMC_MSG_ArmD3,
   0),
+   MSG_MAP(SetNumBadMemoryPagesRetired,
PPSMC_MSG_SetNumBadMemoryPagesRetired,   0),
+   MSG_MAP(SetBadMemoryPagesRetiredFlagsPerChannel,
+   PPSMC_MSG_SetBadMemoryPagesRetiredFlagsPerChannel,  
 0),
 };
 
 static struct cmn2asic_mapping smu_v13_0_0_clk_map[SMU_CLK_COUNT] = {
@@ -1838,6 +1841,40 @@ static void smu_v13_0_0_set_smu_mailbox_registers(struct 
smu_context *smu)
smu->debug_resp_reg = SOC15_REG_OFFSET(MP1, 0, mmMP1_SMN_C2PMSG_54);
 }
 
+static int smu_v13_0_0_smu_send_bad_mem_page_num(struct smu_context *smu,
+   uint32_t size)
+{
+   int ret = 0;
+
+   /* message SMU to update the bad page number on SMUBUS */
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+ SMU_MSG_SetNumBadMemoryPagesRetired,
+ size, NULL);
+   if (ret)
+   dev_err(smu->adev->dev,
+ "[%s] failed to message SMU to update bad memory 
pages number\n",
+ __func__);
+
+   return ret;
+}
+
+static int smu_v13_0_0_send_bad_mem_channel_flag(struct smu_context *smu,
+   uint32_t size)
+{
+   int ret = 0;
+
+   /* message SMU to update the bad channel info on SMUBUS */
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+ 
SMU_MSG_SetBadMemoryPagesRetiredFlagsPerChannel,
+ size, NULL);
+   if (ret)
+   dev_err(smu->adev->dev,
+ "[%s] failed to message SMU to update bad memory 
pages channel info\n",
+ __func__);
+
+   return ret;
+}
+
 static const struct pptable_funcs smu_v13_0_0_ppt_funcs = {
.get_allowed_feature_mask = smu_v13_0_0_get_allowed_feature_mask,
.set_default_dpm_table = smu_v13_0_0_set_default_dpm_table,
@@ -1908,6 +1945,8 @@ static const struct pptable_funcs smu_v13_0_0_ppt_funcs = 
{
.mode1_reset = smu_v13_0_0_mode1_reset,
.set_mp1_state = smu_v13_0_0_set_mp1_state,
.set_df_cstate = smu_v13_0_0_set_df_csta

[PATCH 1/2] drm/amdgpu: Add df v4_3 headers

2022-12-14 Thread Candice Li
Add df v4_3 header files.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 .../amd/include/asic_reg/df/df_4_3_offset.h   |  30 
 .../amd/include/asic_reg/df/df_4_3_sh_mask.h  | 157 ++
 2 files changed, 187 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/df/df_4_3_offset.h
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/df/df_4_3_sh_mask.h

diff --git a/drivers/gpu/drm/amd/include/asic_reg/df/df_4_3_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_3_offset.h
new file mode 100644
index 00..fbb18e44ec5226
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_3_offset.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2022  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _df_4_3_OFFSET_HEADER
+#define _df_4_3_OFFSET_HEADER
+
+#define regDF_CS_UMC_AON0_HardwareAssertMaskLow 0x0e3e
+#define regDF_CS_UMC_AON0_HardwareAssertMaskLow_BASE_IDX4
+#define regDF_NCS_PG0_HardwareAssertMaskHigh0x0e3f
+#define regDF_NCS_PG0_HardwareAssertMaskHigh_BASE_IDX   4
+
+#endif
diff --git a/drivers/gpu/drm/amd/include/asic_reg/df/df_4_3_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_3_sh_mask.h
new file mode 100644
index 00..9c8f19ded4ebd6
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/asic_reg/df/df_4_3_sh_mask.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (C) 2022  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _df_4_3_SH_MASK_HEADER
+#define _df_4_3_SH_MASK_HEADER
+
+//DF_CS_UMC_AON0_HardwareAssertMaskLow
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk0__SHIFT  
0x0
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk1__SHIFT  
0x1
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk2__SHIFT  
0x2
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk3__SHIFT  
0x3
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk4__SHIFT  
0x4
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk5__SHIFT  
0x5
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk6__SHIFT  
0x6
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk7__SHIFT  
0x7
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk8__SHIFT  
0x8
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk9__SHIFT  
0x9
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk10__SHIFT 
0xa
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk11__SHIFT 
0xb
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk12__SHIFT 
0xc
+#define DF_CS_UMC_AON0_HardwareAssertMaskLow__HWAssertMsk13__SHIFT 
0xd
+#define DF_CS_UMC_AON0_Har

[PATCH 2/2] drm/amdgpu: Add poison mode query for df v4_3

2022-12-14 Thread Candice Li
Add poison mode query support on df v4_3.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c |  4 ++
 drivers/gpu/drm/amd/amdgpu/df_v4_3.c  | 61 +++
 drivers/gpu/drm/amd/amdgpu/df_v4_3.h  | 31 ++
 4 files changed, 98 insertions(+), 1 deletion(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_3.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/df_v4_3.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 798d0e9a60b7dc..332cf8bda7a2da 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -81,7 +81,8 @@ amdgpu-y += \
 # add DF block
 amdgpu-y += \
df_v1_7.o \
-   df_v3_6.o
+   df_v3_6.o \
+   df_v4_3.o
 
 # add GMC block
 amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 1bbd56029a4f90..b719852daa071a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -33,6 +33,7 @@
 #include "gmc_v9_0.h"
 #include "df_v1_7.h"
 #include "df_v3_6.h"
+#include "df_v4_3.h"
 #include "nbio_v6_1.h"
 #include "nbio_v7_0.h"
 #include "nbio_v7_4.h"
@@ -2329,6 +2330,9 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device 
*adev)
case IP_VERSION(3, 5, 2):
adev->df.funcs = &df_v1_7_funcs;
break;
+   case IP_VERSION(4, 3, 0):
+   adev->df.funcs = &df_v4_3_funcs;
+   break;
default:
break;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_3.c 
b/drivers/gpu/drm/amd/amdgpu/df_v4_3.c
new file mode 100644
index 00..e8b9e19ede2e11
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/df_v4_3.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "amdgpu.h"
+#include "df_v4_3.h"
+
+#include "df/df_4_3_offset.h"
+#include "df/df_4_3_sh_mask.h"
+
+static bool df_v4_3_query_ras_poison_mode(struct amdgpu_device *adev)
+{
+   uint32_t hw_assert_msklo, hw_assert_mskhi;
+   uint32_t v0, v1, v28, v31;
+
+   hw_assert_msklo = RREG32_SOC15(DF, 0,
+   regDF_CS_UMC_AON0_HardwareAssertMaskLow);
+   hw_assert_mskhi = RREG32_SOC15(DF, 0,
+   regDF_NCS_PG0_HardwareAssertMaskHigh);
+
+   v0 = REG_GET_FIELD(hw_assert_msklo,
+   DF_CS_UMC_AON0_HardwareAssertMaskLow, HWAssertMsk0);
+   v1 = REG_GET_FIELD(hw_assert_msklo,
+   DF_CS_UMC_AON0_HardwareAssertMaskLow, HWAssertMsk1);
+   v28 = REG_GET_FIELD(hw_assert_mskhi,
+   DF_NCS_PG0_HardwareAssertMaskHigh, HWAssertMsk28);
+   v31 = REG_GET_FIELD(hw_assert_mskhi,
+   DF_NCS_PG0_HardwareAssertMaskHigh, HWAssertMsk31);
+
+   if (v0 && v1 && v28 && v31)
+   return true;
+   else if (!v0 && !v1 && !v28 && !v31)
+   return false;
+   else {
+   dev_warn(adev->dev, "DF poison setting is 
inconsistent(%d:%d:%d:%d)!\n",
+   v0, v1, v28, v31);
+   return false;
+   }
+}
+
+const struct amdgpu_df_funcs df_v4_3_funcs = {
+   .query_ras_poison_mode = df_v4_3_query_ras_poison_mode,
+};
diff --git a/drivers/gpu/drm/amd/amdgpu/df_v4_3.h 
b/drivers/gpu/drm/amd/amdgpu/df_v4_3.h
new file mode 100644
index 00..06ef0724edd3d7
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/df_v4_3.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby gr

[PATCH] drm/amd/pm: Support RAS fatal error mode1 reset on smu v13_0_0 and v13_0_10

2023-01-12 Thread Candice Li
Support RAS fatal error mode1 reset on smu v13_0_0 and v13_0_10.

Signed-off-by: Candice Li 
Reviewed-by: Evan Quan 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  | 42 +--
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c|  6 +++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h|  3 ++
 3 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index 969e5f96554015..d0cdc578344d8d 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -1904,15 +1904,51 @@ static int smu_v13_0_0_set_df_cstate(struct smu_context 
*smu,
   NULL);
 }
 
+static void smu_v13_0_0_set_mode1_reset_param(struct smu_context *smu,
+   uint32_t supported_version,
+   uint32_t *param)
+{
+   uint32_t smu_version;
+   struct amdgpu_device *adev = smu->adev;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   smu_cmn_get_smc_version(smu, NULL, &smu_version);
+
+   if ((smu_version >= supported_version) &&
+   ras && atomic_read(&ras->in_recovery))
+   /* Set RAS fatal error reset flag */
+   *param = 1 << 16;
+   else
+   *param = 0;
+}
+
 static int smu_v13_0_0_mode1_reset(struct smu_context *smu)
 {
int ret;
+   uint32_t param;
struct amdgpu_device *adev = smu->adev;
 
-   if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 10))
-   ret = smu_cmn_send_debug_smc_msg(smu, DEBUGSMC_MSG_Mode1Reset);
-   else
+   switch (adev->ip_versions[MP1_HWIP][0]) {
+   case IP_VERSION(13, 0, 0):
+   /* SMU 13_0_0 PMFW supports RAS fatal error reset from 78.77 */
+   smu_v13_0_0_set_mode1_reset_param(smu, 0x004e4d00, ¶m);
+
+   ret = smu_cmn_send_smc_msg_with_param(smu,
+   SMU_MSG_Mode1Reset, param, 
NULL);
+   break;
+
+   case IP_VERSION(13, 0, 10):
+   /* SMU 13_0_10 PMFW supports RAS fatal error reset from 80.28 */
+   smu_v13_0_0_set_mode1_reset_param(smu, 0x00501c00, ¶m);
+
+   ret = smu_cmn_send_debug_smc_msg_with_param(smu,
+   DEBUGSMC_MSG_Mode1Reset, param);
+   break;
+
+   default:
ret = smu_cmn_send_smc_msg(smu, SMU_MSG_Mode1Reset, NULL);
+   break;
+   }
 
if (!ret)
msleep(SMU13_MODE1_RESET_WAIT_TIME_IN_MS);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 768b6e7dbd7719..d5abafc5a68201 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -404,6 +404,12 @@ int smu_cmn_send_debug_smc_msg(struct smu_context *smu,
return __smu_cmn_send_debug_msg(smu, msg, 0);
 }
 
+int smu_cmn_send_debug_smc_msg_with_param(struct smu_context *smu,
+uint32_t msg, uint32_t param)
+{
+   return __smu_cmn_send_debug_msg(smu, msg, param);
+}
+
 int smu_cmn_to_asic_specific_index(struct smu_context *smu,
   enum smu_cmn2asic_mapping_type type,
   uint32_t index)
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
index f82cf76dd3a474..d7cd358a53bdcd 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.h
@@ -45,6 +45,9 @@ int smu_cmn_send_smc_msg(struct smu_context *smu,
 int smu_cmn_send_debug_smc_msg(struct smu_context *smu,
 uint32_t msg);
 
+int smu_cmn_send_debug_smc_msg_with_param(struct smu_context *smu,
+uint32_t msg, uint32_t param);
+
 int smu_cmn_wait_for_response(struct smu_context *smu);
 
 int smu_cmn_to_asic_specific_index(struct smu_context *smu,
-- 
2.17.1



[PATCH] drm/amdgpu: Fix build warning for TA debugfs interface

2022-04-27 Thread Candice Li
Remove the redundant conditional group to fix build warning
when CONFIG_DEBUG_FS is disabled.

Reported-by: Randy Dunlap 
Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
index 6806deb098d3f7..ccda96c924dc99 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
@@ -305,9 +305,7 @@ static struct dentry *amdgpu_ta_if_debugfs_create(struct 
amdgpu_device *adev)
 
 void amdgpu_ta_if_debugfs_init(struct amdgpu_device *adev)
 {
-#if defined(CONFIG_DEBUG_FS)
dir = amdgpu_ta_if_debugfs_create(adev);
-#endif
 }
 
 void amdgpu_ta_if_debugfs_remove(void)
-- 
2.17.1



[PATCH v2] drm/amdgpu: Fix build warning for TA debugfs interface

2022-04-27 Thread Candice Li
Remove the redundant codes to fix build warning
when CONFIG_DEBUG_FS is disabled.

Reported-by: Randy Dunlap 
Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 43 ++
 1 file changed, 12 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
index 6806deb098d3f7..97ea2246bc1ddb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
@@ -24,13 +24,6 @@
 #include "amdgpu.h"
 #include "amdgpu_psp_ta.h"
 
-static const char *TA_IF_FS_NAME = "ta_if";
-
-struct dentry *dir;
-static struct dentry *ta_load_debugfs_dentry;
-static struct dentry *ta_unload_debugfs_dentry;
-static struct dentry *ta_invoke_debugfs_dentry;
-
 static ssize_t ta_if_load_debugfs_write(struct file *fp, const char *buf,
size_t len, loff_t *off);
 static ssize_t ta_if_unload_debugfs_write(struct file *fp, const char *buf,
@@ -38,7 +31,6 @@ static ssize_t ta_if_unload_debugfs_write(struct file *fp, 
const char *buf,
 static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf,
size_t len, loff_t *off);
 
-
 static uint32_t get_bin_version(const uint8_t *bin)
 {
const struct common_firmware_header *hdr =
@@ -74,19 +66,19 @@ static bool is_ta_type_valid(enum ta_type_id ta_type)
 }
 
 static const struct file_operations ta_load_debugfs_fops = {
-   .write   = ta_if_load_debugfs_write,
+   .write  = ta_if_load_debugfs_write,
.llseek = default_llseek,
.owner  = THIS_MODULE
 };
 
 static const struct file_operations ta_unload_debugfs_fops = {
-   .write   = ta_if_unload_debugfs_write,
+   .write  = ta_if_unload_debugfs_write,
.llseek = default_llseek,
.owner  = THIS_MODULE
 };
 
 static const struct file_operations ta_invoke_debugfs_fops = {
-   .write   = ta_if_invoke_debugfs_write,
+   .write  = ta_if_invoke_debugfs_write,
.llseek = default_llseek,
.owner  = THIS_MODULE
 };
@@ -286,31 +278,20 @@ static ssize_t ta_if_invoke_debugfs_write(struct file 
*fp, const char *buf, size
return ret;
 }
 
-static struct dentry *amdgpu_ta_if_debugfs_create(struct amdgpu_device *adev)
+void amdgpu_ta_if_debugfs_init(struct amdgpu_device *adev)
 {
+#if defined(CONFIG_DEBUG_FS)
struct drm_minor *minor = adev_to_drm(adev)->primary;
 
-   dir = debugfs_create_dir(TA_IF_FS_NAME, minor->debugfs_root);
-
-   ta_load_debugfs_dentry = debugfs_create_file("ta_load", 0200, dir, adev,
-&ta_load_debugfs_fops);
+   struct dentry *dir = debugfs_create_dir("ta_if", minor->debugfs_root);
 
-   ta_unload_debugfs_dentry = debugfs_create_file("ta_unload", 0200, dir,
-adev, 
&ta_unload_debugfs_fops);
+   debugfs_create_file("ta_load", 0200, dir, adev,
+&ta_load_debugfs_fops);
 
-   ta_invoke_debugfs_dentry = debugfs_create_file("ta_invoke", 0200, dir,
-adev, 
&ta_invoke_debugfs_fops);
-   return dir;
-}
+   debugfs_create_file("ta_unload", 0200, dir,
+adev, &ta_unload_debugfs_fops);
 
-void amdgpu_ta_if_debugfs_init(struct amdgpu_device *adev)
-{
-#if defined(CONFIG_DEBUG_FS)
-   dir = amdgpu_ta_if_debugfs_create(adev);
+   debugfs_create_file("ta_invoke", 0200, dir,
+adev, &ta_invoke_debugfs_fops);
 #endif
 }
-
-void amdgpu_ta_if_debugfs_remove(void)
-{
-   debugfs_remove_recursive(dir);
-}
-- 
2.17.1



[PATCH v3] drm/amdgpu: Fix build warning for TA debugfs interface

2022-04-28 Thread Candice Li
Remove the redundant codes to fix build warning
when CONFIG_DEBUG_FS is disabled.

Reported-by: Randy Dunlap 
Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c | 40 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.h |  1 -
 2 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
index 6806deb098d3f7..0988e00612e515 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
@@ -24,12 +24,7 @@
 #include "amdgpu.h"
 #include "amdgpu_psp_ta.h"
 
-static const char *TA_IF_FS_NAME = "ta_if";
-
-struct dentry *dir;
-static struct dentry *ta_load_debugfs_dentry;
-static struct dentry *ta_unload_debugfs_dentry;
-static struct dentry *ta_invoke_debugfs_dentry;
+#if defined(CONFIG_DEBUG_FS)
 
 static ssize_t ta_if_load_debugfs_write(struct file *fp, const char *buf,
size_t len, loff_t *off);
@@ -38,7 +33,6 @@ static ssize_t ta_if_unload_debugfs_write(struct file *fp, 
const char *buf,
 static ssize_t ta_if_invoke_debugfs_write(struct file *fp, const char *buf,
size_t len, loff_t *off);
 
-
 static uint32_t get_bin_version(const uint8_t *bin)
 {
const struct common_firmware_header *hdr =
@@ -74,19 +68,19 @@ static bool is_ta_type_valid(enum ta_type_id ta_type)
 }
 
 static const struct file_operations ta_load_debugfs_fops = {
-   .write   = ta_if_load_debugfs_write,
+   .write  = ta_if_load_debugfs_write,
.llseek = default_llseek,
.owner  = THIS_MODULE
 };
 
 static const struct file_operations ta_unload_debugfs_fops = {
-   .write   = ta_if_unload_debugfs_write,
+   .write  = ta_if_unload_debugfs_write,
.llseek = default_llseek,
.owner  = THIS_MODULE
 };
 
 static const struct file_operations ta_invoke_debugfs_fops = {
-   .write   = ta_if_invoke_debugfs_write,
+   .write  = ta_if_invoke_debugfs_write,
.llseek = default_llseek,
.owner  = THIS_MODULE
 };
@@ -286,31 +280,25 @@ static ssize_t ta_if_invoke_debugfs_write(struct file 
*fp, const char *buf, size
return ret;
 }
 
-static struct dentry *amdgpu_ta_if_debugfs_create(struct amdgpu_device *adev)
+void amdgpu_ta_if_debugfs_init(struct amdgpu_device *adev)
 {
struct drm_minor *minor = adev_to_drm(adev)->primary;
 
-   dir = debugfs_create_dir(TA_IF_FS_NAME, minor->debugfs_root);
+   struct dentry *dir = debugfs_create_dir("ta_if", minor->debugfs_root);
 
-   ta_load_debugfs_dentry = debugfs_create_file("ta_load", 0200, dir, adev,
-&ta_load_debugfs_fops);
+   debugfs_create_file("ta_load", 0200, dir, adev,
+&ta_load_debugfs_fops);
 
-   ta_unload_debugfs_dentry = debugfs_create_file("ta_unload", 0200, dir,
-adev, 
&ta_unload_debugfs_fops);
+   debugfs_create_file("ta_unload", 0200, dir,
+adev, &ta_unload_debugfs_fops);
 
-   ta_invoke_debugfs_dentry = debugfs_create_file("ta_invoke", 0200, dir,
-adev, 
&ta_invoke_debugfs_fops);
-   return dir;
+   debugfs_create_file("ta_invoke", 0200, dir,
+adev, &ta_invoke_debugfs_fops);
 }
 
+#else
 void amdgpu_ta_if_debugfs_init(struct amdgpu_device *adev)
 {
-#if defined(CONFIG_DEBUG_FS)
-   dir = amdgpu_ta_if_debugfs_create(adev);
-#endif
-}
 
-void amdgpu_ta_if_debugfs_remove(void)
-{
-   debugfs_remove_recursive(dir);
 }
+#endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.h
index 883f89d57616d0..cfc1542f63ef94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.h
@@ -25,6 +25,5 @@
 #define __AMDGPU_PSP_TA_H__
 
 void amdgpu_ta_if_debugfs_init(struct amdgpu_device *adev);
-void amdgpu_ta_if_debugfs_remove(void);
 
 #endif
-- 
2.17.1



[PATCH] drm/amdgpu: Resolve pcie_bif RAS recovery bug

2022-05-20 Thread Candice Li
Check shared buf instead of init flag for xgmi ta shared buf init
during xgmi ta initialization.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 214e4e89a02876..e9411c28d88ba8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1177,7 +1177,7 @@ int psp_xgmi_initialize(struct psp_context *psp, bool 
set_extended_data, bool lo
psp->xgmi_context.context.mem_context.shared_mem_size = 
PSP_XGMI_SHARED_MEM_SIZE;
psp->xgmi_context.context.ta_load_type = GFX_CMD_ID_LOAD_TA;
 
-   if (!psp->xgmi_context.context.initialized) {
+   if (!psp->xgmi_context.context.mem_context.shared_buf) {
ret = psp_ta_init_shared_buf(psp, 
&psp->xgmi_context.context.mem_context);
if (ret)
return ret;
-- 
2.17.1



[PATCH] drm/amdgpu: Resolve RAS GFX error count issue after cold boot on Arcturus

2022-06-01 Thread Candice Li
Adjust the sequence for ras late init and separate ras reset error status
from query status.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  7 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 27 -
 2 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index ede2fa56f6c90d..99c1a2d3dae84d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -594,14 +594,15 @@ int amdgpu_get_gfx_off_status(struct amdgpu_device *adev, 
uint32_t *value)
 int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if 
*ras_block)
 {
int r;
-   r = amdgpu_ras_block_late_init(adev, ras_block);
-   if (r)
-   return r;
 
if (amdgpu_ras_is_supported(adev, ras_block->block)) {
if (!amdgpu_persistent_edc_harvesting_supported(adev))
amdgpu_ras_reset_error_status(adev, 
AMDGPU_RAS_BLOCK__GFX);
 
+   r = amdgpu_ras_block_late_init(adev, ras_block);
+   if (r)
+   return r;
+
r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
if (r)
goto late_fini;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 31207f7eec0291..9c5e05ef8beb0c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -197,6 +197,13 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, 
char __user *buf,
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
 
+   /* Hardware counter will be reset automatically after the query on 
Vega20 and Arcturus */
+   if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+   obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+   if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
+   dev_warn(obj->adev->dev, "Failed to reset error counter 
and error status");
+   }
+
s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
"ue", info.ue_count,
"ce", info.ce_count);
@@ -550,9 +557,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
 
-   if (obj->adev->asic_type == CHIP_ALDEBARAN) {
+   if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+   obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
-   DRM_WARN("Failed to reset error counter and error 
status");
+   dev_warn(obj->adev->dev, "Failed to reset error counter 
and error status");
}
 
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
@@ -1027,9 +1035,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev,
}
}
 
-   if (!amdgpu_persistent_edc_harvesting_supported(adev))
-   amdgpu_ras_reset_error_status(adev, info->head.block);
-
return 0;
 }
 
@@ -1149,6 +1154,12 @@ int amdgpu_ras_query_error_count(struct amdgpu_device 
*adev,
if (res)
return res;
 
+   if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+   adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+   if (amdgpu_ras_reset_error_status(adev, 
info.head.block))
+   dev_warn(adev->dev, "Failed to reset error 
counter and error status");
+   }
+
ce += info.ce_count;
ue += info.ue_count;
}
@@ -1792,6 +1803,12 @@ static void amdgpu_ras_log_on_err_counter(struct 
amdgpu_device *adev)
continue;
 
amdgpu_ras_query_error_status(adev, &info);
+
+   if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+   adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+   if (amdgpu_ras_reset_error_status(adev, 
info.head.block))
+   dev_warn(adev->dev, "Failed to reset error 
counter and error status");
+   }
}
 }
 
-- 
2.17.1



[PATCH] drm/amdgpu: Resolve RAS GFX error count issue v2

2022-06-01 Thread Candice Li
Fix misleading indentation

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 99c1a2d3dae84d..424990e1bec10c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -599,14 +599,15 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, 
struct ras_common_if *r
if (!amdgpu_persistent_edc_harvesting_supported(adev))
amdgpu_ras_reset_error_status(adev, 
AMDGPU_RAS_BLOCK__GFX);
 
-   r = amdgpu_ras_block_late_init(adev, ras_block);
-   if (r)
-   return r;
+   r = amdgpu_ras_block_late_init(adev, ras_block);
+   if (r)
+   return r;
 
r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
if (r)
goto late_fini;
-   }
+   } else
+   amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
 
return 0;
 late_fini:
-- 
2.17.1



[PATCH v2] drm/amdgpu: Resolve RAS GFX error count issue v2

2022-06-01 Thread Candice Li
Fix misleading indentation and add ras unsupported checking
for gfx ras late init.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 99c1a2d3dae84d..16699158e00d8c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -599,13 +599,15 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, 
struct ras_common_if *r
if (!amdgpu_persistent_edc_harvesting_supported(adev))
amdgpu_ras_reset_error_status(adev, 
AMDGPU_RAS_BLOCK__GFX);
 
-   r = amdgpu_ras_block_late_init(adev, ras_block);
-   if (r)
-   return r;
+   r = amdgpu_ras_block_late_init(adev, ras_block);
+   if (r)
+   return r;
 
r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
if (r)
goto late_fini;
+   } else {
+   amdgpu_ras_feature_enable_on_boot(adev, ras_block, 0);
}
 
return 0;
-- 
2.17.1



[PATCH] drm/amdgpu: Allow the initramfs generator to include psp_13_0_6_ta

2023-07-13 Thread Candice Li
Allow the initramfs generator to automatically include psp_13_0_6_ta
firmware to initramfs.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index f9cb0d2c89d15b..e1a392bcea70d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -49,6 +49,7 @@ MODULE_FIRMWARE("amdgpu/psp_13_0_10_ta.bin");
 MODULE_FIRMWARE("amdgpu/psp_13_0_11_toc.bin");
 MODULE_FIRMWARE("amdgpu/psp_13_0_11_ta.bin");
 MODULE_FIRMWARE("amdgpu/psp_13_0_6_sos.bin");
+MODULE_FIRMWARE("amdgpu/psp_13_0_6_ta.bin");
 
 /* For large FW files the time to complete can be very long */
 #define USBC_PD_POLLING_LIMIT_S 240
-- 
2.25.1



[PATCH] drm/amdgpu: Extend poison mode check to SDMA/VCN/JPEG

2023-08-08 Thread Candice Li
Treat SDMA/VCN/JPEG as RAS capable IP blocks in poison mode.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index bb29cb57add5a0..8e9cd05dee245a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3143,7 +3143,10 @@ int amdgpu_ras_is_supported(struct amdgpu_device *adev,
 * that the ras block supports ras function.
 */
if (!ret &&
-   block == AMDGPU_RAS_BLOCK__GFX &&
+   (block == AMDGPU_RAS_BLOCK__GFX ||
+block == AMDGPU_RAS_BLOCK__SDMA ||
+block == AMDGPU_RAS_BLOCK__VCN ||
+block == AMDGPU_RAS_BLOCK__JPEG) &&
amdgpu_ras_is_poison_mode_supported(adev) &&
amdgpu_ras_get_ras_block(adev, block, 0))
ret = 1;
-- 
2.25.1



[PATCH] drm/amdgpu: Add I2C EEPROM support on smu v13_0_6

2023-08-10 Thread Candice Li
Support I2C EEPROM on smu v13_0_6.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 4287743e121245..27fb9b640011c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -159,6 +159,7 @@ static bool __is_ras_eeprom_supported(struct amdgpu_device 
*adev)
case IP_VERSION(13, 0, 0):
case IP_VERSION(13, 0, 2): /* Aldebaran */
case IP_VERSION(13, 0, 10):
+   case IP_VERSION(13, 0, 6):
return true;
default:
return false;
@@ -213,6 +214,7 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device 
*adev,
return true;
case IP_VERSION(13, 0, 0):
case IP_VERSION(13, 0, 10):
+   case IP_VERSION(13, 0, 6):
control->i2c_address = EEPROM_I2C_MADDR_4;
return true;
default:
-- 
2.25.1



[PATCH v2] drm/amdgpu: Add I2C EEPROM support on smu v13_0_6

2023-08-10 Thread Candice Li
Support I2C EEPROM on smu v13_0_6.

v2: Move IP_VERSION(13, 0, 6) ahead of IP_VERSION(13, 0, 10).

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 4287743e121245..4764d2171f92e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -158,6 +158,7 @@ static bool __is_ras_eeprom_supported(struct amdgpu_device 
*adev)
case IP_VERSION(11, 0, 7): /* Sienna cichlid */
case IP_VERSION(13, 0, 0):
case IP_VERSION(13, 0, 2): /* Aldebaran */
+   case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 10):
return true;
default:
@@ -212,6 +213,7 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device 
*adev,
control->i2c_address = EEPROM_I2C_MADDR_0;
return true;
case IP_VERSION(13, 0, 0):
+   case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 10):
control->i2c_address = EEPROM_I2C_MADDR_4;
return true;
-- 
2.25.1



[PATCH] drm/amdgpu: Update RAS EEPROM support on smu v13_0_6.

2023-08-16 Thread Candice Li
RAS EEPROM device is only supported on dGPU platform
for smu v13_0_6.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 4764d2171f92e9..595d5e535aca63 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -158,9 +158,10 @@ static bool __is_ras_eeprom_supported(struct amdgpu_device 
*adev)
case IP_VERSION(11, 0, 7): /* Sienna cichlid */
case IP_VERSION(13, 0, 0):
case IP_VERSION(13, 0, 2): /* Aldebaran */
-   case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 10):
return true;
+   case IP_VERSION(13, 0, 6):
+   return (adev->gmc.is_app_apu) ? false : true;
default:
return false;
}
-- 
2.25.1



[PATCH] drm/amdgpu: Add nps_mode in RAS init_flag

2024-10-18 Thread Candice Li
Add nps_mode in RAS init_flag.

Signed-off-by: Candice Li 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/ta_ras_if.h  | 9 +
 2 files changed, 12 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index cad153e333d824..a545892e51802e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1834,6 +1834,9 @@ int psp_ras_initialize(struct psp_context *psp)
ras_cmd->ras_in_message.init_flags.xcc_mask =
adev->gfx.xcc_mask;
ras_cmd->ras_in_message.init_flags.channel_dis_num = 
hweight32(adev->gmc.m_half_use) * 2;
+   if (adev->gmc.gmc_funcs->query_mem_partition_mode)
+   ras_cmd->ras_in_message.init_flags.nps_mode =
+   adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
 
ret = psp_ta_load(psp, &psp->ras_context.context);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h 
b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
index 3ac56a9645ebdf..21b71a427b1fdf 100644
--- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
+++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
@@ -113,6 +113,14 @@ enum ta_ras_address_type {
TA_RAS_PA_TO_MCA,
 };
 
+enum ta_ras_nps_mode {
+   TA_RAS_UNKNOWN_MODE = 0,
+   TA_RAS_NPS1_MODE = 1,
+   TA_RAS_NPS2_MODE = 2,
+   TA_RAS_NPS4_MODE = 4,
+   TA_RAS_NPS8_MODE = 8,
+};
+
 /* Input/output structures for RAS commands */
 /**/
 
@@ -139,6 +147,7 @@ struct ta_ras_init_flags {
uint8_t dgpu_mode;
uint16_t xcc_mask;
uint8_t channel_dis_num;
+   uint8_t nps_mode;
 };
 
 struct ta_ras_mca_addr {
-- 
2.25.1



[PATCH 2/2] drm/amdgpu: Check umc ras_if init to disable gmc v12_0 ecc_irq

2024-12-04 Thread Candice Li
Instead of checking umc ras supported.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
index 621769255ffac2..37c4644f5ebc36 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
@@ -939,8 +939,7 @@ static int gmc_v12_0_hw_fini(struct amdgpu_ip_block 
*ip_block)
 
amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
 
-   if (adev->gmc.ecc_irq.funcs &&
-   amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
+   if (adev->gmc.ecc_irq.funcs && adev->umc.ras_if)
amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
 
gmc_v12_0_gart_disable(adev);
-- 
2.25.1



[PATCH 1/2] drm/amdgpu: Add psp v14_0_3 ras support

2024-12-04 Thread Candice Li
Add psp v14_0_3 ras support.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1de934cd5764fa..02662eec6776f2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3581,6 +3581,7 @@ static bool amdgpu_ras_asic_supported(struct 
amdgpu_device *adev)
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 10):
case IP_VERSION(13, 0, 14):
+   case IP_VERSION(14, 0, 3):
return true;
default:
return false;
-- 
2.25.1



[PATCH 4/4] drm/amdgpu: Support nbif v6_3_1 fatal error handling

2024-12-06 Thread Candice Li
Add nbif v6_3_1 fatal error handling support.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 12 
 drivers/gpu/drm/amd/amdgpu/nbif_v6_3_1.c | 81 
 drivers/gpu/drm/amd/amdgpu/nbif_v6_3_1.h |  1 +
 drivers/gpu/drm/amd/amdgpu/soc24.c   | 19 +-
 4 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 623ae9b3880037..db081618e85c3b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -36,6 +36,7 @@
 #include "amdgpu_xgmi.h"
 #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include "nbio_v4_3.h"
+#include "nbif_v6_3_1.h"
 #include "nbio_v7_9.h"
 #include "atom.h"
 #include "amdgpu_reset.h"
@@ -3911,6 +3912,17 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
 * check DF RAS */
adev->nbio.ras = &nbio_v4_3_ras;
break;
+   case IP_VERSION(6, 3, 1):
+   if (adev->ras_hw_enabled & (1 << AMDGPU_RAS_BLOCK__DF))
+   /* unlike other generation of nbio ras,
+* nbif v6_3_1 only support fatal error interrupt
+* to inform software that DF is freezed due to
+* system fatal error event. driver should not
+* enable nbio ras in such case. Instead,
+* check DF RAS
+*/
+   adev->nbio.ras = &nbif_v6_3_1_ras;
+   break;
case IP_VERSION(7, 9, 0):
case IP_VERSION(7, 9, 1):
if (!adev->gmc.is_app_apu)
diff --git a/drivers/gpu/drm/amd/amdgpu/nbif_v6_3_1.c 
b/drivers/gpu/drm/amd/amdgpu/nbif_v6_3_1.c
index 39919e0892c148..c92875ceb31f45 100644
--- a/drivers/gpu/drm/amd/amdgpu/nbif_v6_3_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/nbif_v6_3_1.c
@@ -28,6 +28,7 @@
 #include "nbif/nbif_6_3_1_sh_mask.h"
 #include "pcie/pcie_6_1_0_offset.h"
 #include "pcie/pcie_6_1_0_sh_mask.h"
+#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
 #include 
 
 static void nbif_v6_3_1_remap_hdp_registers(struct amdgpu_device *adev)
@@ -518,3 +519,83 @@ const struct amdgpu_nbio_funcs nbif_v6_3_1_sriov_funcs = {
.get_rom_offset = nbif_v6_3_1_get_rom_offset,
.set_reg_remap = nbif_v6_3_1_set_reg_remap,
 };
+
+static int nbif_v6_3_1_set_ras_err_event_athub_irq_state(struct amdgpu_device 
*adev,
+  struct amdgpu_irq_src 
*src,
+  unsigned type,
+  enum 
amdgpu_interrupt_state state)
+{
+   /* The ras_controller_irq enablement should be done in psp bl when it
+* tries to enable ras feature. Driver only need to set the correct 
interrupt
+* vector for bare-metal and sriov use case respectively
+*/
+   uint32_t bif_doorbell_int_cntl;
+
+   bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+ BIF_BX0_BIF_DOORBELL_INT_CNTL,
+ 
RAS_ATHUB_ERR_EVENT_INTERRUPT_DISABLE,
+ (state == 
AMDGPU_IRQ_STATE_ENABLE) ? 0 : 1);
+   WREG32_SOC15(NBIO, 0, regBIF_BX0_BIF_DOORBELL_INT_CNTL, 
bif_doorbell_int_cntl);
+
+   return 0;
+}
+
+static int nbif_v6_3_1_process_err_event_athub_irq(struct amdgpu_device *adev,
+struct amdgpu_irq_src *source,
+struct amdgpu_iv_entry *entry)
+{
+   /* By design, the ih cookie for err_event_athub_irq should be written
+* to bif ring. since bif ring is not enabled, just leave process 
callback
+* as a dummy one.
+*/
+   return 0;
+}
+
+static const struct amdgpu_irq_src_funcs 
nbif_v6_3_1_ras_err_event_athub_irq_funcs = {
+   .set = nbif_v6_3_1_set_ras_err_event_athub_irq_state,
+   .process = nbif_v6_3_1_process_err_event_athub_irq,
+};
+
+static void nbif_v6_3_1_handle_ras_err_event_athub_intr_no_bifring(struct 
amdgpu_device *adev)
+{
+   uint32_t bif_doorbell_int_cntl;
+
+   bif_doorbell_int_cntl = RREG32_SOC15(NBIO, 0, 
regBIF_BX0_BIF_DOORBELL_INT_CNTL);
+   if (REG_GET_FIELD(bif_doorbell_int_cntl,
+ BIF_BX0_BIF_DOORBELL_INT_CNTL,
+ RAS_ATHUB_ERR_EVENT_INTERRUPT_STATUS)) {
+   /* driver has to clear the interrupt status when bif ring is 
disabled */
+   bif_doorbell_int_cntl = REG_SET_FIELD(bif_doorbell_int_cntl,
+   BIF_BX0_B

[PATCH 2/4] drm/amdgpu: Add umc v8_14_0 ip headers

2024-12-06 Thread Candice Li
Add umc v8_14_0 ip headers.

Signed-off-by: Candice Li 
---
 .../include/asic_reg/umc/umc_8_14_0_offset.h  | 29 +++
 .../include/asic_reg/umc/umc_8_14_0_sh_mask.h | 37 +++
 2 files changed, 66 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_14_0_offset.h
 create mode 100644 
drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_14_0_sh_mask.h

diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_14_0_offset.h 
b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_14_0_offset.h
new file mode 100644
index 00..0e8f12728d5f48
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_14_0_offset.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2024  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _umc_8_14_0_OFFSET_HEADER
+#define _umc_8_14_0_OFFSET_HEADER
+
+#define regUMCCH0_GeccErrCntSel 0x0328
+#define regUMCCH0_GeccErrCntSel_BASE_IDX0
+#define regUMCCH0_GeccErrCnt0x0329
+#define regUMCCH0_GeccErrCnt_BASE_IDX   0
+
+#endif
diff --git a/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_14_0_sh_mask.h 
b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_14_0_sh_mask.h
new file mode 100644
index 00..5d723b5d9b87b8
--- /dev/null
+++ b/drivers/gpu/drm/amd/include/asic_reg/umc/umc_8_14_0_sh_mask.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2024  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _umc_8_14_0_SH_MASK_HEADER
+#define _umc_8_14_0_SH_MASK_HEADER
+
+//UMCCH0_GeccErrCntSel
+#define UMCCH0_GeccErrCntSel__GeccErrInt__SHIFT  0xc
+#define UMCCH0_GeccErrCntSel__GeccErrCntEn__SHIFT0xf
+#define UMCCH0_GeccErrCntSel__PoisonCntEn__SHIFT 0x10
+#define UMCCH0_GeccErrCntSel__GeccErrInt_MASK0x3000L
+#define UMCCH0_GeccErrCntSel__GeccErrCntEn_MASK  0x8000L
+#define UMCCH0_GeccErrCntSel__PoisonCntEn_MASK   0x0003L
+//UMCCH0_GeccErrCnt
+#define UMCCH0_GeccErrCnt__GeccErrCnt__SHIFT 0x0
+#define UMCCH0_GeccErrCnt__GeccUnCorrErrCnt__SHIFT   0x10
+#define UMCCH0_GeccErrCnt__GeccErrCnt_MASK   0xL
+#define UMCCH0_GeccErrCnt__GeccUnCorrErrCnt_MASK 0xL
+
+#endif
-- 
2.25.1



[PATCH 1/4] drm/amdgpu: Add psp v14_0_3 ras support

2024-12-06 Thread Candice Li
Add psp v14_0_3 ras support.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 9961ff6c29f668..623ae9b3880037 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3571,6 +3571,7 @@ static bool amdgpu_ras_asic_supported(struct 
amdgpu_device *adev)
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
+   case IP_VERSION(14, 0, 3):
return true;
default:
return false;
-- 
2.25.1



[PATCH 3/4] drm/amdgpu: Add umc v8_14 ras functions

2024-12-06 Thread Candice Li
Add umc v8_14 ras functions.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/Makefile|   2 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c |  18 ++-
 drivers/gpu/drm/amd/amdgpu/umc_v8_14.c | 160 +
 drivers/gpu/drm/amd/amdgpu/umc_v8_14.h |  51 
 4 files changed, 229 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/umc_v8_14.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/umc_v8_14.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 502adcda7b65cd..88caac49e46e0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -106,7 +106,7 @@ amdgpu-y += \
 
 # add UMC block
 amdgpu-y += \
-   umc_v6_0.o umc_v6_1.o umc_v6_7.o umc_v8_7.o umc_v8_10.o umc_v12_0.o
+   umc_v6_0.o umc_v6_1.o umc_v6_7.o umc_v8_7.o umc_v8_10.o umc_v12_0.o 
umc_v8_14.o
 
 # add IH block
 amdgpu-y += \
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
index 621769255ffac2..b749f1c3f6a9af 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c
@@ -40,7 +40,7 @@
 #include "gfxhub_v12_0.h"
 #include "mmhub_v4_1_0.h"
 #include "athub_v4_1_0.h"
-
+#include "umc_v8_14.h"
 
 static int gmc_v12_0_ecc_interrupt_state(struct amdgpu_device *adev,
 struct amdgpu_irq_src *src,
@@ -581,6 +581,18 @@ static void gmc_v12_0_set_gmc_funcs(struct amdgpu_device 
*adev)
 
 static void gmc_v12_0_set_umc_funcs(struct amdgpu_device *adev)
 {
+   switch (amdgpu_ip_version(adev, UMC_HWIP, 0)) {
+   case IP_VERSION(8, 14, 0):
+   adev->umc.channel_inst_num = UMC_V8_14_CHANNEL_INSTANCE_NUM;
+   adev->umc.umc_inst_num = UMC_V8_14_UMC_INSTANCE_NUM(adev);
+   adev->umc.node_inst_num = 0;
+   adev->umc.max_ras_err_cnt_per_query = 
UMC_V8_14_TOTAL_CHANNEL_NUM(adev);
+   adev->umc.channel_offs = UMC_V8_14_PER_CHANNEL_OFFSET;
+   adev->umc.ras = &umc_v8_14_ras;
+   break;
+   default:
+   break;
+   }
 }
 
 
@@ -829,6 +841,10 @@ static int gmc_v12_0_sw_init(struct amdgpu_ip_block 
*ip_block)
 
amdgpu_vm_manager_init(adev);
 
+   r = amdgpu_gmc_ras_sw_init(adev);
+   if (r)
+   return r;
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_14.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v8_14.c
new file mode 100644
index 00..eaca10a3c4a9df
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_14.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright 2024 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+#include "umc_v8_14.h"
+#include "amdgpu_ras.h"
+#include "amdgpu_umc.h"
+#include "amdgpu.h"
+#include "umc/umc_8_14_0_offset.h"
+#include "umc/umc_8_14_0_sh_mask.h"
+
+static inline uint32_t get_umc_v8_14_reg_offset(struct amdgpu_device *adev,
+   uint32_t umc_inst,
+   uint32_t ch_inst)
+{
+   return adev->umc.channel_offs * ch_inst + UMC_V8_14_INST_DIST * 
umc_inst;
+}
+
+static int umc_v8_14_clear_error_count_per_channel(struct amdgpu_device *adev,
+   uint32_t node_inst, uint32_t umc_inst,
+   uint32_t ch_inst, void *data)
+{
+   uint32_t ecc_err_cnt_addr;
+   uint32_t umc_reg_offset =
+   get_umc_v8_14_reg_offset(adev, umc_inst, ch_inst);
+
+   ecc_err_cnt_addr =
+   SOC15_REG_OFFSET(UMC, 0, regUMCCH0_GeccErrCnt);
+
+   /* clear error count */
+   WREG32_PCIE((ecc_err_cnt_addr + umc_reg_offset) * 4,
+   UMC_V8_14_CE_

[PATCH] drm/amdgpu: Enable psp v14_0_3 RAS support for non-SRIOV configurations.

2024-12-16 Thread Candice Li
Enable psp v14_0_3 RAS support for non-SRIOV configurations.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index db081618e85c3b..01c947066a2eb0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3572,7 +3572,6 @@ static bool amdgpu_ras_asic_supported(struct 
amdgpu_device *adev)
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
-   case IP_VERSION(14, 0, 3):
return true;
default:
return false;
@@ -3586,6 +3585,7 @@ static bool amdgpu_ras_asic_supported(struct 
amdgpu_device *adev)
case IP_VERSION(13, 0, 10):
case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
+   case IP_VERSION(14, 0, 3):
return true;
default:
return false;
-- 
2.25.1



[PATCH] drm/amdgpu: Enable ACA by default for psp v13_0_12

2025-02-11 Thread Candice Li
Enable ACA by default for psp v13_0_12.

Signed-off-by: Candice Li 
Reviewed-by: Hawking Zhang 
Reviewed-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 44d13a60588df7..3c3312bbfee8d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3760,8 +3760,9 @@ static void amdgpu_ras_check_supported(struct 
amdgpu_device *adev)
adev->ras_enabled = amdgpu_ras_enable == 0 ? 0 :
adev->ras_hw_enabled & amdgpu_ras_mask;
 
-   /* aca is disabled by default */
-   adev->aca.is_enabled = false;
+   /* aca is disabled by default except for psp v13_0_12 */
+   adev->aca.is_enabled =
+   (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 12));
 
/* bad page feature is not applicable to specific app platform */
if (adev->gmc.is_app_apu &&
-- 
2.25.1



[PATCH] drm/amdgpu: Optimize the enablement of GECC

2025-02-11 Thread Candice Li
Enable GECC only when the default memory ECC mode or
the module parameter amdgpu_ras_enable is activated.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h   |  1 +
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c  | 18 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c   | 31 ++-
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index c03a586eb5a26f..7f84cc66a19b34 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1167,6 +1167,7 @@ struct amdgpu_device {
struct ratelimit_state  throttling_logging_rs;
uint32_tras_hw_enabled;
uint32_tras_enabled;
+   boolras_default_ecc_enabled;
 
boolno_hw_access;
struct pci_saved_state  *pci_state;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index f873dd3cae1606..eb015bdda8a749 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -549,9 +549,10 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
u16 data_offset, size;
union umc_info *umc_info;
u8 frev, crev;
-   bool ecc_default_enabled = false;
+   bool mem_ecc_enabled = false;
u8 umc_config;
u32 umc_config1;
+   adev->ras_default_ecc_enabled = false;
 
index = 
get_index_into_master_table(atom_master_list_of_data_tables_v2_1,
umc_info);
@@ -563,20 +564,22 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
switch (crev) {
case 1:
umc_config = 
le32_to_cpu(umc_info->v31.umc_config);
-   ecc_default_enabled =
+   mem_ecc_enabled =
(umc_config & 
UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false;
break;
case 2:
umc_config = 
le32_to_cpu(umc_info->v32.umc_config);
-   ecc_default_enabled =
+   mem_ecc_enabled =
(umc_config & 
UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false;
break;
case 3:
umc_config = 
le32_to_cpu(umc_info->v33.umc_config);
umc_config1 = 
le32_to_cpu(umc_info->v33.umc_config1);
-   ecc_default_enabled =
+   mem_ecc_enabled =
((umc_config & 
UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ||
 (umc_config1 & 
UMC_CONFIG1__ENABLE_ECC_CAPABLE)) ? true : false;
+   adev->ras_default_ecc_enabled =
+   (umc_config & 
UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false;
break;
default:
/* unsupported crev */
@@ -585,9 +588,12 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
} else if (frev == 4) {
switch (crev) {
case 0:
+   umc_config = 
le32_to_cpu(umc_info->v40.umc_config);
umc_config1 = 
le32_to_cpu(umc_info->v40.umc_config1);
-   ecc_default_enabled =
+   mem_ecc_enabled =
(umc_config1 & 
UMC_CONFIG1__ENABLE_ECC_CAPABLE) ? true : false;
+   adev->ras_default_ecc_enabled =
+   (umc_config & 
UMC_CONFIG__DEFAULT_MEM_ECC_ENABLE) ? true : false;
break;
default:
/* unsupported crev */
@@ -599,7 +605,7 @@ bool amdgpu_atomfirmware_mem_ecc_supported(struct 
amdgpu_device *adev)
}
}
 
-   return ecc_default_enabled;
+   return mem_ecc_enabled;
 }
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 196b8dbffc2e28..06f6bbdc7f5e9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1794,7 +1794,22 @@ int psp_ras_initialize(struct psp_context *psp)
if (ret)
dev_warn(adev->dev, "PSP get boot config failed\n");
 
-   if (!amdgpu_ras_i

[PATCH] Remove unnecessary firmware version check for gc v9_4_2

2025-03-25 Thread Candice Li
GC v9_4_2 uses a new versioning scheme for CP firmware, making
the warning ("CP firmware version too old, please update!") irrelevant."

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 5bf9d27d1ead9a..1a072362855315 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -1269,6 +1269,7 @@ static void gfx_v9_0_check_fw_write_wait(struct 
amdgpu_device *adev)
adev->gfx.mec_fw_write_wait = false;
 
if ((amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 1)) &&
+   (amdgpu_ip_version(adev, GC_HWIP, 0) != IP_VERSION(9, 4, 2)) &&
((adev->gfx.mec_fw_version < 0x01a5) ||
 (adev->gfx.mec_feature_version < 46) ||
 (adev->gfx.pfp_fw_version < 0x00b7) ||
-- 
2.25.1



[PATCH] drm/amdgpu: Add EEPROM I2C address support for smu v13_0_12

2025-03-14 Thread Candice Li
Add EEPROM I2C address support for smu v13_0_12.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index 3597ecd9baca34..3de89e95a636c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -161,6 +161,7 @@ static bool __is_ras_eeprom_supported(struct amdgpu_device 
*adev)
case IP_VERSION(13, 0, 10):
return true;
case IP_VERSION(13, 0, 6):
+   case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
return (adev->gmc.is_app_apu) ? false : true;
default:
@@ -223,6 +224,7 @@ static bool __get_eeprom_i2c_addr(struct amdgpu_device 
*adev,
return true;
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 10):
+   case IP_VERSION(13, 0, 12):
case IP_VERSION(13, 0, 14):
control->i2c_address = EEPROM_I2C_MADDR_4;
return true;
-- 
2.25.1



[PATCH] drm/amdgpu: Add active_umc_mask to ras init_flags

2025-03-14 Thread Candice Li
Add active_umc_mask to ras init_flags.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 1 +
 drivers/gpu/drm/amd/amdgpu/ta_ras_if.h  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 79dad75bd0e79f..d3b05b7020c84d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1861,6 +1861,7 @@ int psp_ras_initialize(struct psp_context *psp)
if (adev->gmc.gmc_funcs->query_mem_partition_mode)
ras_cmd->ras_in_message.init_flags.nps_mode =
adev->gmc.gmc_funcs->query_mem_partition_mode(adev);
+   ras_cmd->ras_in_message.init_flags.active_umc_mask = 
adev->umc.active_mask;
 
ret = psp_ta_load(psp, &psp->ras_context.context);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h 
b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
index 64891f0993666e..a3b5fda224328b 100644
--- a/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
+++ b/drivers/gpu/drm/amd/amdgpu/ta_ras_if.h
@@ -151,6 +151,7 @@ struct ta_ras_init_flags {
uint16_t xcc_mask;
uint8_t channel_dis_num;
uint8_t nps_mode;
+   uint32_t active_umc_mask;
 };
 
 struct ta_ras_mca_addr {
-- 
2.25.1



[PATCH] drm/amdgpu: Set RAS EEPROM table version to v3 for umc v12_5

2025-04-10 Thread Candice Li
Set RAS EEPROM table version to v3 for umc v12_5.

Signed-off-by: Candice Li 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index c985d58fdd7ddc..2c58e09e56f95d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -418,6 +418,7 @@ static void amdgpu_ras_set_eeprom_table_version(struct 
amdgpu_ras_eeprom_control
hdr->version = RAS_TABLE_VER_V2_1;
return;
case IP_VERSION(12, 0, 0):
+   case IP_VERSION(12, 5, 0):
hdr->version = RAS_TABLE_VER_V3;
return;
default:
-- 
2.25.1