On 4/28/2024 12:38 PM, YiPeng Chai wrote:
> Add mutex to protect ras shared memory.
> 
> Signed-off-by: YiPeng Chai <yipeng.c...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c    | 121 ++++++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h    |   1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c |   2 +
>  3 files changed, 84 insertions(+), 40 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index 5583e2d1b12f..fa4fea00f6b4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -1564,6 +1564,66 @@ static void psp_ras_ta_check_status(struct psp_context 
> *psp)
>       }
>  }
>  
> +static int psp_ras_send_cmd(struct psp_context *psp,
> +             enum ras_command cmd_id, void *in, void *out)
> +{
> +     struct ta_ras_shared_memory *ras_cmd;
> +     uint32_t cmd = cmd_id;
> +     int ret = 0;
> +
> +     mutex_lock(&psp->ras_context.mutex);
> +     ras_cmd = (struct ta_ras_shared_memory 
> *)psp->ras_context.context.mem_context.shared_buf;
> +     memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> +
> +     switch (cmd) {
> +     case TA_RAS_COMMAND__ENABLE_FEATURES:
> +     case TA_RAS_COMMAND__DISABLE_FEATURES:
> +             memcpy(&ras_cmd->ras_in_message,
> +                     in, sizeof(ras_cmd->ras_in_message));
> +             break;
> +     case TA_RAS_COMMAND__TRIGGER_ERROR:
> +             memcpy(&ras_cmd->ras_in_message.trigger_error,
> +                     in, sizeof(ras_cmd->ras_in_message.trigger_error));
> +             break;
> +     case TA_RAS_COMMAND__QUERY_ADDRESS:
> +             memcpy(&ras_cmd->ras_in_message.address,
> +                     in, sizeof(ras_cmd->ras_in_message.address));
> +             break;
> +     default:
> +             dev_err(psp->adev->dev, "Invalid ras cmd id: %u\n", cmd);
> +             ret = -EINVAL;
> +             goto err_out;
> +     }
> +
> +     ras_cmd->cmd_id = cmd;
> +     ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> +
> +     switch (cmd) {
> +     case TA_RAS_COMMAND__TRIGGER_ERROR:
> +             if (out) {

As NULL check for 'out' is done before copying, better to do the same
for 'in' also.
> +                     uint32_t *ras_status = (uint32_t *)out;
> +
> +                     *ras_status = ras_cmd->ras_status;
> +             }
> +             break;
> +     case TA_RAS_COMMAND__QUERY_ADDRESS:
> +             if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
> +                     ret = -EINVAL;
> +             else if (out)
> +                     memcpy(out,
> +                             &ras_cmd->ras_out_message.address,
> +                             sizeof(ras_cmd->ras_out_message.address));
> +             break;
> +     default:
> +             break;
> +     }
> +
> +err_out:
> +     mutex_unlock(&psp->ras_context.mutex);
> +
> +     return ret;
> +}
> +
>  int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)
>  {
>       struct ta_ras_shared_memory *ras_cmd;
> @@ -1605,23 +1665,15 @@ int psp_ras_invoke(struct psp_context *psp, uint32_t 
> ta_cmd_id)
>  int psp_ras_enable_features(struct psp_context *psp,
>               union ta_ras_cmd_input *info, bool enable)
>  {
> -     struct ta_ras_shared_memory *ras_cmd;
> +     enum ras_command cmd_id;
>       int ret;
>  
> -     if (!psp->ras_context.context.initialized)
> +     if (!psp->ras_context.context.initialized || !info)
>               return -EINVAL;
>  
> -     ras_cmd = (struct ta_ras_shared_memory 
> *)psp->ras_context.context.mem_context.shared_buf;
> -     memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> -     if (enable)
> -             ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES;
> -     else
> -             ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES;
> -
> -     ras_cmd->ras_in_message = *info;
> -
> -     ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> +     cmd_id = enable ?
> +             TA_RAS_COMMAND__ENABLE_FEATURES : 
> TA_RAS_COMMAND__DISABLE_FEATURES;
> +     ret = psp_ras_send_cmd(psp, cmd_id, info, NULL);
>       if (ret)
>               return -EINVAL;
>  
> @@ -1645,6 +1697,8 @@ int psp_ras_terminate(struct psp_context *psp)
>  
>       psp->ras_context.context.initialized = false;
>  
> +     mutex_destroy(&psp->ras_context.mutex);
> +
>       return ret;
>  }
>  
> @@ -1729,9 +1783,10 @@ int psp_ras_initialize(struct psp_context *psp)
>  
>       ret = psp_ta_load(psp, &psp->ras_context.context);
>  
> -     if (!ret && !ras_cmd->ras_status)
> +     if (!ret && !ras_cmd->ras_status) {
>               psp->ras_context.context.initialized = true;
> -     else {
> +             mutex_init(&psp->ras_context.mutex);
> +     } else {
>               if (ras_cmd->ras_status)
>                       dev_warn(adev->dev, "RAS Init Status: 0x%X\n", 
> ras_cmd->ras_status);
>  
> @@ -1745,12 +1800,12 @@ int psp_ras_initialize(struct psp_context *psp)
>  int psp_ras_trigger_error(struct psp_context *psp,
>                         struct ta_ras_trigger_error_input *info, uint32_t 
> instance_mask)
>  {
> -     struct ta_ras_shared_memory *ras_cmd;
>       struct amdgpu_device *adev = psp->adev;
>       int ret;
>       uint32_t dev_mask;
> +     uint32_t ras_status;
>  
> -     if (!psp->ras_context.context.initialized)
> +     if (!psp->ras_context.context.initialized || !info)
>               return -EINVAL;
>  
>       switch (info->block_id) {
> @@ -1774,13 +1829,8 @@ int psp_ras_trigger_error(struct psp_context *psp,
>       dev_mask &= AMDGPU_RAS_INST_MASK;
>       info->sub_block_index |= dev_mask;
>  
> -     ras_cmd = (struct ta_ras_shared_memory 
> *)psp->ras_context.context.mem_context.shared_buf;
> -     memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> -     ras_cmd->cmd_id = TA_RAS_COMMAND__TRIGGER_ERROR;
> -     ras_cmd->ras_in_message.trigger_error = *info;
> -
> -     ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> +     ret = psp_ras_send_cmd(psp,
> +                     TA_RAS_COMMAND__TRIGGER_ERROR, info, &ras_status);
>       if (ret)
>               return -EINVAL;
>  
> @@ -1790,9 +1840,9 @@ int psp_ras_trigger_error(struct psp_context *psp,
>       if (amdgpu_ras_intr_triggered())
>               return 0;
>  
> -     if (ras_cmd->ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
> +     if (ras_status == TA_RAS_STATUS__TEE_ERROR_ACCESS_DENIED)
>               return -EACCES;
> -     else if (ras_cmd->ras_status)
> +     else if (ras_status)
>               return -EINVAL;
>  
>       return 0;
> @@ -1802,25 +1852,16 @@ int psp_ras_query_address(struct psp_context *psp,
>                         struct ta_ras_query_address_input *addr_in,
>                         struct ta_ras_query_address_output *addr_out)
>  {
> -     struct ta_ras_shared_memory *ras_cmd;
>       int ret;
>  
> -     if (!psp->ras_context.context.initialized)
> -             return -EINVAL;
> -
> -     ras_cmd = (struct ta_ras_shared_memory 
> *)psp->ras_context.context.mem_context.shared_buf;
> -     memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
> -
> -     ras_cmd->cmd_id = TA_RAS_COMMAND__QUERY_ADDRESS;
> -     ras_cmd->ras_in_message.address = *addr_in;
> -
> -     ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
> -     if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
> +     if (!psp->ras_context.context.initialized ||
> +             !addr_in || !addr_out)
>               return -EINVAL;
>  
> -     *addr_out = ras_cmd->ras_out_message.address;
> +     ret = psp_ras_send_cmd(psp,
> +                     TA_RAS_COMMAND__QUERY_ADDRESS, addr_in, addr_out);

return psp_ras_send_cmd() will do.
>  
> -     return 0;
> +     return ret;
>  }
>  // ras end
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> index ee16f134ae92..686023918ce3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
> @@ -197,6 +197,7 @@ struct psp_xgmi_context {
>  struct psp_ras_context {
>       struct ta_context               context;
>       struct amdgpu_ras               *ras;
> +     struct mutex                    mutex;
>  };
>  
>  #define MEM_TRAIN_SYSTEM_SIGNATURE           0x54534942
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> index ca5c86e5f7cd..87f213f92d83 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c
> @@ -348,6 +348,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file 
> *fp, const char *buf, size
>  
>       context->session_id = ta_id;
>  
> +     mutex_lock(&psp->ras_context.mutex);
>       ret = prep_ta_mem_context(&context->mem_context, shared_buf, 
> shared_buf_len);
>       if (ret)
>               goto err_free_shared_buf;
> @@ -366,6 +367,7 @@ static ssize_t ta_if_invoke_debugfs_write(struct file 
> *fp, const char *buf, size
>               ret = -EFAULT;
>  
>  err_free_shared_buf:

This error label is used at other places as well and that happens even
before acquiring the mutex.

Thanks,
Lijo
> +     mutex_unlock(&psp->ras_context.mutex);
>       kfree(shared_buf);
>  
>       return ret;

Reply via email to