On 2/12/2025 9:27 PM, Jonathan Kim wrote:
> Deprecate KFD XGMI peer info calls in favour of calling directly from
> simplified XGMI peer info functions.
> 
> Signed-off-by: Jonathan Kim <jonathan....@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 42 ------------------
>  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h |  5 ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c   | 51 +++++++++++++++++-----
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  6 +--
>  drivers/gpu/drm/amd/amdkfd/kfd_crat.c      | 11 +++--
>  5 files changed, 48 insertions(+), 67 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> index 0312231b703e..4cec3a873995 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
> @@ -555,48 +555,6 @@ int amdgpu_amdkfd_get_dmabuf_info(struct amdgpu_device 
> *adev, int dma_buf_fd,
>       return r;
>  }
>  
> -uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct amdgpu_device *dst,
> -                                       struct amdgpu_device *src)
> -{
> -     struct amdgpu_device *peer_adev = src;
> -     struct amdgpu_device *adev = dst;
> -     int ret = amdgpu_xgmi_get_hops_count(adev, peer_adev);
> -
> -     if (ret < 0) {
> -             DRM_ERROR("amdgpu: failed to get  xgmi hops count between node 
> %d and %d. ret = %d\n",
> -                     adev->gmc.xgmi.physical_node_id,
> -                     peer_adev->gmc.xgmi.physical_node_id, ret);
> -             ret = 0;
> -     }
> -     return  (uint8_t)ret;
> -}
> -
> -int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst,
> -                                         struct amdgpu_device *src,
> -                                         bool is_min)
> -{
> -     struct amdgpu_device *adev = dst, *peer_adev;
> -     int num_links;
> -
> -     if (amdgpu_ip_version(adev, GC_HWIP, 0) < IP_VERSION(9, 4, 2))
> -             return 0;
> -
> -     if (src)
> -             peer_adev = src;
> -
> -     /* num links returns 0 for indirect peers since indirect route is 
> unknown. */
> -     num_links = is_min ? 1 : amdgpu_xgmi_get_num_links(adev, peer_adev);
> -     if (num_links < 0) {
> -             DRM_ERROR("amdgpu: failed to get xgmi num links between node %d 
> and %d. ret = %d\n",
> -                     adev->gmc.xgmi.physical_node_id,
> -                     peer_adev->gmc.xgmi.physical_node_id, num_links);
> -             num_links = 0;
> -     }
> -
> -     /* Aldebaran xGMI DPM is defeatured so assume x16 x 25Gbps for 
> bandwidth. */
> -     return (num_links * 16 * 25000)/BITS_PER_BYTE;
> -}
> -
>  int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool 
> is_min)
>  {
>       int num_lanes_shift = (is_min ? ffs(adev->pm.pcie_mlw_mask) :
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> index 092dbd8bec97..28eb1cd0eb5a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
> @@ -255,11 +255,6 @@ int amdgpu_amdkfd_get_dmabuf_info(struct amdgpu_device 
> *adev, int dma_buf_fd,
>                                 uint64_t *bo_size, void *metadata_buffer,
>                                 size_t buffer_size, uint32_t *metadata_size,
>                                 uint32_t *flags, int8_t *xcp_id);
> -uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct amdgpu_device *dst,
> -                                       struct amdgpu_device *src);
> -int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst,
> -                                         struct amdgpu_device *src,
> -                                         bool is_min);
>  int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool 
> is_min);
>  int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
>                                       uint32_t *payload);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 74b4349e345a..d18d2a26cc91 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -818,28 +818,59 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info 
> *hive, struct amdgpu_dev
>   * num_hops[2:0] = number of hops
>   */
>  int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> -             struct amdgpu_device *peer_adev)
> +                            struct amdgpu_device *peer_adev)
>  {
>       struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
>       uint8_t num_hops_mask = 0x7;
>       int i;
>  
> +     if (!adev->gmc.xgmi.supported)
> +             return 0;
> +
>       for (i = 0 ; i < top->num_nodes; ++i)
>               if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
>                       return top->nodes[i].num_hops & num_hops_mask;
> -     return  -EINVAL;
> +
> +     DRM_ERROR("amdgpu: failed to get  xgmi hops count between node %d and 
> %d.\n",

Suggest to use dev_ function to identify the device pci number. Since
the function still passes, maybe info level is good enough.

> +               adev->gmc.xgmi.physical_node_id,
> +               peer_adev->gmc.xgmi.physical_node_id);
> +
> +     return 0;
>  }
>  
> -int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> -             struct amdgpu_device *peer_adev)
> +int amdgpu_xgmi_get_bandwidth_mbytes(struct amdgpu_device *adev,
> +                                  struct amdgpu_device *peer_adev)
>  {
> -     struct psp_xgmi_topology_info *top = &adev->psp.xgmi_context.top_info;
> -     int i;
> +     int num_links = !peer_adev ? 1 : 0;
>  
> -     for (i = 0 ; i < top->num_nodes; ++i)
> -             if (top->nodes[i].node_id == peer_adev->gmc.xgmi.node_id)
> -                     return top->nodes[i].num_links;
> -     return  -EINVAL;
> +     if (!adev->gmc.xgmi.supported)
> +             return 0;
> +
> +     if (peer_adev) {
> +             struct psp_xgmi_topology_info *top = 
> &adev->psp.xgmi_context.top_info;
> +             int i;
> +
> +             for (i = 0 ; i < top->num_nodes; ++i) {
> +                     if (top->nodes[i].node_id != 
> peer_adev->gmc.xgmi.node_id)
> +                             continue;
> +
> +                     num_links =  top->nodes[i].num_links;
> +                     break;
> +             }
> +     }
> +
> +     /* num links returns 0 for indirect peers since indirect route is 
> unknown. */
> +     if (!num_links) {

This looks problematic. I guess, it is better to keep the old way of
passing min/max. Otherwise, there is a chance that min reports some
value and max could report this error.

Thanks,
Lijo

> +             DRM_ERROR("amdgpu: failed to get xgmi num links between node %d 
> and %d.\n",
> +                       adev->gmc.xgmi.physical_node_id,
> +                       peer_adev->gmc.xgmi.physical_node_id);
> +     }
> +
> +     /*
> +      * TBD - will update IP based bandwidth later.
> +      * Bandwidth currently assumed to be x16 lanes x 25Gbps.
> +      */
> +     return (num_links * 16 * 25000)/BITS_PER_BYTE;
>  }
>  
>  bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> index d1282b4c6348..325e7972469d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h
> @@ -62,10 +62,8 @@ int amdgpu_xgmi_update_topology(struct amdgpu_hive_info 
> *hive, struct amdgpu_dev
>  int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
>  int amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
>  int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
> -int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
> -             struct amdgpu_device *peer_adev);
> -int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
> -             struct amdgpu_device *peer_adev);
> +int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev, struct 
> amdgpu_device *peer_adev);
> +int amdgpu_xgmi_get_bandwidth_mbytes(struct amdgpu_device *adev, struct 
> amdgpu_device *peer_adev);
>  bool amdgpu_xgmi_get_is_sharing_enabled(struct amdgpu_device *adev,
>                                       struct amdgpu_device *peer_adev);
>  uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> index 70b3ae0b74fe..a787d192390c 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c
> @@ -2133,8 +2133,8 @@ static int kfd_fill_gpu_direct_io_link_to_cpu(int 
> *avail_size,
>               bool ext_cpu = KFD_GC_VERSION(kdev) != IP_VERSION(9, 4, 3);
>               int mem_bw = 819200, weight = ext_cpu ? KFD_CRAT_XGMI_WEIGHT :
>                                                       
> KFD_CRAT_INTRA_SOCKET_WEIGHT;
> -             uint32_t bandwidth = ext_cpu ? 
> amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(
> -                                                     kdev->adev, NULL, true) 
> : mem_bw;
> +             uint32_t bandwidth = ext_cpu ? 
> amdgpu_xgmi_get_bandwidth_mbytes(kdev->adev, NULL) :
> +                                            mem_bw;
>  
>               /*
>                * with host gpu xgmi link, host can access gpu memory whether
> @@ -2198,12 +2198,11 @@ static int kfd_fill_gpu_xgmi_link_to_gpu(int 
> *avail_size,
>  
>       if (use_ta_info) {
>               sub_type_hdr->weight_xgmi = KFD_CRAT_XGMI_WEIGHT *
> -                     amdgpu_amdkfd_get_xgmi_hops_count(kdev->adev, 
> peer_kdev->adev);
> +                     amdgpu_xgmi_get_hops_count(kdev->adev, peer_kdev->adev);
>               sub_type_hdr->maximum_bandwidth_mbs =
> -                     amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev,
> -                                                     peer_kdev->adev, false);
> +                     amdgpu_xgmi_get_bandwidth_mbytes(kdev->adev, 
> peer_kdev->adev);
>               sub_type_hdr->minimum_bandwidth_mbs = 
> sub_type_hdr->maximum_bandwidth_mbs ?
> -                     amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(kdev->adev, 
> NULL, true) : 0;
> +                     amdgpu_xgmi_get_bandwidth_mbytes(kdev->adev, NULL) : 0;
>       } else {
>               bool is_single_hop = kdev->kfd == peer_kdev->kfd;
>               int weight = is_single_hop ? KFD_CRAT_INTRA_SOCKET_WEIGHT :

Reply via email to