On 5/22/2025 9:13 AM, Samuel Zhang wrote:
> For SRIOV VM env with XGMI enabled systems, XGMI physical node id may
> change when hibernate and resume with different VF.
> 
> Update XGMI info and vram_base_offset on resume for gfx444 SRIOV env.
> Add amdgpu_virt_xgmi_migrate_enabled() as the feature flag.
> 
> Signed-off-by: Jiang Liu <ge...@linux.alibaba.com>
> Signed-off-by: Samuel Zhang <guoqing.zh...@amd.com>
> Reviewed-by: Christian König <christian.koe...@amd.com>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 32 ++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h   |  7 +++++
>  2 files changed, 39 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index d477a901af84..e5bb46effb6c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2732,6 +2732,12 @@ static int amdgpu_device_ip_early_init(struct 
> amdgpu_device *adev)
>       if (!amdgpu_device_pcie_dynamic_switching_supported(adev))
>               adev->pm.pp_feature &= ~PP_PCIE_DPM_MASK;
>  
> +     adev->virt.is_xgmi_node_migrate_enabled = false;
> +     if (amdgpu_sriov_vf(adev)) {
> +             adev->virt.is_xgmi_node_migrate_enabled =
> +                     amdgpu_ip_version((adev), GC_HWIP, 0) == IP_VERSION(9, 
> 4, 4);
> +     }
> +
>       total = true;
>       for (i = 0; i < adev->num_ip_blocks; i++) {
>               ip_block = &adev->ip_blocks[i];
> @@ -5040,6 +5046,28 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
> notify_clients)
>       return 0;
>  }
>  
> +static inline int amdgpu_virt_resume(struct amdgpu_device *adev)
> +{
> +     int r;
> +     unsigned int prev_physical_node_id = adev->gmc.xgmi.physical_node_id;
> +
> +     if (!amdgpu_virt_xgmi_migrate_enabled(adev))
> +             return 0;
> +
> +     r = adev->gfxhub.funcs->get_xgmi_info(adev);
> +     if (r)
> +             return r;
> +
> +     dev_info(adev->dev, "xgmi node, old id %d, new id %d\n",
> +             prev_physical_node_id, adev->gmc.xgmi.physical_node_id);
> +
> +     adev->vm_manager.vram_base_offset = 
> adev->gfxhub.funcs->get_mc_fb_offset(adev);
> +     adev->vm_manager.vram_base_offset +=
> +             adev->gmc.xgmi.physical_node_id * 
> adev->gmc.xgmi.node_segment_size;
> +
> +     return 0;
> +}
> +
>  /**
>   * amdgpu_device_resume - initiate device resume
>   *
> @@ -5061,6 +5089,10 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
> notify_clients)
>                       return r;
>       }
>  
> +     r = amdgpu_virt_resume(adev);

You may restrict this to VF only for now -
        if (amdgpu_sriov_vf(adev)) {
                r = amdgpu_virt_resume(adev);
                ...
        }

Thanks,
Lijo

> +     if (r)
> +             goto exit;
> +
>       if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
>               return 0;
>  
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> index df03dba67ab8..532b92628171 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h
> @@ -295,6 +295,9 @@ struct amdgpu_virt {
>       union amd_sriov_ras_caps ras_telemetry_en_caps;
>       struct amdgpu_virt_ras ras;
>       struct amd_sriov_ras_telemetry_error_count count_cache;
> +
> +     /* hibernate and resume with different VF feature for xgmi enabled 
> system */
> +     bool is_xgmi_node_migrate_enabled;
>  };
>  
>  struct amdgpu_video_codec_info;
> @@ -376,6 +379,10 @@ static inline bool is_virtual_machine(void)
>       ((adev)->virt.gim_feature & AMDGIM_FEATURE_VCN_RB_DECOUPLE)
>  #define amdgpu_sriov_is_mes_info_enable(adev) \
>       ((adev)->virt.gim_feature & AMDGIM_FEATURE_MES_INFO_ENABLE)
> +
> +#define amdgpu_virt_xgmi_migrate_enabled(adev) \
> +     ((adev)->virt.is_xgmi_node_migrate_enabled)
> +
>  bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev);
>  void amdgpu_virt_init_setting(struct amdgpu_device *adev);
>  int amdgpu_virt_request_full_gpu(struct amdgpu_device *adev, bool init);

Reply via email to