> 2025年1月6日 14:51,Lazar, Lijo <lijo.la...@amd.com> 写道: > > > > On 1/5/2025 8:15 AM, Jiang Liu wrote: >> Introduce new interface amdgpu_xcp_drm_dev_free() to free a specific >> drm_device crreated by amdgpu_xcp_drm_dev_alloc(), which will be used >> to do error recovery. >> >> Signed-off-by: Jiang Liu <ge...@linux.alibaba.com> >> --- >> drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c | 11 +++- >> drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h | 1 + >> drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c | 70 +++++++++++++++++---- >> drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h | 1 + >> 4 files changed, 70 insertions(+), 13 deletions(-) >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c >> index e209b5e101df..401fbaa0b6b8 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c >> @@ -359,6 +359,8 @@ int amdgpu_xcp_dev_register(struct amdgpu_device *adev, >> ret = drm_dev_register(adev->xcp_mgr->xcp[i].ddev, >> ent->driver_data); >> if (ret) >> return ret; >> + >> + adev->xcp_mgr->xcp[i].registered = true; >> } >> >> return 0; >> @@ -376,12 +378,19 @@ void amdgpu_xcp_dev_unplug(struct amdgpu_device *adev) >> if (!adev->xcp_mgr->xcp[i].ddev) >> break; >> >> + // Restore and free the original drm_device. >> p_ddev = adev->xcp_mgr->xcp[i].ddev; >> - drm_dev_unplug(p_ddev); >> + if (adev->xcp_mgr->xcp[i].registered) { >> + drm_dev_unplug(p_ddev); >> + adev->xcp_mgr->xcp[i].registered = false; >> + } >> p_ddev->render->dev = adev->xcp_mgr->xcp[i].rdev; >> p_ddev->primary->dev = adev->xcp_mgr->xcp[i].pdev; >> p_ddev->driver = adev->xcp_mgr->xcp[i].driver; >> p_ddev->vma_offset_manager = >> adev->xcp_mgr->xcp[i].vma_offset_manager; >> + amdgpu_xcp_drm_dev_free(p_ddev); >> + >> + adev->xcp_mgr->xcp[i].ddev = NULL; >> } >> } >> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h >> index b63f53242c57..cd06a4a232be 100644 >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h >> @@ -101,6 +101,7 @@ struct amdgpu_xcp { >> uint8_t id; >> uint8_t mem_id; >> bool valid; >> + bool registered; >> atomic_t ref_cnt; >> struct drm_device *ddev; >> struct drm_device *rdev; >> diff --git a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c >> b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c >> index faed84172dd4..9058d71b4756 100644 >> --- a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c >> +++ b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.c >> @@ -45,18 +45,26 @@ static const struct drm_driver amdgpu_xcp_driver = { >> >> static int8_t pdev_num; >> static struct xcp_device *xcp_dev[MAX_XCP_PLATFORM_DEVICE]; >> +static struct mutex xcp_mutex; > > I think this needs to be static DEFINE_MUTEX(xcp_mutex). > >> >> int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev) >> { >> struct platform_device *pdev; >> struct xcp_device *pxcp_dev; >> char dev_name[20]; >> - int ret; >> + int ret, index; >> >> + mutex_lock(&xcp_mutex); >> + ret = -ENODEV; > > Preference would be do this inside the below if() to associate the error > with the condition. > >> if (pdev_num >= MAX_XCP_PLATFORM_DEVICE) >> - return -ENODEV; >> + goto out_unlock; >> >> - snprintf(dev_name, sizeof(dev_name), "amdgpu_xcp_%d", pdev_num); >> + for (index = 0; index < MAX_XCP_PLATFORM_DEVICE; index++) { >> + if (!xcp_dev[index]) >> + break; >> + } >> + >> + snprintf(dev_name, sizeof(dev_name), "amdgpu_xcp_%d", index); >> pdev = platform_device_register_simple(dev_name, -1, NULL, 0); >> if (IS_ERR(pdev)) >> return PTR_ERR(pdev); > > Seems mutex is left locked here. Will fixed in next version:)
> >> @@ -72,10 +80,11 @@ int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev) >> goto out_devres; >> } >> >> - xcp_dev[pdev_num] = pxcp_dev; >> - xcp_dev[pdev_num]->pdev = pdev; >> + xcp_dev[index] = pxcp_dev; >> + xcp_dev[index]->pdev = pdev; >> *ddev = &pxcp_dev->drm; >> pdev_num++; >> + mutex_unlock(&xcp_mutex); >> >> return 0; >> >> @@ -83,21 +92,58 @@ int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev) >> devres_release_group(&pdev->dev, NULL); >> out_unregister: >> platform_device_unregister(pdev); >> +out_unlock: >> + mutex_unlock(&xcp_mutex); >> >> return ret; >> } >> EXPORT_SYMBOL(amdgpu_xcp_drm_dev_alloc); >> >> -void amdgpu_xcp_drv_release(void) >> +static void amdgpu_xcp_drm_dev_destroy(int index) >> +{ >> + struct platform_device *pdev; >> + >> + pdev = xcp_dev[index]->pdev; >> + devres_release_group(&pdev->dev, NULL); >> + platform_device_unregister(pdev); >> + xcp_dev[index] = NULL; >> + pdev_num--; >> +} >> + >> +void amdgpu_xcp_drm_dev_free(struct drm_device *ddev) >> { >> - for (--pdev_num; pdev_num >= 0; --pdev_num) { >> - struct platform_device *pdev = xcp_dev[pdev_num]->pdev; >> + int index; >> + struct xcp_device *pxcp_dev; >> + >> + if (ddev == NULL) >> + return; >> >> - devres_release_group(&pdev->dev, NULL); >> - platform_device_unregister(pdev); >> - xcp_dev[pdev_num] = NULL; >> + pxcp_dev = container_of(ddev, struct xcp_device, drm); >> + >> + mutex_lock(&xcp_mutex); >> + for (index = 0; index < MAX_XCP_PLATFORM_DEVICE; index++) { >> + if (xcp_dev[index] == pxcp_dev) { >> + amdgpu_xcp_drm_dev_destroy(index); >> + break; >> + } >> + } >> + mutex_unlock(&xcp_mutex); >> +} >> +EXPORT_SYMBOL(amdgpu_xcp_drm_dev_free); >> + >> +void amdgpu_xcp_drv_release(void) >> +{ >> + int index; >> + >> + mutex_lock(&xcp_mutex); >> + for (index = 0; index < MAX_XCP_PLATFORM_DEVICE; index++) { >> + if (xcp_dev[index]) { >> + WARN_ON(xcp_dev[index]); > > Why is this WARN check needed? There is already a if() check for valid > index. > > Also, would suggest to separate out amdgpu_xcp.c from xcp_drv.c. xcp_drv > introducing a new interface may be kept in a separate patch. With new implementation, all xcp devices should have already be removed when amdgpu_xcp_drv_release() gets called, So hope to verify whether it works as expected. Thanks! > > Thanks, > Lijo > >> + amdgpu_xcp_drm_dev_destroy(index); >> + } >> } >> - pdev_num = 0; >> + WARN_ON(pdev_num != 0); >> + mutex_unlock(&xcp_mutex); >> } >> EXPORT_SYMBOL(amdgpu_xcp_drv_release); >> >> diff --git a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h >> b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h >> index c1c4b679bf95..580a1602c8e3 100644 >> --- a/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h >> +++ b/drivers/gpu/drm/amd/amdxcp/amdgpu_xcp_drv.h >> @@ -25,5 +25,6 @@ >> #define _AMDGPU_XCP_DRV_H_ >> >> int amdgpu_xcp_drm_dev_alloc(struct drm_device **ddev); >> +void amdgpu_xcp_drm_dev_free(struct drm_device *ddev); >> void amdgpu_xcp_drv_release(void); >> #endif /* _AMDGPU_XCP_DRV_H_ */