On 2025-06-19 14:37, Francis, David wrote: > > [AMD Official Use Only - AMD Internal Distribution Only] > > > > > + spin_lock(&filp->table_lock); > > > + idr_for_each_entry(&filp->object_idr, gobj, id) > > > + num_bos += 1; > > > + spin_unlock(&filp->table_lock); > > > + > > > + if (args->num_bos < num_bos) { > > > + args->num_bos = num_bos; > > > + goto exit; > > > > Since this path doesn't require any cleanup, you could just return. But > > maybe this should return an error code to let user mode know that it should > > retry with a bigger bucket allocation. -ENOSPC? > > > > I wasn't sure if sending back information from an ioctl in the structs while > also returning an error value was acceptable. If it is, I'll make that change. > > > > + idr_for_each_entry(&filp->object_idr, gobj, id) { > > > + struct amdgpu_bo *bo = gem_to_amdgpu_bo(gobj); > > > + struct drm_amdgpu_criu_bo_bucket *bo_bucket; > > > + > > > + bo_bucket = &bo_buckets[bo_index]; > > > + > > > + bo_bucket->size = amdgpu_bo_size(bo); > > > + bo_bucket->alloc_flags = bo->flags & > > > (~AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE); > > > > I'm not sure why you're removing this flag. I think we always set it > > implicitly when creating a new BO (and presumably during restore), but > > there should be no harm in leaving this flag set anyway. > > > > The driver doesn't like this flag being set on requests to create bo. Since > this is meant to be sending to the user the flags userspace will need to > recreate the bo, I thought to leave it off.
OK. > > > > > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > > > > > +#include "amdgpu_criu.h" > > > + > > > > Why is this header file needed here? None of the new ioctls will be > > implemented in kfd_chardev.c. > > amdgpu_drv.c needs the header with the ioctl declarations for its ioctl > definitions. Not sure if there's another way I should be including them. Sorry. I put my comment in the wrong place. Yes, you need this header in amdgpu_drv.c. You were also including it in kfd_chardev.c. I don't think you need it there. Regards, Felix > ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ > *From:* Kuehling, Felix <felix.kuehl...@amd.com> > *Sent:* Wednesday, June 18, 2025 5:05 PM > *To:* Francis, David <david.fran...@amd.com>; dri-devel@lists.freedesktop.org > <dri-devel@lists.freedesktop.org> > *Cc:* tvrtko.ursu...@igalia.com <tvrtko.ursu...@igalia.com>; Yat Sin, David > <david.yat...@amd.com>; Freehill, Chris <chris.freeh...@amd.com>; Koenig, > Christian <christian.koe...@amd.com>; dcostant...@meta.com > <dcostant...@meta.com>; sruff...@meta.com <sruff...@meta.com>; > sim...@ffwll.ch <sim...@ffwll.ch>; mrip...@kernel.org <mrip...@kernel.org>; > tzimmerm...@suse.de <tzimmerm...@suse.de> > *Subject:* Re: [PATCH 2/4] drm/amdgpu: Add CRIU ioctl to get bo info > > > On 2025-06-17 15:45, David Francis wrote: > > Add new ioctl DRM_IOCTL_AMDGPU_CRIU_BO_INFO. > > > > This ioctl returns a list of bos with their handles, sizes, > > and flags and domains. > > > > This ioctl is meant to be used during CRIU checkpoint and > > provide information needed to reconstruct the bos > > in CRIU restore. > > > > Signed-off-by: David Francis <david.fran...@amd.com> > > --- > > drivers/gpu/drm/amd/amdgpu/Makefile | 2 +- > > drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c | 144 +++++++++++++++++++++++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h | 32 +++++ > > drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 + > > drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 2 + > > include/uapi/drm/amdgpu_drm.h | 28 +++++ > > 6 files changed, 209 insertions(+), 1 deletion(-) > > create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c > > create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile > > b/drivers/gpu/drm/amd/amdgpu/Makefile > > index 87080c06e5fc..0863edcdd03f 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/Makefile > > +++ b/drivers/gpu/drm/amd/amdgpu/Makefile > > @@ -63,7 +63,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o > > amdgpu_kms.o \ > > amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \ > > amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o > >amdgpu_nbio.o \ > > amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \ > > - amdgpu_fw_attestation.o amdgpu_securedisplay.o \ > > + amdgpu_fw_attestation.o amdgpu_securedisplay.o amdgpu_criu.o \ > > amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \ > > amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o > >amdgpu_dev_coredump.o \ > > amdgpu_cper.o amdgpu_userq_fence.o amdgpu_eviction_fence.o > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c > > new file mode 100644 > > index 000000000000..34a0358946b6 > > --- /dev/null > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c > > @@ -0,0 +1,144 @@ > > +/* SPDX-License-Identifier: MIT */ > > +/* > > +* Copyright 2025 Advanced Micro Devices, Inc. > > +* > > +* Permission is hereby granted, free of charge, to any person obtaining a > > +* copy of this software and associated documentation files (the > > "Software"), > > +* to deal in the Software without restriction, including without limitation > > +* the rights to use, copy, modify, merge, publish, distribute, sublicense, > > +* and/or sell copies of the Software, and to permit persons to whom the > > +* Software is furnished to do so, subject to the following conditions: > > +* > > +* The above copyright notice and this permission notice shall be included > > in > > +* all copies or substantial portions of the Software. > > +* > > +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS > > OR > > +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > > +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > > +* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > > +* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > > +* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > > +* OTHER DEALINGS IN THE SOFTWARE. > > +*/ > > + > > +#include <linux/dma-buf.h> > > +#include <linux/hashtable.h> > > +#include <linux/mutex.h> > > +#include <linux/random.h> > > + > > +#include <drm/amdgpu_drm.h> > > +#include <drm/drm_device.h> > > +#include <drm/drm_file.h> > > + > > +#include "amdgpu_criu.h" > > + > > +#include <drm/amdgpu_drm.h> > > +#include <drm/drm_drv.h> > > +#include <drm/drm_exec.h> > > +#include <drm/drm_gem_ttm_helper.h> > > +#include <drm/ttm/ttm_tt.h> > > +#include <linux/interval_tree_generic.h> > > + > > +#include "amdgpu.h" > > +#include "amdgpu_display.h" > > +#include "amdgpu_dma_buf.h" > > +#include "amdgpu_hmm.h" > > +#include "amdgpu_xgmi.h" > > That's a lot of header files. Do you really need them all? > > > > + > > +static uint32_t hardware_flags_to_uapi_flags(struct amdgpu_device *adev, > > uint64_t pte_flags) > > This function is never called. > > > > +{ > > + uint32_t gem_flags = 0; > > + > > + //This function will be replaced by the mapping flags rework > > + > > + if (pte_flags & AMDGPU_PTE_EXECUTABLE) > > + gem_flags |= AMDGPU_VM_PAGE_EXECUTABLE; > > + if (pte_flags & AMDGPU_PTE_READABLE) > > + gem_flags |= AMDGPU_VM_PAGE_READABLE; > > + if (pte_flags & AMDGPU_PTE_WRITEABLE) > > + gem_flags |= AMDGPU_VM_PAGE_WRITEABLE; > > + if (pte_flags & AMDGPU_PTE_PRT_FLAG(adev)) > > + gem_flags |= AMDGPU_VM_PAGE_PRT; > > + if (pte_flags & AMDGPU_PTE_NOALLOC) > > + gem_flags |= AMDGPU_VM_PAGE_NOALLOC; > > + > > + return gem_flags; > > +} > > + > > + > > +/** > > + * amdgpu_criu_bo_info_ioctl - get information about a process' buffer > > objects > > + * > > + * @dev: drm device pointer > > + * @data: drm_amdgpu_criu_bo_info_args > > + * @filp: drm file pointer > > + * > > + * num_bos is set as an input to the size of the bo_buckets array. > > + * num_bos is sent back as output as the number of bos in the process. > > + * If that number is larger than the size of the array, the ioctl must > > + * be retried. > > + * > > + * Returns: > > + * 0 for success, -errno for errors. > > + */ > > +int amdgpu_criu_bo_info_ioctl(struct drm_device *dev, void *data, > > + struct drm_file *filp) > > +{ > > + struct drm_amdgpu_criu_bo_info_args *args = data; > > + struct drm_amdgpu_criu_bo_bucket *bo_buckets; > > + struct drm_gem_object *gobj; > > + int id, ret = 0; > > + int bo_index = 0; > > + int num_bos = 0; > > + > > + spin_lock(&filp->table_lock); > > + idr_for_each_entry(&filp->object_idr, gobj, id) > > + num_bos += 1; > > + spin_unlock(&filp->table_lock); > > + > > + if (args->num_bos < num_bos) { > > + args->num_bos = num_bos; > > + goto exit; > > Since this path doesn't require any cleanup, you could just return. But maybe > this should return an error code to let user mode know that it should retry > with a bigger bucket allocation. -ENOSPC? > > > > + } > > + args->num_bos = num_bos; > > + if (num_bos == 0) { > > + goto exit; > > Just return. 0 (success) seems fine here. > > > > + } > > + > > + bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL); > > + if (!bo_buckets) { > > + ret = -ENOMEM; > > + goto free_buckets; > > Just return -ENOMEM. > > > > + } > > + > > + spin_lock(&filp->table_lock); > > + idr_for_each_entry(&filp->object_idr, gobj, id) { > > + struct amdgpu_bo *bo = gem_to_amdgpu_bo(gobj); > > + struct drm_amdgpu_criu_bo_bucket *bo_bucket; > > + > > + bo_bucket = &bo_buckets[bo_index]; > > + > > + bo_bucket->size = amdgpu_bo_size(bo); > > + bo_bucket->alloc_flags = bo->flags & > > (~AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE); > > I'm not sure why you're removing this flag. I think we always set it > implicitly when creating a new BO (and presumably during restore), but there > should be no harm in leaving this flag set anyway. > > > > + bo_bucket->preferred_domains = bo->preferred_domains; > > + bo_bucket->gem_handle = id; > > + > > + if (bo->tbo.base.import_attach) > > + bo_bucket->flags |= AMDGPU_CRIU_BO_FLAG_IS_IMPORT; > > + > > + bo_index += 1; > > + } > > + spin_unlock(&filp->table_lock); > > + > > + ret = copy_to_user((void __user *)args->bo_buckets, bo_buckets, > > num_bos * sizeof(*bo_buckets)); > > + if (ret) { > > + pr_debug("Failed to copy BO information to user\n"); > > + ret = -EFAULT; > > + } > > + > > +free_buckets: > > + kvfree(bo_buckets); > > +exit: > > + > > + return ret; > > +} > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h > > new file mode 100644 > > index 000000000000..1b18ffee6587 > > --- /dev/null > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h > > @@ -0,0 +1,32 @@ > > +/* SPDX-License-Identifier: MIT */ > > +/* > > +* Copyright 2025 Advanced Micro Devices, Inc. > > +* > > +* Permission is hereby granted, free of charge, to any person obtaining a > > +* copy of this software and associated documentation files (the > > "Software"), > > +* to deal in the Software without restriction, including without limitation > > +* the rights to use, copy, modify, merge, publish, distribute, sublicense, > > +* and/or sell copies of the Software, and to permit persons to whom the > > +* Software is furnished to do so, subject to the following conditions: > > +* > > +* The above copyright notice and this permission notice shall be included > > in > > +* all copies or substantial portions of the Software. > > +* > > +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS > > OR > > +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, > > +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL > > +* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR > > +* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, > > +* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR > > +* OTHER DEALINGS IN THE SOFTWARE. > > +*/ > > + > > +#ifndef __AMDGPU_CRIU_H__ > > +#define __AMDGPU_CRIU_H__ > > + > > +#include <drm/amdgpu_drm.h> > > Why is this needed here? You're not using any uapi definitions below. > > > > + > > +int amdgpu_criu_bo_info_ioctl(struct drm_device *dev, void *data, > > + struct drm_file *filp); > > + > > +#endif > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > index 4db92e0a60da..bf33567bb166 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c > > @@ -53,6 +53,7 @@ > > #include "amdgpu_xgmi.h" > > #include "amdgpu_userq.h" > > #include "amdgpu_userq_fence.h" > > +#include "amdgpu_criu.h" > > #include "../amdxcp/amdgpu_xcp_drv.h" > > > > /* > > @@ -3021,6 +3022,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = { > > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ, amdgpu_userq_ioctl, > >DRM_AUTH|DRM_RENDER_ALLOW), > > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, > >DRM_AUTH|DRM_RENDER_ALLOW), > > DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, > >DRM_AUTH|DRM_RENDER_ALLOW), > > + DRM_IOCTL_DEF_DRV(AMDGPU_CRIU_BO_INFO, amdgpu_criu_bo_info_ioctl, > > DRM_AUTH|DRM_RENDER_ALLOW), > > }; > > > > static const struct drm_driver amdgpu_kms_driver = { > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > index a2149afa5803..a8cf2d4580cc 100644 > > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c > > @@ -45,6 +45,8 @@ > > #include "amdgpu_dma_buf.h" > > #include "kfd_debug.h" > > > > +#include "amdgpu_criu.h" > > + > > Why is this header file needed here? None of the new ioctls will be > implemented in kfd_chardev.c. > > Regards, > Felix > > > > static long kfd_ioctl(struct file *, unsigned int, unsigned long); > > static int kfd_open(struct inode *, struct file *); > > static int kfd_release(struct inode *, struct file *); > > diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h > > index 45c4fa13499c..1508c55ff92a 100644 > > --- a/include/uapi/drm/amdgpu_drm.h > > +++ b/include/uapi/drm/amdgpu_drm.h > > @@ -57,6 +57,7 @@ extern "C" { > > #define DRM_AMDGPU_USERQ 0x16 > > #define DRM_AMDGPU_USERQ_SIGNAL 0x17 > > #define DRM_AMDGPU_USERQ_WAIT 0x18 > > +#define DRM_AMDGPU_CRIU_BO_INFO 0x19 > > > > #define DRM_IOCTL_AMDGPU_GEM_CREATE DRM_IOWR(DRM_COMMAND_BASE + > >DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create) > > #define DRM_IOCTL_AMDGPU_GEM_MMAP DRM_IOWR(DRM_COMMAND_BASE + > >DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap) > > @@ -77,6 +78,7 @@ extern "C" { > > #define DRM_IOCTL_AMDGPU_USERQ DRM_IOWR(DRM_COMMAND_BASE + > >DRM_AMDGPU_USERQ, union drm_amdgpu_userq) > > #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL DRM_IOWR(DRM_COMMAND_BASE + > >DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal) > > #define DRM_IOCTL_AMDGPU_USERQ_WAIT DRM_IOWR(DRM_COMMAND_BASE + > >DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait) > > +#define DRM_IOCTL_AMDGPU_CRIU_BO_INFO DRM_IOWR(DRM_COMMAND_BASE + > > DRM_AMDGPU_CRIU_BO_INFO, struct drm_amdgpu_criu_bo_info_args) > > > > /** > > * DOC: memory domains > > @@ -1626,6 +1628,32 @@ struct drm_color_ctm_3x4 { > > __u64 matrix[12]; > > }; > > > > +#define AMDGPU_CRIU_BO_FLAG_IS_IMPORT (1 << 0) > > + > > +struct drm_amdgpu_criu_bo_info_args { > > + /* IN: Size of bo_buckets buffer. OUT: Number of bos in process (if > > larger than size of buffer, must retry) */ > > + __u32 num_bos; > > + > > + /* User pointer to array of drm_amdgpu_criu_bo_bucket */ > > + __u64 bo_buckets; > > +}; > > + > > +struct drm_amdgpu_criu_bo_bucket { > > + /* Size of bo */ > > + __u64 size; > > + > > + /* GEM_CREATE flags for re-creation of buffer */ > > + __u64 alloc_flags; > > + > > + /* Pending how to handle this; provides information needed to remake > > the buffer on restore */ > > + __u32 preferred_domains; > > + > > + /* Currently just one flag: IS_IMPORT */ > > + __u32 flags; > > + > > + __u32 gem_handle; > > +}; > > + > > #if defined(__cplusplus) > > } > > #endif