On 2025-06-19 14:37, Francis, David wrote:
>
> [AMD Official Use Only - AMD Internal Distribution Only]
>
>
> > > +     spin_lock(&filp->table_lock);
> > > +     idr_for_each_entry(&filp->object_idr, gobj, id)
> > > +             num_bos += 1;
> > > +     spin_unlock(&filp->table_lock);
> > > +
> > > +     if (args->num_bos < num_bos) {
> > > +             args->num_bos = num_bos;
> > > +             goto exit;
> > 
> > Since this path doesn't require any cleanup, you could just return. But 
> > maybe this should return an error code to let user mode know that it should 
> > retry with a bigger bucket allocation. -ENOSPC?
> > 
>
> I wasn't sure if sending back information from an ioctl in the structs while 
> also returning an error value was acceptable. If it is, I'll make that change.
>
> > > +     idr_for_each_entry(&filp->object_idr, gobj, id) {
> > > +             struct amdgpu_bo *bo = gem_to_amdgpu_bo(gobj);
> > > +             struct drm_amdgpu_criu_bo_bucket *bo_bucket;
> > > +
> > > +             bo_bucket = &bo_buckets[bo_index];
> > > +
> > > +             bo_bucket->size = amdgpu_bo_size(bo);
> > > +             bo_bucket->alloc_flags = bo->flags & 
> > > (~AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE);
> > 
> > I'm not sure why you're removing this flag. I think we always set it 
> > implicitly when creating a new BO (and presumably during restore), but 
> > there should be no harm in leaving this flag set anyway.
> > 
>
> The driver doesn't like this flag being set on requests to create bo. Since 
> this is meant to be sending to the user the flags userspace will need to 
> recreate the bo, I thought to leave it off.


OK.


>
> >
> > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > >
> > > +#include "amdgpu_criu.h"
> > > +
> > 
> > Why is this header file needed here? None of the new ioctls will be 
> > implemented in kfd_chardev.c.
>
> amdgpu_drv.c needs the header with the ioctl declarations for its ioctl 
> definitions. Not sure if there's another way I should be including them.

Sorry. I put my comment in the wrong place. Yes, you need this header in 
amdgpu_drv.c. You were also including it in kfd_chardev.c. I don't think you 
need it there.


Regards,
  Felix


> ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
> *From:* Kuehling, Felix <felix.kuehl...@amd.com>
> *Sent:* Wednesday, June 18, 2025 5:05 PM
> *To:* Francis, David <david.fran...@amd.com>; dri-devel@lists.freedesktop.org 
> <dri-devel@lists.freedesktop.org>
> *Cc:* tvrtko.ursu...@igalia.com <tvrtko.ursu...@igalia.com>; Yat Sin, David 
> <david.yat...@amd.com>; Freehill, Chris <chris.freeh...@amd.com>; Koenig, 
> Christian <christian.koe...@amd.com>; dcostant...@meta.com 
> <dcostant...@meta.com>; sruff...@meta.com <sruff...@meta.com>; 
> sim...@ffwll.ch <sim...@ffwll.ch>; mrip...@kernel.org <mrip...@kernel.org>; 
> tzimmerm...@suse.de <tzimmerm...@suse.de>
> *Subject:* Re: [PATCH 2/4] drm/amdgpu: Add CRIU ioctl to get bo info
>  
>
> On 2025-06-17 15:45, David Francis wrote:
> > Add new ioctl DRM_IOCTL_AMDGPU_CRIU_BO_INFO.
> >
> > This ioctl returns a list of bos with their handles, sizes,
> > and flags and domains.
> >
> > This ioctl is meant to be used during CRIU checkpoint and
> > provide information needed to reconstruct the bos
> > in CRIU restore.
> >
> > Signed-off-by: David Francis <david.fran...@amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/Makefile      |   2 +-
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c | 144 +++++++++++++++++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h |  32 +++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c  |   2 +
> >  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |   2 +
> >  include/uapi/drm/amdgpu_drm.h            |  28 +++++
> >  6 files changed, 209 insertions(+), 1 deletion(-)
> >  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c
> >  create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
> > b/drivers/gpu/drm/amd/amdgpu/Makefile
> > index 87080c06e5fc..0863edcdd03f 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/Makefile
> > +++ b/drivers/gpu/drm/amd/amdgpu/Makefile
> > @@ -63,7 +63,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o 
> > amdgpu_kms.o \
> >        amdgpu_xgmi.o amdgpu_csa.o amdgpu_ras.o amdgpu_vm_cpu.o \
> >        amdgpu_vm_sdma.o amdgpu_discovery.o amdgpu_ras_eeprom.o 
> >amdgpu_nbio.o \
> >        amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
> > -     amdgpu_fw_attestation.o amdgpu_securedisplay.o \
> > +     amdgpu_fw_attestation.o amdgpu_securedisplay.o amdgpu_criu.o \
> >        amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
> >        amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o 
> >amdgpu_dev_coredump.o \
> >        amdgpu_cper.o amdgpu_userq_fence.o amdgpu_eviction_fence.o
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c
> > new file mode 100644
> > index 000000000000..34a0358946b6
> > --- /dev/null
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.c
> > @@ -0,0 +1,144 @@
> > +/* SPDX-License-Identifier: MIT */
> > +/*
> > +* Copyright 2025 Advanced Micro Devices, Inc.
> > +*
> > +* Permission is hereby granted, free of charge, to any person obtaining a
> > +* copy of this software and associated documentation files (the 
> > "Software"),
> > +* to deal in the Software without restriction, including without limitation
> > +* the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > +* and/or sell copies of the Software, and to permit persons to whom the
> > +* Software is furnished to do so, subject to the following conditions:
> > +*
> > +* The above copyright notice and this permission notice shall be included 
> > in
> > +* all copies or substantial portions of the Software.
> > +*
> > +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
> > OR
> > +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> > +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> > +* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> > +* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> > +* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> > +* OTHER DEALINGS IN THE SOFTWARE.
> > +*/
> > +
> > +#include <linux/dma-buf.h>
> > +#include <linux/hashtable.h>
> > +#include <linux/mutex.h>
> > +#include <linux/random.h>
> > +
> > +#include <drm/amdgpu_drm.h>
> > +#include <drm/drm_device.h>
> > +#include <drm/drm_file.h>
> > +
> > +#include "amdgpu_criu.h"
> > +
> > +#include <drm/amdgpu_drm.h>
> > +#include <drm/drm_drv.h>
> > +#include <drm/drm_exec.h>
> > +#include <drm/drm_gem_ttm_helper.h>
> > +#include <drm/ttm/ttm_tt.h>
> > +#include <linux/interval_tree_generic.h>
> > +
> > +#include "amdgpu.h"
> > +#include "amdgpu_display.h"
> > +#include "amdgpu_dma_buf.h"
> > +#include "amdgpu_hmm.h"
> > +#include "amdgpu_xgmi.h"
>
> That's a lot of header files. Do you really need them all?
>
>
> > +
> > +static uint32_t hardware_flags_to_uapi_flags(struct amdgpu_device *adev, 
> > uint64_t pte_flags)
>
> This function is never called.
>
>
> > +{
> > +     uint32_t gem_flags = 0;
> > +
> > +     //This function will be replaced by the mapping flags rework
> > +
> > +     if (pte_flags & AMDGPU_PTE_EXECUTABLE)
> > +             gem_flags |= AMDGPU_VM_PAGE_EXECUTABLE;
> > +     if (pte_flags & AMDGPU_PTE_READABLE)
> > +             gem_flags |= AMDGPU_VM_PAGE_READABLE;
> > +     if (pte_flags & AMDGPU_PTE_WRITEABLE)
> > +             gem_flags |= AMDGPU_VM_PAGE_WRITEABLE;
> > +     if (pte_flags & AMDGPU_PTE_PRT_FLAG(adev))
> > +             gem_flags |= AMDGPU_VM_PAGE_PRT;
> > +     if (pte_flags & AMDGPU_PTE_NOALLOC)
> > +             gem_flags |= AMDGPU_VM_PAGE_NOALLOC;
> > +
> > +     return gem_flags;
> > +}
> > +
> > +
> > +/**
> > + * amdgpu_criu_bo_info_ioctl - get information about a process' buffer 
> > objects
> > + *
> > + * @dev: drm device pointer
> > + * @data: drm_amdgpu_criu_bo_info_args
> > + * @filp: drm file pointer
> > + *
> > + * num_bos is set as an input to the size of the bo_buckets array.
> > + * num_bos is sent back as output as the number of bos in the process.
> > + * If that number is larger than the size of the array, the ioctl must
> > + * be retried.
> > + *
> > + * Returns:
> > + * 0 for success, -errno for errors.
> > + */
> > +int amdgpu_criu_bo_info_ioctl(struct drm_device *dev, void *data,
> > +                         struct drm_file *filp)
> > +{
> > +     struct drm_amdgpu_criu_bo_info_args *args = data;
> > +     struct drm_amdgpu_criu_bo_bucket *bo_buckets;
> > +     struct drm_gem_object *gobj;
> > +     int id, ret = 0;
> > +     int bo_index = 0;
> > +     int num_bos = 0;
> > +
> > +     spin_lock(&filp->table_lock);
> > +     idr_for_each_entry(&filp->object_idr, gobj, id)
> > +             num_bos += 1;
> > +     spin_unlock(&filp->table_lock);
> > +
> > +     if (args->num_bos < num_bos) {
> > +             args->num_bos = num_bos;
> > +             goto exit;
>
> Since this path doesn't require any cleanup, you could just return. But maybe 
> this should return an error code to let user mode know that it should retry 
> with a bigger bucket allocation. -ENOSPC?
>
>
> > +     }
> > +     args->num_bos = num_bos;
> > +     if (num_bos == 0) {
> > +             goto exit;
>
> Just return. 0 (success) seems fine here.
>
>
> > +     }
> > +
> > +     bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL);
> > +     if (!bo_buckets) {
> > +             ret = -ENOMEM;
> > +             goto free_buckets;
>
> Just return -ENOMEM.
>
>
> > +     }
> > +
> > +     spin_lock(&filp->table_lock);
> > +     idr_for_each_entry(&filp->object_idr, gobj, id) {
> > +             struct amdgpu_bo *bo = gem_to_amdgpu_bo(gobj);
> > +             struct drm_amdgpu_criu_bo_bucket *bo_bucket;
> > +
> > +             bo_bucket = &bo_buckets[bo_index];
> > +
> > +             bo_bucket->size = amdgpu_bo_size(bo);
> > +             bo_bucket->alloc_flags = bo->flags & 
> > (~AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE);
>
> I'm not sure why you're removing this flag. I think we always set it 
> implicitly when creating a new BO (and presumably during restore), but there 
> should be no harm in leaving this flag set anyway.
>
>
> > +             bo_bucket->preferred_domains = bo->preferred_domains;
> > +             bo_bucket->gem_handle = id;
> > +
> > +             if (bo->tbo.base.import_attach)
> > +                     bo_bucket->flags |= AMDGPU_CRIU_BO_FLAG_IS_IMPORT;
> > +
> > +             bo_index += 1;
> > +     }
> > +     spin_unlock(&filp->table_lock);
> > +
> > +     ret = copy_to_user((void __user *)args->bo_buckets, bo_buckets, 
> > num_bos * sizeof(*bo_buckets));
> > +     if (ret) {
> > +             pr_debug("Failed to copy BO information to user\n");
> > +             ret = -EFAULT;
> > +     }
> > +
> > +free_buckets:
> > +     kvfree(bo_buckets);
> > +exit:
> > +
> > +     return ret;
> > +}
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h
> > new file mode 100644
> > index 000000000000..1b18ffee6587
> > --- /dev/null
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_criu.h
> > @@ -0,0 +1,32 @@
> > +/* SPDX-License-Identifier: MIT */
> > +/*
> > +* Copyright 2025 Advanced Micro Devices, Inc.
> > +*
> > +* Permission is hereby granted, free of charge, to any person obtaining a
> > +* copy of this software and associated documentation files (the 
> > "Software"),
> > +* to deal in the Software without restriction, including without limitation
> > +* the rights to use, copy, modify, merge, publish, distribute, sublicense,
> > +* and/or sell copies of the Software, and to permit persons to whom the
> > +* Software is furnished to do so, subject to the following conditions:
> > +*
> > +* The above copyright notice and this permission notice shall be included 
> > in
> > +* all copies or substantial portions of the Software.
> > +*
> > +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 
> > OR
> > +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> > +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
> > +* THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
> > +* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
> > +* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
> > +* OTHER DEALINGS IN THE SOFTWARE.
> > +*/
> > +
> > +#ifndef __AMDGPU_CRIU_H__
> > +#define __AMDGPU_CRIU_H__
> > +
> > +#include <drm/amdgpu_drm.h>
>
> Why is this needed here? You're not using any uapi definitions below.
>
>
> > +
> > +int amdgpu_criu_bo_info_ioctl(struct drm_device *dev, void *data,
> > +                         struct drm_file *filp);
> > +
> > +#endif
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > index 4db92e0a60da..bf33567bb166 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
> > @@ -53,6 +53,7 @@
> >  #include "amdgpu_xgmi.h"
> >  #include "amdgpu_userq.h"
> >  #include "amdgpu_userq_fence.h"
> > +#include "amdgpu_criu.h"
> >  #include "../amdxcp/amdgpu_xcp_drv.h"
> > 
> >  /*
> > @@ -3021,6 +3022,7 @@ const struct drm_ioctl_desc amdgpu_ioctls_kms[] = {
> >        DRM_IOCTL_DEF_DRV(AMDGPU_USERQ, amdgpu_userq_ioctl, 
> >DRM_AUTH|DRM_RENDER_ALLOW),
> >        DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_SIGNAL, amdgpu_userq_signal_ioctl, 
> >DRM_AUTH|DRM_RENDER_ALLOW),
> >        DRM_IOCTL_DEF_DRV(AMDGPU_USERQ_WAIT, amdgpu_userq_wait_ioctl, 
> >DRM_AUTH|DRM_RENDER_ALLOW),
> > +     DRM_IOCTL_DEF_DRV(AMDGPU_CRIU_BO_INFO, amdgpu_criu_bo_info_ioctl, 
> > DRM_AUTH|DRM_RENDER_ALLOW),
> >  };
> > 
> >  static const struct drm_driver amdgpu_kms_driver = {
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
> > b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > index a2149afa5803..a8cf2d4580cc 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
> > @@ -45,6 +45,8 @@
> >  #include "amdgpu_dma_buf.h"
> >  #include "kfd_debug.h"
> > 
> > +#include "amdgpu_criu.h"
> > +
>
> Why is this header file needed here? None of the new ioctls will be 
> implemented in kfd_chardev.c.
>
> Regards,
>   Felix
>
>
> >  static long kfd_ioctl(struct file *, unsigned int, unsigned long);
> >  static int kfd_open(struct inode *, struct file *);
> >  static int kfd_release(struct inode *, struct file *);
> > diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
> > index 45c4fa13499c..1508c55ff92a 100644
> > --- a/include/uapi/drm/amdgpu_drm.h
> > +++ b/include/uapi/drm/amdgpu_drm.h
> > @@ -57,6 +57,7 @@ extern "C" {
> >  #define DRM_AMDGPU_USERQ             0x16
> >  #define DRM_AMDGPU_USERQ_SIGNAL              0x17
> >  #define DRM_AMDGPU_USERQ_WAIT                0x18
> > +#define DRM_AMDGPU_CRIU_BO_INFO      0x19
> > 
> >  #define DRM_IOCTL_AMDGPU_GEM_CREATE  DRM_IOWR(DRM_COMMAND_BASE + 
> >DRM_AMDGPU_GEM_CREATE, union drm_amdgpu_gem_create)
> >  #define DRM_IOCTL_AMDGPU_GEM_MMAP    DRM_IOWR(DRM_COMMAND_BASE + 
> >DRM_AMDGPU_GEM_MMAP, union drm_amdgpu_gem_mmap)
> > @@ -77,6 +78,7 @@ extern "C" {
> >  #define DRM_IOCTL_AMDGPU_USERQ               DRM_IOWR(DRM_COMMAND_BASE + 
> >DRM_AMDGPU_USERQ, union drm_amdgpu_userq)
> >  #define DRM_IOCTL_AMDGPU_USERQ_SIGNAL        DRM_IOWR(DRM_COMMAND_BASE + 
> >DRM_AMDGPU_USERQ_SIGNAL, struct drm_amdgpu_userq_signal)
> >  #define DRM_IOCTL_AMDGPU_USERQ_WAIT  DRM_IOWR(DRM_COMMAND_BASE + 
> >DRM_AMDGPU_USERQ_WAIT, struct drm_amdgpu_userq_wait)
> > +#define DRM_IOCTL_AMDGPU_CRIU_BO_INFO        DRM_IOWR(DRM_COMMAND_BASE + 
> > DRM_AMDGPU_CRIU_BO_INFO, struct drm_amdgpu_criu_bo_info_args)
> > 
> >  /**
> >   * DOC: memory domains
> > @@ -1626,6 +1628,32 @@ struct drm_color_ctm_3x4 {
> >        __u64 matrix[12];
> >  };
> > 
> > +#define AMDGPU_CRIU_BO_FLAG_IS_IMPORT        (1 << 0)
> > +
> > +struct drm_amdgpu_criu_bo_info_args {
> > +    /* IN: Size of bo_buckets buffer. OUT: Number of bos in process (if 
> > larger than size of buffer, must retry) */
> > +    __u32   num_bos;
> > +
> > +    /* User pointer to array of drm_amdgpu_criu_bo_bucket */
> > +    __u64   bo_buckets;
> > +};
> > +
> > +struct drm_amdgpu_criu_bo_bucket {
> > +    /* Size of bo */
> > +    __u64 size;
> > +
> > +    /* GEM_CREATE flags for re-creation of buffer */
> > +    __u64 alloc_flags;
> > +
> > +    /* Pending how to handle this; provides information needed to remake 
> > the buffer on restore */
> > +    __u32 preferred_domains;
> > +
> > +    /* Currently just one flag: IS_IMPORT */
> > +    __u32 flags;
> > +
> > +    __u32 gem_handle;
> > +};
> > +
> >  #if defined(__cplusplus)
> >  }
> >  #endif

Reply via email to