On Tue, May 12, 2026 at 11:10:44AM +0200, Albert Esteve wrote:
> On embedded platforms a central process often allocates dma-buf
> memory on behalf of client applications. Without a way to
> attribute the charge to the requesting client's cgroup, the
> cost lands on the allocator, making per-cgroup memory limits
> ineffective for the actual consumers.
>
> Add charge_pid_fd to struct dma_heap_allocation_data. When set to
Please be aware that pidfds come in two flavors:
thread-group pidfds and thread-specific pidfds. Make sure that your API
doesn't implicitly depend on this distinction not existing.
> a valid pidfd, DMA_HEAP_IOCTL_ALLOC resolves the target task's
> memcg and charges the buffer there via mem_cgroup_charge_dmabuf()
> inside dma_heap_buffer_alloc(). Without charge_pid_fd, and with
> the mem_accounting module parameter enabled, the buffer is charged
> to the allocator's own cgroup.
>
> Additionally, commit 3c227be90659 ("dma-buf: system_heap: account for
> system heap allocation in memcg") adds __GFP_ACCOUNT to system-heap
> page allocations. Keeping __GFP_ACCOUNT would charge the same pages
> twice (once to kmem, once to MEMCG_DMABUF), thus remove it and route
> all accounting through a single MEMCG_DMABUF path.
>
> Usage examples:
>
> 1. Central allocator charging to a client at allocation time.
> The allocator knows the client's PID (e.g., from binder's
> sender_pid) and uses pidfd to attribute the charge:
>
> pid_t client_pid = txn->sender_pid;
> int pidfd = pidfd_open(client_pid, 0);
>
> struct dma_heap_allocation_data alloc = {
> .len = buffer_size,
> .fd_flags = O_RDWR | O_CLOEXEC,
> .charge_pid_fd = pidfd,
> };
> ioctl(heap_fd, DMA_HEAP_IOCTL_ALLOC, &alloc);
> close(pidfd);
> /* alloc.fd is now charged to client's cgroup */
>
> 2. Default allocation (no pidfd, mem_accounting=1).
> When charge_pid_fd is not set and the mem_accounting module
> parameter is enabled, the buffer is charged to the allocator's
> own cgroup:
>
> struct dma_heap_allocation_data alloc = {
> .len = buffer_size,
> .fd_flags = O_RDWR | O_CLOEXEC,
> };
> ioctl(heap_fd, DMA_HEAP_IOCTL_ALLOC, &alloc);
> /* charged to current process's cgroup */
>
> Current limitations:
>
> - Single-owner model: a dma-buf carries one memcg charge regardless of
> how many processes share it. Means only the first owner (and exporter)
> of the shared buffer bears the charge.
> - Only memcg accounting supported. While this makes sense for system
> heap buffers, other heaps (e.g., CMA heaps) will require selectively
> charging also for the dmem controller.
>
> Signed-off-by: Albert Esteve <[email protected]>
> ---
> Documentation/admin-guide/cgroup-v2.rst | 5 ++--
> drivers/dma-buf/dma-buf.c | 16 ++++---------
> drivers/dma-buf/dma-heap.c | 42
> ++++++++++++++++++++++++++++++---
> drivers/dma-buf/heaps/system_heap.c | 2 --
> include/uapi/linux/dma-heap.h | 6 +++++
> 5 files changed, 53 insertions(+), 18 deletions(-)
>
> diff --git a/Documentation/admin-guide/cgroup-v2.rst
> b/Documentation/admin-guide/cgroup-v2.rst
> index 8bdbc2e866430..824d269531eb1 100644
> --- a/Documentation/admin-guide/cgroup-v2.rst
> +++ b/Documentation/admin-guide/cgroup-v2.rst
> @@ -1636,8 +1636,9 @@ The following nested keys are defined.
> structures.
>
> dmabuf (npn)
> - Amount of memory used for exported DMA buffers allocated by the
> cgroup.
> - Stays with the allocating cgroup regardless of how the buffer
> is shared.
> + Amount of memory used for exported DMA buffers allocated by or
> on
> + behalf of the cgroup. Stays with the allocating cgroup
> regardless
> + of how the buffer is shared.
>
> workingset_refault_anon
> Number of refaults of previously evicted anonymous pages.
> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> index ce02377f48908..23fb758b78297 100644
> --- a/drivers/dma-buf/dma-buf.c
> +++ b/drivers/dma-buf/dma-buf.c
> @@ -181,8 +181,11 @@ static void dma_buf_release(struct dentry *dentry)
> */
> BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active);
>
> - mem_cgroup_uncharge_dmabuf(dmabuf->memcg, PAGE_ALIGN(dmabuf->size) /
> PAGE_SIZE);
> - mem_cgroup_put(dmabuf->memcg);
> + if (dmabuf->memcg) {
> + mem_cgroup_uncharge_dmabuf(dmabuf->memcg,
> + PAGE_ALIGN(dmabuf->size) / PAGE_SIZE);
> + mem_cgroup_put(dmabuf->memcg);
> + }
>
> dmabuf->ops->release(dmabuf);
>
> @@ -764,13 +767,6 @@ struct dma_buf *dma_buf_export(const struct
> dma_buf_export_info *exp_info)
> dmabuf->resv = resv;
> }
>
> - dmabuf->memcg = get_mem_cgroup_from_mm(current->mm);
> - if (!mem_cgroup_charge_dmabuf(dmabuf->memcg, PAGE_ALIGN(dmabuf->size) /
> PAGE_SIZE,
> - GFP_KERNEL)) {
> - ret = -ENOMEM;
> - goto err_memcg;
> - }
> -
> file->private_data = dmabuf;
> file->f_path.dentry->d_fsdata = dmabuf;
> dmabuf->file = file;
> @@ -781,8 +777,6 @@ struct dma_buf *dma_buf_export(const struct
> dma_buf_export_info *exp_info)
>
> return dmabuf;
>
> -err_memcg:
> - mem_cgroup_put(dmabuf->memcg);
> err_file:
> fput(file);
> err_module:
> diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
> index ac5f8685a6494..ff6e259afcdc0 100644
> --- a/drivers/dma-buf/dma-heap.c
> +++ b/drivers/dma-buf/dma-heap.c
> @@ -7,13 +7,17 @@
> */
>
> #include <linux/cdev.h>
> +#include <linux/cgroup.h>
> #include <linux/device.h>
> #include <linux/dma-buf.h>
> #include <linux/dma-heap.h>
> +#include <linux/memcontrol.h>
> +#include <linux/sched/mm.h>
> #include <linux/err.h>
> #include <linux/export.h>
> #include <linux/list.h>
> #include <linux/nospec.h>
> +#include <linux/pidfd.h>
> #include <linux/syscalls.h>
> #include <linux/uaccess.h>
> #include <linux/xarray.h>
> @@ -55,10 +59,12 @@ MODULE_PARM_DESC(mem_accounting,
> "Enable cgroup-based memory accounting for dma-buf heap
> allocations (default=false).");
>
> static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
> - u32 fd_flags,
> - u64 heap_flags)
> + u32 fd_flags, u64 heap_flags,
> + struct mem_cgroup *charge_to)
> {
> struct dma_buf *dmabuf;
> + unsigned int nr_pages;
> + struct mem_cgroup *memcg = charge_to;
> int fd;
>
> /*
> @@ -73,6 +79,22 @@ static int dma_heap_buffer_alloc(struct dma_heap *heap,
> size_t len,
> if (IS_ERR(dmabuf))
> return PTR_ERR(dmabuf);
>
> + nr_pages = len / PAGE_SIZE;
> +
> + if (memcg)
> + css_get(&memcg->css);
> + else if (mem_accounting)
> + memcg = get_mem_cgroup_from_mm(current->mm);
> +
> + if (memcg) {
> + if (!mem_cgroup_charge_dmabuf(memcg, nr_pages, GFP_KERNEL)) {
> + mem_cgroup_put(memcg);
> + dma_buf_put(dmabuf);
> + return -ENOMEM;
> + }
> + dmabuf->memcg = memcg;
> + }
> +
> fd = dma_buf_fd(dmabuf, fd_flags);
> if (fd < 0) {
> dma_buf_put(dmabuf);
> @@ -102,6 +124,9 @@ static long dma_heap_ioctl_allocate(struct file *file,
> void *data)
> {
> struct dma_heap_allocation_data *heap_allocation = data;
> struct dma_heap *heap = file->private_data;
> + struct mem_cgroup *memcg = NULL;
> + struct task_struct *task;
> + unsigned int pidfd_flags;
> int fd;
>
> if (heap_allocation->fd)
> @@ -113,9 +138,20 @@ static long dma_heap_ioctl_allocate(struct file *file,
> void *data)
> if (heap_allocation->heap_flags & ~DMA_HEAP_VALID_HEAP_FLAGS)
> return -EINVAL;
>
> + if (heap_allocation->charge_pid_fd) {
> + task = pidfd_get_task(heap_allocation->charge_pid_fd,
> &pidfd_flags);
Will always get a thread-group leader pidfd and will fail if this is a
thread-specific pidfd. pidfd_open(1234, PIDFD_THREAD) can be used to
open a thread-specific pidfd.
> + if (IS_ERR(task))
> + return PTR_ERR(task);
> +
> + memcg = get_mem_cgroup_from_mm(task->mm);
> + put_task_struct(task);
> + }
> +
> fd = dma_heap_buffer_alloc(heap, heap_allocation->len,
> heap_allocation->fd_flags,
> - heap_allocation->heap_flags);
> + heap_allocation->heap_flags,
> + memcg);
> + mem_cgroup_put(memcg);
> if (fd < 0)
> return fd;
>
> diff --git a/drivers/dma-buf/heaps/system_heap.c
> b/drivers/dma-buf/heaps/system_heap.c
> index 03c2b87cb1112..95d7688167b93 100644
> --- a/drivers/dma-buf/heaps/system_heap.c
> +++ b/drivers/dma-buf/heaps/system_heap.c
> @@ -385,8 +385,6 @@ static struct page *alloc_largest_available(unsigned long
> size,
> if (max_order < orders[i])
> continue;
> flags = order_flags[i];
> - if (mem_accounting)
> - flags |= __GFP_ACCOUNT;
> page = alloc_pages(flags, orders[i]);
> if (!page)
> continue;
> diff --git a/include/uapi/linux/dma-heap.h b/include/uapi/linux/dma-heap.h
> index a4cf716a49fa6..e02b0f8cbc6a1 100644
> --- a/include/uapi/linux/dma-heap.h
> +++ b/include/uapi/linux/dma-heap.h
> @@ -29,6 +29,10 @@
> * handle to the allocated dma-buf
> * @fd_flags: file descriptor flags used when allocating
> * @heap_flags: flags passed to heap
> + * @charge_pid_fd: optional pidfd of the process whose cgroup should be
> + * charged for this allocation; 0 means charge the calling
> + * process's cgroup
> + * @__padding: reserved, must be zero
> *
> * Provided by userspace as an argument to the ioctl
> */
> @@ -37,6 +41,8 @@ struct dma_heap_allocation_data {
> __u32 fd;
> __u32 fd_flags;
> __u64 heap_flags;
> + __u32 charge_pid_fd;
> + __u32 __padding;
> };
>
> #define DMA_HEAP_IOC_MAGIC 'H'
>
> --
> 2.53.0
>