dmabuf is a modern Linux kernel feature to allow DMA transfers between
two drivers. Common examples of usage are streaming video devices and
NIC to GPU transfers. Prior to dmabuf users had to load proprietary
drivers to expose the DMA mappings. With dmabuf the proprietary drivers
are no longer required.

A new api function rte_extmem_register_dmabuf is introduced to create
the mapping from a dmabuf file descriptor. dmabuf uses a file descriptor
and an offset that has been pre-opened with the kernel. The kernel uses
the file descriptor to map to a VA pointer. To avoid ABI changes, a
static struct is used inside of eal_common_memory.c, and lookups are
done on this struct rather than from the rte_memseg_list.

Ideally we would like to add both the dmabuf file descriptor and offset
to rte_memseg_list, but it's not clear if we can reuse existing fields
when using the dmabuf API.

We could rename the external flag to a more generic "properties" flag where
"external" is the lowest bit, then we can use the second bit to indicate the
presence of dmabuf. In the presence of the flag for dmabuf we could
reuse the base_va address field for the dmabuf offset, and the socket_id
for the file descriptor.

Signed-off-by: Cliff Burdick <[email protected]>
---
 .mailmap                               |   1 +
 doc/guides/rel_notes/release_26_03.rst |   6 +
 lib/eal/common/eal_common_memory.c     | 165 ++++++++++++++++++++++++-
 lib/eal/common/eal_memalloc.h          |  21 ++++
 lib/eal/common/malloc_heap.c           |  27 ++++
 lib/eal/common/malloc_heap.h           |   5 +
 lib/eal/include/rte_memory.h           | 145 ++++++++++++++++++++++
 7 files changed, 364 insertions(+), 6 deletions(-)

diff --git a/.mailmap b/.mailmap
index 2f089326ff..4c2b2f921d 100644
--- a/.mailmap
+++ b/.mailmap
@@ -291,6 +291,7 @@ Cian Ferriter <[email protected]>
 Ciara Loftus <[email protected]>
 Ciara Power <[email protected]>
 Claire Murphy <[email protected]>
+Cliff Burdick <[email protected]>
 Clemens Famulla-Conrad <[email protected]>
 Cody Doucette <[email protected]>
 Congwen Zhang <[email protected]>
diff --git a/doc/guides/rel_notes/release_26_03.rst 
b/doc/guides/rel_notes/release_26_03.rst
index 15dabee7a1..56457d0382 100644
--- a/doc/guides/rel_notes/release_26_03.rst
+++ b/doc/guides/rel_notes/release_26_03.rst
@@ -55,6 +55,12 @@ New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Added dma-buf-backed external memory support.**
+
+  Added EAL support for registering dma-buf-backed external memory with
+  ``rte_extmem_register_dmabuf``, and enabled mlx5 common code to consume
+  dma-buf mappings for device access.
+
 
 Removed Items
 -------------
diff --git a/lib/eal/common/eal_common_memory.c 
b/lib/eal/common/eal_common_memory.c
index c62edf5e55..34ebbdc202 100644
--- a/lib/eal/common/eal_common_memory.c
+++ b/lib/eal/common/eal_common_memory.c
@@ -45,6 +45,15 @@
 static void *next_baseaddr;
 static uint64_t system_page_sz;
 
+/* Internal storage for dma-buf info, indexed by memseg list index.
+ * This keeps dma-buf metadata out of the public rte_memseg_list structure
+ * to preserve ABI compatibility.
+ */
+static struct {
+       int fd;          /**< dma-buf fd, -1 if not dma-buf backed */
+       uint64_t offset; /**< offset within dma-buf */
+} dmabuf_info[RTE_MAX_MEMSEG_LISTS];
+
 #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
 void *
 eal_get_virtual_area(void *requested_addr, size_t *size,
@@ -232,6 +241,10 @@ eal_memseg_list_init(struct rte_memseg_list *msl, uint64_t 
page_sz,
 {
        char name[RTE_FBARRAY_NAME_LEN];
 
+       /* Initialize dma-buf info to "not dma-buf backed" */
+       dmabuf_info[type_msl_idx].fd = -1;
+       dmabuf_info[type_msl_idx].offset = 0;
+
        snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id,
                 type_msl_idx);
 
@@ -930,10 +943,113 @@ rte_memseg_get_fd_offset(const struct rte_memseg *ms, 
size_t *offset)
        return ret;
 }
 
-RTE_EXPORT_SYMBOL(rte_extmem_register)
+/* Internal dma-buf info functions */
 int
-rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
-               unsigned int n_pages, size_t page_sz)
+eal_memseg_list_set_dmabuf_info(int list_idx, int fd, uint64_t offset)
+{
+       if (list_idx < 0 || list_idx >= RTE_MAX_MEMSEG_LISTS)
+               return -EINVAL;
+
+       dmabuf_info[list_idx].fd = fd;
+       dmabuf_info[list_idx].offset = offset;
+       return 0;
+}
+
+int
+eal_memseg_list_get_dmabuf_fd(int list_idx)
+{
+       if (list_idx < 0 || list_idx >= RTE_MAX_MEMSEG_LISTS)
+               return -EINVAL;
+
+       return dmabuf_info[list_idx].fd;
+}
+
+int
+eal_memseg_list_get_dmabuf_offset(int list_idx, uint64_t *offset)
+{
+       if (list_idx < 0 || list_idx >= RTE_MAX_MEMSEG_LISTS || offset == NULL)
+               return -EINVAL;
+
+       *offset = dmabuf_info[list_idx].offset;
+       return 0;
+}
+
+/* Public dma-buf info API functions */
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_fd_unsafe)
+int
+rte_memseg_list_get_dmabuf_fd_unsafe(const struct rte_memseg_list *msl)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       int msl_idx;
+
+       if (msl == NULL) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       msl_idx = msl - mcfg->memsegs;
+       if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       return dmabuf_info[msl_idx].fd;
+}
+
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_fd)
+int
+rte_memseg_list_get_dmabuf_fd(const struct rte_memseg_list *msl)
+{
+       int ret;
+
+       rte_mcfg_mem_read_lock();
+       ret = rte_memseg_list_get_dmabuf_fd_unsafe(msl);
+       rte_mcfg_mem_read_unlock();
+
+       return ret;
+}
+
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_offset_unsafe)
+int
+rte_memseg_list_get_dmabuf_offset_unsafe(const struct rte_memseg_list *msl,
+               uint64_t *offset)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       int msl_idx;
+
+       if (msl == NULL || offset == NULL) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       msl_idx = msl - mcfg->memsegs;
+       if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       *offset = dmabuf_info[msl_idx].offset;
+       return 0;
+}
+
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_offset)
+int
+rte_memseg_list_get_dmabuf_offset(const struct rte_memseg_list *msl,
+               uint64_t *offset)
+{
+       int ret;
+
+       rte_mcfg_mem_read_lock();
+       ret = rte_memseg_list_get_dmabuf_offset_unsafe(msl, offset);
+       rte_mcfg_mem_read_unlock();
+
+       return ret;
+}
+
+static int
+extmem_register(void *va_addr, size_t len,
+       int dmabuf_fd, uint64_t dmabuf_offset,
+       rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz)
 {
        struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
        unsigned int socket_id, n;
@@ -967,10 +1083,19 @@ rte_extmem_register(void *va_addr, size_t len, 
rte_iova_t iova_addrs[],
 
        /* we can create a new memseg */
        n = len / page_sz;
-       if (malloc_heap_create_external_seg(va_addr, iova_addrs, n,
+       if (dmabuf_fd < 0) {
+               if (malloc_heap_create_external_seg(va_addr, iova_addrs, n,
                        page_sz, "extmem", socket_id) == NULL) {
-               ret = -1;
-               goto unlock;
+                       ret = -1;
+                       goto unlock;
+               }
+       } else {
+               if (malloc_heap_create_external_seg_dmabuf(va_addr, iova_addrs, 
n,
+                       page_sz, "extmem_dmabuf", socket_id,
+                       dmabuf_fd, dmabuf_offset) == NULL) {
+                       ret = -1;
+                       goto unlock;
+               }
        }
 
        /* memseg list successfully created - increment next socket ID */
@@ -980,6 +1105,34 @@ rte_extmem_register(void *va_addr, size_t len, rte_iova_t 
iova_addrs[],
        return ret;
 }
 
+RTE_EXPORT_SYMBOL(rte_extmem_register)
+int
+rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
+               unsigned int n_pages, size_t page_sz)
+{
+       return rte_extmem_register_dmabuf(va_addr, len, -1, 0, iova_addrs, 
n_pages, page_sz);
+}
+
+RTE_EXPORT_SYMBOL(rte_extmem_register_dmabuf)
+int
+rte_extmem_register_dmabuf(void *va_addr, size_t len,
+               int dmabuf_fd, uint64_t dmabuf_offset,
+               rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz)
+{
+       if (dmabuf_fd < 0) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+       
+       return extmem_register(va_addr,
+               len,
+               dmabuf_fd,
+               dmabuf_offset,
+               iova_addrs,
+               n_pages,
+               page_sz);
+}
+
 RTE_EXPORT_SYMBOL(rte_extmem_unregister)
 int
 rte_extmem_unregister(void *va_addr, size_t len)
diff --git a/lib/eal/common/eal_memalloc.h b/lib/eal/common/eal_memalloc.h
index 0c267066d9..e7e807ddcb 100644
--- a/lib/eal/common/eal_memalloc.h
+++ b/lib/eal/common/eal_memalloc.h
@@ -90,6 +90,27 @@ eal_memalloc_set_seg_list_fd(int list_idx, int fd);
 int
 eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset);
 
+/*
+ * Set dma-buf info for a memseg list.
+ * Returns 0 on success, -errno on failure.
+ */
+int
+eal_memseg_list_set_dmabuf_info(int list_idx, int fd, uint64_t offset);
+
+/*
+ * Get dma-buf fd for a memseg list.
+ * Returns fd (>= 0) on success, -1 if not dma-buf backed, -errno on error.
+ */
+int
+eal_memseg_list_get_dmabuf_fd(int list_idx);
+
+/*
+ * Get dma-buf offset for a memseg list.
+ * Returns 0 on success, -errno on failure.
+ */
+int
+eal_memseg_list_get_dmabuf_offset(int list_idx, uint64_t *offset);
+
 int
 eal_memalloc_init(void)
        __rte_requires_shared_capability(rte_mcfg_mem_get_lock());
diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c
index 39240c261c..bf986fe654 100644
--- a/lib/eal/common/malloc_heap.c
+++ b/lib/eal/common/malloc_heap.c
@@ -1232,6 +1232,33 @@ malloc_heap_create_external_seg(void *va_addr, 
rte_iova_t iova_addrs[],
        msl->version = 0;
        msl->external = 1;
 
+       /* initialize dma-buf info to "not dma-buf backed" */
+       eal_memseg_list_set_dmabuf_info(i, -1, 0);
+
+       return msl;
+}
+
+struct rte_memseg_list *
+malloc_heap_create_external_seg_dmabuf(void *va_addr, rte_iova_t iova_addrs[],
+               unsigned int n_pages, size_t page_sz, const char *seg_name,
+               unsigned int socket_id, int dmabuf_fd, uint64_t dmabuf_offset)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct rte_memseg_list *msl;
+       int msl_idx;
+
+       /* Create the base external segment */
+       msl = malloc_heap_create_external_seg(va_addr, iova_addrs, n_pages,
+                       page_sz, seg_name, socket_id);
+       if (msl == NULL)
+               return NULL;
+
+       /* Get memseg list index */
+       msl_idx = msl - mcfg->memsegs;
+
+       /* Set dma-buf info in the internal side-table */
+       eal_memseg_list_set_dmabuf_info(msl_idx, dmabuf_fd, dmabuf_offset);
+
        return msl;
 }
 
diff --git a/lib/eal/common/malloc_heap.h b/lib/eal/common/malloc_heap.h
index dfc56d4ae3..87525d1a68 100644
--- a/lib/eal/common/malloc_heap.h
+++ b/lib/eal/common/malloc_heap.h
@@ -51,6 +51,11 @@ malloc_heap_create_external_seg(void *va_addr, rte_iova_t 
iova_addrs[],
                unsigned int n_pages, size_t page_sz, const char *seg_name,
                unsigned int socket_id);
 
+struct rte_memseg_list *
+malloc_heap_create_external_seg_dmabuf(void *va_addr, rte_iova_t iova_addrs[],
+               unsigned int n_pages, size_t page_sz, const char *seg_name,
+               unsigned int socket_id, int dmabuf_fd, uint64_t dmabuf_offset);
+
 struct rte_memseg_list *
 malloc_heap_find_external_seg(void *va_addr, size_t len);
 
diff --git a/lib/eal/include/rte_memory.h b/lib/eal/include/rte_memory.h
index b6e97ad695..4e92897dd9 100644
--- a/lib/eal/include/rte_memory.h
+++ b/lib/eal/include/rte_memory.h
@@ -405,6 +405,98 @@ int
 rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
                size_t *offset);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get dma-buf file descriptor associated with a memseg list.
+ *
+ * @note This function read-locks the memory hotplug subsystem, and thus cannot
+ *       be used within memory-related callback functions.
+ *
+ * @param msl
+ *   A pointer to memseg list for which to get dma-buf fd.
+ *
+ * @return
+ *   Valid dma-buf file descriptor (>= 0) in case of success.
+ *   -1 if not dma-buf backed or in case of error, with ``rte_errno`` set to:
+ *     - EINVAL  - ``msl`` pointer was NULL or did not point to a valid memseg 
list
+ */
+__rte_experimental
+int
+rte_memseg_list_get_dmabuf_fd(const struct rte_memseg_list *msl);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get dma-buf file descriptor associated with a memseg list.
+ *
+ * @note This function does not perform any locking, and is only safe to call
+ *       from within memory-related callback functions.
+ *
+ * @param msl
+ *   A pointer to memseg list for which to get dma-buf fd.
+ *
+ * @return
+ *   Valid dma-buf file descriptor (>= 0) in case of success.
+ *   -1 if not dma-buf backed or in case of error, with ``rte_errno`` set to:
+ *     - EINVAL  - ``msl`` pointer was NULL or did not point to a valid memseg 
list
+ */
+__rte_experimental
+int
+rte_memseg_list_get_dmabuf_fd_unsafe(const struct rte_memseg_list *msl);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get dma-buf offset associated with a memseg list.
+ *
+ * @note This function read-locks the memory hotplug subsystem, and thus cannot
+ *       be used within memory-related callback functions.
+ *
+ * @param msl
+ *   A pointer to memseg list for which to get dma-buf offset.
+ * @param offset
+ *   A pointer to offset value where the result will be stored.
+ *
+ * @return
+ *   0 on success.
+ *   -1 in case of error, with ``rte_errno`` set to:
+ *     - EINVAL  - ``msl`` pointer was NULL or did not point to a valid memseg 
list
+ *     - EINVAL  - ``offset`` pointer was NULL
+ */
+__rte_experimental
+int
+rte_memseg_list_get_dmabuf_offset(const struct rte_memseg_list *msl,
+               uint64_t *offset);
+
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Get dma-buf offset associated with a memseg list.
+ *
+ * @note This function does not perform any locking, and is only safe to call
+ *       from within memory-related callback functions.
+ *
+ * @param msl
+ *   A pointer to memseg list for which to get dma-buf offset.
+ * @param offset
+ *   A pointer to offset value where the result will be stored.
+ *
+ * @return
+ *   0 on success.
+ *   -1 in case of error, with ``rte_errno`` set to:
+ *     - EINVAL  - ``msl`` pointer was NULL or did not point to a valid memseg 
list
+ *     - EINVAL  - ``offset`` pointer was NULL
+ */
+__rte_experimental
+int
+rte_memseg_list_get_dmabuf_offset_unsafe(const struct rte_memseg_list *msl,
+               uint64_t *offset);
+
 /**
  * Register external memory chunk with DPDK.
  *
@@ -443,6 +535,59 @@ int
 rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
                unsigned int n_pages, size_t page_sz);
 
+/**
+ * @warning
+ * @b EXPERIMENTAL: this API may change without prior notice.
+ *
+ * Register external memory chunk backed by a dma-buf file descriptor and 
offset.
+ *
+ * This is similar to rte_extmem_register() but additionally stores dma-buf
+ * file descriptor information, allowing drivers to use dma-buf based
+ * memory registration (e.g., ibv_reg_dmabuf_mr for RDMA devices).
+ *
+ * @note Using this API is mutually exclusive with ``rte_malloc`` family of
+ *   API's.
+ *
+ * @note This API will not perform any DMA mapping. It is expected that user
+ *   will do that themselves via rte_dev_dma_map().
+ *
+ * @note Before accessing this memory in other processes, it needs to be
+ *   attached in each of those processes by calling ``rte_extmem_attach`` in
+ *   each other process.
+ *
+ * @param va_addr
+ *   Start of virtual area to register (mmap'd address of the dma-buf).
+ *   Must be aligned by ``page_sz``.
+ * @param len
+ *   Length of virtual area to register. Must be aligned by ``page_sz``.
+ *   This is independent of dma-buf offset.
+ * @param dmabuf_fd
+ *   File descriptor of the dma-buf.
+ * @param dmabuf_offset
+ *   Offset within the dma-buf where the registered region starts.
+ * @param iova_addrs
+ *   Array of page IOVA addresses corresponding to each page in this memory
+ *   area. Can be NULL, in which case page IOVA addresses will be set to
+ *   RTE_BAD_IOVA.
+ * @param n_pages
+ *   Number of elements in the iova_addrs array. Ignored if ``iova_addrs``
+ *   is NULL.
+ * @param page_sz
+ *   Page size of the underlying memory
+ *
+ * @return
+ *   - 0 on success
+ *   - -1 in case of error, with rte_errno set to one of the following:
+ *     EINVAL - one of the parameters was invalid
+ *     EEXIST - memory chunk is already registered
+ *     ENOSPC - no more space in internal config to store a new memory chunk
+ */
+ __rte_experimental
+int
+rte_extmem_register_dmabuf(void *va_addr, size_t len,
+               int dmabuf_fd, uint64_t dmabuf_offset,
+               rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz);
+
 /**
  * Unregister external memory chunk with DPDK.
  *
-- 
2.52.0

Reply via email to