This will supplant no-shared-files mode to use memfd-based hugetlbfs
allocation instead of hugetlbfs mounts. Due to memfd only being
supported kernel 4.14+ and glibc 2.27+, a compile-time check is
performed along with runtime checks.

Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
---
 .../linuxapp/eal/eal_hugepage_info.c          | 136 ++++++++++++++----
 lib/librte_eal/linuxapp/eal/eal_memalloc.c    | 105 +++++++++++++-
 lib/librte_eal/linuxapp/eal/eal_memfd.h       |  28 ++++
 lib/librte_eal/linuxapp/eal/eal_memory.c      |   4 +-
 4 files changed, 234 insertions(+), 39 deletions(-)
 create mode 100644 lib/librte_eal/linuxapp/eal/eal_memfd.h

diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c 
b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 02b1c4ff1..1a80ee0ee 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -30,6 +30,7 @@
 #include "eal_internal_cfg.h"
 #include "eal_hugepages.h"
 #include "eal_filesystem.h"
+#include "eal_memfd.h"
 
 static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
 static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";
@@ -313,11 +314,85 @@ compare_hpi(const void *a, const void *b)
        return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
 }
 
+static void
+calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent)
+{
+       uint64_t total_pages = 0;
+       unsigned int i;
+
+       /*
+        * first, try to put all hugepages into relevant sockets, but
+        * if first attempts fails, fall back to collecting all pages
+        * in one socket and sorting them later
+        */
+       total_pages = 0;
+       /* we also don't want to do this for legacy init */
+       if (!internal_config.legacy_mem)
+               for (i = 0; i < rte_socket_count(); i++) {
+                       int socket = rte_socket_id_by_idx(i);
+                       unsigned int num_pages =
+                                       get_num_hugepages_on_node(
+                                               dirent->d_name, socket);
+                       hpi->num_pages[socket] = num_pages;
+                       total_pages += num_pages;
+               }
+       /*
+        * we failed to sort memory from the get go, so fall
+        * back to old way
+        */
+       if (total_pages == 0) {
+               hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+
+#ifndef RTE_ARCH_64
+               /* for 32-bit systems, limit number of hugepages to
+                * 1GB per page size */
+               hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
+                               RTE_PGSIZE_1G / hpi->hugepage_sz);
+#endif
+       }
+}
+
+static int
+check_memfd_pagesize_supported(uint64_t page_sz)
+{
+#ifdef MEMFD_SUPPORTED
+       int sz_flag, fd;
+
+       /* first, check if this particular pagesize is supported */
+       sz_flag = eal_memalloc_get_memfd_pagesize_flag(page_sz);
+       if (sz_flag == 0) {
+               RTE_LOG(ERR, EAL, "Unexpected memfd hugepage size: %"
+                       PRIu64" bytes\n", page_sz);
+               return 0;
+       }
+
+       /* does currently running kernel support it? */
+       fd = memfd_create("memfd_test", sz_flag | MFD_HUGETLB);
+       if (fd >= 0) {
+               /* success */
+               close(fd);
+               return 1;
+       }
+       /* creating memfd failed, but if the error wasn't EINVAL, reserving of
+        * hugepages via memfd is supported by the kernel
+        */
+       if (errno != EINVAL) {
+               return 1;
+       }
+       RTE_LOG(DEBUG, EAL, "Kernel does not support memfd hugepages of size %"
+               PRIu64" bytes\n", page_sz);
+#else
+       RTE_LOG(DEBUG, EAL, "Memfd hugepage support not enabled at compile 
time\n");
+       RTE_SET_USED(page_sz);
+#endif
+       return 0;
+}
+
 static int
 hugepage_info_init(void)
 {      const char dirent_start_text[] = "hugepages-";
        const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
-       unsigned int i, total_pages, num_sizes = 0;
+       unsigned int i, num_sizes = 0;
        DIR *dir;
        struct dirent *dirent;
 
@@ -343,6 +418,10 @@ hugepage_info_init(void)
                hpi->hugepage_sz =
                        rte_str_to_size(&dirent->d_name[dirent_start_len]);
 
+               /* by default, memfd_hugepage_supported is 1 */
+               memfd_hugepage_supported &=
+                       check_memfd_pagesize_supported(hpi->hugepage_sz);
+
                /* first, check if we have a mountpoint */
                if (get_hugepage_dir(hpi->hugepage_sz,
                        hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
@@ -355,6 +434,23 @@ hugepage_info_init(void)
                                        "%" PRIu64 " reserved, but no mounted "
                                        "hugetlbfs found for that size\n",
                                        num_pages, hpi->hugepage_sz);
+
+                       /* no shared files mode may still be able to allocate
+                        * without a valid mountpoint via memfd, but we cannot
+                        * use memfd in legacy mode, because we cannot sort
+                        * pages, so only allow empty mountpoints in non-legacy
+                        * mode.
+                        */
+                       if (internal_config.no_shared_files &&
+                                       !internal_config.legacy_mem &&
+                                       memfd_hugepage_supported) {
+                               RTE_LOG(NOTICE, EAL, "No shared files mode 
enabled, "
+                                       "hugepages of size %" PRIu64 " bytes "
+                                       "will be allocated anonymously\n",
+                                       hpi->hugepage_sz);
+                               calc_num_pages(hpi, dirent);
+                               num_sizes++;
+                       }
                        continue;
                }
 
@@ -371,35 +467,14 @@ hugepage_info_init(void)
                if (clear_hugedir(hpi->hugedir) == -1)
                        break;
 
-               /*
-                * first, try to put all hugepages into relevant sockets, but
-                * if first attempts fails, fall back to collecting all pages
-                * in one socket and sorting them later
-                */
-               total_pages = 0;
-               /* we also don't want to do this for legacy init */
-               if (!internal_config.legacy_mem)
-                       for (i = 0; i < rte_socket_count(); i++) {
-                               int socket = rte_socket_id_by_idx(i);
-                               unsigned int num_pages =
-                                               get_num_hugepages_on_node(
-                                                       dirent->d_name, socket);
-                               hpi->num_pages[socket] = num_pages;
-                               total_pages += num_pages;
-                       }
-               /*
-                * we failed to sort memory from the get go, so fall
-                * back to old way
-                */
-               if (total_pages == 0)
-                       hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+               calc_num_pages(hpi, dirent);
 
-#ifndef RTE_ARCH_64
-               /* for 32-bit systems, limit number of hugepages to
-                * 1GB per page size */
-               hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
-                                           RTE_PGSIZE_1G / hpi->hugepage_sz);
-#endif
+               if (internal_config.no_shared_files &&
+                               !internal_config.legacy_mem &&
+                               memfd_hugepage_supported)
+                       RTE_LOG(NOTICE, EAL, "No shared files mode enabled, "
+                               "hugepages of size %" PRIu64 " bytes will be "
+                               "allocated anonymously\n", hpi->hugepage_sz);
 
                num_sizes++;
        }
@@ -423,8 +498,7 @@ hugepage_info_init(void)
 
                for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
                        num_pages += hpi->num_pages[j];
-               if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0 &&
-                               num_pages > 0)
+               if (num_pages > 0)
                        return 0;
        }
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c 
b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index f57d307dd..c4d57c349 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -39,6 +39,7 @@
 #include "eal_filesystem.h"
 #include "eal_internal_cfg.h"
 #include "eal_memalloc.h"
+#include "eal_memfd.h"
 
 /*
  * not all kernel version support fallocate on hugetlbfs, so fall back to
@@ -46,6 +47,11 @@
  */
 static int fallocate_supported = -1; /* unknown */
 
+/* not all kernel versions support memfd hugepages. assume supported unless
+ * shown otherwise.
+ */
+int memfd_hugepage_supported = 1;
+
 /* for single-file segments, we need some kind of mechanism to keep track of
  * which hugepages can be freed back to the system, and which cannot. we cannot
  * use flock() because they don't allow locking parts of a file, and we cannot
@@ -293,6 +299,49 @@ static int unlock_segment(int list_idx, int seg_idx)
        return 0;
 }
 
+int
+eal_memalloc_get_memfd_pagesize_flag(uint64_t page_sz)
+{
+#ifdef MEMFD_SUPPORTED
+       switch (page_sz) {
+       case RTE_PGSIZE_1G:
+               return MFD_HUGE_1GB;
+       case RTE_PGSIZE_2M:
+               return MFD_HUGE_2MB;
+       default:
+               return -1;
+       }
+#endif
+       return 0;
+}
+
+static int
+get_memfd_seg_fd(unsigned int list_idx,
+               unsigned int seg_idx, int sz_flag)
+{
+#ifdef MEMFD_SUPPORTED
+       int flags = MFD_HUGETLB | sz_flag;
+       char name[64];
+       int fd;
+
+       snprintf(name, sizeof(name) - 1, "memseg-%d-%d", list_idx,
+                       seg_idx);
+
+       fd = memfd_create(name, flags);
+       if (fd < 0) {
+               RTE_LOG(ERR, EAL, "Couldn't create memfd hugepage: %s\n",
+                       strerror(errno));
+               return -1;
+       }
+       return fd;
+#else
+       RTE_SET_USED(list_idx);
+       RTE_SET_USED(seg_idx);
+       RTE_SET_USED(sz_flag);
+       return -1;
+#endif
+}
+
 static int
 get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
                unsigned int list_idx, unsigned int seg_idx)
@@ -342,6 +391,27 @@ get_seg_fd(char *path, int buflen, struct hugepage_info 
*hi,
        return fd;
 }
 
+static int
+get_seg_fd_no_shared(char *path, int buflen, struct hugepage_info *hi,
+               unsigned int list_idx, unsigned int seg_idx)
+{
+       int sz_flag;
+
+       /* if memfd hugepages are not supported, create regular files */
+       if (memfd_hugepage_supported == 0)
+               return get_seg_fd(path, buflen, hi, list_idx, seg_idx);
+
+       /* pick correct page size flags */
+       sz_flag = eal_memalloc_get_memfd_pagesize_flag(hi->hugepage_sz);
+       if (sz_flag == 0) {
+               RTE_LOG(ERR, EAL, "Unexpected page size: %"
+                       PRIu64 "\n", hi->hugepage_sz);
+               return -1;
+       }
+
+       return get_memfd_seg_fd(list_idx, seg_idx, sz_flag);
+}
+
 static int
 resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
                uint64_t fa_offset, uint64_t page_sz, bool grow)
@@ -491,8 +561,16 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
        int fd;
        size_t alloc_sz;
 
-       /* takes out a read lock on segment or segment list */
-       fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+       if (internal_config.no_shared_files) {
+               /* if allocating memfd hugepages is supported, do that,
+                * otherwise fallback to regular allocation
+                */
+               fd = get_seg_fd_no_shared(path, sizeof(path), hi, list_idx,
+                               seg_idx);
+       } else {
+               /* takes out a read lock on segment or segment list */
+               fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+       }
        if (fd < 0) {
                RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
                return -1;
@@ -512,7 +590,8 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
                                __func__, strerror(errno));
                        goto resized;
                }
-               if (internal_config.no_shared_files) {
+               if (internal_config.no_shared_files &&
+                               memfd_hugepage_supported == 0) {
                        if (unlink(path)) {
                                RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: 
%s\n",
                                        __func__, strerror(errno));
@@ -616,7 +695,7 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
 {
        uint64_t map_offset;
        char path[PATH_MAX];
-       int fd, ret;
+       int fd, ret = 0;
 
        /* erase page data */
        memset(ms->addr, 0, ms->len);
@@ -685,6 +764,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
        size_t page_sz;
        int cur_idx, start_idx, j, dir_fd = -1;
        unsigned int msl_idx, need, i;
+       bool mountpoint_is_empty;
 
        if (msl->page_sz != wa->page_sz)
                return 0;
@@ -704,6 +784,12 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void 
*arg)
                return 0;
        start_idx = cur_idx;
 
+       /* if we're in no-shared-files mode and memfd is supported, we will
+        * allow empty mountpoints because memfd doesn't require a mountpoint.
+        */
+       mountpoint_is_empty =
+                       strnlen(wa->hi->hugedir, sizeof(wa->hi->hugedir)) == 0;
+
        /* do not allow any page allocations during the time we're allocating,
         * because file creation and locking operations are not atomic,
         * and we might be the first or the last ones to use a particular page,
@@ -712,7 +798,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
         * during init, we already hold a write lock, so don't try to take out
         * another one.
         */
-       if (wa->hi->lock_descriptor == -1) {
+       if (wa->hi->lock_descriptor == -1 && !mountpoint_is_empty) {
                dir_fd = open(wa->hi->hugedir, O_RDONLY);
                if (dir_fd < 0) {
                        RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
@@ -794,6 +880,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
        struct free_walk_param *wa = arg;
        uintptr_t start_addr, end_addr;
        int msl_idx, seg_idx, ret, dir_fd = -1;
+       bool mountpoint_is_empty;
 
        start_addr = (uintptr_t) msl->base_va;
        end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
@@ -802,6 +889,12 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
                        (uintptr_t)wa->ms->addr >= end_addr)
                return 0;
 
+       /* if we're in no shared files mode and memfd is supported, we will
+        * allow empty mountpoints because memfd doesn't require a mountpoint.
+        */
+       mountpoint_is_empty =
+                       strnlen(wa->hi->hugedir, sizeof(wa->hi->hugedir)) == 0;
+
        msl_idx = msl - mcfg->memsegs;
        seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
 
@@ -816,7 +909,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
         * during init, we already hold a write lock, so don't try to take out
         * another one.
         */
-       if (wa->hi->lock_descriptor == -1) {
+       if (wa->hi->lock_descriptor == -1 && !mountpoint_is_empty) {
                dir_fd = open(wa->hi->hugedir, O_RDONLY);
                if (dir_fd < 0) {
                        RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
diff --git a/lib/librte_eal/linuxapp/eal/eal_memfd.h 
b/lib/librte_eal/linuxapp/eal/eal_memfd.h
new file mode 100644
index 000000000..55e6dbb2c
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_memfd.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef EAL_MEMFD_H
+#define EAL_MEMFD_H
+
+#include <stdint.h>
+
+/*
+ * For memfd hugepages, both kernel and glibc version must support them. So,
+ * check for both.
+ */
+#include <features.h> /* glibc version */
+#if __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 27
+#include <linux/version.h> /* linux kernel version */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
+#define MEMFD_SUPPORTED
+#include <linux/memfd.h>
+#endif /* linux version check */
+#endif /* glibc version check */
+
+int
+eal_memalloc_get_memfd_pagesize_flag(uint64_t page_sz);
+
+extern int memfd_hugepage_supported;
+
+#endif /* EAL_MEMFD_H */
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
b/lib/librte_eal/linuxapp/eal/eal_memory.c
index d7b43b5c1..b26e21be8 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -44,6 +44,7 @@
 #include "eal_internal_cfg.h"
 #include "eal_filesystem.h"
 #include "eal_hugepages.h"
+#include "eal_memfd.h"
 
 #define PFN_MASK_SIZE  8
 
@@ -1060,8 +1061,7 @@ get_socket_mem_size(int socket)
 
        for (i = 0; i < internal_config.num_hugepage_sizes; i++){
                struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-               if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0)
-                       size += hpi->hugepage_sz * hpi->num_pages[socket];
+               size += hpi->hugepage_sz * hpi->num_pages[socket];
        }
 
        return size;
-- 
2.17.0

Reply via email to