On 18/11/2015 02:42, Jianfeng Tan wrote:
> Currently DPDK does not respect the quota of a hugetblfs mount.
> It will fail to init the EAL because it tries to map the number of
> free hugepages in the system rather than using the number specified
> in the quota for that mount.
>
> To solve this issue, we take the quota into consideration when
> calculating the number of hugepages to map. We use either the number
> specified in the quota, or number of available hugepages, whichever
> is lower.
>
> There are possible race conditions when multiple applications
> allocate hugepages in different hugetlbfs mounts of the same size,
> so the suggested system would have a pool with enough hugepages for
> all hugetlbfs mount quotas.
>
> There is, however, still an open issue with
> CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS. When this option is enabled
> (IVSHMEM target does this by default), having hugetlbfs mounts with
> quota will fail to remap hugepages because it relies on having
> mapped all free hugepages in the system.
>
> Signed-off-by: Jianfeng Tan <jianfeng.tan at intel.com>
> ---
> v3 changes:
> - commit msg rework
> - add hpi->quota to record quota of each hugetlbfs
> - get_hugepage_dir -> get_hugepage_mnt_info to fill hugedir and quota
> - add info in release note
>
> v2 changes:
> - reword title
> - fix compiler error of v1
>
> doc/guides/rel_notes/release_2_2.rst | 5 +
> lib/librte_eal/common/eal_internal_cfg.h | 1 +
> lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 145
> +++++++++++++++---------
> 3 files changed, 98 insertions(+), 53 deletions(-)
>
> diff --git a/doc/guides/rel_notes/release_2_2.rst
> b/doc/guides/rel_notes/release_2_2.rst
> index 0781ae6..5b8777a 100644
> --- a/doc/guides/rel_notes/release_2_2.rst
> +++ b/doc/guides/rel_notes/release_2_2.rst
> @@ -102,6 +102,11 @@ New Features
>
> * **Added port hotplug support to xenvirt.**
>
> +* **Added support of taking mount quota into account.**
> +
> + Take the quota into consideration when calculating the number of hugepages
> + to map. We use either the number specified in the quota, or number of
> + available hugepages, whichever is lower.
>
> Resolved Issues
> ---------------
> diff --git a/lib/librte_eal/common/eal_internal_cfg.h
> b/lib/librte_eal/common/eal_internal_cfg.h
> index 5f1367e..38ca410 100644
> --- a/lib/librte_eal/common/eal_internal_cfg.h
> +++ b/lib/librte_eal/common/eal_internal_cfg.h
> @@ -50,6 +50,7 @@
> */
> struct hugepage_info {
> uint64_t hugepage_sz; /**< size of a huge page */
> + uint64_t quota; /**< quota of a hugetlbfs */
> const char *hugedir; /**< dir where hugetlbfs is mounted */
> uint32_t num_pages[RTE_MAX_NUMA_NODES];
> /**< number of hugepages of that size on each
> socket */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> index 18858e2..612d87d 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> @@ -44,6 +44,8 @@
> #include <unistd.h>
> #include <errno.h>
> #include <sys/queue.h>
> +#include <sys/vfs.h>
> +#include <mntent.h>
>
> #include <rte_memory.h>
> #include <rte_memzone.h>
> @@ -124,71 +126,90 @@ get_default_hp_size(void)
> return size;
> }
>
> -static const char *
> -get_hugepage_dir(uint64_t hugepage_sz)
> +static void
> +get_hugetlbfs_mnt_info(struct hugepage_info *hpi)
> {
> - enum proc_mount_fieldnames {
> - DEVICE = 0,
> - MOUNTPT,
> - FSTYPE,
> - OPTIONS,
> - _FIELDNAME_MAX
> - };
> + FILE *f;
> + struct mntent *ent;
> + char *str_size;
> + char *str_pagesz;
> + uint64_t pagesz;
> +
> + static const char *proc_mounts = "/proc/mounts";
> + static const char *hugetlbfs_str = "hugetlbfs";
> + static const char *opt_pagesize = "pagesize";
> + static const size_t opt_pagesize_len = sizeof("pagesize") - 1;
> + static const char *opt_size = "size";
> + static const size_t opt_size_len = sizeof("size") - 1;
> static uint64_t default_size = 0;
> - const char proc_mounts[] = "/proc/mounts";
> - const char hugetlbfs_str[] = "hugetlbfs";
> - const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1;
> - const char pagesize_opt[] = "pagesize=";
> - const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1;
> - const char split_tok = ' ';
> - char *splitstr[_FIELDNAME_MAX];
> - char buf[BUFSIZ];
> - char *retval = NULL;
> -
> - FILE *fd = fopen(proc_mounts, "r");
> - if (fd == NULL)
> - rte_panic("Cannot open %s\n", proc_mounts);
>
> if (default_size == 0)
> default_size = get_default_hp_size();
>
> - while (fgets(buf, sizeof(buf), fd)){
> - if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX,
> - split_tok) != _FIELDNAME_MAX) {
> - RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts);
> - break; /* return NULL */
> - }
> + f = setmntent(proc_mounts, "r");
> + if (f == NULL)
> + rte_panic("Cannot open %s\n", proc_mounts);
> +
> + while (NULL != (ent = getmntent(f))) {
> +
> + if (strcmp(ent->mnt_type, hugetlbfs_str) != 0)
> + continue;
>
> /* we have a specified --huge-dir option, only examine that dir
> */
> if (internal_config.hugepage_dir != NULL &&
> - strcmp(splitstr[MOUNTPT],
> internal_config.hugepage_dir) != 0)
> + strcmp(ent->mnt_dir,
> internal_config.hugepage_dir) != 0)
> continue;
>
> - if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) ==
> 0){
> - const char *pagesz_str = strstr(splitstr[OPTIONS],
> pagesize_opt);
> -
> - /* if no explicit page size, the default page size is
> compared */
> - if (pagesz_str == NULL){
> - if (hugepage_sz == default_size){
> - retval = strdup(splitstr[MOUNTPT]);
> - break;
> - }
> - }
> - /* there is an explicit page size, so check it */
> - else {
> - uint64_t pagesz =
> rte_str_to_size(&pagesz_str[pagesize_opt_len]);
> - if (pagesz == hugepage_sz) {
> - retval = strdup(splitstr[MOUNTPT]);
> - break;
> - }
> - }
> - } /* end if strncmp hugetlbfs */
> - } /* end while fgets */
> + str_pagesz = hasmntopt(ent, opt_pagesize);
> + /* if no explicit page size, the default page size is compared
> */
> + if (!str_pagesz)
> + pagesz = default_size;
> + /* there is an explicit page size, so check it */
> + else
> + pagesz = rte_str_to_size(&str_pagesz[opt_pagesize_len +
> 1]);
>
> - fclose(fd);
> - return retval;
> + if (pagesz == hpi->hugepage_sz)
> + break;
> + }
> +
> + if (ent == NULL) {
> + hpi->hugedir = NULL;
> + goto end;
> + }
> +
> + hpi->hugedir = strdup(ent->mnt_dir);
> +
> + str_size = hasmntopt(ent, opt_size);
> + if (str_size == NULL) {
> + RTE_LOG(DEBUG, EAL, "size not specified for %s\n",
> + hpi->hugedir);
> + hpi->quota = 0;
> + goto end;
> + }
> + hpi->quota = rte_str_to_size(&str_size[opt_size_len + 1]);
> +
> +end:
> + endmntent(f);
> }
>
> +/* Caller to make sure this mount has option size
> + * so that statistics from statfs is valid.
> + */
> +static uint32_t
> +get_hugetlbfs_free_pages(const char *mnt_dir)
> +{
> + int r;
> + struct statfs stats;
> +
> + r = statfs(mnt_dir, &stats);
> + if (r != 0)
> + rte_panic("statfs() %s error: %s\n",
> + mnt_dir, strerror(errno));
> +
> + return (uint32_t)stats.f_bfree;
> +}
> +
> +
> /*
> * Clear the hugepage directory of whatever hugepage files
> * there are. Checks if the file is locked (i.e.
> @@ -300,7 +321,8 @@ eal_hugepage_info_init(void)
> hpi = &internal_config.hugepage_info[num_sizes];
> hpi->hugepage_sz =
> rte_str_to_size(&dirent->d_name[dirent_start_len]);
> - hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz);
> +
> + get_hugetlbfs_mnt_info(hpi);
>
> /* first, check if we have a mountpoint */
> if (hpi->hugedir == NULL) {
> @@ -329,9 +351,26 @@ eal_hugepage_info_init(void)
> if (clear_hugedir(hpi->hugedir) == -1)
> break;
>
> + uint32_t num_left, num_statfs;
> + num_left = get_num_hugepages(dirent->d_name);
> + if (hpi->quota) {
> + /* when option size is specified, calculate free
> + * pages left in this hugetlbfs using statfs.
> + */
> + num_statfs = get_hugetlbfs_free_pages(hpi->hugedir);
> + RTE_LOG(DEBUG, EAL,
> + "%u free hugepages from a quota of 0x%"
> PRIx64
> + ", of size 0x%" PRIx64 " mounted at
> %s\n",
> + num_statfs,
> + hpi->quota,
> + hpi->hugepage_sz,
> + hpi->hugedir);
> + num_left = RTE_MIN(num_left, num_statfs);
> + }
> +
> /* for now, put all pages into socket 0,
> * later they will be sorted */
> - hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
> + hpi->num_pages[0] = num_left;
>
> #ifndef RTE_ARCH_64
> /* for 32-bit systems, limit number of hugepages to
Acked-by: Sergio Gonzalez Monroy <sergio.gonzalez.monroy at intel.com>