On 18/11/2015 02:42, Jianfeng Tan wrote:
> Currently DPDK does not respect the quota of a hugetblfs mount.
> It will fail to init the EAL because it tries to map the number of
> free hugepages in the system rather than using the number specified
> in the quota for that mount.
>
> To solve this issue, we take the quota into consideration when
> calculating the number of hugepages to map.  We use either the number
> specified in the quota, or number of available hugepages, whichever
> is lower.
>
> There are possible race conditions when multiple applications
> allocate hugepages in different hugetlbfs mounts of the same size,
> so the suggested system would have a pool with enough hugepages for
> all hugetlbfs mount quotas.
>
> There is, however, still an open issue with
> CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS. When this option is enabled
> (IVSHMEM target does this by default), having hugetlbfs mounts with
> quota will fail to remap hugepages because it relies on having
> mapped all free hugepages in the system.
>
> Signed-off-by: Jianfeng Tan <jianfeng.tan at intel.com>
> ---
> v3 changes:
>   - commit msg rework
>   - add hpi->quota to record quota of each hugetlbfs
>   - get_hugepage_dir -> get_hugepage_mnt_info to fill hugedir and quota
>   - add info in release note
>
> v2 changes:
>   - reword title
>   - fix compiler error of v1
>
>   doc/guides/rel_notes/release_2_2.rst            |   5 +
>   lib/librte_eal/common/eal_internal_cfg.h        |   1 +
>   lib/librte_eal/linuxapp/eal/eal_hugepage_info.c | 145 
> +++++++++++++++---------
>   3 files changed, 98 insertions(+), 53 deletions(-)
>
> diff --git a/doc/guides/rel_notes/release_2_2.rst 
> b/doc/guides/rel_notes/release_2_2.rst
> index 0781ae6..5b8777a 100644
> --- a/doc/guides/rel_notes/release_2_2.rst
> +++ b/doc/guides/rel_notes/release_2_2.rst
> @@ -102,6 +102,11 @@ New Features
>   
>   * **Added port hotplug support to xenvirt.**
>   
> +* **Added support of taking mount quota into account.**
> +
> +  Take the quota into consideration when calculating the number of hugepages
> +  to map. We use either the number specified in the quota, or number of
> +  available hugepages, whichever is lower.
>   
>   Resolved Issues
>   ---------------
> diff --git a/lib/librte_eal/common/eal_internal_cfg.h 
> b/lib/librte_eal/common/eal_internal_cfg.h
> index 5f1367e..38ca410 100644
> --- a/lib/librte_eal/common/eal_internal_cfg.h
> +++ b/lib/librte_eal/common/eal_internal_cfg.h
> @@ -50,6 +50,7 @@
>    */
>   struct hugepage_info {
>       uint64_t hugepage_sz;   /**< size of a huge page */
> +     uint64_t quota;   /**< quota of a hugetlbfs */
>       const char *hugedir;    /**< dir where hugetlbfs is mounted */
>       uint32_t num_pages[RTE_MAX_NUMA_NODES];
>                               /**< number of hugepages of that size on each 
> socket */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c 
> b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> index 18858e2..612d87d 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
> @@ -44,6 +44,8 @@
>   #include <unistd.h>
>   #include <errno.h>
>   #include <sys/queue.h>
> +#include <sys/vfs.h>
> +#include <mntent.h>
>   
>   #include <rte_memory.h>
>   #include <rte_memzone.h>
> @@ -124,71 +126,90 @@ get_default_hp_size(void)
>       return size;
>   }
>   
> -static const char *
> -get_hugepage_dir(uint64_t hugepage_sz)
> +static void
> +get_hugetlbfs_mnt_info(struct hugepage_info *hpi)
>   {
> -     enum proc_mount_fieldnames {
> -             DEVICE = 0,
> -             MOUNTPT,
> -             FSTYPE,
> -             OPTIONS,
> -             _FIELDNAME_MAX
> -     };
> +     FILE *f;
> +     struct mntent *ent;
> +     char *str_size;
> +     char *str_pagesz;
> +     uint64_t pagesz;
> +
> +     static const char *proc_mounts = "/proc/mounts";
> +     static const char *hugetlbfs_str = "hugetlbfs";
> +     static const char *opt_pagesize = "pagesize";
> +     static const size_t opt_pagesize_len = sizeof("pagesize") - 1;
> +     static const char *opt_size = "size";
> +     static const size_t opt_size_len = sizeof("size") - 1;
>       static uint64_t default_size = 0;
> -     const char proc_mounts[] = "/proc/mounts";
> -     const char hugetlbfs_str[] = "hugetlbfs";
> -     const size_t htlbfs_str_len = sizeof(hugetlbfs_str) - 1;
> -     const char pagesize_opt[] = "pagesize=";
> -     const size_t pagesize_opt_len = sizeof(pagesize_opt) - 1;
> -     const char split_tok = ' ';
> -     char *splitstr[_FIELDNAME_MAX];
> -     char buf[BUFSIZ];
> -     char *retval = NULL;
> -
> -     FILE *fd = fopen(proc_mounts, "r");
> -     if (fd == NULL)
> -             rte_panic("Cannot open %s\n", proc_mounts);
>   
>       if (default_size == 0)
>               default_size = get_default_hp_size();
>   
> -     while (fgets(buf, sizeof(buf), fd)){
> -             if (rte_strsplit(buf, sizeof(buf), splitstr, _FIELDNAME_MAX,
> -                             split_tok) != _FIELDNAME_MAX) {
> -                     RTE_LOG(ERR, EAL, "Error parsing %s\n", proc_mounts);
> -                     break; /* return NULL */
> -             }
> +     f = setmntent(proc_mounts, "r");
> +     if (f == NULL)
> +             rte_panic("Cannot open %s\n", proc_mounts);
> +
> +     while (NULL != (ent = getmntent(f))) {
> +
> +             if (strcmp(ent->mnt_type, hugetlbfs_str) != 0)
> +                     continue;
>   
>               /* we have a specified --huge-dir option, only examine that dir 
> */
>               if (internal_config.hugepage_dir != NULL &&
> -                             strcmp(splitstr[MOUNTPT], 
> internal_config.hugepage_dir) != 0)
> +                             strcmp(ent->mnt_dir, 
> internal_config.hugepage_dir) != 0)
>                       continue;
>   
> -             if (strncmp(splitstr[FSTYPE], hugetlbfs_str, htlbfs_str_len) == 
> 0){
> -                     const char *pagesz_str = strstr(splitstr[OPTIONS], 
> pagesize_opt);
> -
> -                     /* if no explicit page size, the default page size is 
> compared */
> -                     if (pagesz_str == NULL){
> -                             if (hugepage_sz == default_size){
> -                                     retval = strdup(splitstr[MOUNTPT]);
> -                                     break;
> -                             }
> -                     }
> -                     /* there is an explicit page size, so check it */
> -                     else {
> -                             uint64_t pagesz = 
> rte_str_to_size(&pagesz_str[pagesize_opt_len]);
> -                             if (pagesz == hugepage_sz) {
> -                                     retval = strdup(splitstr[MOUNTPT]);
> -                                     break;
> -                             }
> -                     }
> -             } /* end if strncmp hugetlbfs */
> -     } /* end while fgets */
> +             str_pagesz = hasmntopt(ent, opt_pagesize);
> +             /* if no explicit page size, the default page size is compared 
> */
> +             if (!str_pagesz)
> +                     pagesz = default_size;
> +             /* there is an explicit page size, so check it */
> +             else
> +                     pagesz = rte_str_to_size(&str_pagesz[opt_pagesize_len + 
> 1]);
>   
> -     fclose(fd);
> -     return retval;
> +             if (pagesz == hpi->hugepage_sz)
> +                     break;
> +     }
> +
> +     if (ent == NULL) {
> +             hpi->hugedir = NULL;
> +             goto end;
> +     }
> +
> +     hpi->hugedir = strdup(ent->mnt_dir);
> +
> +     str_size = hasmntopt(ent, opt_size);
> +     if (str_size == NULL) {
> +             RTE_LOG(DEBUG, EAL, "size not specified for %s\n",
> +                     hpi->hugedir);
> +             hpi->quota = 0;
> +             goto end;
> +     }
> +     hpi->quota = rte_str_to_size(&str_size[opt_size_len + 1]);
> +
> +end:
> +     endmntent(f);
>   }
>   
> +/* Caller to make sure this mount has option size
> + * so that statistics from statfs is valid.
> + */
> +static uint32_t
> +get_hugetlbfs_free_pages(const char *mnt_dir)
> +{
> +     int r;
> +     struct statfs stats;
> +
> +     r = statfs(mnt_dir, &stats);
> +     if (r != 0)
> +             rte_panic("statfs() %s error: %s\n",
> +                             mnt_dir, strerror(errno));
> +
> +     return (uint32_t)stats.f_bfree;
> +}
> +
> +
>   /*
>    * Clear the hugepage directory of whatever hugepage files
>    * there are. Checks if the file is locked (i.e.
> @@ -300,7 +321,8 @@ eal_hugepage_info_init(void)
>               hpi = &internal_config.hugepage_info[num_sizes];
>               hpi->hugepage_sz =
>                       rte_str_to_size(&dirent->d_name[dirent_start_len]);
> -             hpi->hugedir = get_hugepage_dir(hpi->hugepage_sz);
> +
> +             get_hugetlbfs_mnt_info(hpi);
>   
>               /* first, check if we have a mountpoint */
>               if (hpi->hugedir == NULL) {
> @@ -329,9 +351,26 @@ eal_hugepage_info_init(void)
>               if (clear_hugedir(hpi->hugedir) == -1)
>                       break;
>   
> +             uint32_t num_left, num_statfs;
> +             num_left = get_num_hugepages(dirent->d_name);
> +             if (hpi->quota) {
> +                     /* when option size is specified, calculate free
> +                      * pages left in this hugetlbfs using statfs.
> +                      */
> +                     num_statfs = get_hugetlbfs_free_pages(hpi->hugedir);
> +                     RTE_LOG(DEBUG, EAL,
> +                                     "%u free hugepages from a quota of 0x%" 
> PRIx64
> +                                     ", of size 0x%" PRIx64 " mounted at 
> %s\n",
> +                                     num_statfs,
> +                                     hpi->quota,
> +                                     hpi->hugepage_sz,
> +                                     hpi->hugedir);
> +                     num_left = RTE_MIN(num_left, num_statfs);
> +             }
> +
>               /* for now, put all pages into socket 0,
>                * later they will be sorted */
> -             hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
> +             hpi->num_pages[0] = num_left;
>   
>   #ifndef RTE_ARCH_64
>               /* for 32-bit systems, limit number of hugepages to
Acked-by: Sergio Gonzalez Monroy <sergio.gonzalez.monroy at intel.com>

Reply via email to