On 3/8/2016 9:42 AM, Jianfeng Tan wrote:
> This patch adds an option, --huge-trybest, to use a recover mechanism to
> the case that there are not so many hugepages (declared in sysfs), which
> can be used. It relys on a mem access to fault-in hugepages, and if fails
> with SIGBUS, recover to previously saved stack environment with
> siglongjmp().
>
> Test example:
>    a. cgcreate -g hugetlb:/test-subgroup
>    b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
>    c. cgexec -g hugetlb:test-subgroup \
>         ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest
>
> Signed-off-by: Jianfeng Tan <jianfeng.tan at intel.com>

Sorry, forgot to add ack from Neil.
Acked-by: Neil Horman <nhorman at tuxdriver.com>

> ---
> v2:
>   - Address the compiling error by move setjmp into a wrap method.
>
>   lib/librte_eal/common/eal_common_options.c |   4 ++
>   lib/librte_eal/common/eal_internal_cfg.h   |   1 +
>   lib/librte_eal/common/eal_options.h        |   2 +
>   lib/librte_eal/linuxapp/eal/eal.c          |   1 +
>   lib/librte_eal/linuxapp/eal/eal_memory.c   | 104 
> ++++++++++++++++++++++++++---
>   5 files changed, 104 insertions(+), 8 deletions(-)
>
> diff --git a/lib/librte_eal/common/eal_common_options.c 
> b/lib/librte_eal/common/eal_common_options.c
> index 29942ea..8ff6a2e 100644
> --- a/lib/librte_eal/common/eal_common_options.c
> +++ b/lib/librte_eal/common/eal_common_options.c
> @@ -95,6 +95,7 @@ eal_long_options[] = {
>       {OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
>       {OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
>       {OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
> +     {OPT_HUGE_TRYBEST,      0, NULL, OPT_HUGE_TRYBEST_NUM     },
>       {0,                     0, NULL, 0                        }
>   };
>   
> @@ -896,6 +897,9 @@ eal_parse_common_option(int opt, const char *optarg,
>                       return -1;
>               }
>               break;
> +     case OPT_HUGE_TRYBEST_NUM:
> +             internal_config.huge_trybest = 1;
> +             break;
>   
>       /* don't know what to do, leave this to caller */
>       default:
> diff --git a/lib/librte_eal/common/eal_internal_cfg.h 
> b/lib/librte_eal/common/eal_internal_cfg.h
> index 5f1367e..90a3533 100644
> --- a/lib/librte_eal/common/eal_internal_cfg.h
> +++ b/lib/librte_eal/common/eal_internal_cfg.h
> @@ -64,6 +64,7 @@ struct internal_config {
>       volatile unsigned force_nchannel; /**< force number of channels */
>       volatile unsigned force_nrank;    /**< force number of ranks */
>       volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */
> +     volatile unsigned huge_trybest;   /**< try best to allocate hugepages */
>       unsigned hugepage_unlink;         /**< true to unlink backing files */
>       volatile unsigned xen_dom0_support; /**< support app running on Xen 
> Dom0*/
>       volatile unsigned no_pci;         /**< true to disable PCI */
> diff --git a/lib/librte_eal/common/eal_options.h 
> b/lib/librte_eal/common/eal_options.h
> index a881c62..02397c5 100644
> --- a/lib/librte_eal/common/eal_options.h
> +++ b/lib/librte_eal/common/eal_options.h
> @@ -83,6 +83,8 @@ enum {
>       OPT_VMWARE_TSC_MAP_NUM,
>   #define OPT_XEN_DOM0          "xen-dom0"
>       OPT_XEN_DOM0_NUM,
> +#define OPT_HUGE_TRYBEST      "huge-trybest"
> +     OPT_HUGE_TRYBEST_NUM,
>       OPT_LONG_MAX_NUM
>   };
>   
> diff --git a/lib/librte_eal/linuxapp/eal/eal.c 
> b/lib/librte_eal/linuxapp/eal/eal.c
> index ceac435..3e23877 100644
> --- a/lib/librte_eal/linuxapp/eal/eal.c
> +++ b/lib/librte_eal/linuxapp/eal/eal.c
> @@ -343,6 +343,7 @@ eal_usage(const char *prgname)
>              "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by 
> hotplug)\n"
>              "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO 
> (legacy|msi|msix)\n"
>              "  --"OPT_XEN_DOM0"          Support running on Xen dom0 without 
> hugetlbfs\n"
> +            "  --"OPT_HUGE_TRYBEST"      Try best to accommodate hugepages\n"
>              "\n");
>       /* Allow the application to print its usage message too if hook is set 
> */
>       if ( rte_application_usage_hook ) {
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
> b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index 5b9132c..e4e1f3b 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -80,6 +80,8 @@
>   #include <errno.h>
>   #include <sys/ioctl.h>
>   #include <sys/time.h>
> +#include <signal.h>
> +#include <setjmp.h>
>   
>   #include <rte_log.h>
>   #include <rte_memory.h>
> @@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
>       return addr;
>   }
>   
> +static sigjmp_buf jmpenv;
> +
> +static void sigbus_handler(int signo __rte_unused)
> +{
> +     siglongjmp(jmpenv, 1);
> +}
> +
> +/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
> + * non-static local variable in the stack frame calling setjmp might be
> + * clobbered by a call to longjmp.
> + */
> +static int wrap_setjmp(void)
> +{
> +     return setjmp(jmpenv);
> +}
>   /*
>    * Mmap all hugepages of hugepage table: it first open a file in
>    * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
> @@ -396,7 +413,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>               if (fd < 0) {
>                       RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
>                                       strerror(errno));
> -                     return -1;
> +                     return i;
>               }
>   
>               /* map the segment, and populate page tables,
> @@ -407,7 +424,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>                       RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
>                                       strerror(errno));
>                       close(fd);
> -                     return -1;
> +                     return i;
>               }
>   
>               if (orig) {
> @@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>                       hugepg_tbl[i].final_va = virtaddr;
>               }
>   
> +             if (orig && internal_config.huge_trybest) {
> +                     /* In linux, hugetlb limitations, like cgroup, are
> +                      * enforced at fault time instead of mmap(), even
> +                      * with the option of MAP_POPULATE. Kernel will send
> +                      * a SIGBUS signal. To avoid to be killed, save stack
> +                      * environment here, if SIGBUS happens, we can jump
> +                      * back here.
> +                      */
> +                     if (wrap_setjmp()) {
> +                             RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more "
> +                                     "hugepages of size %u MB\n",
> +                                     (unsigned)(hugepage_sz / 0x100000));
> +                             munmap(virtaddr, hugepage_sz);
> +                             close(fd);
> +                             unlink(hugepg_tbl[i].filepath);
> +                             return i;
> +                     }
> +                     *(int *)virtaddr = 0;
> +             }
> +
> +
>               /* set shared flock on the file. */
>               if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
>                       RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
>                               __func__, strerror(errno));
>                       close(fd);
> -                     return -1;
> +                     return i;
>               }
>   
>               close(fd);
> @@ -430,7 +468,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>               vma_addr = (char *)vma_addr + hugepage_sz;
>               vma_len -= hugepage_sz;
>       }
> -     return 0;
> +     return i;
>   }
>   
>   #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
> @@ -1036,6 +1074,33 @@ calc_num_pages_per_socket(uint64_t * memory,
>       return total_num_pages;
>   }
>   
> +static struct sigaction action_old;
> +static int need_recover;
> +
> +static void
> +register_sigbus(void)
> +{
> +     sigset_t mask;
> +     struct sigaction action;
> +
> +     sigemptyset(&mask);
> +     sigaddset(&mask, SIGBUS);
> +     action.sa_flags = 0;
> +     action.sa_mask = mask;
> +     action.sa_handler = sigbus_handler;
> +
> +     need_recover = !sigaction(SIGBUS, &action, &action_old);
> +}
> +
> +static void
> +recover_sigbus(void)
> +{
> +     if (need_recover) {
> +             sigaction(SIGBUS, &action_old, NULL);
> +             need_recover = 0;
> +     }
> +}
> +
>   /*
>    * Prepare physical memory mapping: fill configuration structure with
>    * these infos, return 0 on success.
> @@ -1122,8 +1187,12 @@ rte_eal_hugepage_init(void)
>   
>       hp_offset = 0; /* where we start the current page size entries */
>   
> +     if (internal_config.huge_trybest)
> +             register_sigbus();
> +
>       /* map all hugepages and sort them */
>       for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
> +             int pages_old, pages_new;
>               struct hugepage_info *hpi;
>   
>               /*
> @@ -1137,10 +1206,24 @@ rte_eal_hugepage_init(void)
>                       continue;
>   
>               /* map all hugepages available */
> -             if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
> -                     RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
> -                                     (unsigned)(hpi->hugepage_sz / 
> 0x100000));
> -                     goto fail;
> +             pages_old = hpi->num_pages[0];
> +             pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
> +             if (pages_new < pages_old) {
> +                     RTE_LOG(DEBUG, EAL,
> +                             "%d not %d hugepages of size %u MB allocated\n",
> +                             pages_new, pages_old,
> +                             (unsigned)(hpi->hugepage_sz / 0x100000));
> +                     if (internal_config.huge_trybest) {
> +                             int pages = pages_old - pages_new;
> +
> +                             internal_config.memory -=
> +                                     hpi->hugepage_sz * pages;
> +                             nr_hugepages -= pages;
> +                             hpi->num_pages[0] = pages_new;
> +                             if (pages_new == 0)
> +                                     continue;
> +                     } else
> +                             goto fail;
>               }
>   
>               /* find physical addresses and sockets for each hugepage */
> @@ -1187,6 +1270,9 @@ rte_eal_hugepage_init(void)
>   #endif
>       }
>   
> +     if (internal_config.huge_trybest)
> +             recover_sigbus();
> +
>   #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
>       nr_hugefiles = 0;
>       for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
> @@ -1373,6 +1459,8 @@ rte_eal_hugepage_init(void)
>       return 0;
>   
>   fail:
> +     if (internal_config.huge_trybest)
> +             recover_sigbus();
>       free(tmp_hp);
>       return -1;
>   }

Reply via email to