Hi,

> -----Original Message-----
> From: Ilya Maximets [mailto:i.maxim...@samsung.com]
> Sent: Thursday, February 16, 2017 9:01 PM
> To: dev@dpdk.org; David Marchand; Gonzalez Monroy, Sergio
> Cc: Heetae Ahn; Yuanhan Liu; Tan, Jianfeng; Neil Horman; Pei, Yulong; Ilya
> Maximets; sta...@dpdk.org
> Subject: [PATCH] mem: balanced allocation of hugepages
> 
> Currently EAL allocates hugepages one by one not paying
> attention from which NUMA node allocation was done.
> 
> Such behaviour leads to allocation failure if number of
> available hugepages for application limited by cgroups
> or hugetlbfs and memory requested not only from the first
> socket.
> 
> Example:
>       # 90 x 1GB hugepages availavle in a system
> 
>       cgcreate -g hugetlb:/test
>       # Limit to 32GB of hugepages
>       cgset -r hugetlb.1GB.limit_in_bytes=34359738368 test
>       # Request 4GB from each of 2 sockets
>       cgexec -g hugetlb:test testpmd --socket-mem=4096,4096 ...
> 
>       EAL: SIGBUS: Cannot mmap more hugepages of size 1024 MB
>       EAL: 32 not 90 hugepages of size 1024 MB allocated
>       EAL: Not enough memory available on socket 1!
>            Requested: 4096MB, available: 0MB
>       PANIC in rte_eal_init():
>       Cannot init memory
> 
>       This happens beacause all allocated pages are
>       on socket 0.

For such an use case, why not just use "numactl --interleave=0,1 <DPDK app> 
xxx"?

Do you see use case like --socket-mem 2048,1024 and only three 1GB-hugepage are 
allowed?

Thanks,
Jianfeng

> 
> Fix this issue by setting mempolicy MPOL_PREFERRED for each
> hugepage to one of requested nodes in a round-robin fashion.
> In this case all allocated pages will be fairly distributed
> between all requested nodes.
> 
> New config option RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> introduced and disabled by default because of external
> dependency from libnuma.
> 
> Cc: <sta...@dpdk.org>
> Fixes: 77988fc08dc5 ("mem: fix allocating all free hugepages")
> 
> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com>
> ---
>  config/common_base                       |  1 +
>  lib/librte_eal/Makefile                  |  4 ++
>  lib/librte_eal/linuxapp/eal/eal_memory.c | 66
> ++++++++++++++++++++++++++++++++
>  mk/rte.app.mk                            |  3 ++
>  4 files changed, 74 insertions(+)
> 
> diff --git a/config/common_base b/config/common_base
> index 71a4fcb..fbcebbd 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -97,6 +97,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
>  CONFIG_RTE_EAL_IGB_UIO=n
>  CONFIG_RTE_EAL_VFIO=n
>  CONFIG_RTE_MALLOC_DEBUG=n
> +CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES=n
> 
>  # Default driver path (or "" to disable)
>  CONFIG_RTE_EAL_PMD_PATH=""
> diff --git a/lib/librte_eal/Makefile b/lib/librte_eal/Makefile
> index cf11a09..5ae3846 100644
> --- a/lib/librte_eal/Makefile
> +++ b/lib/librte_eal/Makefile
> @@ -35,4 +35,8 @@ DIRS-y += common
>  DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += linuxapp
>  DIRS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += bsdapp
> 
> +ifeq ($(CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES),y)
> +LDLIBS += -lnuma
> +endif
> +
>  include $(RTE_SDK)/mk/rte.subdir.mk
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c
> b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index a956bb2..8536a36 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -82,6 +82,9 @@
>  #include <sys/time.h>
>  #include <signal.h>
>  #include <setjmp.h>
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +#include <numaif.h>
> +#endif
> 
>  #include <rte_log.h>
>  #include <rte_memory.h>
> @@ -359,6 +362,21 @@ static int huge_wrap_sigsetjmp(void)
>       return sigsetjmp(huge_jmpenv, 1);
>  }
> 
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +#ifndef ULONG_SIZE
> +#define ULONG_SIZE sizeof(unsigned long)
> +#endif
> +#ifndef ULONG_BITS
> +#define ULONG_BITS (ULONG_SIZE * CHAR_BIT)
> +#endif
> +#ifndef DIV_ROUND_UP
> +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
> +#endif
> +#ifndef BITS_TO_LONGS
> +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, ULONG_SIZE)
> +#endif
> +#endif
> +
>  /*
>   * Mmap all hugepages of hugepage table: it first open a file in
>   * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
> @@ -375,10 +393,48 @@ map_all_hugepages(struct hugepage_file
> *hugepg_tbl,
>       void *virtaddr;
>       void *vma_addr = NULL;
>       size_t vma_len = 0;
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +     unsigned long
> nodemask[BITS_TO_LONGS(RTE_MAX_NUMA_NODES)] = {0UL};
> +     unsigned long maxnode = 0;
> +     int node_id = -1;
> +
> +     for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
> +             if (internal_config.socket_mem[i])
> +                     maxnode = i + 1;
> +#endif
> 
>       for (i = 0; i < hpi->num_pages[0]; i++) {
>               uint64_t hugepage_sz = hpi->hugepage_sz;
> 
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +             if (maxnode) {
> +                     node_id = (node_id + 1) % RTE_MAX_NUMA_NODES;
> +                     while (!internal_config.socket_mem[node_id])
> +                             node_id = (node_id + 1) %
> RTE_MAX_NUMA_NODES;
> +
> +                     nodemask[node_id / ULONG_BITS] =
> +                                             1UL << (node_id %
> ULONG_BITS);
> +
> +                     RTE_LOG(DEBUG, EAL,
> +                             "Setting policy MPOL_PREFERRED for
> socket %d\n",
> +                             node_id);
> +                     /*
> +                      * Due to old linux kernel bug (feature?) we have to
> +                      * increase maxnode by 1. It will be unconditionally
> +                      * decreased back to normal value inside the syscall
> +                      * handler.
> +                      */
> +                     if (set_mempolicy(MPOL_PREFERRED,
> +                                       nodemask, maxnode + 1) < 0) {
> +                             RTE_LOG(ERR, EAL,
> +                                     "Failed to set policy
> MPOL_PREFERRED: "
> +                                     "%s\n", strerror(errno));
> +                             return i;
> +                     }
> +
> +                     nodemask[node_id / ULONG_BITS] = 0UL;
> +             }
> +#endif
>               if (orig) {
>                       hugepg_tbl[i].file_id = i;
>                       hugepg_tbl[i].size = hugepage_sz;
> @@ -489,6 +545,10 @@ map_all_hugepages(struct hugepage_file
> *hugepg_tbl,
>               vma_len -= hugepage_sz;
>       }
> 
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +     if (maxnode && set_mempolicy(MPOL_DEFAULT, NULL, 0) < 0)
> +             RTE_LOG(ERR, EAL, "Failed to set mempolicy
> MPOL_DEFAULT\n");
> +#endif
>       return i;
>  }
> 
> @@ -573,6 +634,11 @@ find_numasocket(struct hugepage_file *hugepg_tbl,
> struct hugepage_info *hpi)
>                       if (hugepg_tbl[i].orig_va == va) {
>                               hugepg_tbl[i].socket_id = socket_id;
>                               hp_count++;
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +                             RTE_LOG(DEBUG, EAL,
> +                                     "Hugepage %s is on socket %d\n",
> +                                     hugepg_tbl[i].filepath, socket_id);
> +#endif
>                       }
>               }
>       }
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index 92f3635..c2153b9 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -159,6 +159,9 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
>  # The static libraries do not know their dependencies.
>  # So linking with static library requires explicit dependencies.
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lrt
> +ifeq ($(CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES),y)
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lnuma
> +endif
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lm
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lrt
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lm
> --
> 2.7.4

Reply via email to