Hi, > -----Original Message----- > From: Ilya Maximets [mailto:i.maxim...@samsung.com] > Sent: Thursday, February 16, 2017 9:01 PM > To: dev@dpdk.org; David Marchand; Gonzalez Monroy, Sergio > Cc: Heetae Ahn; Yuanhan Liu; Tan, Jianfeng; Neil Horman; Pei, Yulong; Ilya > Maximets; sta...@dpdk.org > Subject: [PATCH] mem: balanced allocation of hugepages > > Currently EAL allocates hugepages one by one not paying > attention from which NUMA node allocation was done. > > Such behaviour leads to allocation failure if number of > available hugepages for application limited by cgroups > or hugetlbfs and memory requested not only from the first > socket. > > Example: > # 90 x 1GB hugepages availavle in a system > > cgcreate -g hugetlb:/test > # Limit to 32GB of hugepages > cgset -r hugetlb.1GB.limit_in_bytes=34359738368 test > # Request 4GB from each of 2 sockets > cgexec -g hugetlb:test testpmd --socket-mem=4096,4096 ... > > EAL: SIGBUS: Cannot mmap more hugepages of size 1024 MB > EAL: 32 not 90 hugepages of size 1024 MB allocated > EAL: Not enough memory available on socket 1! > Requested: 4096MB, available: 0MB > PANIC in rte_eal_init(): > Cannot init memory > > This happens beacause all allocated pages are > on socket 0.
For such an use case, why not just use "numactl --interleave=0,1 <DPDK app> xxx"? Do you see use case like --socket-mem 2048,1024 and only three 1GB-hugepage are allowed? Thanks, Jianfeng > > Fix this issue by setting mempolicy MPOL_PREFERRED for each > hugepage to one of requested nodes in a round-robin fashion. > In this case all allocated pages will be fairly distributed > between all requested nodes. > > New config option RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES > introduced and disabled by default because of external > dependency from libnuma. > > Cc: <sta...@dpdk.org> > Fixes: 77988fc08dc5 ("mem: fix allocating all free hugepages") > > Signed-off-by: Ilya Maximets <i.maxim...@samsung.com> > --- > config/common_base | 1 + > lib/librte_eal/Makefile | 4 ++ > lib/librte_eal/linuxapp/eal/eal_memory.c | 66 > ++++++++++++++++++++++++++++++++ > mk/rte.app.mk | 3 ++ > 4 files changed, 74 insertions(+) > > diff --git a/config/common_base b/config/common_base > index 71a4fcb..fbcebbd 100644 > --- a/config/common_base > +++ b/config/common_base > @@ -97,6 +97,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n > CONFIG_RTE_EAL_IGB_UIO=n > CONFIG_RTE_EAL_VFIO=n > CONFIG_RTE_MALLOC_DEBUG=n > +CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES=n > > # Default driver path (or "" to disable) > CONFIG_RTE_EAL_PMD_PATH="" > diff --git a/lib/librte_eal/Makefile b/lib/librte_eal/Makefile > index cf11a09..5ae3846 100644 > --- a/lib/librte_eal/Makefile > +++ b/lib/librte_eal/Makefile > @@ -35,4 +35,8 @@ DIRS-y += common > DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += linuxapp > DIRS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += bsdapp > > +ifeq ($(CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES),y) > +LDLIBS += -lnuma > +endif > + > include $(RTE_SDK)/mk/rte.subdir.mk > diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c > b/lib/librte_eal/linuxapp/eal/eal_memory.c > index a956bb2..8536a36 100644 > --- a/lib/librte_eal/linuxapp/eal/eal_memory.c > +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c > @@ -82,6 +82,9 @@ > #include <sys/time.h> > #include <signal.h> > #include <setjmp.h> > +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES > +#include <numaif.h> > +#endif > > #include <rte_log.h> > #include <rte_memory.h> > @@ -359,6 +362,21 @@ static int huge_wrap_sigsetjmp(void) > return sigsetjmp(huge_jmpenv, 1); > } > > +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES > +#ifndef ULONG_SIZE > +#define ULONG_SIZE sizeof(unsigned long) > +#endif > +#ifndef ULONG_BITS > +#define ULONG_BITS (ULONG_SIZE * CHAR_BIT) > +#endif > +#ifndef DIV_ROUND_UP > +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) > +#endif > +#ifndef BITS_TO_LONGS > +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, ULONG_SIZE) > +#endif > +#endif > + > /* > * Mmap all hugepages of hugepage table: it first open a file in > * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the > @@ -375,10 +393,48 @@ map_all_hugepages(struct hugepage_file > *hugepg_tbl, > void *virtaddr; > void *vma_addr = NULL; > size_t vma_len = 0; > +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES > + unsigned long > nodemask[BITS_TO_LONGS(RTE_MAX_NUMA_NODES)] = {0UL}; > + unsigned long maxnode = 0; > + int node_id = -1; > + > + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) > + if (internal_config.socket_mem[i]) > + maxnode = i + 1; > +#endif > > for (i = 0; i < hpi->num_pages[0]; i++) { > uint64_t hugepage_sz = hpi->hugepage_sz; > > +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES > + if (maxnode) { > + node_id = (node_id + 1) % RTE_MAX_NUMA_NODES; > + while (!internal_config.socket_mem[node_id]) > + node_id = (node_id + 1) % > RTE_MAX_NUMA_NODES; > + > + nodemask[node_id / ULONG_BITS] = > + 1UL << (node_id % > ULONG_BITS); > + > + RTE_LOG(DEBUG, EAL, > + "Setting policy MPOL_PREFERRED for > socket %d\n", > + node_id); > + /* > + * Due to old linux kernel bug (feature?) we have to > + * increase maxnode by 1. It will be unconditionally > + * decreased back to normal value inside the syscall > + * handler. > + */ > + if (set_mempolicy(MPOL_PREFERRED, > + nodemask, maxnode + 1) < 0) { > + RTE_LOG(ERR, EAL, > + "Failed to set policy > MPOL_PREFERRED: " > + "%s\n", strerror(errno)); > + return i; > + } > + > + nodemask[node_id / ULONG_BITS] = 0UL; > + } > +#endif > if (orig) { > hugepg_tbl[i].file_id = i; > hugepg_tbl[i].size = hugepage_sz; > @@ -489,6 +545,10 @@ map_all_hugepages(struct hugepage_file > *hugepg_tbl, > vma_len -= hugepage_sz; > } > > +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES > + if (maxnode && set_mempolicy(MPOL_DEFAULT, NULL, 0) < 0) > + RTE_LOG(ERR, EAL, "Failed to set mempolicy > MPOL_DEFAULT\n"); > +#endif > return i; > } > > @@ -573,6 +634,11 @@ find_numasocket(struct hugepage_file *hugepg_tbl, > struct hugepage_info *hpi) > if (hugepg_tbl[i].orig_va == va) { > hugepg_tbl[i].socket_id = socket_id; > hp_count++; > +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES > + RTE_LOG(DEBUG, EAL, > + "Hugepage %s is on socket %d\n", > + hugepg_tbl[i].filepath, socket_id); > +#endif > } > } > } > diff --git a/mk/rte.app.mk b/mk/rte.app.mk > index 92f3635..c2153b9 100644 > --- a/mk/rte.app.mk > +++ b/mk/rte.app.mk > @@ -159,6 +159,9 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n) > # The static libraries do not know their dependencies. > # So linking with static library requires explicit dependencies. > _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lrt > +ifeq ($(CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES),y) > +_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lnuma > +endif > _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lm > _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrt > _LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lm > -- > 2.7.4