On 29.06.2017 08:48, Ilya Maximets wrote: > On 29.06.2017 08:32, Hemant Agrawal wrote: >> On 6/27/2017 3:54 PM, Ilya Maximets wrote: >>> Currently EAL allocates hugepages one by one not paying attention >>> from which NUMA node allocation was done. >>> >>> Such behaviour leads to allocation failure if number of available >>> hugepages for application limited by cgroups or hugetlbfs and >>> memory requested not only from the first socket. >>> >>> Example: >>> # 90 x 1GB hugepages availavle in a system >>> >>> cgcreate -g hugetlb:/test >>> # Limit to 32GB of hugepages >>> cgset -r hugetlb.1GB.limit_in_bytes=34359738368 test >>> # Request 4GB from each of 2 sockets >>> cgexec -g hugetlb:test testpmd --socket-mem=4096,4096 ... >>> >>> EAL: SIGBUS: Cannot mmap more hugepages of size 1024 MB >>> EAL: 32 not 90 hugepages of size 1024 MB allocated >>> EAL: Not enough memory available on socket 1! >>> Requested: 4096MB, available: 0MB >>> PANIC in rte_eal_init(): >>> Cannot init memory >>> >>> This happens beacause all allocated pages are >>> on socket 0. >>> >>> Fix this issue by setting mempolicy MPOL_PREFERRED for each hugepage >>> to one of requested nodes using following schema: >>> >>> 1) Allocate essential hugepages: >>> 1.1) Allocate as many hugepages from numa N to >>> only fit requested memory for this numa. >>> 1.2) repeat 1.1 for all numa nodes. >>> 2) Try to map all remaining free hugepages in a round-robin >>> fashion. >>> 3) Sort pages and choose the most suitable. >>> >>> In this case all essential memory will be allocated and all remaining >>> pages will be fairly distributed between all requested nodes. >>> >>> New config option RTE_EAL_NUMA_AWARE_HUGEPAGES introduced and >>> enabled by default for linuxapp except armv7 and dpaa2. >>> Enabling of this option adds libnuma as a dependency for EAL. >>> >>> Fixes: 77988fc08dc5 ("mem: fix allocating all free hugepages") >>> >>> Signed-off-by: Ilya Maximets <i.maxim...@samsung.com> >>> --- >>> config/common_base | 1 + >>> config/common_linuxapp | 1 + >>> config/defconfig_arm-armv7a-linuxapp-gcc | 3 + >>> config/defconfig_arm64-dpaa2-linuxapp-gcc | 3 + >>> lib/librte_eal/linuxapp/eal/Makefile | 3 + >>> lib/librte_eal/linuxapp/eal/eal_memory.c | 120 >>> ++++++++++++++++++++++++++++-- >>> mk/rte.app.mk | 3 + >>> 7 files changed, 126 insertions(+), 8 deletions(-) >>> >>> diff --git a/config/common_base b/config/common_base >>> index f6aafd1..660588a 100644 >>> --- a/config/common_base >>> +++ b/config/common_base >>> @@ -103,6 +103,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n >>> CONFIG_RTE_EAL_IGB_UIO=n >>> CONFIG_RTE_EAL_VFIO=n >>> CONFIG_RTE_MALLOC_DEBUG=n >>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n >>> >>> # >>> # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing. >>> diff --git a/config/common_linuxapp b/config/common_linuxapp >>> index b3cf41b..64bef87 100644 >>> --- a/config/common_linuxapp >>> +++ b/config/common_linuxapp >>> @@ -35,6 +35,7 @@ >>> CONFIG_RTE_EXEC_ENV="linuxapp" >>> CONFIG_RTE_EXEC_ENV_LINUXAPP=y >>> >>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y >>> CONFIG_RTE_EAL_IGB_UIO=y >>> CONFIG_RTE_EAL_VFIO=y >>> CONFIG_RTE_KNI_KMOD=y >>> diff --git a/config/defconfig_arm-armv7a-linuxapp-gcc >>> b/config/defconfig_arm-armv7a-linuxapp-gcc >>> index 19607eb..e06b1d4 100644 >>> --- a/config/defconfig_arm-armv7a-linuxapp-gcc >>> +++ b/config/defconfig_arm-armv7a-linuxapp-gcc >>> @@ -47,6 +47,9 @@ CONFIG_RTE_ARCH_STRICT_ALIGN=y >>> CONFIG_RTE_TOOLCHAIN="gcc" >>> CONFIG_RTE_TOOLCHAIN_GCC=y >>> >>> +# NUMA is not supported on ARM >>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n >>> + >>> # ARM doesn't have support for vmware TSC map >>> CONFIG_RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT=n >>> >>> diff --git a/config/defconfig_arm64-dpaa2-linuxapp-gcc >>> b/config/defconfig_arm64-dpaa2-linuxapp-gcc >>> index 2304ab6..f78449d 100644 >>> --- a/config/defconfig_arm64-dpaa2-linuxapp-gcc >>> +++ b/config/defconfig_arm64-dpaa2-linuxapp-gcc >>> @@ -45,6 +45,9 @@ CONFIG_RTE_CACHE_LINE_SIZE=64 >>> >>> CONFIG_RTE_PKTMBUF_HEADROOM=256 >>> >>> +# Doesn't support NUMA >>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y >>> + >> >> DPAA2 does not support NUMA so, >> CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n > > Oh, sorry. Just typo. > Thanks for catching this.
Fixed. Hemant, please, check the new version (v10). > Sergio, I'll send v10 with only this change and will keep your > acked-by because the change is trivial. > >>> # >>> # Compile Support Libraries for DPAA2 >>> # >>> diff --git a/lib/librte_eal/linuxapp/eal/Makefile >>> b/lib/librte_eal/linuxapp/eal/Makefile >>> index 640afd0..8651e27 100644 >>> --- a/lib/librte_eal/linuxapp/eal/Makefile >>> +++ b/lib/librte_eal/linuxapp/eal/Makefile >>> @@ -50,6 +50,9 @@ LDLIBS += -ldl >>> LDLIBS += -lpthread >>> LDLIBS += -lgcc_s >>> LDLIBS += -lrt >>> +ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y) >>> +LDLIBS += -lnuma >>> +endif >>> >>> # specific to linuxapp exec-env >>> SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c >>> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c >>> b/lib/librte_eal/linuxapp/eal/eal_memory.c >>> index e17c9cb..647d89c 100644 >>> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c >>> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c >>> @@ -54,6 +54,10 @@ >>> #include <sys/time.h> >>> #include <signal.h> >>> #include <setjmp.h> >>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES >>> +#include <numa.h> >>> +#include <numaif.h> >>> +#endif >>> >>> #include <rte_log.h> >>> #include <rte_memory.h> >>> @@ -348,6 +352,14 @@ static int huge_wrap_sigsetjmp(void) >>> return sigsetjmp(huge_jmpenv, 1); >>> } >>> >>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES >>> +/* Callback for numa library. */ >>> +void numa_error(char *where) >>> +{ >>> + RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno)); >>> +} >>> +#endif >>> + >>> /* >>> * Mmap all hugepages of hugepage table: it first open a file in >>> * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the >>> @@ -356,18 +368,78 @@ static int huge_wrap_sigsetjmp(void) >>> * map continguous physical blocks in contiguous virtual blocks. >>> */ >>> static unsigned >>> -map_all_hugepages(struct hugepage_file *hugepg_tbl, >>> - struct hugepage_info *hpi, int orig) >>> +map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info >>> *hpi, >>> + uint64_t *essential_memory __rte_unused, int orig) >>> { >>> int fd; >>> unsigned i; >>> void *virtaddr; >>> void *vma_addr = NULL; >>> size_t vma_len = 0; >>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES >>> + int node_id = -1; >>> + int essential_prev = 0; >>> + int oldpolicy; >>> + struct bitmask *oldmask = numa_allocate_nodemask(); >>> + bool have_numa = true; >>> + unsigned long maxnode = 0; >>> + >>> + /* Check if kernel supports NUMA. */ >>> + if (numa_available() != 0) { >>> + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n"); >>> + have_numa = false; >>> + } >>> + >>> + if (orig && have_numa) { >>> + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n"); >>> + if (get_mempolicy(&oldpolicy, oldmask->maskp, >>> + oldmask->size + 1, 0, 0) < 0) { >>> + RTE_LOG(ERR, EAL, >>> + "Failed to get current mempolicy: %s. " >>> + "Assuming MPOL_DEFAULT.\n", strerror(errno)); >>> + oldpolicy = MPOL_DEFAULT; >>> + } >>> + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) >>> + if (internal_config.socket_mem[i]) >>> + maxnode = i + 1; >>> + } >>> +#endif >>> >>> for (i = 0; i < hpi->num_pages[0]; i++) { >>> uint64_t hugepage_sz = hpi->hugepage_sz; >>> >>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES >>> + if (maxnode) { >>> + unsigned int j; >>> + >>> + for (j = 0; j < maxnode; j++) >>> + if (essential_memory[j]) >>> + break; >>> + >>> + if (j == maxnode) { >>> + node_id = (node_id + 1) % maxnode; >>> + while (!internal_config.socket_mem[node_id]) { >>> + node_id++; >>> + node_id %= maxnode; >>> + } >>> + essential_prev = 0; >>> + } else { >>> + node_id = j; >>> + essential_prev = essential_memory[j]; >>> + >>> + if (essential_memory[j] < hugepage_sz) >>> + essential_memory[j] = 0; >>> + else >>> + essential_memory[j] -= hugepage_sz; >>> + } >>> + >>> + RTE_LOG(DEBUG, EAL, >>> + "Setting policy MPOL_PREFERRED for socket %d\n", >>> + node_id); >>> + numa_set_preferred(node_id); >>> + } >>> +#endif >>> + >>> if (orig) { >>> hugepg_tbl[i].file_id = i; >>> hugepg_tbl[i].size = hugepage_sz; >>> @@ -422,7 +494,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, >>> if (fd < 0) { >>> RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__, >>> strerror(errno)); >>> - return i; >>> + goto out; >>> } >>> >>> /* map the segment, and populate page tables, >>> @@ -433,7 +505,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, >>> RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__, >>> strerror(errno)); >>> close(fd); >>> - return i; >>> + goto out; >>> } >>> >>> if (orig) { >>> @@ -458,7 +530,12 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, >>> munmap(virtaddr, hugepage_sz); >>> close(fd); >>> unlink(hugepg_tbl[i].filepath); >>> - return i; >>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES >>> + if (maxnode) >>> + essential_memory[node_id] = >>> + essential_prev; >>> +#endif >>> + goto out; >>> } >>> *(int *)virtaddr = 0; >>> } >>> @@ -469,7 +546,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, >>> RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n", >>> __func__, strerror(errno)); >>> close(fd); >>> - return i; >>> + goto out; >>> } >>> >>> close(fd); >>> @@ -478,6 +555,22 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl, >>> vma_len -= hugepage_sz; >>> } >>> >>> +out: >>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES >>> + if (maxnode) { >>> + RTE_LOG(DEBUG, EAL, >>> + "Restoring previous memory policy: %d\n", oldpolicy); >>> + if (oldpolicy == MPOL_DEFAULT) { >>> + numa_set_localalloc(); >>> + } else if (set_mempolicy(oldpolicy, oldmask->maskp, >>> + oldmask->size + 1) < 0) { >>> + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n", >>> + strerror(errno)); >>> + numa_set_localalloc(); >>> + } >>> + } >>> + numa_free_cpumask(oldmask); >>> +#endif >>> return i; >>> } >>> >>> @@ -562,6 +655,11 @@ find_numasocket(struct hugepage_file *hugepg_tbl, >>> struct hugepage_info *hpi) >>> if (hugepg_tbl[i].orig_va == va) { >>> hugepg_tbl[i].socket_id = socket_id; >>> hp_count++; >>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES >>> + RTE_LOG(DEBUG, EAL, >>> + "Hugepage %s is on socket %d\n", >>> + hugepg_tbl[i].filepath, socket_id); >>> +#endif >>> } >>> } >>> } >>> @@ -1000,6 +1098,11 @@ rte_eal_hugepage_init(void) >>> >>> huge_register_sigbus(); >>> >>> + /* make a copy of socket_mem, needed for balanced allocation. */ >>> + for (i = 0; i < RTE_MAX_NUMA_NODES; i++) >>> + memory[i] = internal_config.socket_mem[i]; >>> + >>> + >>> /* map all hugepages and sort them */ >>> for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){ >>> unsigned pages_old, pages_new; >>> @@ -1017,7 +1120,8 @@ rte_eal_hugepage_init(void) >>> >>> /* map all hugepages available */ >>> pages_old = hpi->num_pages[0]; >>> - pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1); >>> + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, >>> + memory, 1); >>> if (pages_new < pages_old) { >>> RTE_LOG(DEBUG, EAL, >>> "%d not %d hugepages of size %u MB allocated\n", >>> @@ -1060,7 +1164,7 @@ rte_eal_hugepage_init(void) >>> sizeof(struct hugepage_file), cmp_physaddr); >>> >>> /* remap all hugepages */ >>> - if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) != >>> + if (map_all_hugepages(&tmp_hp[hp_offset], hpi, NULL, 0) != >>> hpi->num_pages[0]) { >>> RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n", >>> (unsigned)(hpi->hugepage_sz / 0x100000)); >>> diff --git a/mk/rte.app.mk b/mk/rte.app.mk >>> index bcaf1b3..4fe22d1 100644 >>> --- a/mk/rte.app.mk >>> +++ b/mk/rte.app.mk >>> @@ -186,6 +186,9 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n) >>> # The static libraries do not know their dependencies. >>> # So linking with static library requires explicit dependencies. >>> _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lrt >>> +ifeq >>> ($(CONFIG_RTE_EXEC_ENV_LINUXAPP)$(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),yy) >>> +_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lnuma >>> +endif >>> _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lm >>> _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrt >>> _LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lm >>> >> >> >> >> >>