Hi Anatoly
> > Introduce an example application demonstrating the use of > external memory support. This is a simple application based on > skeleton app, but instead of using internal DPDK memory, it is > using externally allocated memory. > > The RX/TX and init path is a carbon-copy of skeleton app, with > no modifications whatseoever. The only difference is an additional > init stage to allocate memory and create a heap for it, and the > socket ID supplied to the mempool initialization function. The > memory used by this app is hugepage memory allocated anonymously. > > Anonymous hugepage memory will not be allocated in a NUMA-aware > fashion, so there is a chance of performance degradation when > using this app, but given that kernel usually gives hugepages on > local socket first, this should not be a problem in most cases. Do we need a new sample app just for that? Couldn't it be added into testpmd, same, as we have now 'mp-anon' to use mempool over anonymous memory? Konstantin > > Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com> > --- > examples/external_mem/Makefile | 62 ++++ > examples/external_mem/extmem.c | 461 ++++++++++++++++++++++++++++++ > examples/external_mem/meson.build | 12 + > 3 files changed, 535 insertions(+) > create mode 100644 examples/external_mem/Makefile > create mode 100644 examples/external_mem/extmem.c > create mode 100644 examples/external_mem/meson.build > > diff --git a/examples/external_mem/Makefile b/examples/external_mem/Makefile > new file mode 100644 > index 000000000..3b6ab3b2f > --- /dev/null > +++ b/examples/external_mem/Makefile > @@ -0,0 +1,62 @@ > +# SPDX-License-Identifier: BSD-3-Clause > +# Copyright(c) 2010-2018 Intel Corporation > + > +# binary name > +APP = extmem > + > +# all source are stored in SRCS-y > +SRCS-y := extmem.c > + > +# Build using pkg-config variables if possible > +$(shell pkg-config --exists libdpdk) > +ifeq ($(.SHELLSTATUS),0) > + > +all: shared > +.PHONY: shared static > +shared: build/$(APP)-shared > + ln -sf $(APP)-shared build/$(APP) > +static: build/$(APP)-static > + ln -sf $(APP)-static build/$(APP) > + > +PC_FILE := $(shell pkg-config --path libdpdk) > +CFLAGS += -O3 $(shell pkg-config --cflags libdpdk) > +CFLAGS += -DALLOW_EXPERIMENTAL_API > +LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk) > +LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk) > + > +build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build > + $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED) > + > +build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build > + $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC) > + > +build: > + @mkdir -p $@ > + > +.PHONY: clean > +clean: > + rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared > + rmdir --ignore-fail-on-non-empty build > + > +else # Build using legacy build system > + > +ifeq ($(RTE_SDK),) > +$(error "Please define RTE_SDK environment variable") > +endif > + > +# Default target, can be overridden by command line or environment > +RTE_TARGET ?= x86_64-native-linuxapp-gcc > + > +include $(RTE_SDK)/mk/rte.vars.mk > + > +CFLAGS += $(WERROR_FLAGS) > +CFLAGS += -DALLOW_EXPERIMENTAL_API > + > +# workaround for a gcc bug with noreturn attribute > +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603 > +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) > +CFLAGS_main.o += -Wno-return-type > +endif > + > +include $(RTE_SDK)/mk/rte.extapp.mk > +endif > diff --git a/examples/external_mem/extmem.c b/examples/external_mem/extmem.c > new file mode 100644 > index 000000000..818a02171 > --- /dev/null > +++ b/examples/external_mem/extmem.c > @@ -0,0 +1,461 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright(c) 2010-2018 Intel Corporation > + */ > + > +#include <stdint.h> > +#include <inttypes.h> > +#include <stdbool.h> > +#include <unistd.h> > +#include <sys/mman.h> > + > +#include <rte_eal.h> > +#include <rte_ethdev.h> > +#include <rte_cycles.h> > +#include <rte_lcore.h> > +#include <rte_mbuf.h> > +#include <rte_malloc.h> > +#include <rte_memory.h> > +#include <rte_vfio.h> > + > +#define RX_RING_SIZE 1024 > +#define TX_RING_SIZE 1024 > + > +#define NUM_MBUFS 8191 > +#define MBUF_CACHE_SIZE 250 > +#define BURST_SIZE 32 > +#define EXTMEM_HEAP_NAME "extmem" > + > +static const struct rte_eth_conf port_conf_default = { > + .rxmode = { > + .max_rx_pkt_len = ETHER_MAX_LEN, > + }, > +}; > + > +/* extmem.c: Basic DPDK skeleton forwarding example using external memory. */ > + > +/* > + * Initializes a given port using global settings and with the RX buffers > + * coming from the mbuf_pool passed as a parameter. > + */ > +static inline int > +port_init(uint16_t port, struct rte_mempool *mbuf_pool) > +{ > + struct rte_eth_conf port_conf = port_conf_default; > + const uint16_t rx_rings = 1, tx_rings = 1; > + uint16_t nb_rxd = RX_RING_SIZE; > + uint16_t nb_txd = TX_RING_SIZE; > + int retval; > + uint16_t q; > + struct rte_eth_dev_info dev_info; > + struct rte_eth_txconf txconf; > + > + if (!rte_eth_dev_is_valid_port(port)) > + return -1; > + > + rte_eth_dev_info_get(port, &dev_info); > + if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE) > + port_conf.txmode.offloads |= > + DEV_TX_OFFLOAD_MBUF_FAST_FREE; > + > + /* Configure the Ethernet device. */ > + retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf); > + if (retval != 0) > + return retval; > + > + retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &nb_rxd, &nb_txd); > + if (retval != 0) > + return retval; > + > + /* Allocate and set up 1 RX queue per Ethernet port. */ > + for (q = 0; q < rx_rings; q++) { > + retval = rte_eth_rx_queue_setup(port, q, nb_rxd, > + rte_eth_dev_socket_id(port), NULL, mbuf_pool); > + if (retval < 0) > + return retval; > + } > + > + txconf = dev_info.default_txconf; > + txconf.offloads = port_conf.txmode.offloads; > + /* Allocate and set up 1 TX queue per Ethernet port. */ > + for (q = 0; q < tx_rings; q++) { > + retval = rte_eth_tx_queue_setup(port, q, nb_txd, > + rte_eth_dev_socket_id(port), &txconf); > + if (retval < 0) > + return retval; > + } > + > + /* Start the Ethernet port. */ > + retval = rte_eth_dev_start(port); > + if (retval < 0) > + return retval; > + > + /* Display the port MAC address. */ > + struct ether_addr addr; > + rte_eth_macaddr_get(port, &addr); > + printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8 > + " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n", > + port, > + addr.addr_bytes[0], addr.addr_bytes[1], > + addr.addr_bytes[2], addr.addr_bytes[3], > + addr.addr_bytes[4], addr.addr_bytes[5]); > + > + /* Enable RX in promiscuous mode for the Ethernet device. */ > + rte_eth_promiscuous_enable(port); > + > + return 0; > +} > + > +/* > + * The lcore main. This is the main thread that does the work, reading from > + * an input port and writing to an output port. > + */ > +static __attribute__((noreturn)) void > +lcore_main(void) > +{ > + uint16_t port; > + > + /* > + * Check that the port is on the same NUMA node as the polling thread > + * for best performance. > + */ > + RTE_ETH_FOREACH_DEV(port) > + if (rte_eth_dev_socket_id(port) > 0 && > + rte_eth_dev_socket_id(port) != > + (int)rte_socket_id()) > + printf("WARNING, port %u is on remote NUMA node to " > + "polling thread.\n\tPerformance will " > + "not be optimal.\n", port); > + > + printf("\nCore %u forwarding packets. [Ctrl+C to quit]\n", > + rte_lcore_id()); > + > + /* Run until the application is quit or killed. */ > + for (;;) { > + /* > + * Receive packets on a port and forward them on the paired > + * port. The mapping is 0 -> 1, 1 -> 0, 2 -> 3, 3 -> 2, etc. > + */ > + RTE_ETH_FOREACH_DEV(port) { > + > + /* Get burst of RX packets, from first port of pair. */ > + struct rte_mbuf *bufs[BURST_SIZE]; > + const uint16_t nb_rx = rte_eth_rx_burst(port, 0, > + bufs, BURST_SIZE); > + > + if (unlikely(nb_rx == 0)) > + continue; > + > + /* Send burst of TX packets, to second port of pair. */ > + const uint16_t nb_tx = rte_eth_tx_burst(port ^ 1, 0, > + bufs, nb_rx); > + > + /* Free any unsent packets. */ > + if (unlikely(nb_tx < nb_rx)) { > + uint16_t buf; > + for (buf = nb_tx; buf < nb_rx; buf++) > + rte_pktmbuf_free(bufs[buf]); > + } > + } > + } > +} > + > +/* extremely pessimistic estimation of memory required to create a mempool */ > +static int > +calc_mem_size(uint32_t nb_ports, uint32_t nb_mbufs_per_port, > + uint32_t mbuf_sz, size_t pgsz, size_t *out) > +{ > + uint32_t nb_mbufs = nb_ports * nb_mbufs_per_port; > + uint64_t total_mem, mbuf_mem, obj_sz; > + > + /* there is no good way to predict how much space the mempool will > + * occupy because it will allocate chunks on the fly, and some of those > + * will come from default DPDK memory while some will come from our > + * external memory, so just assume 16MB will be enough for everyone. > + */ > + uint64_t hdr_mem = 16 << 20; > + > + obj_sz = rte_mempool_calc_obj_size(mbuf_sz, 0, NULL); > + if (rte_eal_iova_mode() == RTE_IOVA_VA) { > + /* contiguous - no need to account for page boundaries */ > + mbuf_mem = nb_mbufs * obj_sz; > + } else { > + /* account for possible non-contiguousness */ > + unsigned int n_pages, mbuf_per_pg, leftover; > + > + mbuf_per_pg = pgsz / obj_sz; > + leftover = (nb_mbufs % mbuf_per_pg) > 0; > + n_pages = (nb_mbufs / mbuf_per_pg) + leftover; > + > + mbuf_mem = n_pages * pgsz; > + } > + > + total_mem = RTE_ALIGN(hdr_mem + mbuf_mem, pgsz); > + > + if (total_mem > SIZE_MAX) { > + printf("Memory size too big\n"); > + return -1; > + } > + *out = (size_t)total_mem; > + > + return 0; > +} > + > +static inline uint32_t > +bsf64(uint64_t v) > +{ > + return (uint32_t)__builtin_ctzll(v); > +} > + > +static inline uint32_t > +log2_u64(uint64_t v) > +{ > + if (v == 0) > + return 0; > + v = rte_align64pow2(v); > + return bsf64(v); > +} > + > +#ifndef MAP_HUGE_SHIFT > +#define HUGE_SHIFT 26 > +#else > +#define HUGE_SHIFT MAP_HUGE_SHIFT > +#endif > + > +static int > +pagesz_flags(uint64_t page_sz) > +{ > + /* as per mmap() manpage, all page sizes are log2 of page size > + * shifted by MAP_HUGE_SHIFT > + */ > + int log2 = log2_u64(page_sz); > + return log2 << HUGE_SHIFT; > +} > + > +static void * > +alloc_mem(size_t memsz, size_t pgsz) > +{ > + void *addr; > + int flags; > + > + /* allocate anonymous hugepages */ > + flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | pagesz_flags(pgsz); > + > + addr = mmap(NULL, memsz, PROT_READ | PROT_WRITE, flags, -1, 0); > + if (addr == MAP_FAILED) > + return NULL; > + > + return addr; > +} > + > +struct extmem_param { > + void *addr; > + size_t len; > + size_t pgsz; > + rte_iova_t *iova_table; > + unsigned int iova_table_len; > +}; > + > +static int > +create_extmem(uint32_t nb_ports, uint32_t nb_mbufs_per_port, uint32_t > mbuf_sz, > + struct extmem_param *param) > +{ > + uint64_t pgsizes[] = {RTE_PGSIZE_2M, RTE_PGSIZE_1G, /* x86_64, ARM */ > + RTE_PGSIZE_16M, RTE_PGSIZE_16G}; /* POWER */ > + unsigned int n_pages, cur_page, pgsz_idx; > + size_t mem_sz, offset, cur_pgsz; > + bool vfio_supported = true; > + rte_iova_t *iovas = NULL; > + void *addr; > + int ret; > + > + for (pgsz_idx = 0; pgsz_idx < RTE_DIM(pgsizes); pgsz_idx++) { > + /* skip anything that is too big */ > + if (pgsizes[pgsz_idx] > SIZE_MAX) > + continue; > + > + cur_pgsz = pgsizes[pgsz_idx]; > + > + ret = calc_mem_size(nb_ports, nb_mbufs_per_port, > + mbuf_sz, cur_pgsz, &mem_sz); > + if (ret < 0) { > + printf("Cannot calculate memory size\n"); > + return -1; > + } > + > + /* allocate our memory */ > + addr = alloc_mem(mem_sz, cur_pgsz); > + > + /* if we couldn't allocate memory with a specified page size, > + * that doesn't mean we can't do it with other page sizes, so > + * try another one. > + */ > + if (addr == NULL) > + continue; > + > + /* store IOVA addresses for every page in this memory area */ > + n_pages = mem_sz / cur_pgsz; > + > + iovas = malloc(sizeof(*iovas) * n_pages); > + > + if (iovas == NULL) { > + printf("Cannot allocate memory for iova addresses\n"); > + goto fail; > + } > + > + /* populate IOVA table */ > + for (cur_page = 0; cur_page < n_pages; cur_page++) { > + rte_iova_t iova; > + void *cur; > + > + offset = cur_pgsz * cur_page; > + cur = RTE_PTR_ADD(addr, offset); > + > + iova = (uintptr_t)rte_mem_virt2iova(cur); > + > + iovas[cur_page] = iova; > + > + if (vfio_supported) { > + /* map memory for DMA */ > + ret = rte_vfio_dma_map((uintptr_t)addr, > + iova, cur_pgsz); > + if (ret < 0) { > + /* > + * ENODEV means VFIO is not initialized > + * ENOTSUP means current IOMMU mode > + * doesn't support mapping > + * both cases are not an error > + */ > + if (rte_errno == ENOTSUP || > + rte_errno == ENODEV) > + /* VFIO is unsupported, don't > + * try again. > + */ > + vfio_supported = false; > + else > + /* this is an actual error */ > + goto fail; > + } > + } > + } > + > + break; > + } > + /* if we couldn't allocate anything */ > + if (iovas == NULL) > + return -1; > + > + param->addr = addr; > + param->len = mem_sz; > + param->pgsz = cur_pgsz; > + param->iova_table = iovas; > + param->iova_table_len = n_pages; > + > + return 0; > +fail: > + if (iovas) > + free(iovas); > + if (addr) > + munmap(addr, mem_sz); > + > + return -1; > +} > + > +static int > +setup_extmem(uint32_t nb_ports, uint32_t nb_mbufs_per_port, uint32_t mbuf_sz) > +{ > + struct extmem_param param; > + int ret; > + > + /* create our heap */ > + ret = rte_malloc_heap_create(EXTMEM_HEAP_NAME); > + if (ret < 0) { > + printf("Cannot create heap\n"); > + return -1; > + } > + > + ret = create_extmem(nb_ports, nb_mbufs_per_port, mbuf_sz, ¶m); > + if (ret < 0) { > + printf("Cannot create memory area\n"); > + return -1; > + } > + > + /* we now have a valid memory area, so add it to heap */ > + ret = rte_malloc_heap_memory_add(EXTMEM_HEAP_NAME, > + param.addr, param.len, param.iova_table, > + param.iova_table_len, param.pgsz); > + > + /* not needed any more */ > + free(param.iova_table); > + > + if (ret < 0) { > + printf("Cannot add memory to heap\n"); > + munmap(param.addr, param.len); > + return -1; > + } > + > + printf("Allocated %zuMB of memory\n", param.len >> 20); > + > + /* success */ > + return 0; > +} > + > + > +/* > + * The main function, which does initialization and calls the per-lcore > + * functions. > + */ > +int > +main(int argc, char *argv[]) > +{ > + struct rte_mempool *mbuf_pool; > + unsigned int nb_ports; > + int socket_id; > + uint16_t portid; > + uint32_t nb_mbufs_per_port, mbuf_sz; > + > + /* Initialize the Environment Abstraction Layer (EAL). */ > + int ret = rte_eal_init(argc, argv); > + if (ret < 0) > + rte_exit(EXIT_FAILURE, "Error with EAL initialization\n"); > + > + argc -= ret; > + argv += ret; > + > + /* Check that there is an even number of ports to send/receive on. */ > + nb_ports = rte_eth_dev_count_avail(); > + if (nb_ports < 2 || (nb_ports & 1)) > + rte_exit(EXIT_FAILURE, "Error: number of ports must be even\n"); > + > + nb_mbufs_per_port = NUM_MBUFS; > + mbuf_sz = RTE_MBUF_DEFAULT_BUF_SIZE; > + > + if (setup_extmem(nb_ports, nb_mbufs_per_port, mbuf_sz) < 0) > + rte_exit(EXIT_FAILURE, "Error: cannot set up external > memory\n"); > + > + /* retrieve socket ID for our heap */ > + socket_id = rte_malloc_heap_get_socket(EXTMEM_HEAP_NAME); > + if (socket_id < 0) > + rte_exit(EXIT_FAILURE, "Invalid socket for external heap\n"); > + > + /* Creates a new mempool in memory to hold the mbufs. */ > + mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", > + nb_mbufs_per_port * nb_ports, MBUF_CACHE_SIZE, 0, > + mbuf_sz, socket_id); > + > + if (mbuf_pool == NULL) > + rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n"); > + > + /* Initialize all ports. */ > + RTE_ETH_FOREACH_DEV(portid) > + if (port_init(portid, mbuf_pool) != 0) > + rte_exit(EXIT_FAILURE, "Cannot init port %"PRIu16 "\n", > + portid); > + > + if (rte_lcore_count() > 1) > + printf("\nWARNING: Too many lcores enabled. Only 1 used.\n"); > + > + /* Call lcore_main on the master core only. */ > + lcore_main(); > + > + return 0; > +} > diff --git a/examples/external_mem/meson.build > b/examples/external_mem/meson.build > new file mode 100644 > index 000000000..17a363ad2 > --- /dev/null > +++ b/examples/external_mem/meson.build > @@ -0,0 +1,12 @@ > +# SPDX-License-Identifier: BSD-3-Clause > +# Copyright(c) 2017 Intel Corporation > + > +# meson file, for building this example as part of a main DPDK build. > +# > +# To build this example as a standalone application with an already-installed > +# DPDK instance, use 'make' > + > +allow_experimental_apis = true > +sources = files( > + 'extmem.c' > +) > -- > 2.17.1