Hi Anatoly

> 
> Introduce an example application demonstrating the use of
> external memory support. This is a simple application based on
> skeleton app, but instead of using internal DPDK memory, it is
> using externally allocated memory.
> 
> The RX/TX and init path is a carbon-copy of skeleton app, with
> no modifications whatseoever. The only difference is an additional
> init stage to allocate memory and create a heap for it, and the
> socket ID supplied to the mempool initialization function. The
> memory used by this app is hugepage memory allocated anonymously.
> 
> Anonymous hugepage memory will not be allocated in a NUMA-aware
> fashion, so there is a chance of performance degradation when
> using this app, but given that kernel usually gives hugepages on
> local socket first, this should not be a problem in most cases.

Do we need a new sample app just for that?
Couldn't it be added into testpmd, same, as we have now 'mp-anon'
to use mempool over anonymous memory?
Konstantin

> 
> Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
> ---
>  examples/external_mem/Makefile    |  62 ++++
>  examples/external_mem/extmem.c    | 461 ++++++++++++++++++++++++++++++
>  examples/external_mem/meson.build |  12 +
>  3 files changed, 535 insertions(+)
>  create mode 100644 examples/external_mem/Makefile
>  create mode 100644 examples/external_mem/extmem.c
>  create mode 100644 examples/external_mem/meson.build
> 
> diff --git a/examples/external_mem/Makefile b/examples/external_mem/Makefile
> new file mode 100644
> index 000000000..3b6ab3b2f
> --- /dev/null
> +++ b/examples/external_mem/Makefile
> @@ -0,0 +1,62 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2010-2018 Intel Corporation
> +
> +# binary name
> +APP = extmem
> +
> +# all source are stored in SRCS-y
> +SRCS-y := extmem.c
> +
> +# Build using pkg-config variables if possible
> +$(shell pkg-config --exists libdpdk)
> +ifeq ($(.SHELLSTATUS),0)
> +
> +all: shared
> +.PHONY: shared static
> +shared: build/$(APP)-shared
> +     ln -sf $(APP)-shared build/$(APP)
> +static: build/$(APP)-static
> +     ln -sf $(APP)-static build/$(APP)
> +
> +PC_FILE := $(shell pkg-config --path libdpdk)
> +CFLAGS += -O3 $(shell pkg-config --cflags libdpdk)
> +CFLAGS += -DALLOW_EXPERIMENTAL_API
> +LDFLAGS_SHARED = $(shell pkg-config --libs libdpdk)
> +LDFLAGS_STATIC = -Wl,-Bstatic $(shell pkg-config --static --libs libdpdk)
> +
> +build/$(APP)-shared: $(SRCS-y) Makefile $(PC_FILE) | build
> +     $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_SHARED)
> +
> +build/$(APP)-static: $(SRCS-y) Makefile $(PC_FILE) | build
> +     $(CC) $(CFLAGS) $(SRCS-y) -o $@ $(LDFLAGS) $(LDFLAGS_STATIC)
> +
> +build:
> +     @mkdir -p $@
> +
> +.PHONY: clean
> +clean:
> +     rm -f build/$(APP) build/$(APP)-static build/$(APP)-shared
> +     rmdir --ignore-fail-on-non-empty build
> +
> +else # Build using legacy build system
> +
> +ifeq ($(RTE_SDK),)
> +$(error "Please define RTE_SDK environment variable")
> +endif
> +
> +# Default target, can be overridden by command line or environment
> +RTE_TARGET ?= x86_64-native-linuxapp-gcc
> +
> +include $(RTE_SDK)/mk/rte.vars.mk
> +
> +CFLAGS += $(WERROR_FLAGS)
> +CFLAGS += -DALLOW_EXPERIMENTAL_API
> +
> +# workaround for a gcc bug with noreturn attribute
> +# http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12603
> +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
> +CFLAGS_main.o += -Wno-return-type
> +endif
> +
> +include $(RTE_SDK)/mk/rte.extapp.mk
> +endif
> diff --git a/examples/external_mem/extmem.c b/examples/external_mem/extmem.c
> new file mode 100644
> index 000000000..818a02171
> --- /dev/null
> +++ b/examples/external_mem/extmem.c
> @@ -0,0 +1,461 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright(c) 2010-2018 Intel Corporation
> + */
> +
> +#include <stdint.h>
> +#include <inttypes.h>
> +#include <stdbool.h>
> +#include <unistd.h>
> +#include <sys/mman.h>
> +
> +#include <rte_eal.h>
> +#include <rte_ethdev.h>
> +#include <rte_cycles.h>
> +#include <rte_lcore.h>
> +#include <rte_mbuf.h>
> +#include <rte_malloc.h>
> +#include <rte_memory.h>
> +#include <rte_vfio.h>
> +
> +#define RX_RING_SIZE 1024
> +#define TX_RING_SIZE 1024
> +
> +#define NUM_MBUFS 8191
> +#define MBUF_CACHE_SIZE 250
> +#define BURST_SIZE 32
> +#define EXTMEM_HEAP_NAME "extmem"
> +
> +static const struct rte_eth_conf port_conf_default = {
> +     .rxmode = {
> +             .max_rx_pkt_len = ETHER_MAX_LEN,
> +     },
> +};
> +
> +/* extmem.c: Basic DPDK skeleton forwarding example using external memory. */
> +
> +/*
> + * Initializes a given port using global settings and with the RX buffers
> + * coming from the mbuf_pool passed as a parameter.
> + */
> +static inline int
> +port_init(uint16_t port, struct rte_mempool *mbuf_pool)
> +{
> +     struct rte_eth_conf port_conf = port_conf_default;
> +     const uint16_t rx_rings = 1, tx_rings = 1;
> +     uint16_t nb_rxd = RX_RING_SIZE;
> +     uint16_t nb_txd = TX_RING_SIZE;
> +     int retval;
> +     uint16_t q;
> +     struct rte_eth_dev_info dev_info;
> +     struct rte_eth_txconf txconf;
> +
> +     if (!rte_eth_dev_is_valid_port(port))
> +             return -1;
> +
> +     rte_eth_dev_info_get(port, &dev_info);
> +     if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
> +             port_conf.txmode.offloads |=
> +                     DEV_TX_OFFLOAD_MBUF_FAST_FREE;
> +
> +     /* Configure the Ethernet device. */
> +     retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
> +     if (retval != 0)
> +             return retval;
> +
> +     retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &nb_rxd, &nb_txd);
> +     if (retval != 0)
> +             return retval;
> +
> +     /* Allocate and set up 1 RX queue per Ethernet port. */
> +     for (q = 0; q < rx_rings; q++) {
> +             retval = rte_eth_rx_queue_setup(port, q, nb_rxd,
> +                             rte_eth_dev_socket_id(port), NULL, mbuf_pool);
> +             if (retval < 0)
> +                     return retval;
> +     }
> +
> +     txconf = dev_info.default_txconf;
> +     txconf.offloads = port_conf.txmode.offloads;
> +     /* Allocate and set up 1 TX queue per Ethernet port. */
> +     for (q = 0; q < tx_rings; q++) {
> +             retval = rte_eth_tx_queue_setup(port, q, nb_txd,
> +                             rte_eth_dev_socket_id(port), &txconf);
> +             if (retval < 0)
> +                     return retval;
> +     }
> +
> +     /* Start the Ethernet port. */
> +     retval = rte_eth_dev_start(port);
> +     if (retval < 0)
> +             return retval;
> +
> +     /* Display the port MAC address. */
> +     struct ether_addr addr;
> +     rte_eth_macaddr_get(port, &addr);
> +     printf("Port %u MAC: %02" PRIx8 " %02" PRIx8 " %02" PRIx8
> +                        " %02" PRIx8 " %02" PRIx8 " %02" PRIx8 "\n",
> +                     port,
> +                     addr.addr_bytes[0], addr.addr_bytes[1],
> +                     addr.addr_bytes[2], addr.addr_bytes[3],
> +                     addr.addr_bytes[4], addr.addr_bytes[5]);
> +
> +     /* Enable RX in promiscuous mode for the Ethernet device. */
> +     rte_eth_promiscuous_enable(port);
> +
> +     return 0;
> +}
> +
> +/*
> + * The lcore main. This is the main thread that does the work, reading from
> + * an input port and writing to an output port.
> + */
> +static __attribute__((noreturn)) void
> +lcore_main(void)
> +{
> +     uint16_t port;
> +
> +     /*
> +      * Check that the port is on the same NUMA node as the polling thread
> +      * for best performance.
> +      */
> +     RTE_ETH_FOREACH_DEV(port)
> +             if (rte_eth_dev_socket_id(port) > 0 &&
> +                             rte_eth_dev_socket_id(port) !=
> +                                             (int)rte_socket_id())
> +                     printf("WARNING, port %u is on remote NUMA node to "
> +                                     "polling thread.\n\tPerformance will "
> +                                     "not be optimal.\n", port);
> +
> +     printf("\nCore %u forwarding packets. [Ctrl+C to quit]\n",
> +                     rte_lcore_id());
> +
> +     /* Run until the application is quit or killed. */
> +     for (;;) {
> +             /*
> +              * Receive packets on a port and forward them on the paired
> +              * port. The mapping is 0 -> 1, 1 -> 0, 2 -> 3, 3 -> 2, etc.
> +              */
> +             RTE_ETH_FOREACH_DEV(port) {
> +
> +                     /* Get burst of RX packets, from first port of pair. */
> +                     struct rte_mbuf *bufs[BURST_SIZE];
> +                     const uint16_t nb_rx = rte_eth_rx_burst(port, 0,
> +                                     bufs, BURST_SIZE);
> +
> +                     if (unlikely(nb_rx == 0))
> +                             continue;
> +
> +                     /* Send burst of TX packets, to second port of pair. */
> +                     const uint16_t nb_tx = rte_eth_tx_burst(port ^ 1, 0,
> +                                     bufs, nb_rx);
> +
> +                     /* Free any unsent packets. */
> +                     if (unlikely(nb_tx < nb_rx)) {
> +                             uint16_t buf;
> +                             for (buf = nb_tx; buf < nb_rx; buf++)
> +                                     rte_pktmbuf_free(bufs[buf]);
> +                     }
> +             }
> +     }
> +}
> +
> +/* extremely pessimistic estimation of memory required to create a mempool */
> +static int
> +calc_mem_size(uint32_t nb_ports, uint32_t nb_mbufs_per_port,
> +             uint32_t mbuf_sz, size_t pgsz, size_t *out)
> +{
> +     uint32_t nb_mbufs = nb_ports * nb_mbufs_per_port;
> +     uint64_t total_mem, mbuf_mem, obj_sz;
> +
> +     /* there is no good way to predict how much space the mempool will
> +      * occupy because it will allocate chunks on the fly, and some of those
> +      * will come from default DPDK memory while some will come from our
> +      * external memory, so just assume 16MB will be enough for everyone.
> +      */
> +     uint64_t hdr_mem = 16 << 20;
> +
> +     obj_sz = rte_mempool_calc_obj_size(mbuf_sz, 0, NULL);
> +     if (rte_eal_iova_mode() == RTE_IOVA_VA) {
> +             /* contiguous - no need to account for page boundaries */
> +             mbuf_mem = nb_mbufs * obj_sz;
> +     } else {
> +             /* account for possible non-contiguousness */
> +             unsigned int n_pages, mbuf_per_pg, leftover;
> +
> +             mbuf_per_pg = pgsz / obj_sz;
> +             leftover = (nb_mbufs % mbuf_per_pg) > 0;
> +             n_pages = (nb_mbufs / mbuf_per_pg) + leftover;
> +
> +             mbuf_mem = n_pages * pgsz;
> +     }
> +
> +     total_mem = RTE_ALIGN(hdr_mem + mbuf_mem, pgsz);
> +
> +     if (total_mem > SIZE_MAX) {
> +             printf("Memory size too big\n");
> +             return -1;
> +     }
> +     *out = (size_t)total_mem;
> +
> +     return 0;
> +}
> +
> +static inline uint32_t
> +bsf64(uint64_t v)
> +{
> +     return (uint32_t)__builtin_ctzll(v);
> +}
> +
> +static inline uint32_t
> +log2_u64(uint64_t v)
> +{
> +     if (v == 0)
> +             return 0;
> +     v = rte_align64pow2(v);
> +     return bsf64(v);
> +}
> +
> +#ifndef MAP_HUGE_SHIFT
> +#define HUGE_SHIFT 26
> +#else
> +#define HUGE_SHIFT MAP_HUGE_SHIFT
> +#endif
> +
> +static int
> +pagesz_flags(uint64_t page_sz)
> +{
> +     /* as per mmap() manpage, all page sizes are log2 of page size
> +      * shifted by MAP_HUGE_SHIFT
> +      */
> +     int log2 = log2_u64(page_sz);
> +     return log2 << HUGE_SHIFT;
> +}
> +
> +static void *
> +alloc_mem(size_t memsz, size_t pgsz)
> +{
> +     void *addr;
> +     int flags;
> +
> +     /* allocate anonymous hugepages */
> +     flags = MAP_ANONYMOUS | MAP_PRIVATE | MAP_HUGETLB | pagesz_flags(pgsz);
> +
> +     addr = mmap(NULL, memsz, PROT_READ | PROT_WRITE, flags, -1, 0);
> +     if (addr == MAP_FAILED)
> +             return NULL;
> +
> +     return addr;
> +}
> +
> +struct extmem_param {
> +     void *addr;
> +     size_t len;
> +     size_t pgsz;
> +     rte_iova_t *iova_table;
> +     unsigned int iova_table_len;
> +};
> +
> +static int
> +create_extmem(uint32_t nb_ports, uint32_t nb_mbufs_per_port, uint32_t 
> mbuf_sz,
> +             struct extmem_param *param)
> +{
> +     uint64_t pgsizes[] = {RTE_PGSIZE_2M, RTE_PGSIZE_1G, /* x86_64, ARM */
> +                     RTE_PGSIZE_16M, RTE_PGSIZE_16G};    /* POWER */
> +     unsigned int n_pages, cur_page, pgsz_idx;
> +     size_t mem_sz, offset, cur_pgsz;
> +     bool vfio_supported = true;
> +     rte_iova_t *iovas = NULL;
> +     void *addr;
> +     int ret;
> +
> +     for (pgsz_idx = 0; pgsz_idx < RTE_DIM(pgsizes); pgsz_idx++) {
> +             /* skip anything that is too big */
> +             if (pgsizes[pgsz_idx] > SIZE_MAX)
> +                     continue;
> +
> +             cur_pgsz = pgsizes[pgsz_idx];
> +
> +             ret = calc_mem_size(nb_ports, nb_mbufs_per_port,
> +                             mbuf_sz, cur_pgsz, &mem_sz);
> +             if (ret < 0) {
> +                     printf("Cannot calculate memory size\n");
> +                     return -1;
> +             }
> +
> +             /* allocate our memory */
> +             addr = alloc_mem(mem_sz, cur_pgsz);
> +
> +             /* if we couldn't allocate memory with a specified page size,
> +              * that doesn't mean we can't do it with other page sizes, so
> +              * try another one.
> +              */
> +             if (addr == NULL)
> +                     continue;
> +
> +             /* store IOVA addresses for every page in this memory area */
> +             n_pages = mem_sz / cur_pgsz;
> +
> +             iovas = malloc(sizeof(*iovas) * n_pages);
> +
> +             if (iovas == NULL) {
> +                     printf("Cannot allocate memory for iova addresses\n");
> +                     goto fail;
> +             }
> +
> +             /* populate IOVA table */
> +             for (cur_page = 0; cur_page < n_pages; cur_page++) {
> +                     rte_iova_t iova;
> +                     void *cur;
> +
> +                     offset = cur_pgsz * cur_page;
> +                     cur = RTE_PTR_ADD(addr, offset);
> +
> +                     iova = (uintptr_t)rte_mem_virt2iova(cur);
> +
> +                     iovas[cur_page] = iova;
> +
> +                     if (vfio_supported) {
> +                             /* map memory for DMA */
> +                             ret = rte_vfio_dma_map((uintptr_t)addr,
> +                                             iova, cur_pgsz);
> +                             if (ret < 0) {
> +                                     /*
> +                                      * ENODEV means VFIO is not initialized
> +                                      * ENOTSUP means current IOMMU mode
> +                                      * doesn't support mapping
> +                                      * both cases are not an error
> +                                      */
> +                                     if (rte_errno == ENOTSUP ||
> +                                                     rte_errno == ENODEV)
> +                                             /* VFIO is unsupported, don't
> +                                              * try again.
> +                                              */
> +                                             vfio_supported = false;
> +                                     else
> +                                             /* this is an actual error */
> +                                             goto fail;
> +                             }
> +                     }
> +             }
> +
> +             break;
> +     }
> +     /* if we couldn't allocate anything */
> +     if (iovas == NULL)
> +             return -1;
> +
> +     param->addr = addr;
> +     param->len = mem_sz;
> +     param->pgsz = cur_pgsz;
> +     param->iova_table = iovas;
> +     param->iova_table_len = n_pages;
> +
> +     return 0;
> +fail:
> +     if (iovas)
> +             free(iovas);
> +     if (addr)
> +             munmap(addr, mem_sz);
> +
> +     return -1;
> +}
> +
> +static int
> +setup_extmem(uint32_t nb_ports, uint32_t nb_mbufs_per_port, uint32_t mbuf_sz)
> +{
> +     struct extmem_param param;
> +     int ret;
> +
> +     /* create our heap */
> +     ret = rte_malloc_heap_create(EXTMEM_HEAP_NAME);
> +     if (ret < 0) {
> +             printf("Cannot create heap\n");
> +             return -1;
> +     }
> +
> +     ret = create_extmem(nb_ports, nb_mbufs_per_port, mbuf_sz, &param);
> +     if (ret < 0) {
> +             printf("Cannot create memory area\n");
> +             return -1;
> +     }
> +
> +     /* we now have a valid memory area, so add it to heap */
> +     ret = rte_malloc_heap_memory_add(EXTMEM_HEAP_NAME,
> +                     param.addr, param.len, param.iova_table,
> +                     param.iova_table_len, param.pgsz);
> +
> +     /* not needed any more */
> +     free(param.iova_table);
> +
> +     if (ret < 0) {
> +             printf("Cannot add memory to heap\n");
> +             munmap(param.addr, param.len);
> +             return -1;
> +     }
> +
> +     printf("Allocated %zuMB of memory\n", param.len >> 20);
> +
> +     /* success */
> +     return 0;
> +}
> +
> +
> +/*
> + * The main function, which does initialization and calls the per-lcore
> + * functions.
> + */
> +int
> +main(int argc, char *argv[])
> +{
> +     struct rte_mempool *mbuf_pool;
> +     unsigned int nb_ports;
> +     int socket_id;
> +     uint16_t portid;
> +     uint32_t nb_mbufs_per_port, mbuf_sz;
> +
> +     /* Initialize the Environment Abstraction Layer (EAL). */
> +     int ret = rte_eal_init(argc, argv);
> +     if (ret < 0)
> +             rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
> +
> +     argc -= ret;
> +     argv += ret;
> +
> +     /* Check that there is an even number of ports to send/receive on. */
> +     nb_ports = rte_eth_dev_count_avail();
> +     if (nb_ports < 2 || (nb_ports & 1))
> +             rte_exit(EXIT_FAILURE, "Error: number of ports must be even\n");
> +
> +     nb_mbufs_per_port = NUM_MBUFS;
> +     mbuf_sz = RTE_MBUF_DEFAULT_BUF_SIZE;
> +
> +     if (setup_extmem(nb_ports, nb_mbufs_per_port, mbuf_sz) < 0)
> +             rte_exit(EXIT_FAILURE, "Error: cannot set up external 
> memory\n");
> +
> +     /* retrieve socket ID for our heap */
> +     socket_id = rte_malloc_heap_get_socket(EXTMEM_HEAP_NAME);
> +     if (socket_id < 0)
> +             rte_exit(EXIT_FAILURE, "Invalid socket for external heap\n");
> +
> +     /* Creates a new mempool in memory to hold the mbufs. */
> +     mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL",
> +                     nb_mbufs_per_port * nb_ports, MBUF_CACHE_SIZE, 0,
> +                     mbuf_sz, socket_id);
> +
> +     if (mbuf_pool == NULL)
> +             rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
> +
> +     /* Initialize all ports. */
> +     RTE_ETH_FOREACH_DEV(portid)
> +             if (port_init(portid, mbuf_pool) != 0)
> +                     rte_exit(EXIT_FAILURE, "Cannot init port %"PRIu16 "\n",
> +                                     portid);
> +
> +     if (rte_lcore_count() > 1)
> +             printf("\nWARNING: Too many lcores enabled. Only 1 used.\n");
> +
> +     /* Call lcore_main on the master core only. */
> +     lcore_main();
> +
> +     return 0;
> +}
> diff --git a/examples/external_mem/meson.build 
> b/examples/external_mem/meson.build
> new file mode 100644
> index 000000000..17a363ad2
> --- /dev/null
> +++ b/examples/external_mem/meson.build
> @@ -0,0 +1,12 @@
> +# SPDX-License-Identifier: BSD-3-Clause
> +# Copyright(c) 2017 Intel Corporation
> +
> +# meson file, for building this example as part of a main DPDK build.
> +#
> +# To build this example as a standalone application with an already-installed
> +# DPDK instance, use 'make'
> +
> +allow_experimental_apis = true
> +sources = files(
> +     'extmem.c'
> +)
> --
> 2.17.1

Reply via email to