Add in some performance testing for the pktdev library. Looking at cycles count for a ring-based implementation, based off the ring performance tests. Compares ring performance: * native ring calls * calls through pktdev to the ring * calls through ring pmd wrapper to the ring * calls through pktdev to the pmd wrapper to the ring.
Signed-off-by: Bruce Richardson <bruce.richardson at intel.com> --- app/test/Makefile | 4 +- app/test/test_pktdev_perf.c | 260 +++++++++++++++++++++++++++++++++++++++++ lib/librte_pktdev/rte_pktdev.h | 8 +- 3 files changed, 265 insertions(+), 7 deletions(-) create mode 100644 app/test/test_pktdev_perf.c diff --git a/app/test/Makefile b/app/test/Makefile index 77e48c1..8697893 100644 --- a/app/test/Makefile +++ b/app/test/Makefile @@ -58,9 +58,7 @@ SRCS-y += test_ring.c SRCS-y += test_ring_perf.c SRCS-y += test_pmd_perf.c -ifeq ($(CONFIG_RTE_LIBRTE_PKTDEV),y) -SRCS-y += test_pktdev.c -endif +SRCS-$(CONFIG_RTE_LIBRTE_PKTDEV) += test_pktdev.c test_pktdev_perf.c ifeq ($(CONFIG_RTE_LIBRTE_TABLE),y) SRCS-y += test_table.c diff --git a/app/test/test_pktdev_perf.c b/app/test/test_pktdev_perf.c new file mode 100644 index 0000000..6a94e4d --- /dev/null +++ b/app/test/test_pktdev_perf.c @@ -0,0 +1,260 @@ +/*- + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <stdio.h> +#include <inttypes.h> +#include <rte_ring.h> +#include <rte_cycles.h> +#include <rte_launch.h> +#include <rte_pktdev.h> +#include <rte_ethdev.h> +#include <rte_eth_ring.h> + +#include "test.h" + +/* + * Ring + * ==== + * + * Measures performance of various operations using rdtsc + * * Empty ring dequeue + * * Enqueue/dequeue of bursts in 1 threads + * * Enqueue/dequeue of bursts in 2 threads + */ + +#define RING_NAME "RING_PERF" +#define RING_SIZE 4096 +#define MAX_BURST 32 + +/* + * the sizes to enqueue and dequeue in testing + * (marked volatile so they won't be seen as compile-time constants) + */ +static const volatile unsigned bulk_sizes[] = { 1, 8, 32 }; + +/* The ring structure used for tests */ +static struct rte_ring *r; +static struct rte_pktdev *r_pdev; +static uint8_t ring_ethdev_port; +static struct rte_pktdev *re_pdev; + +/* Get cycle counts for dequeuing from an empty ring. Should be 2 or 3 cycles */ +static void +test_empty_dequeue(void) +{ + const unsigned iter_shift = 26; + const unsigned iterations = 1<<iter_shift; + unsigned i = 0; + void *burst[MAX_BURST]; + + const uint64_t sc_start = rte_rdtsc(); + for (i = 0; i < iterations; i++) + rte_ring_sc_dequeue_bulk(r, burst, bulk_sizes[0]); + const uint64_t sc_end = rte_rdtsc(); + + const uint64_t mc_start = rte_rdtsc(); + for (i = 0; i < iterations; i++) + rte_ring_mc_dequeue_bulk(r, burst, bulk_sizes[0]); + const uint64_t mc_end = rte_rdtsc(); + + printf("SC empty dequeue: %.2F\n", + (double)(sc_end-sc_start) / iterations); + printf("MC empty dequeue: %.2F\n", + (double)(mc_end-mc_start) / iterations); +} + +/* + * Test function that determines how long an enqueue + dequeue of a single item + * takes on a single lcore. Result is for comparison with the bulk enq+deq. + */ +static void +test_single_enqueue_dequeue(void) +{ + const unsigned iter_shift = 24; + const unsigned iterations = 1<<iter_shift; + unsigned i = 0; + void *burst = NULL; + struct rte_mbuf *mburst[1] = { NULL }; + + const uint64_t sc_start = rte_rdtsc_precise(); + rte_compiler_barrier(); + for (i = 0; i < iterations; i++) { + rte_ring_enqueue_bulk(r, &burst, 1); + rte_ring_dequeue_bulk(r, &burst, 1); + } + const uint64_t sc_end = rte_rdtsc_precise(); + rte_compiler_barrier(); + + const uint64_t pd_start = rte_rdtsc_precise(); + rte_compiler_barrier(); + for (i = 0; i < iterations; i++) { + rte_pkt_tx_burst(r_pdev, mburst, 1); + rte_pkt_rx_burst(r_pdev, mburst, 1); + } + const uint64_t pd_end = rte_rdtsc_precise(); + rte_compiler_barrier(); + + const uint64_t eth_start = rte_rdtsc_precise(); + rte_compiler_barrier(); + for (i = 0; i < iterations; i++) { + rte_eth_tx_burst(ring_ethdev_port, 0, mburst, 1); + rte_eth_rx_burst(ring_ethdev_port, 0, mburst, 1); + } + const uint64_t eth_end = rte_rdtsc_precise(); + rte_compiler_barrier(); + + const uint64_t pd_eth_start = rte_rdtsc_precise(); + rte_compiler_barrier(); + for (i = 0; i < iterations; i++) { + rte_pkt_tx_burst(re_pdev, mburst, 1); + rte_pkt_rx_burst(re_pdev, mburst, 1); + } + const uint64_t pd_eth_end = rte_rdtsc_precise(); + rte_compiler_barrier(); + + printf("Ring single enq/dequeue : %"PRIu64"\n", + (sc_end-sc_start) >> iter_shift); + printf("Pktdev(ring) single enq/deq : %"PRIu64"\n", + (pd_end-pd_start) >> iter_shift); + printf("Ethdev single enq/dequeue : %"PRIu64"\n", + (eth_end-eth_start) >> iter_shift); + printf("Pktdev(ethdev) single enq/deq: %"PRIu64"\n", + (pd_eth_end-pd_eth_start) >> iter_shift); +} + +/* Times enqueue and dequeue on a single lcore */ +static void +test_bulk_enqueue_dequeue(void) +{ + const unsigned iter_shift = 23; + const unsigned iterations = 1<<iter_shift; + unsigned sz, i = 0; + struct rte_mbuf *burst[MAX_BURST] = {0}; + + for (sz = 0; sz < sizeof(bulk_sizes)/sizeof(bulk_sizes[0]); sz++) { + const uint64_t sc_start = rte_rdtsc(); + for (i = 0; i < iterations; i++) { + rte_ring_sp_enqueue_bulk(r, (void *)burst, bulk_sizes[sz]); + rte_ring_sc_dequeue_bulk(r, (void *)burst, bulk_sizes[sz]); + } + const uint64_t sc_end = rte_rdtsc(); + + const uint64_t pd_start = rte_rdtsc_precise(); + rte_compiler_barrier(); + for (i = 0; i < iterations; i++) { + rte_pkt_tx_burst(r_pdev, burst, bulk_sizes[sz]); + rte_pkt_rx_burst(r_pdev, burst, bulk_sizes[sz]); + } + const uint64_t pd_end = rte_rdtsc_precise(); + rte_compiler_barrier(); + + const uint64_t eth_start = rte_rdtsc_precise(); + rte_compiler_barrier(); + for (i = 0; i < iterations; i++) { + rte_eth_tx_burst(ring_ethdev_port, 0, burst, bulk_sizes[sz]); + rte_eth_rx_burst(ring_ethdev_port, 0, burst, bulk_sizes[sz]); + } + const uint64_t eth_end = rte_rdtsc_precise(); + rte_compiler_barrier(); + + const uint64_t pd_eth_start = rte_rdtsc_precise(); + rte_compiler_barrier(); + for (i = 0; i < iterations; i++) { + rte_pkt_tx_burst(re_pdev, burst, bulk_sizes[sz]); + rte_pkt_rx_burst(re_pdev, burst, bulk_sizes[sz]); + } + const uint64_t pd_eth_end = rte_rdtsc_precise(); + rte_compiler_barrier(); + + double sc_avg = ((double)(sc_end-sc_start) / + (iterations * bulk_sizes[sz])); + double pd_avg = ((double)(pd_end-pd_start) / + (iterations * bulk_sizes[sz])); + double eth_avg = ((double)(eth_end-eth_start) / + (iterations * bulk_sizes[sz])); + double pd_eth_avg = ((double)(pd_eth_end-pd_eth_start) / + (iterations * bulk_sizes[sz])); + + printf("ring bulk enq/dequeue (size: %u): %.1F\n", bulk_sizes[sz], + sc_avg); + printf("pktdev(ring) bulk enq/deq (%u) : %.1F\n", bulk_sizes[sz], + pd_avg); + printf("ethdev bulk enq/dequeue (%u) : %.1F\n", bulk_sizes[sz], + eth_avg); + printf("pktdev(ethdev) bulk enq/deq (%u): %.1F\n", bulk_sizes[sz], + pd_eth_avg); + + printf("\n"); + } +} + +static int +test_pktdev_perf(void) +{ + const struct rte_eth_conf port_conf_default = {0}; + struct rte_mempool *p; + + r = rte_ring_create(RING_NAME, RING_SIZE, rte_socket_id(), + RING_F_SP_ENQ|RING_F_SC_DEQ); + if (r == NULL && (r = rte_ring_lookup(RING_NAME)) == NULL) + return -1; + + r_pdev = rte_pktdev_from_ring(r); + ring_ethdev_port = rte_eth_from_rings("TEST_RING", + &r, 1, &r, 1, /* one RX ring, one TX ring */ + rte_socket_id()); + rte_eth_dev_configure(ring_ethdev_port, 1, 1, &port_conf_default); + p = rte_pktmbuf_pool_create("Test pool", 1023, 32, 0, 2048, rte_socket_id()); + rte_eth_rx_queue_setup(ring_ethdev_port, 0, 128, rte_socket_id(), NULL, p); + rte_eth_tx_queue_setup(ring_ethdev_port, 0, 128, rte_socket_id(), NULL); + + re_pdev = rte_pktdev_from_ethport(ring_ethdev_port, 0, 0); + + printf("### Testing single element and burst enq/deq ###\n"); + test_single_enqueue_dequeue(); + + printf("\n### Testing empty dequeue ###\n"); + test_empty_dequeue(); + + printf("\n### Testing using a single lcore ###\n"); + test_bulk_enqueue_dequeue(); + + return 0; +} + +static struct test_command ring_perf_cmd = { + .command = "pktdev_perf_autotest", + .callback = test_pktdev_perf, +}; +REGISTER_TEST_COMMAND(ring_perf_cmd); diff --git a/lib/librte_pktdev/rte_pktdev.h b/lib/librte_pktdev/rte_pktdev.h index 3acbc0d..4740c67 100644 --- a/lib/librte_pktdev/rte_pktdev.h +++ b/lib/librte_pktdev/rte_pktdev.h @@ -46,6 +46,7 @@ extern "C" { #include <stdint.h> #include <rte_ring.h> +#include <rte_mbuf.h> #include <rte_branch_prediction.h> /* Buffered TX works in bursts of 32 */ @@ -53,9 +54,8 @@ extern "C" { /* * forward definition of data structures. - * We don't need full mbuf/kni/ethdev headers here + * We don't need full kni/ethdev headers here */ -struct rte_mbuf; struct rte_kni; struct rte_eth_dev; @@ -136,7 +136,7 @@ struct rte_pktdev { * of pointers to *rte_mbuf* structures effectively supplied to the * *rx_pkts* array. */ -static inline uint16_t +static inline uint16_t __attribute__((always_inline)) rte_pkt_rx_burst(struct rte_pktdev *dev, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) { @@ -168,7 +168,7 @@ rte_pkt_rx_burst(struct rte_pktdev *dev, struct rte_mbuf **rx_pkts, * the transmit ring. The return value can be less than the value of the * *tx_pkts* parameter when the transmit ring is full or has been filled up. */ -static inline uint16_t +static inline uint16_t __attribute__((always_inline)) rte_pkt_tx_burst(struct rte_pktdev *dev, struct rte_mbuf **tx_pkts, uint16_t nb_pkts) { -- 2.4.2