This patch enables the client dpdk rings within the netdev-dpdk. It adds a new dpdk device called dpdkr (other naming suggestions?). This allows for the use of shared memory to communicate with other dpdk applications, on the host or within a virtual machine. Instructions for use are in INSTALL.DPDK.
This has been tested on Intel multi-core platforms and with the client application within the host. Signed-off-by: Gerald Rogers <gerald.rog...@intel.com> Signed-off-by: Gerald Rogers <gerald.rog...@intel.com> diff --git a/INSTALL.DPDK b/INSTALL.DPDK index 3e0247a..ec1b814 100644 --- a/INSTALL.DPDK +++ b/INSTALL.DPDK @@ -74,6 +74,44 @@ Once first DPDK port is added vswitchd, it creates Polling thread and polls dpdk device in continuous loop. Therefore CPU utilization for that thread is always 100%. +DPDK Rings : + +Following the steps above to create a bridge, you can now add dpdk rings +as a port to the vswitch. OVS will expect the DPDK ring device name to +start with dpdkr and end with portid. + + ovs-vsctl add-port br0 dpdkr0 -- set Interface dpdkr0 type=dpdkr + +DPDK rings client test application + +Included in the test directory is a sample DPDK application for testing +the rings. This is from the base dpdk directory and modified to work +with the ring naming used within ovs. + +location tests/ovs_client + +To run the client : + +ovsclient -c 1 -n 4 --proc-type=secondary -- --n "port id you gave dpdkr" + +It is essential to have --proc-type=secondary + +The application simply receives an mbuf on the receive queue of the +ethernet ring and then places that same mbuf on the transmit ring of +the ethernet ring. It is a trivial loopback application. + +In addition to executing the client in the host, you can execute it within +a guest VM. To do so you will need a patched qemu. You can download the +patch and getting started guid at : + +https://01.org/packet-processing/downloads + +A general rule of thumb for better performance is that the client +application shouldn't be assigned the same dpdk core mask "-c" as +the vswitchd. + + + Restrictions: ------------- @@ -86,6 +124,11 @@ Restrictions: device queues configured. - Work with 1500 MTU, needs few changes in DPDK lib to fix this issue. - Currently DPDK port does not make use any offload functionality. + ivshmem + - The shared memory is currently restricted to the use of a 1GB + huge pages. + - All huge pages are shared amongst the host, clients, virtual + machines etc. Bug Reporting: -------------- diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index fd991ab..a90913f 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -148,6 +148,23 @@ struct dpdk_tx_queue { struct rte_mbuf *burst_pkts[MAX_TX_QUEUE_LEN]; }; +/* dpdk has no way to remove dpdk ring ethernet devices + so we have to keep them around once they've been created +*/ + +static struct list dpdk_ring_list OVS_GUARDED_BY(dpdk_mutex) + = LIST_INITIALIZER(&dpdk_ring_list); + +struct dpdk_ring { + /* For the client rings */ + struct rte_ring * cring_tx; + struct rte_ring * cring_rx; + int port_id; /* dpdkr port id */ + int eth_port_id; /* ethernet device port id */ + struct list list_node OVS_GUARDED_BY(mutex); + +}; + struct netdev_dpdk { struct netdev up; int port_id; @@ -167,9 +184,14 @@ struct netdev_dpdk { uint8_t hwaddr[ETH_ADDR_LEN]; enum netdev_flags flags; + struct rte_eth_link link; int link_reset_cnt; + /* If a shared memory ring */ + + struct dpdk_ring *ivshmem; + /* In dpdk_list. */ struct list list_node OVS_GUARDED_BY(dpdk_mutex); }; @@ -571,6 +593,7 @@ dpdk_queue_flush(struct netdev_dpdk *dev, int qid) if (txq->count == 0) { return; } + rte_spinlock_lock(&txq->tx_lock); nb_tx = rte_eth_tx_burst(dev->port_id, qid, txq->burst_pkts, txq->count); if (nb_tx != txq->count) { @@ -593,6 +616,7 @@ netdev_dpdk_rxq_recv(struct netdev_rxq *rxq_, struct ofpbuf **packets, int *c) dpdk_queue_flush(dev, rxq_->queue_id); + nb_rx = rte_eth_rx_burst(rx->port_id, rxq_->queue_id, (struct rte_mbuf **) packets, MAX_RX_QUEUE_LEN); if (!nb_rx) { @@ -1170,6 +1194,200 @@ static struct netdev_class netdev_dpdk_class = { NULL, /* rxq_drain */ }; +/* Client Rings */ + +static int +dpdk_classr_init(void) +{ + VLOG_INFO("Initialized dpdk client handlers:\n"); + return 0; +} + +static int +netdev_dpdkr_construct(struct netdev *netdev_) +{ + struct netdev_dpdk *netdev = netdev_dpdk_cast(netdev_); + struct dpdk_ring *ivshmem; + unsigned int port_no; + char *cport; + int err=0; + int found=0; + char name[10]; + + + if (rte_eal_init_ret) { + return rte_eal_init_ret; + } + + ovs_mutex_lock(&dpdk_mutex); + cport = netdev_->name + 5; /* Names always start with "dpdkr" */ + + if (strncmp(netdev_->name, "dpdkr", 5)) { + VLOG_ERR("Not a valid dpdkr name %s:\n",netdev_->name); + err = ENODEV; + goto unlock_dpdk; + } + + port_no = strtol(cport, 0, 0); /* string must be null terminated */ + + ovs_mutex_init(&netdev->mutex); + + ovs_mutex_lock(&netdev->mutex); + netdev->flags = 0; + + netdev->mtu = ETHER_MTU; + netdev->max_packet_len = MTU_TO_MAX_LEN(netdev->mtu); + + /* TODO: need to discover device node at run time. */ + netdev->socket_id = SOCKET0; + + netdev->dpdk_mp = dpdk_mp_get(netdev->socket_id, netdev->mtu); + if (!netdev->dpdk_mp) { + VLOG_ERR("Unable to get memory pool\n"); + err = ENOMEM; + goto unlock_dev; + } + + /* look through our list to find the device */ + LIST_FOR_EACH(ivshmem, list_node, &dpdk_ring_list){ + if (ivshmem->port_id == port_no) { + VLOG_INFO("Found dpdk ring device %s:\n", netdev_->name); + found = 1; + break; + } + } + if (found) { + netdev->ivshmem = ivshmem; + netdev->port_id = ivshmem->eth_port_id; /* really all that is needed */ + } + else { + /* Need to create the device rings */ + + ivshmem = dpdk_rte_mzalloc(sizeof *ivshmem); + if (ivshmem == NULL) + goto unlock_dev; + + snprintf(name,10,"%s_tx",netdev_->name); + ivshmem->cring_tx = rte_ring_create(name, MAX_RX_QUEUE_LEN, SOCKET0, 0); + if (ivshmem->cring_tx == NULL) + goto unlock_dev; + + snprintf(name,10,"%s_rx",netdev_->name); + ivshmem->cring_rx = rte_ring_create(name, MAX_RX_QUEUE_LEN, SOCKET0, 0); + if (ivshmem->cring_rx == NULL) + goto unlock_dev; + + err = rte_eth_from_rings(&ivshmem->cring_rx, 1, &ivshmem->cring_tx, 1, SOCKET0); + if (err < 0) { + goto unlock_dev; + } + + ivshmem->port_id = port_no; + ivshmem->eth_port_id = netdev->port_id = rte_eth_dev_count()-1 ; + + netdev->ivshmem=ivshmem; + + list_push_back(&dpdk_ring_list, &ivshmem->list_node); + + } + + err = dpdk_eth_dev_init(netdev); + if (err) { + goto unlock_dev; + } + + list_push_back(&dpdk_list, &netdev->list_node); + + err = 0; +unlock_dev: + ovs_mutex_unlock(&netdev->mutex); +unlock_dpdk: + ovs_mutex_unlock(&dpdk_mutex); + return err; +} + +static void +netdev_dpdkr_destruct(struct netdev *netdev_) +{ + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev_); + + ovs_mutex_lock(&dev->mutex); + rte_eth_dev_stop(dev->port_id); + + ovs_mutex_unlock(&dev->mutex); + + ovs_mutex_lock(&dpdk_mutex); + list_remove(&dev->list_node); + dpdk_mp_put(dev->dpdk_mp); + ovs_mutex_unlock(&dpdk_mutex); + + ovs_mutex_destroy(&dev->mutex); +} + +static struct netdev_class netdev_dpdkr_class = { + "dpdkr", + dpdk_classr_init, /* init */ + NULL, /* netdev_dpdk_run */ + NULL, /* netdev_dpdk_wait */ + + netdev_dpdk_alloc, + netdev_dpdkr_construct, + netdev_dpdkr_destruct, + netdev_dpdk_dealloc, + netdev_dpdk_get_config, + NULL, /* netdev_dpdk_set_config */ + NULL, /* get_tunnel_config */ + + netdev_dpdk_send, /* send */ + NULL, /* send_wait */ + + netdev_dpdk_set_etheraddr, + netdev_dpdk_get_etheraddr, + netdev_dpdk_get_mtu, + netdev_dpdk_set_mtu, + netdev_dpdk_get_ifindex, + netdev_dpdk_get_carrier, + netdev_dpdk_get_carrier_resets, + netdev_dpdk_set_miimon, + netdev_dpdk_get_stats, + netdev_dpdk_set_stats, + netdev_dpdk_get_features, + NULL, /* set_advertisements */ + + NULL, /* set_policing */ + NULL, /* get_qos_types */ + NULL, /* get_qos_capabilities */ + NULL, /* get_qos */ + NULL, /* set_qos */ + NULL, /* get_queue */ + NULL, /* set_queue */ + NULL, /* delete_queue */ + NULL, /* get_queue_stats */ + NULL, /* queue_dump_start */ + NULL, /* queue_dump_next */ + NULL, /* queue_dump_done */ + NULL, /* dump_queue_stats */ + + NULL, /* get_in4 */ + NULL, /* set_in4 */ + NULL, /* get_in6 */ + NULL, /* add_router */ + NULL, /* get_next_hop */ + netdev_dpdk_get_status, + NULL, /* arp_lookup */ + + netdev_dpdk_update_flags, + + netdev_dpdk_rxq_alloc, + netdev_dpdk_rxq_construct, + netdev_dpdk_rxq_destruct, + netdev_dpdk_rxq_dealloc, + netdev_dpdk_rxq_recv, + NULL, /* rx_wait */ + NULL, /* rxq_drain */ +}; + + int dpdk_init(int argc, char **argv) { @@ -1196,6 +1414,7 @@ void netdev_dpdk_register(void) { netdev_register_provider(&netdev_dpdk_class); + netdev_register_provider(&netdev_dpdkr_class); } int diff --git a/lib/netdev-dpdk.h b/lib/netdev-dpdk.h index 2807310..8cb715a 100644 --- a/lib/netdev-dpdk.h +++ b/lib/netdev-dpdk.h @@ -10,6 +10,7 @@ #include <rte_eal.h> #include <rte_debug.h> #include <rte_ethdev.h> +#include <rte_eth_ring.h> #include <rte_errno.h> #include <rte_memzone.h> #include <rte_memcpy.h> diff --git a/lib/netdev.c b/lib/netdev.c index 2fc1834..610de13 100644 --- a/lib/netdev.c +++ b/lib/netdev.c @@ -98,7 +98,7 @@ netdev_n_rxq(const struct netdev *netdev) bool netdev_is_pmd(const struct netdev *netdev) { - return !strcmp(netdev->netdev_class->type, "dpdk"); + return !strncmp(netdev->netdev_class->type, "dpdk",4); } static void diff --git a/tests/automake.mk b/tests/automake.mk index bf80702..0067812 100644 --- a/tests/automake.mk +++ b/tests/automake.mk @@ -204,6 +204,11 @@ tests/idltest.ovsidl: $(IDLTEST_IDL_FILES) tests/idltest.c: tests/idltest.h +noinst_PROGRAMS += tests/ovsclient +tests_ovsclient_SOURCES = \ + tests/ovs_client/ovs_client.c +tests_ovsclient_LDADD = lib/libopenvswitch.la $(LIBS) + noinst_PROGRAMS += tests/ovstest tests_ovstest_SOURCES = \ tests/ovstest.c \ diff --git a/tests/ovs_client/ovs_client.c b/tests/ovs_client/ovs_client.c new file mode 100644 index 0000000..07f49ed --- /dev/null +++ b/tests/ovs_client/ovs_client.c @@ -0,0 +1,217 @@ +/* + * BSD LICENSE + * + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include <getopt.h> + +#include <config.h> +#include <rte_config.h> +#include <rte_mbuf.h> +#include <rte_ether.h> +#include <rte_string_fns.h> +#include <rte_ip.h> +#include <rte_byteorder.h> + +/* Number of packets to attempt to read from queue */ +#define PKT_READ_SIZE ((uint16_t)32) + +/* define common names for structures shared between ovs_dpdk and client */ +#define MP_CLIENT_RXQ_NAME "dpdkr%u_tx" +#define MP_CLIENT_TXQ_NAME "dpdkr%u_rx" + +#define RTE_LOGTYPE_APP RTE_LOGTYPE_USER1 + +#define BASE_10 10 + +/* our client id number - tells us which rx queue to read, and tx + * queue to write to. */ +static uint8_t client_id = 0; + +int no_pkt; +int pkt; + +/* + * Given the rx queue name template above, get the queue name + */ +static inline const char * +get_rx_queue_name(unsigned id) +{ + /* buffer for return value. Size calculated by %u being replaced + * by maximum 3 digits (plus an extra byte for safety) */ + static char buffer[sizeof(MP_CLIENT_RXQ_NAME) + 2]; + + rte_snprintf(buffer, sizeof(buffer) - 1, MP_CLIENT_RXQ_NAME, id); + return buffer; +} + +/* + * Given the tx queue name template above, get the queue name + */ +static inline const char * +get_tx_queue_name(unsigned id) +{ + /* buffer for return value. Size calculated by %u being replaced + * by maximum 3 digits (plus an extra byte for safety) */ + static char buffer[sizeof(MP_CLIENT_TXQ_NAME) + 2]; + + rte_snprintf(buffer, sizeof(buffer) - 1, MP_CLIENT_TXQ_NAME, id); + return buffer; +} + +/* + * print a usage message + */ +static void +usage(const char *progname) +{ + printf("\nUsage: %s [EAL args] -- -n <client_id>\n", progname); +} + +/* + * Convert the client id number from a string to an int. + */ +static int +parse_client_num(const char *client) +{ + char *end = NULL; + unsigned long temp = 0; + + if (client == NULL || *client == '\0') + return -1; + + temp = strtoul(client, &end, BASE_10); + /* If valid string argument is provided, terminating '/0' character + * is stored in 'end' */ + if (end == NULL || *end != '\0') + return -1; + + client_id = (uint8_t)temp; + return 0; +} + +/* + * Parse the application arguments to the client app. + */ +static int +parse_app_args(int argc, char *argv[]) +{ + int option_index = 0, opt = 0; + char **argvopt = argv; + const char *progname = NULL; + static struct option lgopts[] = { + {NULL, 0, 0, 0 } + }; + progname = argv[0]; + + while ((opt = getopt_long(argc, argvopt, "n:s:d:", lgopts, + &option_index)) != EOF){ + switch (opt){ + case 'n': + if (parse_client_num(optarg) != 0){ + usage(progname); + return -1; + } + break; + default: + usage(progname); + return -1; + } + } + + return 0; +} + +/* + * Application main function - loops through + * receiving and processing packets. Never returns + */ +int +main(int argc, char *argv[]) +{ + struct rte_ring *rx_ring = NULL; + struct rte_ring *tx_ring = NULL; + int retval = 0; + void *pkts[PKT_READ_SIZE]; + int rslt = 0; + + if ((retval = rte_eal_init(argc, argv)) < 0) + return -1; + + argc -= retval; + argv += retval; + + if (parse_app_args(argc, argv) < 0) + rte_exit(EXIT_FAILURE, "Invalid command-line arguments\n"); + + rx_ring = rte_ring_lookup(get_rx_queue_name(client_id)); + if (rx_ring == NULL) + rte_exit(EXIT_FAILURE, "Cannot get RX ring - is server process running?\n"); + + tx_ring = rte_ring_lookup(get_tx_queue_name(client_id)); + if (tx_ring == NULL) + rte_exit(EXIT_FAILURE, "Cannot get TX ring - is server process running?\n"); + + RTE_LOG(INFO, APP, "Finished Process Init.\n"); + + printf("\nClient process %d handling packets\n", client_id); + printf("[Press Ctrl-C to quit ...]\n"); + + for (;;) { + unsigned rx_pkts = PKT_READ_SIZE; + + /* try dequeuing max possible packets first, if that fails, get the + * most we can. Loop body should only execute once, maximum */ + while (rx_pkts > 0 && + unlikely(rte_ring_dequeue_bulk(rx_ring, pkts, rx_pkts) != 0)) + rx_pkts = (uint16_t)RTE_MIN(rte_ring_count(rx_ring), PKT_READ_SIZE); + + if(rx_pkts>0) + { + + pkt++; + + /* blocking enqueue */ + do { + rslt = rte_ring_enqueue_bulk(tx_ring, pkts, rx_pkts); + } while (rslt == -ENOBUFS); + } + else + no_pkt++; + + if (!(pkt % 100000)) + { + printf("pkt %d %d\n", pkt, no_pkt); + pkt=no_pkt=0; + } + } +} _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev