This patch gets vrings' addresses and sets up GPA and HPA mappings
for offloading large data movement from the CPU to DMA engines in
vhost-user PMD.

Signed-off-by: Jiayu Hu <jiayu...@intel.com>
---
 drivers/Makefile                  |   2 +-
 drivers/net/vhost/Makefile        |   4 +-
 drivers/net/vhost/internal.h      | 141 ++++++++++++++++++++++++++++++++
 drivers/net/vhost/meson.build     |   3 +-
 drivers/net/vhost/rte_eth_vhost.c |  56 +------------
 drivers/net/vhost/virtio_net.c    | 119 +++++++++++++++++++++++++++
 drivers/net/vhost/virtio_net.h    | 168 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 438 insertions(+), 55 deletions(-)
 create mode 100644 drivers/net/vhost/internal.h
 create mode 100644 drivers/net/vhost/virtio_net.c
 create mode 100644 drivers/net/vhost/virtio_net.h

diff --git a/drivers/Makefile b/drivers/Makefile
index c70bdf9..8555ddd 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -9,7 +9,7 @@ DEPDIRS-bus := common
 DIRS-y += mempool
 DEPDIRS-mempool := common bus
 DIRS-y += net
-DEPDIRS-net := common bus mempool
+DEPDIRS-net := common bus mempool raw
 DIRS-$(CONFIG_RTE_LIBRTE_BBDEV) += baseband
 DEPDIRS-baseband := common bus mempool
 DIRS-$(CONFIG_RTE_LIBRTE_CRYPTODEV) += crypto
diff --git a/drivers/net/vhost/Makefile b/drivers/net/vhost/Makefile
index 0461e29..19cae52 100644
--- a/drivers/net/vhost/Makefile
+++ b/drivers/net/vhost/Makefile
@@ -15,13 +15,15 @@ LDLIBS += -lrte_bus_vdev
 
 CFLAGS += -O3
 CFLAGS += $(WERROR_FLAGS)
+CFLAGS += -fno-strict-aliasing
+CFLAGS += -DALLOW_EXPERIMENTAL_API
 
 EXPORT_MAP := rte_pmd_vhost_version.map
 
 #
 # all source are stored in SRCS-y
 #
-SRCS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += rte_eth_vhost.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += rte_eth_vhost.c virtio_net.c
 
 #
 # Export include files
diff --git a/drivers/net/vhost/internal.h b/drivers/net/vhost/internal.h
new file mode 100644
index 0000000..7588fdf
--- /dev/null
+++ b/drivers/net/vhost/internal.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+#ifndef _INTERNAL_H_
+#define _INTERNAL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdbool.h>
+
+#include <rte_pci.h>
+#include <rte_vhost.h>
+#include <rte_log.h>
+
+extern int vhost_logtype;
+
+#define VHOST_LOG(level, ...) \
+       rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
+
+enum vhost_xstats_pkts {
+       VHOST_UNDERSIZE_PKT = 0,
+       VHOST_64_PKT,
+       VHOST_65_TO_127_PKT,
+       VHOST_128_TO_255_PKT,
+       VHOST_256_TO_511_PKT,
+       VHOST_512_TO_1023_PKT,
+       VHOST_1024_TO_1522_PKT,
+       VHOST_1523_TO_MAX_PKT,
+       VHOST_BROADCAST_PKT,
+       VHOST_MULTICAST_PKT,
+       VHOST_UNICAST_PKT,
+       VHOST_ERRORS_PKT,
+       VHOST_ERRORS_FRAGMENTED,
+       VHOST_ERRORS_JABBER,
+       VHOST_UNKNOWN_PROTOCOL,
+       VHOST_XSTATS_MAX,
+};
+
+struct vhost_stats {
+       uint64_t pkts;
+       uint64_t bytes;
+       uint64_t missed_pkts;
+       uint64_t xstats[VHOST_XSTATS_MAX];
+};
+
+struct batch_copy_elem {
+       void *dst;
+       void *src;
+       uint32_t len;
+};
+
+struct guest_page {
+       uint64_t guest_phys_addr;
+       uint64_t host_phys_addr;
+       uint64_t size;
+};
+
+struct dma_vring {
+       struct rte_vhost_vring  vr;
+
+       uint16_t last_avail_idx;
+       uint16_t last_used_idx;
+
+       /* the last used index that front end can consume */
+       uint16_t copy_done_used;
+
+       uint16_t signalled_used;
+       bool signalled_used_valid;
+
+       struct vring_used_elem *shadow_used_split;
+       uint16_t shadow_used_idx;
+
+       struct batch_copy_elem  *batch_copy_elems;
+       uint16_t batch_copy_nb_elems;
+
+       bool dma_enabled;
+       /**
+        * DMA ID. Currently, we only support I/OAT,
+        * so it's I/OAT rawdev ID.
+        */
+       uint16_t dev_id;
+       /* DMA address */
+       struct rte_pci_addr dma_addr;
+       /**
+        * the number of copy jobs that are submitted to the DMA
+        * but may not be completed.
+        */
+       uint64_t nr_inflight;
+       int nr_batching;
+
+       /**
+        * host physical address of used ring index,
+        * used by the DMA.
+        */
+       phys_addr_t used_idx_hpa;
+};
+
+struct vhost_queue {
+       int vid;
+       rte_atomic32_t allow_queuing;
+       rte_atomic32_t while_queuing;
+       struct pmd_internal *internal;
+       struct rte_mempool *mb_pool;
+       uint16_t port;
+       uint16_t virtqueue_id;
+       struct vhost_stats stats;
+       struct dma_vring *dma_vring;
+};
+
+struct pmd_internal {
+       rte_atomic32_t dev_attached;
+       char *iface_name;
+       uint64_t flags;
+       uint64_t disable_flags;
+       uint16_t max_queues;
+       int vid;
+       rte_atomic32_t started;
+       uint8_t vlan_strip;
+
+       /* guest's memory regions */
+       struct rte_vhost_memory *mem;
+       /* guest and host physical address mapping table */
+       struct guest_page *guest_pages;
+       uint32_t nr_guest_pages;
+       uint32_t max_guest_pages;
+       /* guest's vrings */
+       struct dma_vring dma_vrings[RTE_MAX_QUEUES_PER_PORT * 2];
+       uint16_t nr_vrings;
+       /* negotiated features */
+       uint64_t features;
+       size_t hdr_len;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INTERNAL_H_ */
diff --git a/drivers/net/vhost/meson.build b/drivers/net/vhost/meson.build
index d793086..b308dcb 100644
--- a/drivers/net/vhost/meson.build
+++ b/drivers/net/vhost/meson.build
@@ -3,6 +3,7 @@
 
 build = dpdk_conf.has('RTE_LIBRTE_VHOST')
 reason = 'missing dependency, DPDK vhost library'
-sources = files('rte_eth_vhost.c')
+sources = files('rte_eth_vhost.c',
+               'virtio_net.c')
 install_headers('rte_eth_vhost.h')
 deps += 'vhost'
diff --git a/drivers/net/vhost/rte_eth_vhost.c 
b/drivers/net/vhost/rte_eth_vhost.c
index 458ed58..b5c927c 100644
--- a/drivers/net/vhost/rte_eth_vhost.c
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -16,12 +16,10 @@
 #include <rte_vhost.h>
 #include <rte_spinlock.h>
 
+#include "internal.h"
 #include "rte_eth_vhost.h"
 
-static int vhost_logtype;
-
-#define VHOST_LOG(level, ...) \
-       rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
+int vhost_logtype;
 
 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
 
@@ -56,54 +54,6 @@ static struct rte_ether_addr base_eth_addr = {
        }
 };
 
-enum vhost_xstats_pkts {
-       VHOST_UNDERSIZE_PKT = 0,
-       VHOST_64_PKT,
-       VHOST_65_TO_127_PKT,
-       VHOST_128_TO_255_PKT,
-       VHOST_256_TO_511_PKT,
-       VHOST_512_TO_1023_PKT,
-       VHOST_1024_TO_1522_PKT,
-       VHOST_1523_TO_MAX_PKT,
-       VHOST_BROADCAST_PKT,
-       VHOST_MULTICAST_PKT,
-       VHOST_UNICAST_PKT,
-       VHOST_ERRORS_PKT,
-       VHOST_ERRORS_FRAGMENTED,
-       VHOST_ERRORS_JABBER,
-       VHOST_UNKNOWN_PROTOCOL,
-       VHOST_XSTATS_MAX,
-};
-
-struct vhost_stats {
-       uint64_t pkts;
-       uint64_t bytes;
-       uint64_t missed_pkts;
-       uint64_t xstats[VHOST_XSTATS_MAX];
-};
-
-struct vhost_queue {
-       int vid;
-       rte_atomic32_t allow_queuing;
-       rte_atomic32_t while_queuing;
-       struct pmd_internal *internal;
-       struct rte_mempool *mb_pool;
-       uint16_t port;
-       uint16_t virtqueue_id;
-       struct vhost_stats stats;
-};
-
-struct pmd_internal {
-       rte_atomic32_t dev_attached;
-       char *iface_name;
-       uint64_t flags;
-       uint64_t disable_flags;
-       uint16_t max_queues;
-       int vid;
-       rte_atomic32_t started;
-       uint8_t vlan_strip;
-};
-
 struct internal_list {
        TAILQ_ENTRY(internal_list) next;
        struct rte_eth_dev *eth_dev;
@@ -698,6 +648,7 @@ queue_setup(struct rte_eth_dev *eth_dev, struct 
pmd_internal *internal)
                vq->vid = internal->vid;
                vq->internal = internal;
                vq->port = eth_dev->data->port_id;
+               vq->dma_vring = &internal->dma_vrings[vq->virtqueue_id];
        }
        for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
                vq = eth_dev->data->tx_queues[i];
@@ -706,6 +657,7 @@ queue_setup(struct rte_eth_dev *eth_dev, struct 
pmd_internal *internal)
                vq->vid = internal->vid;
                vq->internal = internal;
                vq->port = eth_dev->data->port_id;
+               vq->dma_vring = &internal->dma_vrings[vq->virtqueue_id];
        }
 }
 
diff --git a/drivers/net/vhost/virtio_net.c b/drivers/net/vhost/virtio_net.c
new file mode 100644
index 0000000..11591c0
--- /dev/null
+++ b/drivers/net/vhost/virtio_net.c
@@ -0,0 +1,119 @@
+#include <stdint.h>
+#include <stdbool.h>
+#include <linux/virtio_net.h>
+
+#include <rte_malloc.h>
+#include <rte_vhost.h>
+
+#include "virtio_net.h"
+
+int
+vhost_dma_setup(struct pmd_internal *dev)
+{
+       struct dma_vring *dma_vr;
+       int vid = dev->vid;
+       int ret;
+       uint16_t i, j, size;
+
+       rte_vhost_get_negotiated_features(vid, &dev->features);
+
+       if (dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF))
+               dev->hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+       else
+               dev->hdr_len = sizeof(struct virtio_net_hdr);
+
+       dev->nr_vrings = rte_vhost_get_vring_num(vid);
+
+       if (rte_vhost_get_mem_table(vid, &dev->mem) < 0) {
+               VHOST_LOG(ERR, "Failed to get guest memory regions\n");
+               return -1;
+       }
+
+       /* set up gpa and hpa mappings */
+       if (setup_guest_pages(dev, dev->mem) < 0) {
+               VHOST_LOG(ERR, "Failed to set up hpa and gpa mappings\n");
+               free(dev->mem);
+               return -1;
+       }
+
+       for (i = 0; i < dev->nr_vrings; i++) {
+               dma_vr = &dev->dma_vrings[i];
+
+               ret = rte_vhost_get_vring_base(vid, i, &dma_vr->last_avail_idx,
+                                              &dma_vr->last_used_idx);
+               if (ret < 0) {
+                       VHOST_LOG(ERR, "Failed to get vring index.\n");
+                       goto err;
+               }
+
+               ret = rte_vhost_get_vhost_vring(vid, i, &dma_vr->vr);
+               if (ret < 0) {
+                       VHOST_LOG(ERR, "Failed to get vring address.\n");
+                       goto err;
+               }
+
+               size = dma_vr->vr.size;
+               dma_vr->shadow_used_split =
+                       rte_malloc(NULL, size * sizeof(struct vring_used_elem),
+                                  RTE_CACHE_LINE_SIZE);
+               if (dma_vr->shadow_used_split == NULL)
+                       goto err;
+
+               dma_vr->batch_copy_elems =
+                       rte_malloc(NULL, size * sizeof(struct batch_copy_elem),
+                                  RTE_CACHE_LINE_SIZE);
+               if (dma_vr->batch_copy_elems == NULL)
+                       goto err;
+
+               /* get HPA of used ring's index */
+               dma_vr->used_idx_hpa =
+                       rte_mem_virt2iova(&dma_vr->vr.used->idx);
+
+               dma_vr->copy_done_used = dma_vr->last_used_idx;
+               dma_vr->signalled_used = dma_vr->last_used_idx;
+               dma_vr->signalled_used_valid = false;
+               dma_vr->shadow_used_idx = 0;
+               dma_vr->batch_copy_nb_elems = 0;
+       }
+
+       return 0;
+
+err:
+       for (j = 0; j <= i; j++) {
+               dma_vr = &dev->dma_vrings[j];
+               rte_free(dma_vr->shadow_used_split);
+               rte_free(dma_vr->batch_copy_elems);
+               dma_vr->shadow_used_split = NULL;
+               dma_vr->batch_copy_elems = NULL;
+               dma_vr->used_idx_hpa = 0;
+       }
+
+       free(dev->mem);
+       dev->mem = NULL;
+       free(dev->guest_pages);
+       dev->guest_pages = NULL;
+
+       return -1;
+}
+
+void
+vhost_dma_remove(struct pmd_internal *dev)
+{
+       struct dma_vring *dma_vr;
+       uint16_t i;
+
+       for (i = 0; i < dev->nr_vrings; i++) {
+               dma_vr = &dev->dma_vrings[i];
+               rte_free(dma_vr->shadow_used_split);
+               rte_free(dma_vr->batch_copy_elems);
+               dma_vr->shadow_used_split = NULL;
+               dma_vr->batch_copy_elems = NULL;
+               dma_vr->signalled_used_valid = false;
+               dma_vr->used_idx_hpa = 0;
+       }
+
+       free(dev->mem);
+       dev->mem = NULL;
+       free(dev->guest_pages);
+       dev->guest_pages = NULL;
+}
diff --git a/drivers/net/vhost/virtio_net.h b/drivers/net/vhost/virtio_net.h
new file mode 100644
index 0000000..7f99f1d
--- /dev/null
+++ b/drivers/net/vhost/virtio_net.h
@@ -0,0 +1,168 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2020 Intel Corporation
+ */
+#ifndef _VIRTIO_NET_H_
+#define _VIRTIO_NET_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "internal.h"
+
+static uint64_t
+get_blk_size(int fd)
+{
+       struct stat stat;
+       int ret;
+
+       ret = fstat(fd, &stat);
+       return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
+}
+
+static __rte_always_inline int
+add_one_guest_page(struct pmd_internal *dev, uint64_t guest_phys_addr,
+                  uint64_t host_phys_addr, uint64_t size)
+{
+       struct guest_page *page, *last_page;
+       struct guest_page *old_pages;
+
+       if (dev->nr_guest_pages == dev->max_guest_pages) {
+               dev->max_guest_pages *= 2;
+               old_pages = dev->guest_pages;
+               dev->guest_pages = realloc(dev->guest_pages,
+                                          dev->max_guest_pages *
+                                          sizeof(*page));
+               if (!dev->guest_pages) {
+                       VHOST_LOG(ERR, "Cannot realloc guest_pages\n");
+                       free(old_pages);
+                       return -1;
+               }
+       }
+
+       if (dev->nr_guest_pages > 0) {
+               last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
+               /* merge if the two pages are continuous */
+               if (host_phys_addr == last_page->host_phys_addr +
+                   last_page->size) {
+                       last_page->size += size;
+                       return 0;
+               }
+       }
+
+       page = &dev->guest_pages[dev->nr_guest_pages++];
+       page->guest_phys_addr = guest_phys_addr;
+       page->host_phys_addr  = host_phys_addr;
+       page->size = size;
+
+       return 0;
+}
+
+static __rte_always_inline int
+add_guest_page(struct pmd_internal *dev, struct rte_vhost_mem_region *reg)
+{
+       uint64_t reg_size = reg->size;
+       uint64_t host_user_addr  = reg->host_user_addr;
+       uint64_t guest_phys_addr = reg->guest_phys_addr;
+       uint64_t host_phys_addr;
+       uint64_t size, page_size;
+
+       page_size = get_blk_size(reg->fd);
+       if (page_size == (uint64_t)-1) {
+               VHOST_LOG(ERR, "Cannot get hugepage size through fstat\n");
+               return -1;
+       }
+
+       host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
+       size = page_size - (guest_phys_addr & (page_size - 1));
+       size = RTE_MIN(size, reg_size);
+
+       if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0)
+               return -1;
+
+       host_user_addr  += size;
+       guest_phys_addr += size;
+       reg_size -= size;
+
+       while (reg_size > 0) {
+               size = RTE_MIN(reg_size, page_size);
+               host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
+                                                  host_user_addr);
+               if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
+                                      size) < 0)
+                       return -1;
+
+               host_user_addr  += size;
+               guest_phys_addr += size;
+               reg_size -= size;
+       }
+
+       return 0;
+}
+
+static __rte_always_inline int
+setup_guest_pages(struct pmd_internal *dev, struct rte_vhost_memory *mem)
+{
+       uint32_t nr_regions = mem->nregions;
+       uint32_t i;
+
+       dev->nr_guest_pages = 0;
+       dev->max_guest_pages = 8;
+
+       dev->guest_pages = malloc(dev->max_guest_pages *
+                                 sizeof(struct guest_page));
+       if (dev->guest_pages == NULL) {
+               VHOST_LOG(ERR, "(%d) failed to allocate memory "
+                         "for dev->guest_pages\n", dev->vid);
+               return -1;
+       }
+
+       for (i = 0; i < nr_regions; i++) {
+               if (add_guest_page(dev, &mem->regions[i]) < 0)
+                       return -1;
+       }
+
+       return 0;
+}
+
+static __rte_always_inline rte_iova_t
+gpa_to_hpa(struct pmd_internal *dev, uint64_t gpa, uint64_t size)
+{
+       uint32_t i;
+       struct guest_page *page;
+
+       for (i = 0; i < dev->nr_guest_pages; i++) {
+               page = &dev->guest_pages[i];
+
+               if (gpa >= page->guest_phys_addr &&
+                   gpa + size < page->guest_phys_addr + page->size) {
+                       return gpa - page->guest_phys_addr +
+                              page->host_phys_addr;
+               }
+       }
+
+       return 0;
+}
+
+/**
+ * This function gets front end's memory and vrings information.
+ * In addition, it sets up necessary data structures for enqueue
+ * and dequeue operations.
+ */
+int vhost_dma_setup(struct pmd_internal *dev);
+
+/**
+ * This function destroys front end's information and frees data
+ * structures for enqueue and dequeue operations.
+ */
+void vhost_dma_remove(struct pmd_internal *dev);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VIRTIO_NET_H_ */
-- 
2.7.4

Reply via email to