On 2017年01月10日 14:11, Tan, Jianfeng wrote:

Hi Jason,


On 1/9/2017 12:39 PM, Jason Wang wrote:


On 2016年12月23日 15:14, Jianfeng Tan wrote:
This patch add support vhost kernel as the backend for virtio_user.
Three main hook functions are added:
   - vhost_kernel_setup() to open char device, each vq pair needs one
     vhostfd;
   - vhost_kernel_ioctl() to communicate control messages with vhost
     kernel module;
   - vhost_kernel_enable_queue_pair() to open tap device and set it
     as the backend of corresonding vhost fd (that is to say, vq pair).

Signed-off-by: Jianfeng Tan <jianfeng....@intel.com>
---

[...]
+/* TUNSETIFF ifr flags */
+#define IFF_TAP          0x0002
+#define IFF_NO_PI        0x1000
+#define IFF_ONE_QUEUE    0x2000
+#define IFF_VNET_HDR     0x4000
+#define IFF_MULTI_QUEUE  0x0100
+#define IFF_ATTACH_QUEUE 0x0200
+#define IFF_DETACH_QUEUE 0x0400

Do we really want to duplicate those things which has been exposed by uapi here?

You mean those defined in <linux/if_tun.h>? Redefine those common macros, or include standard header file, with respective pros and cons. DPDK prefers the redefinition way as far as I understand, doesn't it?


Well, if you really want to do this, you may want to use an independent file. Then you can sync it with linux headers with a bash script.


+
+/* Constants */
+#define TUN_DEF_SNDBUF    (1ull << 20)
+#define PATH_NET_TUN    "/dev/net/tun"
+#define VHOST_KERNEL_MAX_REGIONS    64

Unfortunate not a constant any more since c9ce42f72fd0 vhost: add max_mem_regions module parameter.

Yes, I was considering to ignore this in the initial release. But it's not a big effort, I'll try to fix it in latest version.


+
+static uint64_t vhost_req_user_to_kernel[] = {
+    [VHOST_USER_SET_OWNER] = VHOST_SET_OWNER,
+    [VHOST_USER_RESET_OWNER] = VHOST_RESET_OWNER,
+    [VHOST_USER_SET_FEATURES] = VHOST_SET_FEATURES,
+    [VHOST_USER_GET_FEATURES] = VHOST_GET_FEATURES,
+    [VHOST_USER_SET_VRING_CALL] = VHOST_SET_VRING_CALL,
+    [VHOST_USER_SET_VRING_NUM] = VHOST_SET_VRING_NUM,
+    [VHOST_USER_SET_VRING_BASE] = VHOST_SET_VRING_BASE,
+    [VHOST_USER_GET_VRING_BASE] = VHOST_GET_VRING_BASE,
+    [VHOST_USER_SET_VRING_ADDR] = VHOST_SET_VRING_ADDR,
+    [VHOST_USER_SET_VRING_KICK] = VHOST_SET_VRING_KICK,
+    [VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE,
+};
+
+/* By default, vhost kernel module allows 64 regions, but DPDK allows
+ * 256 segments. As a relief, below function merges those virtually
+ * adjacent memsegs into one region.
+ */
+static struct vhost_memory_kernel *
+prepare_vhost_memory_kernel(void)
+{
+    uint32_t i, j, k = 0;
+    struct rte_memseg *seg;
+    struct vhost_memory_region *mr;
+    struct vhost_memory_kernel *vm;
+
+    vm = malloc(sizeof(struct vhost_memory_kernel) +
+            VHOST_KERNEL_MAX_REGIONS *
+            sizeof(struct vhost_memory_region));
+
+    for (i = 0; i < RTE_MAX_MEMSEG; ++i) {
+        seg = &rte_eal_get_configuration()->mem_config->memseg[i];
+        if (!seg->addr)
+            break;

If we're sure the number of regions is less than 64(or the module parameter read from /sys), can we avoid the iteration here?

The "if" statement under the "for" statement can save us from all RTE_MAX_MEMSEG iteration.

But if we know the number of regions is short than the limit, there's even no need for this?



+
+        int new_region = 1;
+
+        for (j = 0; j < k; ++j) {
+            mr = &vm->regions[j];
+
+            if (mr->userspace_addr + mr->memory_size ==
+                (uint64_t)seg->addr) {
+                mr->memory_size += seg->len;
+                new_region = 0;
+                break;
+            }
+
+            if ((uint64_t)seg->addr + seg->len ==
+                mr->userspace_addr) {
+                mr->guest_phys_addr = (uint64_t)seg->addr;
+                mr->userspace_addr = (uint64_t)seg->addr;
+                mr->memory_size += seg->len;
+                new_region = 0;
+                break;
+            }
+        }
+
+        if (new_region == 0)
+            continue;
+
+        mr = &vm->regions[k++];
+ mr->guest_phys_addr = (uint64_t)seg->addr; /* use vaddr here! */
+        mr->userspace_addr = (uint64_t)seg->addr;
+        mr->memory_size = seg->len;
+        mr->mmap_offset = 0;
+
+        if (k >= VHOST_KERNEL_MAX_REGIONS) {
+            free(vm);
+            return NULL;
+        }
+    }
+
+    vm->nregions = k;
+    vm->padding = 0;
+    return vm;
+}
+
+static int
+vhost_kernel_ioctl(struct virtio_user_dev *dev,
+           enum vhost_user_request req,
+           void *arg)
+{
+    int i, ret = -1;
+    uint64_t req_kernel;
+    struct vhost_memory_kernel *vm = NULL;
+
+    req_kernel = vhost_req_user_to_kernel[req];
+
+    if (req_kernel == VHOST_SET_MEM_TABLE) {
+        vm = prepare_vhost_memory_kernel();
+        if (!vm)
+            return -1;
+        arg = (void *)vm;
+    }
+
+    /* Does not work when VIRTIO_F_IOMMU_PLATFORM now, why? */

I think the reason is when VIRTIO_F_IOMMU_PLATFORM is negotiated, all address should be iova instead of gpa.


Yes, I agree. As we don't have to do memory protection in a single process, so it's completely useless here, right?

Yes if there's no IOMMU concept in this case.


+    if (req_kernel == VHOST_SET_FEATURES)
+        *(uint64_t *)arg &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
+
+    for (i = 0; i < VHOST_KERNEL_MAX_QUEUES; ++i) {
+        if (dev->vhostfds[i] < 0)
+            continue;
+
[...]
+    if (!enable) {
+        if (dev->tapfds[pair_idx]) {
+            close(dev->tapfds[pair_idx]);
+            dev->tapfds[pair_idx] = -1;
+        }
+        return vhost_kernel_set_backend(vhostfd, -1);

If this is used to for thing like ethtool -L in guest, we should use TUNSETQUEUE here.

Oops, I was missing this ioctl operation. Let me fix it in next version.


+    } else if (dev->tapfds[pair_idx] >= 0) {
+        return 0;
+    }
+
+    if ((dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) ||
+        (dev->features & (1ULL << VIRTIO_F_VERSION_1)))
+        hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+    else
+        hdr_size = sizeof(struct virtio_net_hdr);
+
+    /* TODO:
+     * 1. verify we can get/set vnet_hdr_len, tap_probe_vnet_hdr_len
+     * 2. get number of memory regions from vhost module parameter
+     * max_mem_regions, supported in newer version linux kernel
+     */
+    tapfd = open(PATH_NET_TUN, O_RDWR);
+    if (tapfd < 0) {
+        PMD_DRV_LOG(ERR, "fail to open %s: %s",
+                PATH_NET_TUN, strerror(errno));
+        return -1;
+    }
+
+    /* Construct ifr */
+    memset(&ifr, 0, sizeof(ifr));
+    ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
+
+    if (ioctl(tapfd, TUNGETFEATURES, &features) == -1) {
+ PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno));
+        goto error;
+    }
+    if (features & IFF_ONE_QUEUE)
+        ifr.ifr_flags |= IFF_ONE_QUEUE;
+
+ /* Let tap instead of vhost-net handle vnet header, as the latter does + * not support offloading. And in this case, we should not set feature
+     * bit VHOST_NET_F_VIRTIO_NET_HDR.
+     */
+    if (features & IFF_VNET_HDR) {
+        ifr.ifr_flags |= IFF_VNET_HDR;
+    } else {
+        PMD_DRV_LOG(ERR, "TAP does not support IFF_VNET_HDR");
+        goto error;
+    }
+
+    if (dev->ifname)
+        strncpy(ifr.ifr_name, dev->ifname, IFNAMSIZ);
+    else
+        strncpy(ifr.ifr_name, "tap%d", IFNAMSIZ);
+    if (ioctl(tapfd, TUNSETIFF, (void *)&ifr) == -1) {
+        PMD_DRV_LOG(ERR, "TUNSETIFF failed: %s", strerror(errno));
+        goto error;
+    }

This requires CAP_NET_ADMIN, so we should really consider to accept a pre-created fd here.

It sounds like a future work for me. So far, all DPDK apps are running in privileged mode (including CAP_NET_ADMIN?).


That's not safe. Accepting a per-created fd can solve this, and can even support macvtap.


+
+    fcntl(tapfd, F_SETFL, O_NONBLOCK);
+
+    if (ioctl(tapfd, TUNSETVNETHDRSZ, &hdr_size) < 0) {
+ PMD_DRV_LOG(ERR, "TUNSETVNETHDRSZ failed: %s", strerror(errno));
+        goto error;
+    }
+
+    if (ioctl(tapfd, TUNSETSNDBUF, &sndbuf) < 0) {
+        PMD_DRV_LOG(ERR, "TUNSETSNDBUF failed: %s", strerror(errno));
+        goto error;
+    }

Let's use INT_MAX as default here to survive from evil consumer here.

Oh yes, I will fix it.

Thanks,
Jianfeng

Reply via email to