> -----邮件原件----- > 发件人: Maxime Coquelin [mailto:maxime.coque...@redhat.com] > 发送时间: 2019年5月17日 23:47 > 收件人: Li Lin <chuckylinchucky...@gmail.com>; tiwei....@intel.com; > zhihong.w...@intel.com > 抄送: dev@dpdk.org; dariusz.stojac...@intel.com; changpeng....@intel.com; > james.r.har...@intel.com; Xie,Yongji <xieyon...@baidu.com>; Li,Lin(ACG Cloud) > <lili...@baidu.com>; Ni,Xun <ni...@baidu.com>; Zhang,Yu(ACG Cloud) > <zhangy...@baidu.com> > 主题: Re: [PATCH v4] vhost: support inflight share memory protocol feature > > > > On 5/5/19 11:02 AM, Li Lin wrote: > > From: Li Lin <lili...@baidu.com> > > > > This patch introduces two new messages VHOST_USER_GET_INFLIGHT_FD > and > > VHOST_USER_SET_INFLIGHT_FD to support transferring a shared buffer > > between qemu and backend. > > > > Firstly, qemu uses VHOST_USER_GET_INFLIGHT_FD to get the shared buffer > > from backend. Then qemu should send it back through > > VHOST_USER_SET_INFLIGHT_FD each time we start vhost-user. > > > > This shared buffer is used to process inflight I/O when backend > > reconnect. > > > > Signed-off-by: Li Lin <lili...@baidu.com> > > Signed-off-by: Ni Xun <ni...@baidu.com> > > Signed-off-by: Zhang Yu <zhangy...@baidu.com> > > --- > > v2 - add set&clr inflight desc functions > > v3 - simplified some functions implementation > > v4 - rework some data structures according to qemu doc > > > > lib/librte_vhost/rte_vhost.h | 83 ++++++++++ > > lib/librte_vhost/rte_vhost_version.map | 3 + > > lib/librte_vhost/vhost.c | 105 ++++++++++++ > > lib/librte_vhost/vhost.h | 12 ++ > > lib/librte_vhost/vhost_user.c | 292 > +++++++++++++++++++++++++++++++++ > > lib/librte_vhost/vhost_user.h | 13 +- > > 6 files changed, 507 insertions(+), 1 deletion(-) > > > > diff --git a/lib/librte_vhost/rte_vhost.h > > b/lib/librte_vhost/rte_vhost.h index d2c0c93f4..d4d709717 100644 > > --- a/lib/librte_vhost/rte_vhost.h > > +++ b/lib/librte_vhost/rte_vhost.h > > @@ -71,6 +71,10 @@ extern "C" { > > #define VHOST_USER_PROTOCOL_F_HOST_NOTIFIER 11 > > #endif > > > > +#ifndef VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD > > +#define VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD 12 #endif > > + > > /** Indicate whether protocol features negotiation is supported. */ > > #ifndef VHOST_USER_F_PROTOCOL_FEATURES > > #define VHOST_USER_F_PROTOCOL_FEATURES 30 > > @@ -98,12 +102,41 @@ struct rte_vhost_memory { > > struct rte_vhost_mem_region regions[]; > > }; > > > > +struct inflight_desc { > > + uint8_t inflight; > > + uint8_t padding[5]; > > + uint16_t next; > > + uint64_t counter; > > +}; > > + > > +struct inflight_info { > > + uint64_t features; > > + uint16_t version; > > + uint16_t desc_num; > > + uint16_t last_inflight_io; > > + uint16_t used_idx; > > + struct inflight_desc desc[0]; > > +}; > > + > > +struct resubmit_desc { > > + uint16_t index; > > + uint64_t counter; > > +}; > > + > > +struct resubmit_info { > > + struct resubmit_desc *resubmit_list; > > + uint16_t resubmit_num; > > +}; > > + > > struct rte_vhost_vring { > > struct vring_desc *desc; > > struct vring_avail *avail; > > struct vring_used *used; > > uint64_t log_guest_addr; > > > > + struct inflight_info *inflight; > > + struct resubmit_info *resubmit; > > + > > /** Deprecated, use rte_vhost_vring_call() instead. */ > > int callfd; > > > > @@ -632,6 +665,56 @@ int rte_vhost_get_vhost_vring(int vid, uint16_t > vring_idx, > > int rte_vhost_vring_call(int vid, uint16_t vring_idx); > > > > /** > > + * set inflight flag for a desc. > > + * > > + * @param vid > > + * vhost device ID > > + * @param vring_idx > > + * vring index > > + * @param idx > > + * inflight entry index > > + * @return > > + * 0 on success, -1 on failure > > + */ > > +int __rte_experimental > > +rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx, > > + uint16_t idx); > > + > > +/** > > + * clear inflight flag for a desc. > > + * > > + * @param vid > > + * vhost device ID > > + * @param vring_idx > > + * vring index > > + * @param last_used_idx > > + * next free used_idx > > + * @param idx > > + * inflight entry index > > + * @return > > + * 0 on success, -1 on failure > > + */ > > +int __rte_experimental > > +rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx, > > + uint16_t last_used_idx, uint16_t idx); > > + > > +/** > > + * set last inflight io index. > > + * > > + * @param vid > > + * vhost device ID > > + * @param vring_idx > > + * vring index > > + * @param idx > > + * inflight entry index > > + * @return > > + * 0 on success, -1 on failure > > + */ > > +int __rte_experimental > > +rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx, > > + uint16_t idx); > > + > > +/** > > * Get vhost RX queue avail count. > > * > > * @param vid > > diff --git a/lib/librte_vhost/rte_vhost_version.map > > b/lib/librte_vhost/rte_vhost_version.map > > index 5f1d4a75c..a992b3935 100644 > > --- a/lib/librte_vhost/rte_vhost_version.map > > +++ b/lib/librte_vhost/rte_vhost_version.map > > @@ -87,4 +87,7 @@ EXPERIMENTAL { > > rte_vdpa_relay_vring_used; > > rte_vhost_extern_callback_register; > > rte_vhost_driver_set_protocol_features; > > + rte_vhost_set_inflight_desc; > > + rte_vhost_clr_inflight_desc; > > + rte_vhost_set_last_inflight_io; > > }; > > diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c index > > 163f4595e..03eab3551 100644 > > --- a/lib/librte_vhost/vhost.c > > +++ b/lib/librte_vhost/vhost.c > > @@ -76,6 +76,16 @@ cleanup_vq(struct vhost_virtqueue *vq, int destroy) > > close(vq->callfd); > > if (vq->kickfd >= 0) > > close(vq->kickfd); > > + if (vq->inflight) > > + vq->inflight = NULL; > > + if (vq->resubmit->resubmit_list) { > > + free(vq->resubmit->resubmit_list); > > + vq->resubmit->resubmit_list = NULL; > > + } > > + if (vq->resubmit) { > > + free(vq->resubmit); > > + vq->resubmit = NULL; > > + } > > } > > > > /* > > @@ -589,6 +599,9 @@ rte_vhost_get_vhost_vring(int vid, uint16_t > vring_idx, > > vring->kickfd = vq->kickfd; > > vring->size = vq->size; > > > > + vring->inflight = vq->inflight; > > + vring->resubmit = vq->resubmit; > > + > > return 0; > > } > > > > @@ -617,6 +630,98 @@ rte_vhost_vring_call(int vid, uint16_t vring_idx) > > return 0; > > } > > > > +int > > +rte_vhost_set_inflight_desc(int vid, uint16_t vring_idx, uint16_t > > +idx) { > > + struct virtio_net *dev; > > + struct vhost_virtqueue *vq; > > + > > + dev = get_device(vid); > > + if (!dev) > > + return -1; > > + > > + if (!(dev->features & > > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) > > + return 0; > > Shouldn't -1 be returned here also? > I think this function should be called only if the feature negotiation > succeeded. > > Returning an error here would help the calling application to know something > went wrong. > > Same comment applies for the clear function.
If the feature is not set,set & clr function returns directly. They are similar to empty functions. The return value is - 1, which represents an error in function execution and unexpected behavior. > > + > > + if (vring_idx >= VHOST_MAX_VRING) > > + return -1; > > + > > + vq = dev->virtqueue[vring_idx]; > > + if (!vq) > > + return -1; > > + > > + if (unlikely(!vq->inflight)) > > + return -1; > > + > > + vq->inflight->desc[idx].counter = vq->counter++; > > + vq->inflight->desc[idx].inflight = 1; > > + return 0; > > +} > > + > > +int > > +rte_vhost_clr_inflight_desc(int vid, uint16_t vring_idx, > > + uint16_t last_used_idx, uint16_t idx) { > > + struct virtio_net *dev; > > + struct vhost_virtqueue *vq; > > + > > + dev = get_device(vid); > > + if (!dev) > > + return -1; > > + > > + if (!(dev->features & > > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) > > + return 0; > > + > > + if (vring_idx >= VHOST_MAX_VRING) > > + return -1; > > + > > + vq = dev->virtqueue[vring_idx]; > > + if (!vq) > > + return -1; > > + > > + if (unlikely(!vq->inflight)) > > + return -1; > > + > > + rte_compiler_barrier(); > > + > > + vq->inflight->desc[idx].inflight = 0; > > + > > + rte_compiler_barrier(); > > + > > + vq->inflight->used_idx = last_used_idx; > > + return 0; > > +} > > + > > +int > > +rte_vhost_set_last_inflight_io(int vid, uint16_t vring_idx, uint16_t > > +idx) { > > + struct virtio_net *dev; > > + struct vhost_virtqueue *vq; > > + > > + dev = get_device(vid); > > + if (!dev) > > + return -1; > > + > > + if (!(dev->features & > > + (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD))) > > + return 0; > > + > > + if (vring_idx >= VHOST_MAX_VRING) > > + return -1; > > + > > + vq = dev->virtqueue[vring_idx]; > > + if (!vq) > > + return -1; > > + > > + if (unlikely(!vq->inflight)) > > + return -1; > > + > > + vq->inflight->last_inflight_io = idx; > > + return 0; > > +} > > Are above functions supposed to be called in the hot path? > If so, you might want to use unlikely for the error checks everywhere. You're right. I'll modify it in the next version. > > > + > > uint16_t > > rte_vhost_avail_entries(int vid, uint16_t queue_id) > > { > > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h index > > e9138dfab..3dace2ec3 100644 > > --- a/lib/librte_vhost/vhost.h > > +++ b/lib/librte_vhost/vhost.h > > @@ -128,6 +128,11 @@ struct vhost_virtqueue { > > /* Physical address of used ring, for logging */ > > uint64_t log_guest_addr; > > > > + /* Inflight share memory info */ > > + struct inflight_info *inflight; > > + struct resubmit_info *resubmit; > > + uint64_t counter; > > counter and resubmit are too generic names. I'll modify the names in the next version. > > > + > > uint16_t nr_zmbuf; > > uint16_t zmbuf_size; > > uint16_t last_zmbuf_idx; > > @@ -286,6 +291,12 @@ struct guest_page { > > uint64_t size; > > }; > > > > +struct inflight_mem_info { > > + int fd; > > + void *addr; > > + uint64_t size; > > +}; > > + > > /** > > * Device structure contains all configuration information relating > > * to the device. > > @@ -303,6 +314,7 @@ struct virtio_net { > > uint32_t nr_vring; > > int dequeue_zero_copy; > > struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; > > + struct inflight_mem_info inflight_info; > > #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) > > char ifname[IF_NAME_SZ]; > > uint64_t log_size; > > Do you have some code example using these new APIs? > It would help for reviewing the patch. This patch is only valid for vhost blk. The use of new two messages and the implementation of packed queue are described in docs/vhost-user.txt. I have submitted SPDK-related patches to use new API. > > Thanks, > Maxime >