Yes, it depends on low-level implementation. During my earlier test, using one CQ to send and receive may cause packet loss with heavy load: the destination thinks it send READY message successfully but the source still waits for it. This situation always happens when the destination polls receive CQE first.
So I think using only one CQ may cause packet conflict or something like that, and it should be the driver bug. However, using two CQs fix the problem. 2013/9/2 Isaku Yamahata <yamah...@private.email.ne.jp> > Hi. Can you elaborate why two CQs fix it? Does it depend on > HCA implementation? > > I'm not against two CQs for sending and receiving. In fact I'm for it > because I use two CQs for postcopy RDMA support. > > thanks, > > On Fri, Aug 30, 2013 at 08:39:31PM +0800, Frank Yang wrote: > > When several VMs migrate with RDMA at the same time, the increased > pressure > > cause packet loss probabilistically and make source and destination wait > for > > each other. There might be some of VMs blocked during the migration. > > > > Fix the bug by using two completion queues, for sending and receiving > > respectively. > > > > From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17 00:00:00 2001 > > From: Frank Yang <frank.yang...@gmail.com> > > Date: Fri, 30 Aug 2013 17:53:34 +0800 > > Subject: [PATCH] rdma: fix multiple VMs parallel migration > > > > Signed-off-by: Frank Yang <frank.yang...@gmail.com> > > --- > > migration-rdma.c | 57 > ++++++++++++++++++++++++++++++++++++-------------------- > > 1 file changed, 37 insertions(+), 20 deletions(-) > > > > diff --git a/migration-rdma.c b/migration-rdma.c > > index 3d1266f..d0eacbb 100644 > > --- a/migration-rdma.c > > +++ b/migration-rdma.c > > @@ -362,7 +362,8 @@ typedef struct RDMAContext { > > struct ibv_qp *qp; /* queue pair */ > > struct ibv_comp_channel *comp_channel; /* completion channel */ > > struct ibv_pd *pd; /* protection domain */ > > - struct ibv_cq *cq; /* completion queue */ > > + struct ibv_cq *send_cq; /* send completion queue */ > > + struct ibv_cq *recv_cq; /* receive completion queue > */ > > > > /* > > * If a previous write failed (perhaps because of a failed > > @@ -1006,9 +1007,12 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext > *rdma) > > * Completion queue can be filled by both read and write work > requests, > > * so must reflect the sum of both possible queue sizes. > > */ > > - rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3), > > + rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX > * 2), > > NULL, rdma->comp_channel, 0); > > - if (!rdma->cq) { > > + rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX, > NULL, > > + rdma->comp_channel, 0); > > + > > + if (!rdma->send_cq || !rdma->recv_cq) { > > fprintf(stderr, "failed to allocate completion queue\n"); > > goto err_alloc_pd_cq; > > } > > @@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma) > > attr.cap.max_recv_wr = 3; > > attr.cap.max_send_sge = 1; > > attr.cap.max_recv_sge = 1; > > - attr.send_cq = rdma->cq; > > - attr.recv_cq = rdma->cq; > > + attr.send_cq = rdma->send_cq; > > + attr.recv_cq = rdma->recv_cq; > > attr.qp_type = IBV_QPT_RC; > > > > ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr); > > @@ -1361,13 +1365,18 @@ static void > qemu_rdma_signal_unregister(RDMAContext > > *rdma, uint64_t index, > > * Return the work request ID that completed. > > */ > > static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out, > > - uint32_t *byte_len) > > + uint32_t *byte_len, int wrid_requested) > > { > > int ret; > > struct ibv_wc wc; > > uint64_t wr_id; > > > > - ret = ibv_poll_cq(rdma->cq, 1, &wc); > > + if (wrid_requested == RDMA_WRID_RDMA_WRITE || > > + wrid_requested == RDMA_WRID_SEND_CONTROL) { > > + ret = ibv_poll_cq(rdma->send_cq, 1, &wc); > > + } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) { > > + ret = ibv_poll_cq(rdma->recv_cq, 1, &wc); > > + } > > > > if (!ret) { > > *wr_id_out = RDMA_WRID_NONE; > > @@ -1460,12 +1469,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext > *rdma, > > int wrid_requested, > > void *cq_ctx; > > uint64_t wr_id = RDMA_WRID_NONE, wr_id_in; > > > > - if (ibv_req_notify_cq(rdma->cq, 0)) { > > - return -1; > > - } > > /* poll cq first */ > > while (wr_id != wrid_requested) { > > - ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); > > + ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested); > > if (ret < 0) { > > return ret; > > } > > @@ -1487,6 +1493,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext > *rdma, > > int wrid_requested, > > } > > > > while (1) { > > + if (wrid_requested == RDMA_WRID_RDMA_WRITE || > > + wrid_requested == RDMA_WRID_SEND_CONTROL) { > > + if (ibv_req_notify_cq(rdma->send_cq, 0)) { > > + return -1; > > + } > > + } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) { > > + if (ibv_req_notify_cq(rdma->recv_cq, 0)) { > > + return -1; > > + } > > + } > > + > > /* > > * Coroutine doesn't start until process_incoming_migration() > > * so don't yield unless we know we're running inside of a > coroutine. > > @@ -1502,12 +1519,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext > *rdma, > > int wrid_requested, > > > > num_cq_events++; > > > > - if (ibv_req_notify_cq(cq, 0)) { > > - goto err_block_for_wrid; > > - } > > - > > while (wr_id != wrid_requested) { > > - ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); > > + ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, > wrid_requested); > > if (ret < 0) { > > goto err_block_for_wrid; > > } > > @@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma) > > ibv_destroy_qp(rdma->qp); > > rdma->qp = NULL; > > } > > - if (rdma->cq) { > > - ibv_destroy_cq(rdma->cq); > > - rdma->cq = NULL; > > + if (rdma->send_cq) { > > + ibv_destroy_cq(rdma->send_cq); > > + rdma->send_cq = NULL; > > + } > > + if (rdma->recv_cq) { > > + ibv_destroy_cq(rdma->recv_cq); > > + rdma->recv_cq = NULL; > > } > > if (rdma->comp_channel) { > > ibv_destroy_comp_channel(rdma->comp_channel); > > @@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void > > *opaque, > > */ > > while (1) { > > uint64_t wr_id, wr_id_in; > > - int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL); > > + int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL, > RDMA_WRID_RDMA_WRITE); > > if (ret < 0) { > > fprintf(stderr, "rdma migration: polling error! %d\n", ret); > > goto err; > > -- > > 1.8.3.msysgit.0 > > > > > > -- > yamahata >