Yes, it depends on low-level implementation. During my earlier test,
using one CQ to send and receive may cause packet loss with heavy load:
the destination thinks it send READY message successfully but the source
still waits for it. This situation always happens when the destination
polls
receive CQE first.

So I think using only one CQ may cause packet conflict or something like
that,
and it should be the driver bug. However, using two CQs fix the problem.



2013/9/2 Isaku Yamahata <yamah...@private.email.ne.jp>

> Hi. Can you elaborate why two CQs fix it? Does it depend on
> HCA implementation?
>
> I'm not against two CQs for sending and receiving. In fact I'm for it
> because I use two CQs for postcopy RDMA support.
>
> thanks,
>
> On Fri, Aug 30, 2013 at 08:39:31PM +0800, Frank Yang wrote:
> > When several VMs migrate with RDMA at the same time, the increased
> pressure
> > cause packet loss probabilistically and make source and destination wait
> for
> > each other. There might be some of VMs blocked during the migration.
> >
> > Fix the bug by using two completion queues, for sending and receiving
> > respectively.
> >
> > From 0c4829495cdc89eea2e94b103ac42c3f6a4b32c2 Mon Sep 17 00:00:00 2001
> > From: Frank Yang <frank.yang...@gmail.com>
> > Date: Fri, 30 Aug 2013 17:53:34 +0800
> > Subject: [PATCH] rdma: fix multiple VMs parallel migration
> >
> > Signed-off-by: Frank Yang <frank.yang...@gmail.com>
> > ---
> >  migration-rdma.c | 57
> ++++++++++++++++++++++++++++++++++++--------------------
> >  1 file changed, 37 insertions(+), 20 deletions(-)
> >
> > diff --git a/migration-rdma.c b/migration-rdma.c
> > index 3d1266f..d0eacbb 100644
> > --- a/migration-rdma.c
> > +++ b/migration-rdma.c
> > @@ -362,7 +362,8 @@ typedef struct RDMAContext {
> >      struct ibv_qp *qp;                      /* queue pair */
> >      struct ibv_comp_channel *comp_channel;  /* completion channel */
> >      struct ibv_pd *pd;                      /* protection domain */
> > -    struct ibv_cq *cq;                      /* completion queue */
> > +    struct ibv_cq *send_cq;                 /* send completion queue */
> > +    struct ibv_cq *recv_cq;                 /* receive completion queue
> */
> >
> >      /*
> >       * If a previous write failed (perhaps because of a failed
> > @@ -1006,9 +1007,12 @@ static int qemu_rdma_alloc_pd_cq(RDMAContext
> *rdma)
> >       * Completion queue can be filled by both read and write work
> requests,
> >       * so must reflect the sum of both possible queue sizes.
> >       */
> > -    rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3),
> > +    rdma->send_cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX
> * 2),
> >              NULL, rdma->comp_channel, 0);
> > -    if (!rdma->cq) {
> > +    rdma->recv_cq = ibv_create_cq(rdma->verbs, RDMA_SIGNALED_SEND_MAX,
> NULL,
> > +            rdma->comp_channel, 0);
> > +
> > +    if (!rdma->send_cq || !rdma->recv_cq) {
> >          fprintf(stderr, "failed to allocate completion queue\n");
> >          goto err_alloc_pd_cq;
> >      }
> > @@ -1040,8 +1044,8 @@ static int qemu_rdma_alloc_qp(RDMAContext *rdma)
> >      attr.cap.max_recv_wr = 3;
> >      attr.cap.max_send_sge = 1;
> >      attr.cap.max_recv_sge = 1;
> > -    attr.send_cq = rdma->cq;
> > -    attr.recv_cq = rdma->cq;
> > +    attr.send_cq = rdma->send_cq;
> > +    attr.recv_cq = rdma->recv_cq;
> >      attr.qp_type = IBV_QPT_RC;
> >
> >      ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
> > @@ -1361,13 +1365,18 @@ static void
> qemu_rdma_signal_unregister(RDMAContext
> > *rdma, uint64_t index,
> >   * Return the work request ID that completed.
> >   */
> >  static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
> > -                               uint32_t *byte_len)
> > +                               uint32_t *byte_len, int wrid_requested)
> >  {
> >      int ret;
> >      struct ibv_wc wc;
> >      uint64_t wr_id;
> >
> > -    ret = ibv_poll_cq(rdma->cq, 1, &wc);
> > +    if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
> > +        wrid_requested == RDMA_WRID_SEND_CONTROL) {
> > +        ret = ibv_poll_cq(rdma->send_cq, 1, &wc);
> > +    } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
> > +        ret = ibv_poll_cq(rdma->recv_cq, 1, &wc);
> > +    }
> >
> >      if (!ret) {
> >          *wr_id_out = RDMA_WRID_NONE;
> > @@ -1460,12 +1469,9 @@ static int qemu_rdma_block_for_wrid(RDMAContext
> *rdma,
> > int wrid_requested,
> >      void *cq_ctx;
> >      uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
> >
> > -    if (ibv_req_notify_cq(rdma->cq, 0)) {
> > -        return -1;
> > -    }
> >      /* poll cq first */
> >      while (wr_id != wrid_requested) {
> > -        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
> > +        ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len, wrid_requested);
> >          if (ret < 0) {
> >              return ret;
> >          }
> > @@ -1487,6 +1493,17 @@ static int qemu_rdma_block_for_wrid(RDMAContext
> *rdma,
> > int wrid_requested,
> >      }
> >
> >      while (1) {
> > +        if (wrid_requested == RDMA_WRID_RDMA_WRITE ||
> > +            wrid_requested == RDMA_WRID_SEND_CONTROL) {
> > +            if (ibv_req_notify_cq(rdma->send_cq, 0)) {
> > +                return -1;
> > +            }
> > +        } else if (wrid_requested >= RDMA_WRID_RECV_CONTROL) {
> > +            if (ibv_req_notify_cq(rdma->recv_cq, 0)) {
> > +                return -1;
> > +            }
> > +        }
> > +
> >          /*
> >           * Coroutine doesn't start until process_incoming_migration()
> >           * so don't yield unless we know we're running inside of a
> coroutine.
> > @@ -1502,12 +1519,8 @@ static int qemu_rdma_block_for_wrid(RDMAContext
> *rdma,
> > int wrid_requested,
> >
> >          num_cq_events++;
> >
> > -        if (ibv_req_notify_cq(cq, 0)) {
> > -            goto err_block_for_wrid;
> > -        }
> > -
> >          while (wr_id != wrid_requested) {
> > -            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
> > +            ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len,
> wrid_requested);
> >              if (ret < 0) {
> >                  goto err_block_for_wrid;
> >              }
> > @@ -2236,9 +2249,13 @@ static void qemu_rdma_cleanup(RDMAContext *rdma)
> >          ibv_destroy_qp(rdma->qp);
> >          rdma->qp = NULL;
> >      }
> > -    if (rdma->cq) {
> > -        ibv_destroy_cq(rdma->cq);
> > -        rdma->cq = NULL;
> > +    if (rdma->send_cq) {
> > +        ibv_destroy_cq(rdma->send_cq);
> > +        rdma->send_cq = NULL;
> > +    }
> > +    if (rdma->recv_cq) {
> > +        ibv_destroy_cq(rdma->recv_cq);
> > +        rdma->recv_cq = NULL;
> >      }
> >      if (rdma->comp_channel) {
> >          ibv_destroy_comp_channel(rdma->comp_channel);
> > @@ -2770,7 +2787,7 @@ static size_t qemu_rdma_save_page(QEMUFile *f, void
> > *opaque,
> >       */
> >      while (1) {
> >          uint64_t wr_id, wr_id_in;
> > -        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL);
> > +        int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL,
> RDMA_WRID_RDMA_WRITE);
> >          if (ret < 0) {
> >              fprintf(stderr, "rdma migration: polling error! %d\n", ret);
> >              goto err;
> > --
> > 1.8.3.msysgit.0
> >
> >
>
> --
> yamahata
>

Reply via email to