Hi Michael, I think I've got an RDMA race condition, but I'm being a little cautious at the moment and wondered if you agree with the following diagnosis.
It's showing up in a world of mine that's sending more control messages from the destination->source and I'm seeing the following. We normally expect: src dest ----------->control ready-> Sees SEND_CONTROL signal to ack that it has been sent <-----control message-- Sees RECV_CONTROL message from dest but what I'm seeing is: src dest ----------->control ready-> <-----control message-- Sees RECV_CONTROL message from dest Sees SEND_CONTROL signal to ack that it has been sent which seems a bit odd - that means that the source is sending a message, and getting the reply before it gets the acknowledgment from it's own stack that it sent the message the reply is to ?! It's rare (~1 in 100 migrations ish) and only in my world where I'm sending more control messages. But that confuses qemu_rdma_post_send_control when it sends the READY, because it calls block_for_wrid(SEND_CONTROL), that sees the RECV_CONTROL (which it loses) and then the SEND_CONTROL. Does this sound a sane explanation? My current fix (below) I only use it for RDMA_CONTROL_READY , I'm torn between worrying that the race is potentially more general, but worry what will happen if I stop making post_send_control wait at the end for all the other control messages. It seems to work :-) What do you reckon? Dave From 332b867fb0f63be47d89822b7a10b2a519b431fc Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" <dgilb...@redhat.com> Date: Fri, 11 Dec 2015 14:53:11 +0000 Subject: [PATCH] Don't wait for send-control on a ready --- migration/rdma.c | 46 +++++++++++++++++++++++++++++++++++++++++----- trace-events | 1 + 2 files changed, 42 insertions(+), 5 deletions(-) diff --git a/migration/rdma.c b/migration/rdma.c index b0feddc..6a0e59f 100644 --- a/migration/rdma.c +++ b/migration/rdma.c @@ -314,7 +314,13 @@ typedef struct RDMAContext { * the READY message before send() does, in which case we need to * know if it completed. */ - int control_ready_expected; + bool control_ready_expected; + + /* + * Set when we've sent a control message but not yet waited for the + * result. + */ + bool control_sent_expected; /* number of outstanding writes */ int nb_sent; @@ -1454,6 +1460,13 @@ static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out, rdma->control_ready_expected = 0; } + if (rdma->control_sent_expected && + (wr_id >= RDMA_WRID_SEND_CONTROL)) { + trace_qemu_rdma_poll_send(wrid_desc[RDMA_WRID_SEND_CONTROL], + wr_id - RDMA_WRID_SEND_CONTROL, wr_id, rdma->nb_sent); + rdma->control_sent_expected = 0; + } + if (wr_id == RDMA_WRID_RDMA_WRITE) { uint64_t chunk = (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT; @@ -1609,6 +1622,7 @@ static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf, RDMAControlHeader *head) { int ret = 0; + bool wait; RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL]; struct ibv_send_wr *bad_wr; struct ibv_sge sge = { @@ -1626,6 +1640,26 @@ static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf, trace_qemu_rdma_post_send_control(control_desc[head->type]); + if (rdma->control_sent_expected) { + ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL); + if (ret < 0) { + error_report("%s: send polling control error (entry) %s", + __func__, strerror(ret)); + return ret; + } + } + + switch (head->type) { + case RDMA_CONTROL_READY: + wait=false; + rdma->control_sent_expected = true; + break; + + default: + wait=true; + break; + } + /* * We don't actually need to do a memcpy() in here if we used * the "sge" properly, but since we're only sending control messages @@ -1646,13 +1680,15 @@ static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf, ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr); if (ret > 0) { - error_report("Failed to use post IB SEND for control"); + error_report("Failed to use post IB SEND for control: %s", strerror(ret)); return -ret; } - ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL); - if (ret < 0) { - error_report("rdma migration: send polling control error"); + if (wait) { + ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL); + if (ret < 0) { + error_report("rdma migration: send polling control error"); + } } return ret; diff --git a/trace-events b/trace-events index 9043b56..0eeb6b2 100644 --- a/trace-events +++ b/trace-events @@ -1447,6 +1447,7 @@ qemu_rdma_exchange_send_received(const char *desc) "Response %s received." qemu_rdma_fill(size_t control_len, size_t size) "RDMA %zd of %zd bytes already in buffer" qemu_rdma_init_ram_blocks(int blocks) "Allocated %d local ram block structures" qemu_rdma_poll_recv(const char *compstr, int64_t comp, int64_t id, int sent) "completion %s #%" PRId64 " received (%" PRId64 ") left %d" +qemu_rdma_poll_send(const char *compstr, int64_t comp, int64_t id, int sent) "completion %s #%" PRId64 " received (%" PRId64 ") left %d" qemu_rdma_poll_write(const char *compstr, int64_t comp, int left, uint64_t block, uint64_t chunk, void *local, void *remote) "completions %s (%" PRId64 ") left %d, block %" PRIu64 ", chunk: %" PRIu64 " %p %p" qemu_rdma_poll_other(const char *compstr, int64_t comp, int left) "other completion %s (%" PRId64 ") received left %d" qemu_rdma_post_send_control(const char *desc) "CONTROL: sending %s.." -- 2.5.0 -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK