Hi, On Mon, Jun 24, 2013 at 09:58:01PM -0400, mrhi...@linux.vnet.ibm.com wrote: > From: "Michael R. Hines" <mrhi...@us.ibm.com> > [...] > +/* > + * Put in the log file which RDMA device was opened and the details > + * associated with that device. > + */ > +static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs) > +{ > + printf("%s RDMA Device opened: kernel name %s " > + "uverbs device name %s, " > + "infiniband_verbs class device path %s," > + " infiniband class device path %s\n", > + who, > + verbs->device->name, > + verbs->device->dev_name, > + verbs->device->dev_path, > + verbs->device->ibdev_path); > +}
see below > +static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) > +{ > + int ret = -EINVAL, idx; > + struct sockaddr_in sin; > + struct rdma_cm_id *listen_id; > + char ip[40] = "unknown"; > + > + for (idx = 0; idx < RDMA_CONTROL_MAX_WR; idx++) { > + rdma->wr_data[idx].control_len = 0; > + rdma->wr_data[idx].control_curr = NULL; > + } > + > + if (rdma->host == NULL) { > + ERROR(errp, "RDMA host is not set!\n"); > + rdma->error_state = -EINVAL; > + return -1; > + } > + /* create CM channel */ > + rdma->channel = rdma_create_event_channel(); > + if (!rdma->channel) { > + ERROR(errp, "could not create rdma event channel\n"); > + rdma->error_state = -EINVAL; > + return -1; > + } > + > + /* create CM id */ > + ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP); > + if (ret) { > + ERROR(errp, "could not create cm_id!\n"); > + goto err_dest_init_create_listen_id; > + } > + > + memset(&sin, 0, sizeof(sin)); > + sin.sin_family = AF_INET; > + sin.sin_port = htons(rdma->port); > + > + if (rdma->host && strcmp("", rdma->host)) { > + struct hostent *dest_addr; > + dest_addr = gethostbyname(rdma->host); > + if (!dest_addr) { > + ERROR(errp, "migration could not gethostbyname!\n"); > + ret = -EINVAL; > + goto err_dest_init_bind_addr; > + } > + memcpy(&sin.sin_addr.s_addr, dest_addr->h_addr, > + dest_addr->h_length); > + inet_ntop(AF_INET, dest_addr->h_addr, ip, sizeof ip); > + } else { > + sin.sin_addr.s_addr = INADDR_ANY; > + } > + > + DPRINTF("%s => %s\n", rdma->host, ip); > + > + ret = rdma_bind_addr(listen_id, (struct sockaddr *)&sin); > + if (ret) { > + ERROR(errp, "Error: could not rdma_bind_addr!\n"); > + goto err_dest_init_bind_addr; > + } > + > + rdma->listen_id = listen_id; > + if (listen_id->verbs) { > + rdma->verbs = listen_id->verbs; > + } > + qemu_rdma_dump_id("dest_init", rdma->verbs); I wonder if you have ever hit the case where rdma_bind_addr() does not set the verbs structure in listen_id because we are binding to the loopback device (also see linux kernel commit 8523c048). I keep hitting this case on my destination VM ("incoming x-rdma:host:port) Then I think qemu_rdma_dump_id can segfault trying to dereference a null verbs structure. The dump_id function should check for non-NULL verbs argument, or the dump should be made only in the (verbs != NULL) if clause. Disabling the dump_id above, I have rdma_resolve_addr() problems on the source VM side (getting RDMA_CM_EVENT_ADDR_ERROR instead of RDMA_CM_EVENT_ADDR_RESOLVED). I assume that is because of the null verbs structure destination problem above. qemu_rdma_dest_prepare() will always fail with a NULL verbs argument: > + > +static int qemu_rdma_dest_prepare(RDMAContext *rdma, Error **errp) > +{ > + int ret; > + int idx; > + > + if (!rdma->verbs) { > + ERROR(errp, "no verbs context!\n"); > + return 0; > + } It is first called from rdma_start_incoming_migration() and will fail with the loopback binding case (rdma->verbs == NULL). however later qemu_rdma_accept() will check against the incoming cm_event verbs structure and set the RDMAContext's verb struct, calling qemu_rdma_dest_prepare with that struct: [...] > +static int qemu_rdma_accept(RDMAContext *rdma) [...] > + if (!rdma->verbs) { > + rdma->verbs = verbs; > + /* > + * Cannot propagate errp, as there is no error pointer > + * to be propagated. > + */ > + ret = qemu_rdma_dest_prepare(rdma, NULL); > + if (ret) { > + fprintf(stderr, "rdma migration: error preparing dest!\n"); > + goto err_rdma_dest_wait; > + } Are these two cases intentionally different? thanks, - Vasilis