2018-07-31 20:28 GMT+09:00 Anatoly Burakov <anatoly.bura...@intel.com>: > Currently, DPDK will skip mapping some areas (or even an entire BAR) > if MSI-X table happens to be in them but is smaller than page size. > > Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this > as a capability flag. Capability flags themselves are also only > supported since kernel 4.6 [2]. > > This commit will introduce support for checking VFIO capabilities, > and will use it to check if we are allowed to map BARs with MSI-X > tables in them, along with backwards compatibility for older > kernels, including a workaround for a variable rename in VFIO > region info structure [3]. > > [1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ > linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6 > > [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ > linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279 > > [3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/ > linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4 > > Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com> > --- > > Notes: > v2->v1: > - Fix pointer in pci_vfio_get_region_info > - Fix commit message > > drivers/bus/pci/linux/pci_vfio.c | 127 ++++++++++++++++++++--- > lib/librte_eal/common/include/rte_vfio.h | 26 +++++ > 2 files changed, 140 insertions(+), 13 deletions(-) > > diff --git a/drivers/bus/pci/linux/pci_vfio.c > b/drivers/bus/pci/linux/pci_vfio.c > index 686386d6a..24f665c20 100644 > --- a/drivers/bus/pci/linux/pci_vfio.c > +++ b/drivers/bus/pci/linux/pci_vfio.c > @@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct > mapped_pci_resource *vfio_res, > return 0; > } > > +/* > + * region info may contain capability headers, so we need to keep > reallocating > + * the memory until we match allocated memory size with argsz. > + */ > +static int > +pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info, > + int region) > +{ > + struct vfio_region_info *ri; > + size_t argsz = sizeof(*ri); > + int ret; > + > + ri = malloc(sizeof(*ri)); > + if (ri == NULL) { > + RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n"); > + return -1; > + } > +again: > + memset(ri, 0, argsz); > + ri->argsz = argsz; > + ri->index = region; > + > + ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ri); > + if (ret) { > + free(ri); > + return ret; > + } > + if (ri->argsz != argsz) { > + argsz = ri->argsz; > + ri = realloc(ri, argsz); > + > + if (ri == NULL) { > + RTE_LOG(ERR, EAL, "Cannot reallocate memory for > region info\n"); > + return -1; > + } > + goto again; > + } > + *info = ri; > + > + return 0; > +} > + > +static struct vfio_info_cap_header * > +pci_vfio_info_cap(struct vfio_region_info *info, int cap) > +{ > + struct vfio_info_cap_header *h; > + size_t offset; > + > + if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) { > + /* VFIO info does not advertise capabilities */ > + return NULL; > + } > + > + offset = VFIO_CAP_OFFSET(info); > + while (offset != 0) { > + h = RTE_PTR_ADD(info, offset); > + if (h->id == cap) > + return h; > + offset = h->next; > + } > + return NULL; > +} > + > +static int > +pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region) > +{ > + struct vfio_region_info *info; > + int ret; > + > + ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region); > + if (ret < 0) > + return -1; > + > + ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL; > + > + /* cleanup */ > + free(info); > + > + return ret; > +} > + > + > static int > pci_vfio_map_resource_primary(struct rte_pci_device *dev) > { > @@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device > *dev) > if (ret < 0) { > RTE_LOG(ERR, EAL, " %s cannot get MSI-X BAR number!\n", > pci_addr); > - goto err_vfio_dev_fd; > + goto err_vfio_res; > + } > + /* if we found our MSI-X BAR region, check if we can mmap it */ > + if (vfio_res->msix_table.bar_index != -1) { > + int ret = pci_vfio_msix_is_mappable(vfio_dev_fd, > + vfio_res->msix_table.bar_index); > + if (ret < 0) { > + RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is > mappable\n"); > + goto err_vfio_res; > + } else if (ret != 0) { > + /* we can map it, so we don't care where it is */ > + RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as > mappable\n"); > + vfio_res->msix_table.bar_index = -1; > + } > } > > for (i = 0; i < (int) vfio_res->nb_maps; i++) { > - struct vfio_region_info reg = { .argsz = sizeof(reg) }; > + struct vfio_region_info *reg; > void *bar_addr; > > - reg.index = i; > - > - ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, ®); > - if (ret) { > + ret = pci_vfio_get_region_info(vfio_dev_fd, ®, i); > + if (ret < 0) { > RTE_LOG(ERR, EAL, " %s cannot get device region info > " > - "error %i (%s)\n", pci_addr, errno, > strerror(errno)); > + "error %i (%s)\n", pci_addr, errno, > + strerror(errno)); > goto err_vfio_res; > } > > /* chk for io port region */ > ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i); > - if (ret < 0) > + if (ret < 0) { > + free(reg); > goto err_vfio_res; > - else if (ret) { > + } else if (ret) { > RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n", > i); > + free(reg); > continue; > } > > /* skip non-mmapable BARs */ > - if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) > + if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) { > + free(reg); > continue; > + } > > /* try mapping somewhere close to the end of hugepages */ > if (pci_map_addr == NULL) > pci_map_addr = pci_find_max_end_va(); > > bar_addr = pci_map_addr; > - pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size); > + pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size); > > maps[i].addr = bar_addr; > - maps[i].offset = reg.offset; > - maps[i].size = reg.size; > + maps[i].offset = reg->offset; > + maps[i].size = reg->size; > maps[i].path = NULL; /* vfio doesn't have per-resource paths > */ > > ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0); > if (ret < 0) { > RTE_LOG(ERR, EAL, " %s mapping BAR%i failed: %s\n", > pci_addr, i, strerror(errno)); > + free(reg); > goto err_vfio_res; > } > > dev->mem_resource[i].addr = maps[i].addr; > + > + free(reg); > } > > if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) { > diff --git a/lib/librte_eal/common/include/rte_vfio.h > b/lib/librte_eal/common/include/rte_vfio.h > index 5ca13fcce..f6617e004 100644 > --- a/lib/librte_eal/common/include/rte_vfio.h > +++ b/lib/librte_eal/common/include/rte_vfio.h > @@ -14,6 +14,8 @@ > extern "C" { > #endif > > +#include <stdint.h> > + > /* > * determine if VFIO is present on the system > */ > @@ -44,6 +46,30 @@ extern "C" { > #define RTE_VFIO_NOIOMMU 8 > #endif > > +/* > + * capabilities are only supported on kernel 4.6+. there were also some API > + * changes as well, so add a macro to get cap offset. > + */ > +#ifdef VFIO_REGION_INFO_FLAG_CAPS > +#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS > +#define VFIO_CAP_OFFSET(x) (x->cap_offset) > +#else > +#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3) > +#define VFIO_CAP_OFFSET(x) (x->resv) > +struct vfio_info_cap_header { > + uint16_t id; > + uint16_t version; > + uint32_t next; > +}; > +#endif > + > +/* kernels 4.16+ can map BAR containing MSI-X table */ > +#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE > +#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE > +#else > +#define RTE_VFIO_CAP_MSIX_MAPPABLE 3 > +#endif > + > #else /* not VFIO_PRESENT */ > > /* we don't need an actual definition, only pointer is used */ > -- > 2.17.1
Hi Anatoly, Please fix the error check for ioctl in pci_vfio_region_info() from "if (ret)" to "if (ret < 0)" My environment reported compiler errors with -Werror=maybe-uninitialized). dpdk/drivers/bus/pci/linux/pci_vfio.c: In function ‘pci_vfio_map_resource_primary’: dpdk/drivers/bus/pci/linux/pci_vfio.c:612:4: error: ‘reg’ may be used uninitialized in this function [-Werror=maybe-uninitialized] free(reg); ^~~~~~~~~ dpdk/drivers/bus/pci/linux/pci_vfio.c:495:2: error: ‘info’ may be used uninitialized in this function [-Werror=maybe-uninitialized] free(info); ^~~~~~~~~~ dpdk/drivers/bus/pci/linux/pci_vfio.c:485:27: note: ‘info’ was declared here struct vfio_region_info *info; Other code looks good to me. I tested the updated patch with the above change and confirmed it could mmap BAR on my ppc64le machine. Thanks, Takeshi