On 31-Jul-18 10:38 AM, Takeshi Yoshimura wrote:
2018-07-30 20:17 GMT+09:00 Anatoly Burakov <anatoly.bura...@intel.com>:
Currently, DPDK will skip mapping some areas (or even an entire BAR)
if MSI-X happens to be in it but is smaller than page address.

Kernels 4.16+ will allow mapping MSI-X BARs [1], and will report this
as a capability flag. Capability flags themselves are also only
supported since kernel 4.6 [2].

This commit will introduce support for checking VFIO capabilities,
and will use it to check if we are allowed to map BARs with MSI-X
tables in them, along with backwards compatibility for older
kernels, including a workaround for a variable rename in VFIO
region info structure [3].

[1] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=a32295c612c57990d17fb0f41e7134394b2f35f6

[2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=c84982adb23bcf3b99b79ca33527cd2625fbe279

[3] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/
linux.git/commit/?id=ff63eb638d63b95e489f976428f1df01391e15e4

Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
---
  drivers/bus/pci/linux/pci_vfio.c         | 127 ++++++++++++++++++++---
  lib/librte_eal/common/include/rte_vfio.h |  26 +++++
  2 files changed, 140 insertions(+), 13 deletions(-)

diff --git a/drivers/bus/pci/linux/pci_vfio.c b/drivers/bus/pci/linux/pci_vfio.c
index 686386d6a..e7765ee11 100644
--- a/drivers/bus/pci/linux/pci_vfio.c
+++ b/drivers/bus/pci/linux/pci_vfio.c
@@ -415,6 +415,88 @@ pci_vfio_mmap_bar(int vfio_dev_fd, struct 
mapped_pci_resource *vfio_res,
         return 0;
  }

+/*
+ * region info may contain capability headers, so we need to keep reallocating
+ * the memory until we match allocated memory size with argsz.
+ */
+static int
+pci_vfio_get_region_info(int vfio_dev_fd, struct vfio_region_info **info,
+               int region)
+{
+       struct vfio_region_info *ri;
+       size_t argsz = sizeof(*ri);
+       int ret;
+
+       ri = malloc(sizeof(*ri));
+       if (ri == NULL) {
+               RTE_LOG(ERR, EAL, "Cannot allocate memory for region info\n");
+               return -1;
+       }
+again:
+       memset(ri, 0, argsz);
+       ri->argsz = argsz;
+       ri->index = region;
+
+       ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info);
+       if (ret) {
+               free(ri);
+               return ret;
+       }
+       if (ri->argsz != argsz) {
+               argsz = ri->argsz;
+               ri = realloc(ri, argsz);
+
+               if (ri == NULL) {
+                       RTE_LOG(ERR, EAL, "Cannot reallocate memory for region 
info\n");
+                       return -1;
+               }
+               goto again;
+       }
+       *info = ri;
+
+       return 0;
+}
+
+static struct vfio_info_cap_header *
+pci_vfio_info_cap(struct vfio_region_info *info, int cap)
+{
+       struct vfio_info_cap_header *h;
+       size_t offset;
+
+       if ((info->flags & RTE_VFIO_INFO_FLAG_CAPS) == 0) {
+               /* VFIO info does not advertise capabilities */
+               return NULL;
+       }
+
+       offset = VFIO_CAP_OFFSET(info);
+       while (offset != 0) {
+               h = RTE_PTR_ADD(info, offset);
+               if (h->id == cap)
+                       return h;
+               offset = h->next;
+       }
+       return NULL;
+}
+
+static int
+pci_vfio_msix_is_mappable(int vfio_dev_fd, int msix_region)
+{
+       struct vfio_region_info *info;
+       int ret;
+
+       ret = pci_vfio_get_region_info(vfio_dev_fd, &info, msix_region);
+       if (ret < 0)
+               return -1;
+
+       ret = pci_vfio_info_cap(info, RTE_VFIO_CAP_MSIX_MAPPABLE) != NULL;
+
+       /* cleanup */
+       free(info);
+
+       return ret;
+}
+
+
  static int
  pci_vfio_map_resource_primary(struct rte_pci_device *dev)
  {
@@ -464,56 +546,75 @@ pci_vfio_map_resource_primary(struct rte_pci_device *dev)
         if (ret < 0) {
                 RTE_LOG(ERR, EAL, "  %s cannot get MSI-X BAR number!\n",
                                 pci_addr);
-               goto err_vfio_dev_fd;
+               goto err_vfio_res;
+       }
+       /* if we found our MSI-X BAR region, check if we can mmap it */
+       if (vfio_res->msix_table.bar_index != -1) {
+               int ret = pci_vfio_msix_is_mappable(vfio_dev_fd,
+                               vfio_res->msix_table.bar_index);
+               if (ret < 0) {
+                       RTE_LOG(ERR, EAL, "Couldn't check if MSI-X BAR is 
mappable\n");
+                       goto err_vfio_res;
+               } else if (ret != 0) {
+                       /* we can map it, so we don't care where it is */
+                       RTE_LOG(DEBUG, EAL, "VFIO reports MSI-X BAR as 
mappable\n");
+                       vfio_res->msix_table.bar_index = -1;
+               }
         }

         for (i = 0; i < (int) vfio_res->nb_maps; i++) {
-               struct vfio_region_info reg = { .argsz = sizeof(reg) };
+               struct vfio_region_info *reg;
                 void *bar_addr;

-               reg.index = i;
-
-               ret = ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
-               if (ret) {
+               ret = pci_vfio_get_region_info(vfio_dev_fd, &reg, i);
+               if (ret < 0) {
                         RTE_LOG(ERR, EAL, "  %s cannot get device region info "
-                                       "error %i (%s)\n", pci_addr, errno, 
strerror(errno));
+                               "error %i (%s)\n", pci_addr, errno,
+                               strerror(errno));
                         goto err_vfio_res;
                 }

                 /* chk for io port region */
                 ret = pci_vfio_is_ioport_bar(vfio_dev_fd, i);
-               if (ret < 0)
+               if (ret < 0) {
+                       free(reg);
                         goto err_vfio_res;
-               else if (ret) {
+               } else if (ret) {
                         RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d)\n",
                                         i);
+                       free(reg);
                         continue;
                 }

                 /* skip non-mmapable BARs */
-               if ((reg.flags & VFIO_REGION_INFO_FLAG_MMAP) == 0)
+               if ((reg->flags & VFIO_REGION_INFO_FLAG_MMAP) == 0) {
+                       free(reg);
                         continue;
+               }

                 /* try mapping somewhere close to the end of hugepages */
                 if (pci_map_addr == NULL)
                         pci_map_addr = pci_find_max_end_va();

                 bar_addr = pci_map_addr;
-               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg.size);
+               pci_map_addr = RTE_PTR_ADD(bar_addr, (size_t) reg->size);

                 maps[i].addr = bar_addr;
-               maps[i].offset = reg.offset;
-               maps[i].size = reg.size;
+               maps[i].offset = reg->offset;
+               maps[i].size = reg->size;
                 maps[i].path = NULL; /* vfio doesn't have per-resource paths */

                 ret = pci_vfio_mmap_bar(vfio_dev_fd, vfio_res, i, 0);
                 if (ret < 0) {
                         RTE_LOG(ERR, EAL, "  %s mapping BAR%i failed: %s\n",
                                         pci_addr, i, strerror(errno));
+                       free(reg);
                         goto err_vfio_res;
                 }

                 dev->mem_resource[i].addr = maps[i].addr;
+
+               free(reg);
         }

         if (pci_rte_vfio_setup_device(dev, vfio_dev_fd) < 0) {
diff --git a/lib/librte_eal/common/include/rte_vfio.h 
b/lib/librte_eal/common/include/rte_vfio.h
index 5ca13fcce..f6617e004 100644
--- a/lib/librte_eal/common/include/rte_vfio.h
+++ b/lib/librte_eal/common/include/rte_vfio.h
@@ -14,6 +14,8 @@
  extern "C" {
  #endif

+#include <stdint.h>
+
  /*
   * determine if VFIO is present on the system
   */
@@ -44,6 +46,30 @@ extern "C" {
  #define RTE_VFIO_NOIOMMU 8
  #endif

+/*
+ * capabilities are only supported on kernel 4.6+. there were also some API
+ * changes as well, so add a macro to get cap offset.
+ */
+#ifdef VFIO_REGION_INFO_FLAG_CAPS
+#define RTE_VFIO_INFO_FLAG_CAPS VFIO_REGION_INFO_FLAG_CAPS
+#define VFIO_CAP_OFFSET(x) (x->cap_offset)
+#else
+#define RTE_VFIO_INFO_FLAG_CAPS (1 << 3)
+#define VFIO_CAP_OFFSET(x) (x->resv)
+struct vfio_info_cap_header {
+       uint16_t id;
+       uint16_t version;
+       uint32_t next;
+};
+#endif
+
+/* kernels 4.16+ can map BAR containing MSI-X table */
+#ifdef VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#define RTE_VFIO_CAP_MSIX_MAPPABLE VFIO_REGION_INFO_CAP_MSIX_MAPPABLE
+#else
+#define RTE_VFIO_CAP_MSIX_MAPPABLE 3
+#endif
+
  #else /* not VFIO_PRESENT */

  /* we don't need an actual definition, only pointer is used */
--
2.17.1

Hi Anatoly,
I have tested the patch on our ppc64le machine, but the
ioctl(vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, info) in
pci_vfio_get_region_info() failed.
This may be an issue of ppc64le VFIO implementation. Let me investigate more...

Thanks,
Takeshi


Hi Takeshi, i think there's a bug in my patch. I'll submit a v2.

--
Thanks,
Anatoly

Reply via email to