Tested-by: Michael Qiu <michael.qiu at intel.com>

? 10/28/2014 1:49 AM, Danny Zhou ??:
> Linux kernel provides UIO as well as VFIO mechanism to support writing user
> space device driver. Comparing to UIO which is available since 2.6.32 kernel,
> the VFIO is introduced into kernel since version 3.6.0 with better interrupt
> and memory protection (build on top of Intel VT-d technology) supports.
> Basically, UIO and VFIO do two common things below:
> 1) Map PCIe device's I/O memory space to user space driver
> 2) Support device interrupt notification mechanism allows user space
>    driver/application is notified when device interrupt triggers.
>
> To run an DPDK application and make use of VFIO, two in_kernel modules
> vfio and vfio-pci module must be loaded. But to use UIO, a DPDK kernel
> module igb_uio, which was there since DPDK is invented, must be loaded to
> attach to in_kernel uio module. As an possible solution to deprecate igb_uio, 
> this RFC patch leverages uio_pci_generic this in_kernel module to support
> DPDK user space PMD in a generic fashion (like how VFIO works today), to
> partially (DPDK KNI module rte_kni still sits in kernel) remove DPDK 
> dependency
> on GPL code igb_uio in kernel. Tests with uio_pci_generic shows performance
> remains same and LSC interrupts works as before.
>
> Example to bind Network Ports to uio_pci_generic:
>       modprobe uio
>       modprobe uio_pci_generic
>       /* to bind device 08:00.0, to the uio_pci_generic driver */
>       ./tools/dpdk_nic_bind.py -b uio_pci_generic 08:00.0
>
> Note: this RFC patch does not completely remove igb_uio support due to
> igb_uio supports creating maximum number of SR-IOV VFs (Virtual Functions)
> by using max_vfs kernel parameter on older kernels (kernel 3.7.x and below).
> Specifically, igb_uio explicitly calls pci_enable_sriov() to create VFs, while
> it is not invoked in either uio or uio_pci_generic. On kernel 3.8.x
> and above, user can use the standard sysfs to enable VFs. For examples:
>
> #echo $num_vf_enabled > /sys/class/net/$dev/device/sriov_numvfs   //enable VFs
> #echo 0 > /sys/class/net/$dev/device/sriov_numvfs                 //disable 
> VFs
>
> ---
>  lib/librte_eal/common/include/rte_pci.h            |   1 +
>  lib/librte_eal/linuxapp/eal/eal_interrupts.c       |  67 +++++--
>  lib/librte_eal/linuxapp/eal/eal_pci.c              |   2 +-
>  lib/librte_eal/linuxapp/eal/eal_pci_uio.c          | 203 
> ++++++++++++++++-----
>  .../linuxapp/eal/include/exec-env/rte_interrupts.h |   1 +
>  tools/dpdk_nic_bind.py                             |   2 +-
>  6 files changed, 214 insertions(+), 62 deletions(-)
>
> diff --git a/lib/librte_eal/common/include/rte_pci.h 
> b/lib/librte_eal/common/include/rte_pci.h
> index 66ed793..71ca882 100644
> --- a/lib/librte_eal/common/include/rte_pci.h
> +++ b/lib/librte_eal/common/include/rte_pci.h
> @@ -148,6 +148,7 @@ struct rte_pci_device {
>       struct rte_pci_id id;                   /**< PCI ID. */
>       struct rte_pci_resource mem_resource[PCI_MAX_RESOURCE];   /**< PCI 
> Memory Resource */
>       struct rte_intr_handle intr_handle;     /**< Interrupt handle */
> +     char driver_name[BUFSIZ];               /**< driver name */
>       const struct rte_pci_driver *driver;    /**< Associated driver */
>       uint16_t max_vfs;                       /**< sriov enable if not zero */
>       int numa_node;                          /**< NUMA node connection */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_interrupts.c 
> b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> index dc2668a..f2da778 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_interrupts.c
> @@ -361,6 +361,53 @@ vfio_disable_msix(struct rte_intr_handle *intr_handle) {
>  }
>  #endif
>  
> +static int
> +uio_intr_enable(struct rte_intr_handle *intr_handle)
> +{
> +     unsigned char command_high;
> +
> +     /* use config file descriptor for uio_pci_generic */
> +     if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> +             RTE_LOG(ERR, EAL,
> +                     "Error reading interrupts status for fd %d\n",
> +                     intr_handle->uio_cfg_fd);
> +             return -1;
> +     }
> +     /* disable interrupts */
> +     command_high |= 0x4;
> +     if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> +             RTE_LOG(ERR, EAL,
> +                     "Error disabling interrupts for fd %d\n",
> +                     intr_handle->uio_cfg_fd);
> +             return -1;
> +     }
> +
> +     return 0;
> +}
> +
> +static int
> +uio_intr_disable(struct rte_intr_handle *intr_handle)
> +{
> +     unsigned char command_high;
> +
> +     if (pread(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> +             RTE_LOG(ERR, EAL,
> +                     "Error reading interrupts status for fd %d\n",
> +                     intr_handle->uio_cfg_fd);
> +             return -1;
> +     }
> +     /* enable interrupts */
> +     command_high &= ~0x4;
> +     if (pwrite(intr_handle->uio_cfg_fd, &command_high, 1, 5) != 1) {
> +             RTE_LOG(ERR, EAL,
> +                     "Error enabling interrupts for fd %d\n",
> +                     intr_handle->uio_cfg_fd);
> +             return -1;
> +     }
> +
> +     return 0;
> +}
> +
>  int
>  rte_intr_callback_register(struct rte_intr_handle *intr_handle,
>                       rte_intr_callback_fn cb, void *cb_arg)
> @@ -500,20 +547,14 @@ rte_intr_callback_unregister(struct rte_intr_handle 
> *intr_handle,
>  int
>  rte_intr_enable(struct rte_intr_handle *intr_handle)
>  {
> -     const int value = 1;
> -
> -     if (!intr_handle || intr_handle->fd < 0)
> +     if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd)
>               return -1;
>  
>       switch (intr_handle->type){
>       /* write to the uio fd to enable the interrupt */
>       case RTE_INTR_HANDLE_UIO:
> -             if (write(intr_handle->fd, &value, sizeof(value)) < 0) {
> -                     RTE_LOG(ERR, EAL,
> -                             "Error enabling interrupts for fd %d\n",
> -                                                     intr_handle->fd);
> +             if (uio_intr_enable(intr_handle))
>                       return -1;
> -             }
>               break;
>       /* not used at this moment */
>       case RTE_INTR_HANDLE_ALARM:
> @@ -546,20 +587,14 @@ rte_intr_enable(struct rte_intr_handle *intr_handle)
>  int
>  rte_intr_disable(struct rte_intr_handle *intr_handle)
>  {
> -     const int value = 0;
> -
> -     if (!intr_handle || intr_handle->fd < 0)
> +     if (!intr_handle || intr_handle->fd < 0 || intr_handle->uio_cfg_fd)
>               return -1;
>  
>       switch (intr_handle->type){
>       /* write to the uio fd to disable the interrupt */
>       case RTE_INTR_HANDLE_UIO:
> -             if (write(intr_handle->fd, &value, sizeof(value)) < 0){
> -                     RTE_LOG(ERR, EAL,
> -                             "Error disabling interrupts for fd %d\n",
> -                                                     intr_handle->fd);
> +             if (uio_intr_disable(intr_handle))
>                       return -1;
> -             }
>               break;
>       /* not used at this moment */
>       case RTE_INTR_HANDLE_ALARM:
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c 
> b/lib/librte_eal/linuxapp/eal/eal_pci.c
> index 5fe3961..5c73908 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
> @@ -498,7 +498,7 @@ pci_map_device(struct rte_pci_device *dev)
>                       return ret;
>       }
>  #endif
> -     /* map resources for devices that use igb_uio */
> +     /* map resources for devices that use uio_pci_generic or igb_uio */
>       if (!mapped) {
>               ret = pci_uio_map_resource(dev);
>               if (ret != 0)
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c 
> b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> index 7e62266..4b159e2 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_uio.c
> @@ -35,6 +35,7 @@
>  #include <fcntl.h>
>  #include <dirent.h>
>  #include <sys/stat.h>
> +#include <linux/pci_regs.h>
>  
>  #include <rte_log.h>
>  #include <rte_pci.h>
> @@ -46,72 +47,130 @@
>  #include "eal_filesystem.h"
>  #include "eal_pci_init.h"
>  
> +#define IORESOURCE_MEM        0x00000200
> +#define UIO_PCI_GENERIC_NAME  "uio_pci_generic"
> +
>  static int pci_parse_sysfs_value(const char *filename, uint64_t *val);
>  
>  
>  #define OFF_MAX              ((uint64_t)(off_t)-1)
>  static int
> -pci_uio_get_mappings(const char *devname, struct pci_map maps[], int nb_maps)
> +pci_uio_get_mappings(struct rte_pci_device *dev, const char *devname,
> +                             struct pci_map maps[], int nb_maps)
>  {
> -     int i;
> +     struct rte_pci_addr *loc = &dev->addr;
> +     int i = 0;
>       char dirname[PATH_MAX];
>       char filename[PATH_MAX];
>       uint64_t offset, size;
> +     unsigned long long start_addr, end_addr, flags;
> +     FILE *f;
>  
> -     for (i = 0; i != nb_maps; i++) {
> -
> -             /* check if map directory exists */
> -             snprintf(dirname, sizeof(dirname),
> -                     "%s/maps/map%u", devname, i);
> -
> -             if (access(dirname, F_OK) != 0)
> -                     break;
> -
> -             /* get mapping offset */
> +     if (strncmp(dev->driver_name, UIO_PCI_GENERIC_NAME,
> +                     strlen(UIO_PCI_GENERIC_NAME)) == 0) {
>               snprintf(filename, sizeof(filename),
> -                     "%s/offset", dirname);
> -             if (pci_parse_sysfs_value(filename, &offset) < 0) {
> -                     RTE_LOG(ERR, EAL,
> -                             "%s(): cannot parse offset of %s\n",
> -                             __func__, dirname);
> -                     return -1;
> -             }
> +                     SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/resource",
> +                     loc->domain, loc->bus, loc->devid, loc->function);
>  
> -             /* get mapping size */
> -             snprintf(filename, sizeof(filename),
> -                     "%s/size", dirname);
> -             if (pci_parse_sysfs_value(filename, &size) < 0) {
> +             f = fopen(filename, "r");
> +             if (f == NULL) {
>                       RTE_LOG(ERR, EAL,
> -                             "%s(): cannot parse size of %s\n",
> -                             __func__, dirname);
> +                     "%s(): cannot open sysfs %s\n",
> +                     __func__, filename);
>                       return -1;
>               }
>  
> -             /* get mapping physical address */
> -             snprintf(filename, sizeof(filename),
> -                     "%s/addr", dirname);
> -             if (pci_parse_sysfs_value(filename, &maps[i].phaddr) < 0) {
> -                     RTE_LOG(ERR, EAL,
> -                             "%s(): cannot parse addr of %s\n",
> -                             __func__, dirname);
> -                     return -1;
> +             while (fscanf(f, "%llx %llx %llx", &start_addr,
> +                             &end_addr, &flags) == 3 && i < nb_maps) {
> +                     if (flags & IORESOURCE_MEM) {
> +                             maps[i].offset = 0x0;
> +                             maps[i].size = end_addr - start_addr + 1;
> +                             maps[i].phaddr = start_addr;
> +                             i++;
> +                     }
>               }
> +             fclose(f);
> +     } else {
> +             for (i = 0; i != nb_maps; i++) {
> +                     /* check if map directory exists */
> +                     snprintf(dirname, sizeof(dirname),
> +                             "%s/maps/map%u", devname, i);
> +
> +                     if (access(dirname, F_OK) != 0)
> +                             break;
> +
> +                     /* get mapping offset */
> +                     snprintf(filename, sizeof(filename),
> +                             "%s/offset", dirname);
> +                     if (pci_parse_sysfs_value(filename, &offset) < 0) {
> +                             RTE_LOG(ERR, EAL,
> +                                     "%s(): cannot parse offset of %s\n",
> +                                     __func__, dirname);
> +                             return -1;
> +                     }
>  
> -             if ((offset > OFF_MAX) || (size > SIZE_MAX)) {
> -                     RTE_LOG(ERR, EAL,
> -                             "%s(): offset/size exceed system max value\n",
> -                             __func__);
> -                     return -1;
> -             }
> +                     /* get mapping size */
> +                     snprintf(filename, sizeof(filename),
> +                             "%s/size", dirname);
> +                     if (pci_parse_sysfs_value(filename, &size) < 0) {
> +                             RTE_LOG(ERR, EAL,
> +                                     "%s(): cannot parse size of %s\n",
> +                                     __func__, dirname);
> +                             return -1;
> +                     }
>  
> -             maps[i].offset = offset;
> -             maps[i].size = size;
> +                     /* get mapping physical address */
> +                     snprintf(filename, sizeof(filename),
> +                             "%s/addr", dirname);
> +                     if (pci_parse_sysfs_value(filename,
> +                                             &maps[i].phaddr) < 0) {
> +                             RTE_LOG(ERR, EAL,
> +                                     "%s(): cannot parse addr of %s\n",
> +                                     __func__, dirname);
> +                             return -1;
> +                     }
> +
> +                     if ((offset > OFF_MAX) || (size > SIZE_MAX)) {
> +                             RTE_LOG(ERR, EAL,
> +                                     "%s(): offset/size exceed system max 
> value\n",
> +                                     __func__);
> +                             return -1;
> +                     }
> +
> +                     maps[i].offset = offset;
> +                     maps[i].size = size;
> +             }
>       }
>  
>       return i;
>  }
>  
>  static int
> +pci_uio_set_bus_master(int dev_fd)
> +{
> +     uint16_t reg;
> +     int ret;
> +
> +     ret = pread(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
> +     if (ret != sizeof(reg)) {
> +             RTE_LOG(ERR, EAL,
> +                     "Cannot read command from PCI config space!\n");
> +             return -1;
> +     }
> +
> +     reg |= PCI_COMMAND_MASTER;
> +
> +     ret = pwrite(dev_fd, &reg, sizeof(reg), PCI_COMMAND);
> +     if (ret != sizeof(reg)) {
> +             RTE_LOG(ERR, EAL,
> +                     "Cannot write command to PCI config space!\n");
> +             return -1;
> +     }
> +
> +     return 0;
> +}
> +
> +static int
>  pci_uio_map_secondary(struct rte_pci_device *dev)
>  {
>       int fd, i;
> @@ -210,6 +269,10 @@ pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf,
>       struct dirent *e;
>       DIR *dir;
>       char dirname[PATH_MAX];
> +     FILE *f;
> +     char fullpath[PATH_MAX];
> +     char buf[BUFSIZ];
> +     char *s;
>  
>       /* depending on kernel version, uio can be located in uio/uioX
>        * or uio:uioX */
> @@ -265,6 +328,30 @@ pci_get_uio_dev(struct rte_pci_device *dev, char *dstbuf,
>       if (e == NULL)
>               return -1;
>  
> +     /* remember driver name of uio device */
> +     snprintf(fullpath, sizeof(fullpath),
> +                     SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/uio/%s/name" ,
> +                     loc->domain, loc->bus, loc->devid,
> +                     loc->function, e->d_name);
> +
> +     f = fopen(fullpath, "r");
> +     if (f == NULL) {
> +             RTE_LOG(ERR, EAL,
> +                     "%s(): cannot open sysfs to get driver name\n",
> +                     __func__);
> +             return -1;
> +     }
> +     s = fgets(buf, sizeof(buf), f);
> +     if (s == NULL) {
> +             RTE_LOG(ERR, EAL,
> +                     "%s(): cannot read sysfs to get driver name\n",
> +                     __func__);
> +             fclose(f);
> +             return -1;
> +     }
> +     strncpy(dev->driver_name, buf, sizeof(buf));
> +     fclose(f);
> +
>       /* create uio device if we've been asked to */
>       if (internal_config.create_uio_dev &&
>                       pci_mknod_uio_dev(dstbuf, uio_num) < 0)
> @@ -279,6 +366,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
>  {
>       int i, j;
>       char dirname[PATH_MAX];
> +     char cfgname[PATH_MAX];
>       char devname[PATH_MAX]; /* contains the /dev/uioX */
>       void *mapaddr;
>       int uio_num;
> @@ -291,6 +379,7 @@ pci_uio_map_resource(struct rte_pci_device *dev)
>       struct pci_map *maps;
>  
>       dev->intr_handle.fd = -1;
> +     dev->intr_handle.uio_cfg_fd = -1;
>       dev->intr_handle.type = RTE_INTR_HANDLE_UNKNOWN;
>  
>       /* secondary processes - use already recorded details */
> @@ -315,6 +404,33 @@ pci_uio_map_resource(struct rte_pci_device *dev)
>       }
>       dev->intr_handle.type = RTE_INTR_HANDLE_UIO;
>  
> +     snprintf(cfgname, sizeof(cfgname),
> +                     "/sys/class/uio/uio%u/device/config", uio_num);
> +     dev->intr_handle.uio_cfg_fd = open(cfgname, O_RDWR);
> +     if (dev->intr_handle.uio_cfg_fd < 0) {
> +             RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
> +                     cfgname, strerror(errno));
> +             return -1;
> +     }
> +
> +     /* handle Misc. stuff for uio_pci_generic  */
> +     if (strncmp(dev->driver_name, UIO_PCI_GENERIC_NAME,
> +                     strlen(UIO_PCI_GENERIC_NAME)) == 0) {
> +             /* update devname for uio_pci_generic  */
> +             snprintf(devname, sizeof(devname),
> +                     SYSFS_PCI_DEVICES "/" PCI_PRI_FMT "/resource%d",
> +                     loc->domain, loc->bus, loc->devid, loc->function, 0);
> +
> +             /* set bus master that is not done by uio_pci_generic */
> +             if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
> +                     if 
> (pci_uio_set_bus_master(dev->intr_handle.uio_cfg_fd)) {
> +                             RTE_LOG(ERR, EAL,
> +                                     "Cannot set up bus mastering!\n");
> +                             return -1;
> +                     }
> +             }
> +     }
> +
>       /* allocate the mapping details for secondary processes*/
>       uio_res = rte_zmalloc("UIO_RES", sizeof(*uio_res), 0);
>       if (uio_res == NULL) {
> @@ -327,13 +443,12 @@ pci_uio_map_resource(struct rte_pci_device *dev)
>       memcpy(&uio_res->pci_addr, &dev->addr, sizeof(uio_res->pci_addr));
>  
>       /* collect info about device mappings */
> -     nb_maps = pci_uio_get_mappings(dirname, uio_res->maps,
> -                                    RTE_DIM(uio_res->maps));
> +     nb_maps = pci_uio_get_mappings(dev, dirname, uio_res->maps,
> +                                             RTE_DIM(uio_res->maps));
>       if (nb_maps < 0) {
>               rte_free(uio_res);
>               return nb_maps;
>       }
> -
>       uio_res->nb_maps = nb_maps;
>  
>       /* Map all BARs */
> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h 
> b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> index 23eafd9..151323d 100644
> --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_interrupts.h
> @@ -51,6 +51,7 @@ enum rte_intr_handle_type {
>  /** Handle for interrupts. */
>  struct rte_intr_handle {
>       int vfio_dev_fd;                 /**< VFIO device file descriptor */
> +     int uio_cfg_fd;                  /**< UIO config file descriptor */
>       int fd;                          /**< file descriptor */
>       enum rte_intr_handle_type type;  /**< handle type */
>  };
> diff --git a/tools/dpdk_nic_bind.py b/tools/dpdk_nic_bind.py
> index 812b6a1..2483056 100755
> --- a/tools/dpdk_nic_bind.py
> +++ b/tools/dpdk_nic_bind.py
> @@ -43,7 +43,7 @@ ETHERNET_CLASS = "0200"
>  # Each device within this is itself a dictionary of device properties
>  devices = {}
>  # list of supported DPDK drivers
> -dpdk_drivers = [ "igb_uio", "vfio-pci" ]
> +dpdk_drivers = [ "igb_uio", "vfio-pci", "uio_pci_generic" ]
>  
>  # command-line arg flags
>  b_flag = None

Reply via email to