Hi Jeff

Maybe I'm touching in previous discussions but please see some 
comments\questions.

From: Jeff Guo:
> This patch aim to add a general uevent mechanism in eal device layer,
> to enable all linux kernel object hot plug monitoring, so user could use these
> APIs to monitor and read out the device status info that sent from the kernel
> side, then corresponding to handle it, such as detach or attach the
> device, and even benefit to use it to do smoothly fail safe work.
> 
> 1) About uevent monitoring:
> a: add one epolling to poll the netlink socket, to monitor the uevent of
>    the device, add device_state in struct of rte_device, to identify the
>    device state machine.
> b: add enum of rte_eal_dev_event_type and struct of rte_eal_uevent.
> c: add below API in rte eal device common layer.
>    rte_eal_dev_monitor_enable
>    rte_dev_callback_register
>    rte_dev_callback_unregister
>    _rte_dev_callback_process
>    rte_dev_monitor_start
>    rte_dev_monitor_stop
> 
> 2) About failure handler, use pci uio for example,
>    add pci_remap_device in bus layer and below function to process it:
>    rte_pci_remap_device
>    pci_uio_remap_resource
>    pci_map_private_resource
>    add rte_pci_dev_bind_driver to bind pci device with explicit driver.
> 
> Signed-off-by: Jeff Guo <jia....@intel.com>
> ---
> v7->v6:
> a.modify vdev part according to the vdev rework
> b.re-define and split the func into common and bus specific code
> c.fix some incorrect issue.
> b.fix the system hung after send packcet issue.
> ---
>  drivers/bus/pci/bsd/pci.c                          |  30 ++
>  drivers/bus/pci/linux/pci.c                        |  87 +++++
>  drivers/bus/pci/linux/pci_init.h                   |   1 +
>  drivers/bus/pci/pci_common.c                       |  43 +++
>  drivers/bus/pci/pci_common_uio.c                   |  28 ++
>  drivers/bus/pci/private.h                          |  12 +
>  drivers/bus/pci/rte_bus_pci.h                      |  25 ++
>  drivers/bus/vdev/vdev.c                            |  36 +++
>  lib/librte_eal/bsdapp/eal/eal_dev.c                |  64 ++++
>  .../bsdapp/eal/include/exec-env/rte_dev.h          | 106 ++++++
>  lib/librte_eal/common/eal_common_bus.c             |  30 ++
>  lib/librte_eal/common/eal_common_dev.c             | 169 ++++++++++
>  lib/librte_eal/common/include/rte_bus.h            |  69 ++++
>  lib/librte_eal/common/include/rte_dev.h            |  89 ++++++
>  lib/librte_eal/linuxapp/eal/Makefile               |   3 +-
>  lib/librte_eal/linuxapp/eal/eal_alarm.c            |   5 +
>  lib/librte_eal/linuxapp/eal/eal_dev.c              | 356
> +++++++++++++++++++++
>  .../linuxapp/eal/include/exec-env/rte_dev.h        | 106 ++++++
>  lib/librte_eal/linuxapp/igb_uio/igb_uio.c          |   6 +
>  lib/librte_pci/rte_pci.c                           |  20 ++
>  lib/librte_pci/rte_pci.h                           |  17 +
>  21 files changed, 1301 insertions(+), 1 deletion(-)
>  create mode 100644 lib/librte_eal/bsdapp/eal/eal_dev.c
>  create mode 100644 lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
>  create mode 100644 lib/librte_eal/linuxapp/eal/eal_dev.c
>  create mode 100644 lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
> 
> diff --git a/drivers/bus/pci/bsd/pci.c b/drivers/bus/pci/bsd/pci.c
> index b8e2178..d58dbf6 100644
> --- a/drivers/bus/pci/bsd/pci.c
> +++ b/drivers/bus/pci/bsd/pci.c
> @@ -126,6 +126,29 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
>       }
>  }
> 
> +/* re-map pci device */
> +int
> +rte_pci_remap_device(struct rte_pci_device *dev)
> +{
> +     int ret;
> +
> +     if (dev == NULL)
> +             return -EINVAL;
> +
> +     switch (dev->kdrv) {
> +     case RTE_KDRV_NIC_UIO:
> +             ret = pci_uio_remap_resource(dev);
> +             break;
> +     default:
> +             RTE_LOG(DEBUG, EAL,
> +                     "  Not managed by a supported kernel driver,
> skipped\n");
> +             ret = 1;
> +             break;
> +     }
> +
> +     return ret;
> +}
> +
>  void
>  pci_uio_free_resource(struct rte_pci_device *dev,
>               struct mapped_pci_resource *uio_res)
> @@ -678,3 +701,10 @@ rte_pci_ioport_unmap(struct rte_pci_ioport *p)
> 
>       return ret;
>  }
> +
> +int
> +rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type)
> +{
> +     return -1;
> +}
> +
> diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c
> index 5da6728..792fd2c 100644
> --- a/drivers/bus/pci/linux/pci.c
> +++ b/drivers/bus/pci/linux/pci.c
> @@ -145,6 +145,38 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
>       }
>  }
> 
> +/* Map pci device */
> +int
> +rte_pci_remap_device(struct rte_pci_device *dev)
> +{
> +     int ret = -1;
> +
> +     if (dev == NULL)
> +             return -EINVAL;
> +
> +     switch (dev->kdrv) {
> +     case RTE_KDRV_VFIO:
> +#ifdef VFIO_PRESENT
> +             /* no thing to do */
> +#endif
> +             break;
> +     case RTE_KDRV_IGB_UIO:
> +     case RTE_KDRV_UIO_GENERIC:
> +             if (rte_eal_using_phys_addrs()) {
> +                     /* map resources for devices that use uio */
> +                     ret = pci_uio_remap_resource(dev);
> +             }
> +             break;
> +     default:
> +             RTE_LOG(DEBUG, EAL,
> +                     "  Not managed by a supported kernel driver,
> skipped\n");
> +             ret = 1;
> +             break;
> +     }
> +
> +     return ret;
> +}
> +
>  void *
>  pci_find_max_end_va(void)
>  {
> @@ -386,6 +418,8 @@ pci_scan_one(const char *dirname, const struct
> rte_pci_addr *addr)
>               rte_pci_add_device(dev);
>       }
> 
> +     dev->device.state = DEVICE_PARSED;
> +     TAILQ_INIT(&(dev->device.uev_cbs));
>       return 0;
>  }
> 
> @@ -854,3 +888,56 @@ rte_pci_ioport_unmap(struct rte_pci_ioport *p)
> 
>       return ret;
>  }
> +
> +int
> +rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type)
> +{
> +     char drv_bind_path[1024];
> +     char drv_override_path[1024]; /* contains the /dev/uioX */
> +     int drv_override_fd;
> +     int drv_bind_fd;
> +
> +     RTE_SET_USED(drv_type);
> +
> +     snprintf(drv_override_path, sizeof(drv_override_path),
> +             "/sys/bus/pci/devices/%s/driver_override", dev_name);
> +
> +     /* specify the driver for a device by writing to driver_override */
> +     drv_override_fd = open(drv_override_path, O_WRONLY);
> +     if (drv_override_fd < 0) {
> +             RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
> +                     drv_override_path, strerror(errno));
> +             goto err;
> +     }
> +
> +     if (write(drv_override_fd, drv_type, sizeof(drv_type)) < 0) {
> +             RTE_LOG(ERR, EAL,
> +                     "Error: bind failed - Cannot write "
> +                     "driver %s to device %s\n", drv_type, dev_name);
> +             goto err;
> +     }
> +
> +     close(drv_override_fd);
> +
> +     snprintf(drv_bind_path, sizeof(drv_bind_path),
> +             "/sys/bus/pci/drivers/%s/bind", drv_type);
> +
> +     /* do the bind by writing device to the specific driver  */
> +     drv_bind_fd = open(drv_bind_path, O_WRONLY | O_APPEND);
> +     if (drv_bind_fd < 0) {
> +             RTE_LOG(ERR, EAL, "Cannot open %s: %s\n",
> +                     drv_bind_path, strerror(errno));
> +             goto err;
> +     }
> +
> +     if (write(drv_bind_fd, dev_name, sizeof(dev_name)) < 0)
> +             goto err;
> +
> +     close(drv_bind_fd);
> +     return 0;
> +err:
> +     close(drv_override_fd);
> +     close(drv_bind_fd);
> +     return -1;
> +}
> +
> diff --git a/drivers/bus/pci/linux/pci_init.h 
> b/drivers/bus/pci/linux/pci_init.h
> index f342c47..5838402 100644
> --- a/drivers/bus/pci/linux/pci_init.h
> +++ b/drivers/bus/pci/linux/pci_init.h
> @@ -58,6 +58,7 @@ int pci_uio_alloc_resource(struct rte_pci_device *dev,
>               struct mapped_pci_resource **uio_res);
>  void pci_uio_free_resource(struct rte_pci_device *dev,
>               struct mapped_pci_resource *uio_res);
> +int pci_uio_remap_resource(struct rte_pci_device *dev);
>  int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int
> res_idx,
>               struct mapped_pci_resource *uio_res, int map_idx);
> 
> diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c
> index 104fdf9..5417b32 100644
> --- a/drivers/bus/pci/pci_common.c
> +++ b/drivers/bus/pci/pci_common.c
> @@ -282,6 +282,7 @@ pci_probe_all_drivers(struct rte_pci_device *dev)
>               if (rc > 0)
>                       /* positive value means driver doesn't support it */
>                       continue;
> +             dev->device.state = DEVICE_PROBED;
>               return 0;
>       }
>       return 1;
> @@ -481,6 +482,7 @@ rte_pci_insert_device(struct rte_pci_device
> *exist_pci_dev,
>  void
>  rte_pci_remove_device(struct rte_pci_device *pci_dev)
>  {
> +     RTE_LOG(DEBUG, EAL, " rte_pci_remove_device for device list\n");
>       TAILQ_REMOVE(&rte_pci_bus.device_list, pci_dev, next);
>  }
> 
> @@ -502,6 +504,44 @@ pci_find_device(const struct rte_device *start,
> rte_dev_cmp_t cmp,
>       return NULL;
>  }
> 
> +static struct rte_device *
> +pci_find_device_by_name(const struct rte_device *start,
> +             rte_dev_cmp_name_t cmp_name,
> +             const void *data)
> +{
> +     struct rte_pci_device *dev;
> +
> +     FOREACH_DEVICE_ON_PCIBUS(dev) {
> +             if (start && &dev->device == start) {
> +                     start = NULL; /* starting point found */
> +                     continue;
> +             }
> +             if (cmp_name(dev->device.name, data) == 0)
> +                     return &dev->device;
> +     }
> +
> +     return NULL;
> +}
> +
> +static int
> +pci_remap_device(struct rte_device *dev)
> +{
> +     struct rte_pci_device *pdev;
> +     int ret;
> +
> +     if (dev == NULL)
> +             return -EINVAL;
> +
> +     pdev = RTE_DEV_TO_PCI(dev);
> +
> +     /* remap resources for devices that use igb_uio */
> +     ret = rte_pci_remap_device(pdev);
> +     if (ret != 0)
> +             RTE_LOG(ERR, EAL, "failed to remap device %s",
> +                     dev->name);
> +     return ret;
> +}
> +
>  static int
>  pci_plug(struct rte_device *dev)
>  {
> @@ -528,10 +568,13 @@ struct rte_pci_bus rte_pci_bus = {
>               .scan = rte_pci_scan,
>               .probe = rte_pci_probe,
>               .find_device = pci_find_device,
> +             .find_device_by_name = pci_find_device_by_name,
>               .plug = pci_plug,
>               .unplug = pci_unplug,
>               .parse = pci_parse,
>               .get_iommu_class = rte_pci_get_iommu_class,
> +             .remap_device = pci_remap_device,
> +             .bind_driver = rte_pci_dev_bind_driver,
>       },
>       .device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list),
>       .driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list),
> diff --git a/drivers/bus/pci/pci_common_uio.c
> b/drivers/bus/pci/pci_common_uio.c
> index 0671131..8cb4009 100644
> --- a/drivers/bus/pci/pci_common_uio.c
> +++ b/drivers/bus/pci/pci_common_uio.c
> @@ -176,6 +176,34 @@ pci_uio_unmap(struct mapped_pci_resource
> *uio_res)
>       }
>  }
> 
> +/* remap the PCI resource of a PCI device in private virtual memory */
> +int
> +pci_uio_remap_resource(struct rte_pci_device *dev)
> +{
> +     int i;
> +     uint64_t phaddr;
> +     void *map_address;
> +
> +     /* Map all BARs */
> +     for (i = 0; i != PCI_MAX_RESOURCE; i++) {
> +             /* skip empty BAR */
> +             phaddr = dev->mem_resource[i].phys_addr;
> +             if (phaddr == 0)
> +                     continue;
> +             map_address = pci_map_private_resource(
> +                             dev->mem_resource[i].addr, 0,
> +                             (size_t)dev->mem_resource[i].len);
> +             if (map_address == MAP_FAILED)
> +                     goto error;
> +             memset(map_address, 0xFF, (size_t)dev-
> >mem_resource[i].len);
> +             dev->mem_resource[i].addr = map_address;
> +     }
> +
> +     return 0;
> +error:
> +     return -1;
> +}
> +
>  static struct mapped_pci_resource *
>  pci_uio_find_resource(struct rte_pci_device *dev)
>  {
> diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h
> index 2283f09..10baa1a 100644
> --- a/drivers/bus/pci/private.h
> +++ b/drivers/bus/pci/private.h
> @@ -202,6 +202,18 @@ void pci_uio_free_resource(struct rte_pci_device
> *dev,
>               struct mapped_pci_resource *uio_res);
> 
>  /**
> + * remap the pci uio resource..
> + *
> + * @param dev
> + *   Point to the struct rte pci device.
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +int
> +pci_uio_remap_resource(struct rte_pci_device *dev);
> +
> +/**
>   * Map device memory to uio resource
>   *
>   * This function is private to EAL.
> diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h
> index d4a2996..1662f3b 100644
> --- a/drivers/bus/pci/rte_bus_pci.h
> +++ b/drivers/bus/pci/rte_bus_pci.h
> @@ -52,6 +52,8 @@ extern "C" {
>  #include <sys/queue.h>
>  #include <stdint.h>
>  #include <inttypes.h>
> +#include <unistd.h>
> +#include <fcntl.h>
> 
>  #include <rte_debug.h>
>  #include <rte_interrupts.h>
> @@ -197,6 +199,15 @@ int rte_pci_map_device(struct rte_pci_device *dev);
>  void rte_pci_unmap_device(struct rte_pci_device *dev);
> 
>  /**
> + * Remap this device
> + *
> + * @param dev
> + *   A pointer to a rte_pci_device structure describing the device
> + *   to use
> + */
> +int rte_pci_remap_device(struct rte_pci_device *dev);
> +
> +/**
>   * Dump the content of the PCI bus.
>   *
>   * @param f
> @@ -333,6 +344,20 @@ void rte_pci_ioport_read(struct rte_pci_ioport *p,
>  void rte_pci_ioport_write(struct rte_pci_ioport *p,
>               const void *data, size_t len, off_t offset);
> 
> +/**
> + * It can be used to bind a device to a specific type of driver.
> + *
> + * @param dev_name
> + *  The device name.
> + * @param drv_type
> + *  The specific driver's type.
> + *
> + * @return
> + *  - On success, zero.
> + *  - On failure, a negative value.
> + */
> +int rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/drivers/bus/vdev/vdev.c b/drivers/bus/vdev/vdev.c
> index fd7736d..773f6e0 100644
> --- a/drivers/bus/vdev/vdev.c
> +++ b/drivers/bus/vdev/vdev.c
> @@ -323,6 +323,39 @@ vdev_find_device(const struct rte_device *start,
> rte_dev_cmp_t cmp,
>       return NULL;
>  }
> 
> +static struct rte_device *
> +vdev_find_device_by_name(const struct rte_device *start,
> +             rte_dev_cmp_name_t cmp_name,
> +             const void *data)
> +{
> +     struct rte_vdev_device *dev;
> +
> +     TAILQ_FOREACH(dev, &vdev_device_list, next) {
> +             if (start && &dev->device == start) {
> +                     start = NULL;
> +                     continue;
> +             }
> +             if (cmp_name(dev->device.name, data) == 0)
> +                     return &dev->device;
> +     }
> +     return NULL;
> +}
> +
> +static int
> +vdev_remap_device(struct rte_device *dev)
> +{
> +     RTE_SET_USED(dev);
> +     return 0;
> +}
> +
> +static int
> +vdev_bind_driver(const char *dev_name, const char *drv_type)
> +{
> +     RTE_SET_USED(dev_name);
> +     RTE_SET_USED(drv_type);
> +     return 0;
> +}
> +
>  static int
>  vdev_plug(struct rte_device *dev)
>  {
> @@ -339,9 +372,12 @@ static struct rte_bus rte_vdev_bus = {
>       .scan = vdev_scan,
>       .probe = vdev_probe,
>       .find_device = vdev_find_device,
> +     .find_device_by_name = vdev_find_device_by_name,
>       .plug = vdev_plug,
>       .unplug = vdev_unplug,
>       .parse = vdev_parse,
> +     .remap_device = vdev_remap_device,
> +     .bind_driver = vdev_bind_driver,
>  };
> 
>  RTE_REGISTER_BUS(vdev, rte_vdev_bus);
> diff --git a/lib/librte_eal/bsdapp/eal/eal_dev.c
> b/lib/librte_eal/bsdapp/eal/eal_dev.c
> new file mode 100644
> index 0000000..6ea9a74
> --- /dev/null
> +++ b/lib/librte_eal/bsdapp/eal/eal_dev.c
> @@ -0,0 +1,64 @@
> +/*-
> + *   Copyright(c) 2010-2017 Intel Corporation.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <sys/queue.h>
> +#include <sys/signalfd.h>
> +#include <sys/ioctl.h>
> +#include <sys/socket.h>
> +#include <linux/netlink.h>
> +#include <sys/epoll.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +
> +#include <rte_malloc.h>
> +#include <rte_bus.h>
> +#include <rte_dev.h>
> +#include <rte_devargs.h>
> +#include <rte_debug.h>
> +#include <rte_log.h>
> +
> +#include "eal_thread.h"
> +
> +int
> +rte_dev_monitor_start(void)
> +{
> +     return -1;
> +}
> +
> +int
> +rte_dev_monitor_stop(void)
> +{
> +     return -1;
> +}
> diff --git a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
> b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
> new file mode 100644
> index 0000000..6a6feb5
> --- /dev/null
> +++ b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h
> @@ -0,0 +1,106 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef _RTE_DEV_H_
> +#error "don't include this file directly, please include generic <rte_dev.h>"
> +#endif
> +
> +#ifndef _RTE_LINUXAPP_DEV_H_
> +#define _RTE_LINUXAPP_DEV_H_
> +
> +#include <stdio.h>
> +
> +#include <rte_dev.h>
> +
> +#define RTE_EAL_UEV_MSG_LEN 4096
> +#define RTE_EAL_UEV_MSG_ELEM_LEN 128
> +
> +enum uev_subsystem {
> +     UEV_SUBSYSTEM_UIO,
> +     UEV_SUBSYSTEM_VFIO,
> +     UEV_SUBSYSTEM_PCI,
> +     UEV_SUBSYSTEM_MAX
> +};
> +
> +enum uev_monitor_netlink_group {
> +     UEV_MONITOR_KERNEL,
> +     UEV_MONITOR_UDEV,
> +};
> +
> +/**
> + * The device event type.
> + */
> +enum rte_eal_dev_event_type {
> +     RTE_EAL_DEV_EVENT_UNKNOWN,      /**< unknown event type */
> +     RTE_EAL_DEV_EVENT_ADD,          /**< device adding event */
> +     RTE_EAL_DEV_EVENT_REMOVE,
> +                                     /**< device removing event */
> +     RTE_EAL_DEV_EVENT_CHANGE,
> +                                     /**< device status change event */
> +     RTE_EAL_DEV_EVENT_MOVE,         /**< device sys path move
> event */
> +     RTE_EAL_DEV_EVENT_ONLINE,       /**< device online event */
> +     RTE_EAL_DEV_EVENT_OFFLINE,      /**< device offline event */
> +     RTE_EAL_DEV_EVENT_MAX           /**< max value of this enum
> */
> +};
> +
> +struct rte_eal_uevent {
> +     enum rte_eal_dev_event_type type;       /**< device event type */
> +     int subsystem;                          /**< subsystem id */
> +     char *devname;                          /**< device name */
> +     enum uev_monitor_netlink_group group;   /**< device netlink
> group */
> +};
> +
> +/**
> + * Start the device uevent monitoring.
> + *
> + * @param none
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +int
> +rte_dev_monitor_start(void);
> +
> +/**
> + * Stop the device uevent monitoring .
> + *
> + * @param none
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +
> +int
> +rte_dev_monitor_stop(void);
> +
> +#endif /* _RTE_LINUXAPP_DEV_H_ */
> diff --git a/lib/librte_eal/common/eal_common_bus.c
> b/lib/librte_eal/common/eal_common_bus.c
> index 3e022d5..b7219c9 100644
> --- a/lib/librte_eal/common/eal_common_bus.c
> +++ b/lib/librte_eal/common/eal_common_bus.c
> @@ -51,8 +51,11 @@ rte_bus_register(struct rte_bus *bus)
>       RTE_VERIFY(bus->scan);
>       RTE_VERIFY(bus->probe);
>       RTE_VERIFY(bus->find_device);
> +     RTE_VERIFY(bus->find_device_by_name);
>       /* Buses supporting driver plug also require unplug. */
>       RTE_VERIFY(!bus->plug || bus->unplug);
> +     RTE_VERIFY(bus->remap_device);
> +     RTE_VERIFY(bus->bind_driver);
> 
>       TAILQ_INSERT_TAIL(&rte_bus_list, bus, next);
>       RTE_LOG(DEBUG, EAL, "Registered [%s] bus.\n", bus->name);
> @@ -170,6 +173,14 @@ cmp_rte_device(const struct rte_device *dev1,
> const void *_dev2)
>  }
> 
>  static int
> +cmp_rte_device_name(const char *dev_name1, const void *_dev_name2)
> +{
> +     const char *dev_name2 = _dev_name2;
> +
> +     return strcmp(dev_name1, dev_name2);
> +}
> +
> +static int
>  bus_find_device(const struct rte_bus *bus, const void *_dev)
>  {
>       struct rte_device *dev;
> @@ -178,6 +189,25 @@ bus_find_device(const struct rte_bus *bus, const
> void *_dev)
>       return dev == NULL;
>  }
> 
> +static struct rte_device *
> +bus_find_device_by_name(const struct rte_bus *bus, const void
> *_dev_name)
> +{
> +     struct rte_device *dev;
> +
> +     dev = bus->find_device_by_name(NULL, cmp_rte_device_name,
> _dev_name);
> +     return dev;
> +}
> +
> +struct rte_device *
> +
> +rte_bus_find_device(const struct rte_bus *bus, const void *_dev_name)
> +{
> +     struct rte_device *dev;
> +
> +     dev = bus_find_device_by_name(bus, _dev_name);
> +     return dev;
> +}
> +
>  struct rte_bus *
>  rte_bus_find_by_device(const struct rte_device *dev)
>  {
> diff --git a/lib/librte_eal/common/eal_common_dev.c
> b/lib/librte_eal/common/eal_common_dev.c
> index dda8f58..47909e8 100644
> --- a/lib/librte_eal/common/eal_common_dev.c
> +++ b/lib/librte_eal/common/eal_common_dev.c
> @@ -42,9 +42,31 @@
>  #include <rte_devargs.h>
>  #include <rte_debug.h>
>  #include <rte_log.h>
> +#include <rte_spinlock.h>
> +#include <rte_malloc.h>
> 
>  #include "eal_private.h"
> 
> +/* spinlock for device callbacks */
> +static rte_spinlock_t rte_dev_cb_lock = RTE_SPINLOCK_INITIALIZER;
> +
> +/**
> + * The user application callback description.
> + *
> + * It contains callback address to be registered by user application,
> + * the pointer to the parameters for callback, and the event type.
> + */
> +struct rte_eal_dev_callback {
> +     TAILQ_ENTRY(rte_eal_dev_callback) next; /**< Callbacks list */
> +     rte_eal_dev_cb_fn cb_fn;                /**< Callback address */
> +     void *cb_arg;                           /**< Parameter for callback */
> +     void *ret_param;                        /**< Return parameter */
> +     enum rte_eal_dev_event_type event;      /**< device event type */
> +     uint32_t active;                        /**< Callback is executing */
> +};
> +
> +static struct rte_eal_dev_callback *dev_add_cb;
> +
>  static int cmp_detached_dev_name(const struct rte_device *dev,
>       const void *_name)
>  {
> @@ -234,3 +256,150 @@ int rte_eal_hotplug_remove(const char *busname,
> const char *devname)
>       rte_eal_devargs_remove(busname, devname);
>       return ret;
>  }
> +
> +int
> +rte_eal_dev_monitor_enable(void)
> +{
> +     int ret;
> +
> +     ret = rte_dev_monitor_start();
> +     if (ret)
> +             RTE_LOG(ERR, EAL, "Can not init device monitor\n");
> +     return ret;
> +}
> +
> +int
> +rte_dev_callback_register(struct rte_device *device,
> +                     enum rte_eal_dev_event_type event,
> +                     rte_eal_dev_cb_fn cb_fn, void *cb_arg)
> +{
> +     struct rte_eal_dev_callback *user_cb;
> +
> +     if (!cb_fn)
> +             return -EINVAL;
> +

What's about checking the device pointer is not NULL ?

> +     rte_spinlock_lock(&rte_dev_cb_lock);
> +
> +     if (TAILQ_EMPTY(&(device->uev_cbs)))
> +             TAILQ_INIT(&(device->uev_cbs));
> +
> +     if (event == RTE_EAL_DEV_EVENT_ADD) {
> +             user_cb = NULL;
> +     } else {
> +             TAILQ_FOREACH(user_cb, &(device->uev_cbs), next) {
> +                     if (user_cb->cb_fn == cb_fn &&
> +                             user_cb->cb_arg == cb_arg &&
> +                             user_cb->event == event) {
> +                             break;
> +                     }
> +             }
> +     }
> +
> +     /* create a new callback. */
> +     if (user_cb == NULL) {
> +             /* allocate a new interrupt callback entity */
> +             user_cb = rte_zmalloc("eal device event",
> +                                     sizeof(*user_cb), 0);
> +             if (user_cb == NULL) {
> +                     RTE_LOG(ERR, EAL, "Can not allocate memory\n");

Missing rte_spinlock_unlock.

> +                     return -ENOMEM;
> +             }
> +             user_cb->cb_fn = cb_fn;
> +             user_cb->cb_arg = cb_arg;
> +             user_cb->event = event;
> +             if (event == RTE_EAL_DEV_EVENT_ADD)
> +                     dev_add_cb = user_cb;

Only one dpdk entity can register to ADD callback?

I suggest to add option to register all devices maybe by using dummy device 
which will include all the "ALL_DEVICES"  callbacks per event.  
All means past, present and future devices, by this way 1 callback can be 
called for all the devices and more than one dpdk entity could register to  an 
ADD\NEW event.
What's about NEW instead of ADD?

I also suggest to add the device pointer as a parameter to the callback(which 
will be managed by EAL).

> +             else
> +                     TAILQ_INSERT_TAIL(&(device->uev_cbs), user_cb,
> next);
> +     }
> +
> +     rte_spinlock_unlock(&rte_dev_cb_lock);
> +     return 0;
> +}
> +
> +int
> +rte_dev_callback_unregister(struct rte_device *device,
> +                     enum rte_eal_dev_event_type event,
> +                     rte_eal_dev_cb_fn cb_fn, void *cb_arg)
> +{
> +     int ret;
> +     struct rte_eal_dev_callback *cb, *next;
> +
> +     if (!cb_fn)
> +             return -EINVAL;
> +
> +     rte_spinlock_lock(&rte_dev_cb_lock);
> +
> +     ret = 0;
> +     if (event == RTE_EAL_DEV_EVENT_ADD) {
> +             rte_free(dev_add_cb);
> +             dev_add_cb = NULL;
> +     } else {

Device NULL checking?

> +             for (cb = TAILQ_FIRST(&(device->uev_cbs)); cb != NULL;
> +                   cb = next) {
> +
> +                     next = TAILQ_NEXT(cb, next);
> +
> +                     if (cb->cb_fn != cb_fn || cb->event != event ||
> +                                     (cb->cb_arg != (void *)-1 &&
> +                                     cb->cb_arg != cb_arg))
> +                             continue;
> +
> +                     /*
> +                      * if this callback is not executing right now,
> +                      * then remove it.
> +                      */
> +                     if (cb->active == 0) {
> +                             TAILQ_REMOVE(&(device->uev_cbs), cb,
> next);
> +                             rte_free(cb);
> +                     } else {
> +                             ret = -EAGAIN;
> +                     }
> +             }
> +     }
> +     rte_spinlock_unlock(&rte_dev_cb_lock);
> +     return ret;
> +}
> +
> +int
> +_rte_dev_callback_process(struct rte_device *device,
> +                     enum rte_eal_dev_event_type event,
> +                     void *cb_arg, void *ret_param)
> +{
> +     struct rte_eal_dev_callback dev_cb;
> +     struct rte_eal_dev_callback *cb_lst;
> +     int rc = 0;
> +
> +     rte_spinlock_lock(&rte_dev_cb_lock);
> +     if (event == RTE_EAL_DEV_EVENT_ADD) {
> +             if (cb_arg != NULL)
> +                     dev_add_cb->cb_arg = cb_arg;
> +
> +             if (ret_param != NULL)
> +                     dev_add_cb->ret_param = ret_param;
> +
> +             rte_spinlock_unlock(&rte_dev_cb_lock);

Can't someone free it when it running?
I suggest to  keep the lock locked.
Callbacks are not allowed to use this mechanism to prevent deadlock. 

> +             rc = dev_add_cb->cb_fn(dev_add_cb->event,
> +                             dev_add_cb->cb_arg, dev_add_cb-
> >ret_param);
> +             rte_spinlock_lock(&rte_dev_cb_lock);
> +     } else {
> +             TAILQ_FOREACH(cb_lst, &(device->uev_cbs), next) {
> +                     if (cb_lst->cb_fn == NULL || cb_lst->event != event)
> +                             continue;
> +                     dev_cb = *cb_lst;
> +                     cb_lst->active = 1;
> +                     if (cb_arg != NULL)
> +                             dev_cb.cb_arg = cb_arg;
> +                     if (ret_param != NULL)
> +                             dev_cb.ret_param = ret_param;
> +
> +                     rte_spinlock_unlock(&rte_dev_cb_lock);

The current active flag doesn't do it  thread safe here, I suggest to keep the 
lock locked.
Scenario:
        1. Thread A see active = 0 in unregister function.
        2. Context switch.
        3. Thread B start the callback.
        4. Context switch.
        5. Thread A free it.
        6. Context switch.
        7. Seg fault in Thread B.

> +                     rc = dev_cb.cb_fn(dev_cb.event,
> +                                     dev_cb.cb_arg, dev_cb.ret_param);
> +                     rte_spinlock_lock(&rte_dev_cb_lock);
> +                     cb_lst->active = 0;
> +             }
> +     }
> +     rte_spinlock_unlock(&rte_dev_cb_lock);
> +     return rc;
> +}
> diff --git a/lib/librte_eal/common/include/rte_bus.h
> b/lib/librte_eal/common/include/rte_bus.h
> index 6fb0834..6c4ae31 100644
> --- a/lib/librte_eal/common/include/rte_bus.h
> +++ b/lib/librte_eal/common/include/rte_bus.h
> @@ -122,6 +122,34 @@ typedef struct rte_device *
>                        const void *data);
> 
>  /**
> + * Device iterator to find a device on a bus.
> + *
> + * This function returns an rte_device if one of those held by the bus
> + * matches the data passed as parameter.
> + *
> + * If the comparison function returns zero this function should stop 
> iterating
> + * over any more devices. To continue a search the device of a previous
> search
> + * can be passed via the start parameter.
> + *
> + * @param cmp
> + *   the device name comparison function.
> + *
> + * @param data
> + *   Data to compare each device against.
> + *
> + * @param start
> + *   starting point for the iteration
> + *
> + * @return
> + *   The first device matching the data, NULL if none exists.
> + */
> +typedef struct rte_device *
> +(*rte_bus_find_device_by_name_t)(const struct rte_device *start,
> +                      rte_dev_cmp_name_t cmp,
> +                      const void *data);
> +
> +
> +/**
>   * Implementation specific probe function which is responsible for linking
>   * devices on that bus with applicable drivers.
>   *
> @@ -168,6 +196,37 @@ typedef int (*rte_bus_unplug_t)(struct rte_device
> *dev);
>  typedef int (*rte_bus_parse_t)(const char *name, void *addr);
> 
>  /**
> + * Implementation specific remap function which is responsible for
> remmaping
> + * devices on that bus from original share memory resource to a private
> memory
> + * resource for the sake of device has been removal.
> + *
> + * @param dev
> + *   Device pointer that was returned by a previous call to find_device.
> + *
> + * @return
> + *   0 on success.
> + *   !0 on error.
> + */
> +typedef int (*rte_bus_remap_device_t)(struct rte_device *dev);
> +
> +/**
> + * Implementation specific bind driver function which is responsible for bind
> + * a explicit type of driver with a devices on that bus.
> + *
> + * @param dev_name
> + *   device textual description.
> + *
> + * @param drv_type
> + *   driver type textual description.
> + *
> + * @return
> + *   0 on success.
> + *   !0 on error.
> + */
> +typedef int (*rte_bus_bind_driver_t)(const char *dev_name,
> +                             const char *drv_type);
> +
> +/**
>   * Bus scan policies
>   */
>  enum rte_bus_scan_mode {
> @@ -206,9 +265,13 @@ struct rte_bus {
>       rte_bus_scan_t scan;         /**< Scan for devices attached to bus */
>       rte_bus_probe_t probe;       /**< Probe devices on bus */
>       rte_bus_find_device_t find_device; /**< Find a device on the bus */
> +     rte_bus_find_device_by_name_t find_device_by_name;
> +                                  /**< Find a device on the bus */
>       rte_bus_plug_t plug;         /**< Probe single device for drivers */
>       rte_bus_unplug_t unplug;     /**< Remove single device from driver
> */
>       rte_bus_parse_t parse;       /**< Parse a device name */
> +     rte_bus_remap_device_t remap_device;       /**< remap a device */
> +     rte_bus_bind_driver_t bind_driver; /**< bind a driver for bus device
> */
>       struct rte_bus_conf conf;    /**< Bus configuration */
>       rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu
> class */
>  };
> @@ -306,6 +369,12 @@ struct rte_bus *rte_bus_find(const struct rte_bus
> *start, rte_bus_cmp_t cmp,
>  struct rte_bus *rte_bus_find_by_device(const struct rte_device *dev);
> 
>  /**
> + * Find the registered bus for a particular device.
> + */
> +struct rte_device *rte_bus_find_device(const struct rte_bus *bus,
> +                             const void *dev_name);
> +
> +/**
>   * Find the registered bus for a given name.
>   */
>  struct rte_bus *rte_bus_find_by_name(const char *busname);
> diff --git a/lib/librte_eal/common/include/rte_dev.h
> b/lib/librte_eal/common/include/rte_dev.h
> index 9342e0c..19971d0 100644
> --- a/lib/librte_eal/common/include/rte_dev.h
> +++ b/lib/librte_eal/common/include/rte_dev.h
> @@ -51,6 +51,15 @@ extern "C" {
> 
>  #include <rte_log.h>
> 
> +#include <exec-env/rte_dev.h>
> +
> +typedef int (*rte_eal_dev_cb_fn)(enum rte_eal_dev_event_type event,
> +                                     void *cb_arg, void *ret_param);
> +
> +struct rte_eal_dev_callback;
> +/** @internal Structure to keep track of registered callbacks */
> +TAILQ_HEAD(rte_eal_dev_cb_list, rte_eal_dev_callback);
> +
>  __attribute__((format(printf, 2, 0)))
>  static inline void
>  rte_pmd_debug_trace(const char *func_name, const char *fmt, ...)
> @@ -157,6 +166,13 @@ struct rte_driver {
>   */
>  #define RTE_DEV_NAME_MAX_LEN 64
> 
> +enum device_state {
> +     DEVICE_UNDEFINED,
> +     DEVICE_FAULT,
> +     DEVICE_PARSED,
> +     DEVICE_PROBED,
> +};
> +
>  /**
>   * A structure describing a generic device.
>   */
> @@ -166,6 +182,9 @@ struct rte_device {
>       const struct rte_driver *driver;/**< Associated driver */
>       int numa_node;                /**< NUMA node connection */
>       struct rte_devargs *devargs;  /**< Device user arguments */
> +     enum device_state state;  /**< Device state */
> +     /** User application callbacks for device event */
> +     struct rte_eal_dev_cb_list uev_cbs;
>  };
> 
>  /**
> @@ -248,6 +267,8 @@ int rte_eal_hotplug_remove(const char *busname,
> const char *devname);
>   */
>  typedef int (*rte_dev_cmp_t)(const struct rte_device *dev, const void
> *data);
> 
> +typedef int (*rte_dev_cmp_name_t)(const char *dev_name, const void
> *data);
> +
>  #define RTE_PMD_EXPORT_NAME_ARRAY(n, idx) n##idx[]
> 
>  #define RTE_PMD_EXPORT_NAME(name, idx) \
> @@ -293,4 +314,72 @@ __attribute__((used)) = str
>  }
>  #endif
> 
> +/**
> + * It enable the device event monitoring for a specific event.
> + *
> + * @param none
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +int
> +rte_eal_dev_monitor_enable(void);
> +/**
> + * It registers the callback for the specific event. Multiple
> + * callbacks cal be registered at the same time.
> + * @param event
> + *  The device event type.
> + * @param cb_fn
> + *  callback address.
> + * @param cb_arg
> + *  address of parameter for callback.
> + *
> + * @return
> + *  - On success, zero.
> + *  - On failure, a negative value.
> + */
> +int rte_dev_callback_register(struct rte_device *device,
> +                     enum rte_eal_dev_event_type event,
> +                     rte_eal_dev_cb_fn cb_fn, void *cb_arg);
> +
> +/**
> + * It unregisters the callback according to the specified event.
> + *
> + * @param event
> + *  The event type which corresponding to the callback.
> + * @param cb_fn
> + *  callback address.
> + *  address of parameter for callback, (void *)-1 means to remove all
> + *  registered which has the same callback address.
> + *
> + * @return
> + *  - On success, return the number of callback entities removed.
> + *  - On failure, a negative value.
> + */
> +int rte_dev_callback_unregister(struct rte_device *device,
> +                     enum rte_eal_dev_event_type event,
> +                     rte_eal_dev_cb_fn cb_fn, void *cb_arg);
> +
> +/**
> + * @internal Executes all the user application registered callbacks for
> + * the specific device. It is for DPDK internal user only. User
> + * application should not call it directly.
> + *
> + * @param event
> + *  The device event type.
> + * @param cb_arg
> + *  callback parameter.
> + * @param ret_param
> + *  To pass data back to user application.
> + *  This allows the user application to decide if a particular function
> + *  is permitted or not.
> + *
> + * @return
> + *  - On success, return zero.
> + *  - On failure, a negative value.
> + */
> +int
> +_rte_dev_callback_process(struct rte_device *device,
> +                     enum rte_eal_dev_event_type event,
> +                     void *cb_arg, void *ret_param);
>  #endif /* _RTE_DEV_H_ */
> diff --git a/lib/librte_eal/linuxapp/eal/Makefile
> b/lib/librte_eal/linuxapp/eal/Makefile
> index 5a7b8b2..05a2437 100644
> --- a/lib/librte_eal/linuxapp/eal/Makefile
> +++ b/lib/librte_eal/linuxapp/eal/Makefile
> @@ -67,6 +67,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) +=
> eal_lcore.c
>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c
>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c
> +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_dev.c
> 
>  # from common dir
>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c
> @@ -120,7 +121,7 @@ ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
>  CFLAGS_eal_thread.o += -Wno-return-type
>  endif
> 
> -INC := rte_kni_common.h
> +INC := rte_kni_common.h rte_dev.h
> 
>  SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \
>       $(addprefix include/exec-env/,$(INC))
> diff --git a/lib/librte_eal/linuxapp/eal/eal_alarm.c
> b/lib/librte_eal/linuxapp/eal/eal_alarm.c
> index 8e4a775..29e73a7 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_alarm.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_alarm.c
> @@ -209,6 +209,7 @@ rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn,
> void *cb_arg)
>       int count = 0;
>       int err = 0;
>       int executing;
> +     int ret;
> 
>       if (!cb_fn) {
>               rte_errno = EINVAL;
> @@ -259,6 +260,10 @@ rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn,
> void *cb_arg)
>                       }
>                       ap_prev = ap;
>               }
> +
> +             ret |= rte_intr_callback_unregister(&intr_handle,
> +                             eal_alarm_callback, NULL);
> +
>               rte_spinlock_unlock(&alarm_list_lk);
>       } while (executing != 0);
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c
> b/lib/librte_eal/linuxapp/eal/eal_dev.c
> new file mode 100644
> index 0000000..49fd0dc
> --- /dev/null
> +++ b/lib/librte_eal/linuxapp/eal/eal_dev.c
> @@ -0,0 +1,356 @@
> +/*-
> + *   Copyright(c) 2010-2017 Intel Corporation.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <inttypes.h>
> +#include <sys/queue.h>
> +#include <sys/signalfd.h>
> +#include <sys/ioctl.h>
> +#include <sys/socket.h>
> +#include <linux/netlink.h>
> +#include <sys/epoll.h>
> +#include <unistd.h>
> +#include <signal.h>
> +#include <stdbool.h>
> +
> +#include <rte_malloc.h>
> +#include <rte_bus.h>
> +#include <rte_dev.h>
> +#include <rte_devargs.h>
> +#include <rte_debug.h>
> +#include <rte_log.h>
> +
> +#include "eal_thread.h"
> +
> +/* uev monitoring thread */
> +static pthread_t uev_monitor_thread;
> +
> +bool udev_exit = true;
> +
> +bool no_request_thread = true;
> +
> +static void sig_handler(int signum)
> +{
> +     if (signum == SIGINT || signum == SIGTERM)
> +             rte_dev_monitor_stop();
> +}
> +
> +static int
> +dev_monitor_fd_new(void)
> +{
> +
> +     int uevent_fd;
> +
> +     uevent_fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC |
> +                     SOCK_NONBLOCK,
> +                     NETLINK_KOBJECT_UEVENT);
> +     if (uevent_fd < 0) {
> +             RTE_LOG(ERR, EAL, "create uevent fd failed\n");
> +             return -1;
> +     }
> +     return uevent_fd;
> +}
> +
> +static int
> +dev_monitor_enable(int netlink_fd)
> +{
> +     struct sockaddr_nl addr;
> +     int ret;
> +     int size = 64 * 1024;
> +     int nonblock = 1;
> +
> +     memset(&addr, 0, sizeof(addr));
> +     addr.nl_family = AF_NETLINK;
> +     addr.nl_pid = 0;
> +     addr.nl_groups = 0xffffffff;
> +
> +     if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) {
> +             RTE_LOG(ERR, EAL, "bind failed\n");
> +             goto err;
> +     }
> +
> +     setsockopt(netlink_fd, SOL_SOCKET, SO_PASSCRED, &size,
> sizeof(size));
> +
> +     ret = ioctl(netlink_fd, FIONBIO, &nonblock);
> +     if (ret != 0) {
> +             RTE_LOG(ERR, EAL, "ioctl(FIONBIO) failed\n");
> +             goto err;
> +     }
> +     return 0;
> +err:
> +     close(netlink_fd);
> +     return -1;
> +}
> +
> +static void
> +dev_uev_parse(const char *buf, struct rte_eal_uevent *event)
> +{
> +     char action[RTE_EAL_UEV_MSG_ELEM_LEN];
> +     char subsystem[RTE_EAL_UEV_MSG_ELEM_LEN];
> +     char dev_path[RTE_EAL_UEV_MSG_ELEM_LEN];
> +     char pci_slot_name[RTE_EAL_UEV_MSG_ELEM_LEN];
> +     int i = 0;
> +
> +     memset(action, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> +     memset(subsystem, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> +     memset(dev_path, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> +     memset(pci_slot_name, 0, RTE_EAL_UEV_MSG_ELEM_LEN);
> +
> +     while (i < RTE_EAL_UEV_MSG_LEN) {
> +             for (; i < RTE_EAL_UEV_MSG_LEN; i++) {
> +                     if (*buf)
> +                             break;
> +                     buf++;
> +             }
> +             if (!strncmp(buf, "libudev", 7)) {
> +                     buf += 7;
> +                     i += 7;
> +                     event->group = UEV_MONITOR_UDEV;
> +             }
> +             if (!strncmp(buf, "ACTION=", 7)) {
> +                     buf += 7;
> +                     i += 7;
> +                     snprintf(action, sizeof(action), "%s", buf);
> +             } else if (!strncmp(buf, "DEVPATH=", 8)) {
> +                     buf += 8;
> +                     i += 8;
> +                     snprintf(dev_path, sizeof(dev_path), "%s", buf);
> +             } else if (!strncmp(buf, "SUBSYSTEM=", 10)) {
> +                     buf += 10;
> +                     i += 10;
> +                     snprintf(subsystem, sizeof(subsystem), "%s", buf);
> +             } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) {
> +                     buf += 14;
> +                     i += 14;
> +                     snprintf(pci_slot_name, sizeof(subsystem), "%s",
> buf);
> +                     event->devname = pci_slot_name;
> +             }
> +             for (; i < RTE_EAL_UEV_MSG_LEN; i++) {
> +                     if (*buf == '\0')
> +                             break;
> +                     buf++;
> +             }
> +     }
> +
> +     if (!strncmp(subsystem, "pci", 3))
> +             event->subsystem = UEV_SUBSYSTEM_PCI;
> +     if (!strncmp(action, "add", 3))
> +             event->type = RTE_EAL_DEV_EVENT_ADD;
> +     if (!strncmp(action, "remove", 6))
> +             event->type = RTE_EAL_DEV_EVENT_REMOVE;
> +     event->devname = pci_slot_name;
> +}
> +
> +static int
> +dev_uev_receive(int fd, struct rte_eal_uevent *uevent)
> +{
> +     int ret;
> +     char buf[RTE_EAL_UEV_MSG_LEN];
> +
> +     memset(uevent, 0, sizeof(struct rte_eal_uevent));
> +     memset(buf, 0, RTE_EAL_UEV_MSG_LEN);
> +
> +     ret = recv(fd, buf, RTE_EAL_UEV_MSG_LEN - 1, MSG_DONTWAIT);
> +     if (ret < 0) {
> +             RTE_LOG(ERR, EAL,
> +             "Socket read error(%d): %s\n",
> +             errno, strerror(errno));
> +             return -1;
> +     } else if (ret == 0)
> +             /* connection closed */
> +             return -1;
> +
> +     dev_uev_parse(buf, uevent);
> +
> +     return 0;
> +}
> +
> +static int
> +dev_uev_process(struct epoll_event *events, int nfds)
> +{
> +     struct rte_bus *bus;
> +     struct rte_device *dev;
> +     struct rte_eal_uevent uevent;
> +     int ret;
> +     int i;
> +
> +     for (i = 0; i < nfds; i++) {
> +             /**
> +              * check device uevent from kernel side, no need to check
> +              * uevent from udev.
> +              */
> +             if ((dev_uev_receive(events[i].data.fd, &uevent)) ||
> +                     (uevent.group == UEV_MONITOR_UDEV))
> +                     return 0;
> +
> +             /* default handle all pci devcie when is being hot plug */
> +             if (uevent.subsystem == UEV_SUBSYSTEM_PCI) {
> +                     bus = rte_bus_find_by_name("pci");
> +                     dev = rte_bus_find_device(bus, uevent.devname);
> +                     if (uevent.type == RTE_EAL_DEV_EVENT_REMOVE) {
> +
> +                             if ((!dev) || dev->state ==
> DEVICE_UNDEFINED)
> +                                     return 0;
> +                             dev->state = DEVICE_FAULT;
> +
> +                             /**
> +                              * remap the resource to be fake
> +                              * before user's removal processing
> +                              */
> +                             ret = bus->remap_device(dev);
> +                             if (!ret)
> +
>       return(_rte_dev_callback_process(dev,
> +                                       RTE_EAL_DEV_EVENT_REMOVE,
> +                                       NULL, NULL));

What is the reason to keep this device in EAL device list after the removal?
I suggest to remove it (driver remove, bus remove and EAL remove) after the 
callbacks running.
By this way EAL can initiate all device removals.

> +                     } else if (uevent.type == RTE_EAL_DEV_EVENT_ADD)
> {
> +                             if (dev == NULL) {
> +                                     /**
> +                                      * bind the driver to the device
> +                                      * before user's add processing
> +                                      */
> +                                     bus->bind_driver(
> +                                             uevent.devname,
> +                                             "igb_uio");
> +

Similar comments here:
EAL can initiate all device probe operations by adding the device and probing 
it here before the callback running.
Then, also the device pointer can be passed to the callbacks.

>       return(_rte_dev_callback_process(NULL,
> +                                       RTE_EAL_DEV_EVENT_ADD,
> +                                       uevent.devname, NULL));
> +                             }
> +                     }
> +             }
> +     }
> +     return 0;
> +}
> +
> +/**
> + * It builds/rebuilds up the epoll file descriptor with all the
> + * file descriptors being waited on. Then handles the interrupts.
> + *
> + * @param arg
> + *  pointer. (unused)
> + *
> + * @return
> + *  never return;
> + */
> +static __attribute__((noreturn)) void *
> +dev_uev_monitoring(__rte_unused void *arg)
> +{
> +     struct sigaction act;
> +     sigset_t mask;
> +     int netlink_fd;
> +     struct epoll_event ep_kernel;
> +     int fd_ep;
> +
> +     udev_exit = false;
> +
> +     /* set signal handlers */
> +     memset(&act, 0x00, sizeof(struct sigaction));
> +     act.sa_handler = sig_handler;
> +     sigemptyset(&act.sa_mask);
> +     act.sa_flags = SA_RESTART;
> +     sigaction(SIGINT, &act, NULL);
> +     sigaction(SIGTERM, &act, NULL);
> +     sigemptyset(&mask);
> +     sigaddset(&mask, SIGINT);
> +     sigaddset(&mask, SIGTERM);
> +     sigprocmask(SIG_UNBLOCK, &mask, NULL);
> +
> +     fd_ep = epoll_create1(EPOLL_CLOEXEC);
> +     if (fd_ep < 0) {
> +             RTE_LOG(ERR, EAL, "error creating epoll fd: %m\n");
> +             goto out;
> +     }
> +
> +     netlink_fd = dev_monitor_fd_new();
> +
> +     if (dev_monitor_enable(netlink_fd) < 0) {
> +             RTE_LOG(ERR, EAL, "error subscribing to kernel events\n");
> +             goto out;
> +     }
> +
> +     memset(&ep_kernel, 0, sizeof(struct epoll_event));
> +     ep_kernel.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP;
> +     ep_kernel.data.fd = netlink_fd;
> +     if (epoll_ctl(fd_ep, EPOLL_CTL_ADD, netlink_fd,
> +             &ep_kernel) < 0) {
> +             RTE_LOG(ERR, EAL, "error addding fd to epoll: %m\n");
> +             goto out;
> +     }
> +
> +     while (!udev_exit) {
> +             int fdcount;
> +             struct epoll_event ev[1];
> +
> +             fdcount = epoll_wait(fd_ep, ev, 1, -1);
> +             if (fdcount < 0) {
> +                     if (errno != EINTR)
> +                             RTE_LOG(ERR, EAL, "error receiving uevent "
> +                                     "message: %m\n");
> +                             continue;
> +                     }
> +
> +             /* epoll_wait has at least one fd ready to read */
> +             if (dev_uev_process(ev, fdcount) < 0) {
> +                     if (errno != EINTR)
> +                             RTE_LOG(ERR, EAL, "error processing uevent
> "
> +                                     "message: %m\n");
> +             }
> +     }
> +out:
> +     if (fd_ep >= 0)
> +             close(fd_ep);
> +     if (netlink_fd >= 0)
> +             close(netlink_fd);
> +     rte_panic("uev monitoring fail\n");
> +}
> +
> +int
> +rte_dev_monitor_start(void)
> +{

Maybe add option to run it also by new EAL command line parameter?

> +     int ret;
> +
> +     if (!no_request_thread)
> +             return 0;
> +     no_request_thread = false;
> +
> +     /* create the host thread to wait/handle the uevent from kernel */
> +     ret = pthread_create(&uev_monitor_thread, NULL,
> +             dev_uev_monitoring, NULL);

What is the reason to open new thread for hotplug?
Why not to use the current dpdk host thread by the alarm mechanism? 

> +     return ret;
> +}
> +
> +int
> +rte_dev_monitor_stop(void)
> +{
> +     udev_exit = true;
> +     no_request_thread = true;
> +     return 0;
> +}
> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
> b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
> new file mode 100644
> index 0000000..6a6feb5
> --- /dev/null
> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h
> @@ -0,0 +1,106 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
> NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
> AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
> TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#ifndef _RTE_DEV_H_
> +#error "don't include this file directly, please include generic <rte_dev.h>"
> +#endif
> +
> +#ifndef _RTE_LINUXAPP_DEV_H_
> +#define _RTE_LINUXAPP_DEV_H_
> +
> +#include <stdio.h>
> +
> +#include <rte_dev.h>
> +
> +#define RTE_EAL_UEV_MSG_LEN 4096
> +#define RTE_EAL_UEV_MSG_ELEM_LEN 128
> +
> +enum uev_subsystem {
> +     UEV_SUBSYSTEM_UIO,
> +     UEV_SUBSYSTEM_VFIO,
> +     UEV_SUBSYSTEM_PCI,
> +     UEV_SUBSYSTEM_MAX
> +};
> +
> +enum uev_monitor_netlink_group {
> +     UEV_MONITOR_KERNEL,
> +     UEV_MONITOR_UDEV,
> +};
> +
> +/**
> + * The device event type.
> + */
> +enum rte_eal_dev_event_type {
> +     RTE_EAL_DEV_EVENT_UNKNOWN,      /**< unknown event type */
> +     RTE_EAL_DEV_EVENT_ADD,          /**< device adding event */
> +     RTE_EAL_DEV_EVENT_REMOVE,
> +                                     /**< device removing event */
> +     RTE_EAL_DEV_EVENT_CHANGE,
> +                                     /**< device status change event */
> +     RTE_EAL_DEV_EVENT_MOVE,         /**< device sys path move
> event */
> +     RTE_EAL_DEV_EVENT_ONLINE,       /**< device online event */
> +     RTE_EAL_DEV_EVENT_OFFLINE,      /**< device offline event */
> +     RTE_EAL_DEV_EVENT_MAX           /**< max value of this enum
> */
> +};
> +
> +struct rte_eal_uevent {
> +     enum rte_eal_dev_event_type type;       /**< device event type */
> +     int subsystem;                          /**< subsystem id */
> +     char *devname;                          /**< device name */
> +     enum uev_monitor_netlink_group group;   /**< device netlink
> group */
> +};
> +
> +/**
> + * Start the device uevent monitoring.
> + *
> + * @param none
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +int
> +rte_dev_monitor_start(void);
> +
> +/**
> + * Stop the device uevent monitoring .
> + *
> + * @param none
> + * @return
> + *   - On success, zero.
> + *   - On failure, a negative value.
> + */
> +
> +int
> +rte_dev_monitor_stop(void);
> +
> +#endif /* _RTE_LINUXAPP_DEV_H_ */
> diff --git a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> index a3a98c1..d0e07b4 100644
> --- a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> +++ b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c
> @@ -354,6 +354,12 @@ igbuio_pci_release(struct uio_info *info, struct
> inode *inode)
>       struct rte_uio_pci_dev *udev = info->priv;
>       struct pci_dev *dev = udev->pdev;
> 
> +     /* check if device have been remove before release */
> +     if ((&dev->dev.kobj)->state_remove_uevent_sent == 1) {
> +             pr_info("The device have been removed\n");
> +             return -1;
> +     }
> +
>       /* disable interrupts */
>       igbuio_pci_disable_interrupts(udev);
> 
> diff --git a/lib/librte_pci/rte_pci.c b/lib/librte_pci/rte_pci.c
> index 0160fc1..feb5fd7 100644
> --- a/lib/librte_pci/rte_pci.c
> +++ b/lib/librte_pci/rte_pci.c
> @@ -172,6 +172,26 @@ rte_pci_addr_parse(const char *str, struct
> rte_pci_addr *addr)
>       return -1;
>  }
> 
> +/* map a private resource from an address*/
> +void *
> +pci_map_private_resource(void *requested_addr, off_t offset, size_t size)
> +{
> +     void *mapaddr;
> +
> +     mapaddr = mmap(requested_addr, size,
> +                        PROT_READ | PROT_WRITE,
> +                        MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED,
> -1, 0);
> +     if (mapaddr == MAP_FAILED) {
> +             RTE_LOG(ERR, EAL, "%s(): cannot mmap(%p, 0x%lx, 0x%lx): "
> +                     "%s (%p)\n",
> +                     __func__, requested_addr,
> +                     (unsigned long)size, (unsigned long)offset,
> +                     strerror(errno), mapaddr);
> +     } else
> +             RTE_LOG(DEBUG, EAL, "  PCI memory mapped at %p\n",
> mapaddr);
> +
> +     return mapaddr;
> +}
> 
>  /* map a particular resource from a file */
>  void *
> diff --git a/lib/librte_pci/rte_pci.h b/lib/librte_pci/rte_pci.h
> index 4f2cd18..f6091a6 100644
> --- a/lib/librte_pci/rte_pci.h
> +++ b/lib/librte_pci/rte_pci.h
> @@ -227,6 +227,23 @@ int rte_pci_addr_cmp(const struct rte_pci_addr
> *addr,
>  int rte_pci_addr_parse(const char *str, struct rte_pci_addr *addr);
> 
>  /**
> + * @internal
> + * Map to a particular private resource.
> + *
> + * @param requested_addr
> + *      The starting address for the new mapping range.
> + * @param offset
> + *      The offset for the mapping range.
> + * @param size
> + *      The size for the mapping range.
> + * @return
> + *   - On success, the function returns a pointer to the mapped area.
> + *   - On error, the value MAP_FAILED is returned.
> + */
> +void *pci_map_private_resource(void *requested_addr, off_t offset,
> +             size_t size);
> +
> +/**
>   * Map a particular resource from a file.
>   *
>   * @param requested_addr
> --
> 2.7.4

Reply via email to