Hi Jeff Maybe I'm touching in previous discussions but please see some comments\questions.
From: Jeff Guo: > This patch aim to add a general uevent mechanism in eal device layer, > to enable all linux kernel object hot plug monitoring, so user could use these > APIs to monitor and read out the device status info that sent from the kernel > side, then corresponding to handle it, such as detach or attach the > device, and even benefit to use it to do smoothly fail safe work. > > 1) About uevent monitoring: > a: add one epolling to poll the netlink socket, to monitor the uevent of > the device, add device_state in struct of rte_device, to identify the > device state machine. > b: add enum of rte_eal_dev_event_type and struct of rte_eal_uevent. > c: add below API in rte eal device common layer. > rte_eal_dev_monitor_enable > rte_dev_callback_register > rte_dev_callback_unregister > _rte_dev_callback_process > rte_dev_monitor_start > rte_dev_monitor_stop > > 2) About failure handler, use pci uio for example, > add pci_remap_device in bus layer and below function to process it: > rte_pci_remap_device > pci_uio_remap_resource > pci_map_private_resource > add rte_pci_dev_bind_driver to bind pci device with explicit driver. > > Signed-off-by: Jeff Guo <jia....@intel.com> > --- > v7->v6: > a.modify vdev part according to the vdev rework > b.re-define and split the func into common and bus specific code > c.fix some incorrect issue. > b.fix the system hung after send packcet issue. > --- > drivers/bus/pci/bsd/pci.c | 30 ++ > drivers/bus/pci/linux/pci.c | 87 +++++ > drivers/bus/pci/linux/pci_init.h | 1 + > drivers/bus/pci/pci_common.c | 43 +++ > drivers/bus/pci/pci_common_uio.c | 28 ++ > drivers/bus/pci/private.h | 12 + > drivers/bus/pci/rte_bus_pci.h | 25 ++ > drivers/bus/vdev/vdev.c | 36 +++ > lib/librte_eal/bsdapp/eal/eal_dev.c | 64 ++++ > .../bsdapp/eal/include/exec-env/rte_dev.h | 106 ++++++ > lib/librte_eal/common/eal_common_bus.c | 30 ++ > lib/librte_eal/common/eal_common_dev.c | 169 ++++++++++ > lib/librte_eal/common/include/rte_bus.h | 69 ++++ > lib/librte_eal/common/include/rte_dev.h | 89 ++++++ > lib/librte_eal/linuxapp/eal/Makefile | 3 +- > lib/librte_eal/linuxapp/eal/eal_alarm.c | 5 + > lib/librte_eal/linuxapp/eal/eal_dev.c | 356 > +++++++++++++++++++++ > .../linuxapp/eal/include/exec-env/rte_dev.h | 106 ++++++ > lib/librte_eal/linuxapp/igb_uio/igb_uio.c | 6 + > lib/librte_pci/rte_pci.c | 20 ++ > lib/librte_pci/rte_pci.h | 17 + > 21 files changed, 1301 insertions(+), 1 deletion(-) > create mode 100644 lib/librte_eal/bsdapp/eal/eal_dev.c > create mode 100644 lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h > create mode 100644 lib/librte_eal/linuxapp/eal/eal_dev.c > create mode 100644 lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h > > diff --git a/drivers/bus/pci/bsd/pci.c b/drivers/bus/pci/bsd/pci.c > index b8e2178..d58dbf6 100644 > --- a/drivers/bus/pci/bsd/pci.c > +++ b/drivers/bus/pci/bsd/pci.c > @@ -126,6 +126,29 @@ rte_pci_unmap_device(struct rte_pci_device *dev) > } > } > > +/* re-map pci device */ > +int > +rte_pci_remap_device(struct rte_pci_device *dev) > +{ > + int ret; > + > + if (dev == NULL) > + return -EINVAL; > + > + switch (dev->kdrv) { > + case RTE_KDRV_NIC_UIO: > + ret = pci_uio_remap_resource(dev); > + break; > + default: > + RTE_LOG(DEBUG, EAL, > + " Not managed by a supported kernel driver, > skipped\n"); > + ret = 1; > + break; > + } > + > + return ret; > +} > + > void > pci_uio_free_resource(struct rte_pci_device *dev, > struct mapped_pci_resource *uio_res) > @@ -678,3 +701,10 @@ rte_pci_ioport_unmap(struct rte_pci_ioport *p) > > return ret; > } > + > +int > +rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type) > +{ > + return -1; > +} > + > diff --git a/drivers/bus/pci/linux/pci.c b/drivers/bus/pci/linux/pci.c > index 5da6728..792fd2c 100644 > --- a/drivers/bus/pci/linux/pci.c > +++ b/drivers/bus/pci/linux/pci.c > @@ -145,6 +145,38 @@ rte_pci_unmap_device(struct rte_pci_device *dev) > } > } > > +/* Map pci device */ > +int > +rte_pci_remap_device(struct rte_pci_device *dev) > +{ > + int ret = -1; > + > + if (dev == NULL) > + return -EINVAL; > + > + switch (dev->kdrv) { > + case RTE_KDRV_VFIO: > +#ifdef VFIO_PRESENT > + /* no thing to do */ > +#endif > + break; > + case RTE_KDRV_IGB_UIO: > + case RTE_KDRV_UIO_GENERIC: > + if (rte_eal_using_phys_addrs()) { > + /* map resources for devices that use uio */ > + ret = pci_uio_remap_resource(dev); > + } > + break; > + default: > + RTE_LOG(DEBUG, EAL, > + " Not managed by a supported kernel driver, > skipped\n"); > + ret = 1; > + break; > + } > + > + return ret; > +} > + > void * > pci_find_max_end_va(void) > { > @@ -386,6 +418,8 @@ pci_scan_one(const char *dirname, const struct > rte_pci_addr *addr) > rte_pci_add_device(dev); > } > > + dev->device.state = DEVICE_PARSED; > + TAILQ_INIT(&(dev->device.uev_cbs)); > return 0; > } > > @@ -854,3 +888,56 @@ rte_pci_ioport_unmap(struct rte_pci_ioport *p) > > return ret; > } > + > +int > +rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type) > +{ > + char drv_bind_path[1024]; > + char drv_override_path[1024]; /* contains the /dev/uioX */ > + int drv_override_fd; > + int drv_bind_fd; > + > + RTE_SET_USED(drv_type); > + > + snprintf(drv_override_path, sizeof(drv_override_path), > + "/sys/bus/pci/devices/%s/driver_override", dev_name); > + > + /* specify the driver for a device by writing to driver_override */ > + drv_override_fd = open(drv_override_path, O_WRONLY); > + if (drv_override_fd < 0) { > + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", > + drv_override_path, strerror(errno)); > + goto err; > + } > + > + if (write(drv_override_fd, drv_type, sizeof(drv_type)) < 0) { > + RTE_LOG(ERR, EAL, > + "Error: bind failed - Cannot write " > + "driver %s to device %s\n", drv_type, dev_name); > + goto err; > + } > + > + close(drv_override_fd); > + > + snprintf(drv_bind_path, sizeof(drv_bind_path), > + "/sys/bus/pci/drivers/%s/bind", drv_type); > + > + /* do the bind by writing device to the specific driver */ > + drv_bind_fd = open(drv_bind_path, O_WRONLY | O_APPEND); > + if (drv_bind_fd < 0) { > + RTE_LOG(ERR, EAL, "Cannot open %s: %s\n", > + drv_bind_path, strerror(errno)); > + goto err; > + } > + > + if (write(drv_bind_fd, dev_name, sizeof(dev_name)) < 0) > + goto err; > + > + close(drv_bind_fd); > + return 0; > +err: > + close(drv_override_fd); > + close(drv_bind_fd); > + return -1; > +} > + > diff --git a/drivers/bus/pci/linux/pci_init.h > b/drivers/bus/pci/linux/pci_init.h > index f342c47..5838402 100644 > --- a/drivers/bus/pci/linux/pci_init.h > +++ b/drivers/bus/pci/linux/pci_init.h > @@ -58,6 +58,7 @@ int pci_uio_alloc_resource(struct rte_pci_device *dev, > struct mapped_pci_resource **uio_res); > void pci_uio_free_resource(struct rte_pci_device *dev, > struct mapped_pci_resource *uio_res); > +int pci_uio_remap_resource(struct rte_pci_device *dev); > int pci_uio_map_resource_by_index(struct rte_pci_device *dev, int > res_idx, > struct mapped_pci_resource *uio_res, int map_idx); > > diff --git a/drivers/bus/pci/pci_common.c b/drivers/bus/pci/pci_common.c > index 104fdf9..5417b32 100644 > --- a/drivers/bus/pci/pci_common.c > +++ b/drivers/bus/pci/pci_common.c > @@ -282,6 +282,7 @@ pci_probe_all_drivers(struct rte_pci_device *dev) > if (rc > 0) > /* positive value means driver doesn't support it */ > continue; > + dev->device.state = DEVICE_PROBED; > return 0; > } > return 1; > @@ -481,6 +482,7 @@ rte_pci_insert_device(struct rte_pci_device > *exist_pci_dev, > void > rte_pci_remove_device(struct rte_pci_device *pci_dev) > { > + RTE_LOG(DEBUG, EAL, " rte_pci_remove_device for device list\n"); > TAILQ_REMOVE(&rte_pci_bus.device_list, pci_dev, next); > } > > @@ -502,6 +504,44 @@ pci_find_device(const struct rte_device *start, > rte_dev_cmp_t cmp, > return NULL; > } > > +static struct rte_device * > +pci_find_device_by_name(const struct rte_device *start, > + rte_dev_cmp_name_t cmp_name, > + const void *data) > +{ > + struct rte_pci_device *dev; > + > + FOREACH_DEVICE_ON_PCIBUS(dev) { > + if (start && &dev->device == start) { > + start = NULL; /* starting point found */ > + continue; > + } > + if (cmp_name(dev->device.name, data) == 0) > + return &dev->device; > + } > + > + return NULL; > +} > + > +static int > +pci_remap_device(struct rte_device *dev) > +{ > + struct rte_pci_device *pdev; > + int ret; > + > + if (dev == NULL) > + return -EINVAL; > + > + pdev = RTE_DEV_TO_PCI(dev); > + > + /* remap resources for devices that use igb_uio */ > + ret = rte_pci_remap_device(pdev); > + if (ret != 0) > + RTE_LOG(ERR, EAL, "failed to remap device %s", > + dev->name); > + return ret; > +} > + > static int > pci_plug(struct rte_device *dev) > { > @@ -528,10 +568,13 @@ struct rte_pci_bus rte_pci_bus = { > .scan = rte_pci_scan, > .probe = rte_pci_probe, > .find_device = pci_find_device, > + .find_device_by_name = pci_find_device_by_name, > .plug = pci_plug, > .unplug = pci_unplug, > .parse = pci_parse, > .get_iommu_class = rte_pci_get_iommu_class, > + .remap_device = pci_remap_device, > + .bind_driver = rte_pci_dev_bind_driver, > }, > .device_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.device_list), > .driver_list = TAILQ_HEAD_INITIALIZER(rte_pci_bus.driver_list), > diff --git a/drivers/bus/pci/pci_common_uio.c > b/drivers/bus/pci/pci_common_uio.c > index 0671131..8cb4009 100644 > --- a/drivers/bus/pci/pci_common_uio.c > +++ b/drivers/bus/pci/pci_common_uio.c > @@ -176,6 +176,34 @@ pci_uio_unmap(struct mapped_pci_resource > *uio_res) > } > } > > +/* remap the PCI resource of a PCI device in private virtual memory */ > +int > +pci_uio_remap_resource(struct rte_pci_device *dev) > +{ > + int i; > + uint64_t phaddr; > + void *map_address; > + > + /* Map all BARs */ > + for (i = 0; i != PCI_MAX_RESOURCE; i++) { > + /* skip empty BAR */ > + phaddr = dev->mem_resource[i].phys_addr; > + if (phaddr == 0) > + continue; > + map_address = pci_map_private_resource( > + dev->mem_resource[i].addr, 0, > + (size_t)dev->mem_resource[i].len); > + if (map_address == MAP_FAILED) > + goto error; > + memset(map_address, 0xFF, (size_t)dev- > >mem_resource[i].len); > + dev->mem_resource[i].addr = map_address; > + } > + > + return 0; > +error: > + return -1; > +} > + > static struct mapped_pci_resource * > pci_uio_find_resource(struct rte_pci_device *dev) > { > diff --git a/drivers/bus/pci/private.h b/drivers/bus/pci/private.h > index 2283f09..10baa1a 100644 > --- a/drivers/bus/pci/private.h > +++ b/drivers/bus/pci/private.h > @@ -202,6 +202,18 @@ void pci_uio_free_resource(struct rte_pci_device > *dev, > struct mapped_pci_resource *uio_res); > > /** > + * remap the pci uio resource.. > + * > + * @param dev > + * Point to the struct rte pci device. > + * @return > + * - On success, zero. > + * - On failure, a negative value. > + */ > +int > +pci_uio_remap_resource(struct rte_pci_device *dev); > + > +/** > * Map device memory to uio resource > * > * This function is private to EAL. > diff --git a/drivers/bus/pci/rte_bus_pci.h b/drivers/bus/pci/rte_bus_pci.h > index d4a2996..1662f3b 100644 > --- a/drivers/bus/pci/rte_bus_pci.h > +++ b/drivers/bus/pci/rte_bus_pci.h > @@ -52,6 +52,8 @@ extern "C" { > #include <sys/queue.h> > #include <stdint.h> > #include <inttypes.h> > +#include <unistd.h> > +#include <fcntl.h> > > #include <rte_debug.h> > #include <rte_interrupts.h> > @@ -197,6 +199,15 @@ int rte_pci_map_device(struct rte_pci_device *dev); > void rte_pci_unmap_device(struct rte_pci_device *dev); > > /** > + * Remap this device > + * > + * @param dev > + * A pointer to a rte_pci_device structure describing the device > + * to use > + */ > +int rte_pci_remap_device(struct rte_pci_device *dev); > + > +/** > * Dump the content of the PCI bus. > * > * @param f > @@ -333,6 +344,20 @@ void rte_pci_ioport_read(struct rte_pci_ioport *p, > void rte_pci_ioport_write(struct rte_pci_ioport *p, > const void *data, size_t len, off_t offset); > > +/** > + * It can be used to bind a device to a specific type of driver. > + * > + * @param dev_name > + * The device name. > + * @param drv_type > + * The specific driver's type. > + * > + * @return > + * - On success, zero. > + * - On failure, a negative value. > + */ > +int rte_pci_dev_bind_driver(const char *dev_name, const char *drv_type); > + > #ifdef __cplusplus > } > #endif > diff --git a/drivers/bus/vdev/vdev.c b/drivers/bus/vdev/vdev.c > index fd7736d..773f6e0 100644 > --- a/drivers/bus/vdev/vdev.c > +++ b/drivers/bus/vdev/vdev.c > @@ -323,6 +323,39 @@ vdev_find_device(const struct rte_device *start, > rte_dev_cmp_t cmp, > return NULL; > } > > +static struct rte_device * > +vdev_find_device_by_name(const struct rte_device *start, > + rte_dev_cmp_name_t cmp_name, > + const void *data) > +{ > + struct rte_vdev_device *dev; > + > + TAILQ_FOREACH(dev, &vdev_device_list, next) { > + if (start && &dev->device == start) { > + start = NULL; > + continue; > + } > + if (cmp_name(dev->device.name, data) == 0) > + return &dev->device; > + } > + return NULL; > +} > + > +static int > +vdev_remap_device(struct rte_device *dev) > +{ > + RTE_SET_USED(dev); > + return 0; > +} > + > +static int > +vdev_bind_driver(const char *dev_name, const char *drv_type) > +{ > + RTE_SET_USED(dev_name); > + RTE_SET_USED(drv_type); > + return 0; > +} > + > static int > vdev_plug(struct rte_device *dev) > { > @@ -339,9 +372,12 @@ static struct rte_bus rte_vdev_bus = { > .scan = vdev_scan, > .probe = vdev_probe, > .find_device = vdev_find_device, > + .find_device_by_name = vdev_find_device_by_name, > .plug = vdev_plug, > .unplug = vdev_unplug, > .parse = vdev_parse, > + .remap_device = vdev_remap_device, > + .bind_driver = vdev_bind_driver, > }; > > RTE_REGISTER_BUS(vdev, rte_vdev_bus); > diff --git a/lib/librte_eal/bsdapp/eal/eal_dev.c > b/lib/librte_eal/bsdapp/eal/eal_dev.c > new file mode 100644 > index 0000000..6ea9a74 > --- /dev/null > +++ b/lib/librte_eal/bsdapp/eal/eal_dev.c > @@ -0,0 +1,64 @@ > +/*- > + * Copyright(c) 2010-2017 Intel Corporation. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * * Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in > + * the documentation and/or other materials provided with the > + * distribution. > + * * Neither the name of Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived > + * from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND > CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT > NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND > FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE > COPYRIGHT > + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, > INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT > NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS > OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED > AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR > TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF > THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH > DAMAGE. > + */ > + > +#include <stdio.h> > +#include <string.h> > +#include <inttypes.h> > +#include <sys/queue.h> > +#include <sys/signalfd.h> > +#include <sys/ioctl.h> > +#include <sys/socket.h> > +#include <linux/netlink.h> > +#include <sys/epoll.h> > +#include <unistd.h> > +#include <signal.h> > +#include <stdbool.h> > + > +#include <rte_malloc.h> > +#include <rte_bus.h> > +#include <rte_dev.h> > +#include <rte_devargs.h> > +#include <rte_debug.h> > +#include <rte_log.h> > + > +#include "eal_thread.h" > + > +int > +rte_dev_monitor_start(void) > +{ > + return -1; > +} > + > +int > +rte_dev_monitor_stop(void) > +{ > + return -1; > +} > diff --git a/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h > b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h > new file mode 100644 > index 0000000..6a6feb5 > --- /dev/null > +++ b/lib/librte_eal/bsdapp/eal/include/exec-env/rte_dev.h > @@ -0,0 +1,106 @@ > +/*- > + * BSD LICENSE > + * > + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * * Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in > + * the documentation and/or other materials provided with the > + * distribution. > + * * Neither the name of Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived > + * from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND > CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT > NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND > FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE > COPYRIGHT > + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, > INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT > NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS > OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED > AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR > TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF > THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH > DAMAGE. > + */ > + > +#ifndef _RTE_DEV_H_ > +#error "don't include this file directly, please include generic <rte_dev.h>" > +#endif > + > +#ifndef _RTE_LINUXAPP_DEV_H_ > +#define _RTE_LINUXAPP_DEV_H_ > + > +#include <stdio.h> > + > +#include <rte_dev.h> > + > +#define RTE_EAL_UEV_MSG_LEN 4096 > +#define RTE_EAL_UEV_MSG_ELEM_LEN 128 > + > +enum uev_subsystem { > + UEV_SUBSYSTEM_UIO, > + UEV_SUBSYSTEM_VFIO, > + UEV_SUBSYSTEM_PCI, > + UEV_SUBSYSTEM_MAX > +}; > + > +enum uev_monitor_netlink_group { > + UEV_MONITOR_KERNEL, > + UEV_MONITOR_UDEV, > +}; > + > +/** > + * The device event type. > + */ > +enum rte_eal_dev_event_type { > + RTE_EAL_DEV_EVENT_UNKNOWN, /**< unknown event type */ > + RTE_EAL_DEV_EVENT_ADD, /**< device adding event */ > + RTE_EAL_DEV_EVENT_REMOVE, > + /**< device removing event */ > + RTE_EAL_DEV_EVENT_CHANGE, > + /**< device status change event */ > + RTE_EAL_DEV_EVENT_MOVE, /**< device sys path move > event */ > + RTE_EAL_DEV_EVENT_ONLINE, /**< device online event */ > + RTE_EAL_DEV_EVENT_OFFLINE, /**< device offline event */ > + RTE_EAL_DEV_EVENT_MAX /**< max value of this enum > */ > +}; > + > +struct rte_eal_uevent { > + enum rte_eal_dev_event_type type; /**< device event type */ > + int subsystem; /**< subsystem id */ > + char *devname; /**< device name */ > + enum uev_monitor_netlink_group group; /**< device netlink > group */ > +}; > + > +/** > + * Start the device uevent monitoring. > + * > + * @param none > + * @return > + * - On success, zero. > + * - On failure, a negative value. > + */ > +int > +rte_dev_monitor_start(void); > + > +/** > + * Stop the device uevent monitoring . > + * > + * @param none > + * @return > + * - On success, zero. > + * - On failure, a negative value. > + */ > + > +int > +rte_dev_monitor_stop(void); > + > +#endif /* _RTE_LINUXAPP_DEV_H_ */ > diff --git a/lib/librte_eal/common/eal_common_bus.c > b/lib/librte_eal/common/eal_common_bus.c > index 3e022d5..b7219c9 100644 > --- a/lib/librte_eal/common/eal_common_bus.c > +++ b/lib/librte_eal/common/eal_common_bus.c > @@ -51,8 +51,11 @@ rte_bus_register(struct rte_bus *bus) > RTE_VERIFY(bus->scan); > RTE_VERIFY(bus->probe); > RTE_VERIFY(bus->find_device); > + RTE_VERIFY(bus->find_device_by_name); > /* Buses supporting driver plug also require unplug. */ > RTE_VERIFY(!bus->plug || bus->unplug); > + RTE_VERIFY(bus->remap_device); > + RTE_VERIFY(bus->bind_driver); > > TAILQ_INSERT_TAIL(&rte_bus_list, bus, next); > RTE_LOG(DEBUG, EAL, "Registered [%s] bus.\n", bus->name); > @@ -170,6 +173,14 @@ cmp_rte_device(const struct rte_device *dev1, > const void *_dev2) > } > > static int > +cmp_rte_device_name(const char *dev_name1, const void *_dev_name2) > +{ > + const char *dev_name2 = _dev_name2; > + > + return strcmp(dev_name1, dev_name2); > +} > + > +static int > bus_find_device(const struct rte_bus *bus, const void *_dev) > { > struct rte_device *dev; > @@ -178,6 +189,25 @@ bus_find_device(const struct rte_bus *bus, const > void *_dev) > return dev == NULL; > } > > +static struct rte_device * > +bus_find_device_by_name(const struct rte_bus *bus, const void > *_dev_name) > +{ > + struct rte_device *dev; > + > + dev = bus->find_device_by_name(NULL, cmp_rte_device_name, > _dev_name); > + return dev; > +} > + > +struct rte_device * > + > +rte_bus_find_device(const struct rte_bus *bus, const void *_dev_name) > +{ > + struct rte_device *dev; > + > + dev = bus_find_device_by_name(bus, _dev_name); > + return dev; > +} > + > struct rte_bus * > rte_bus_find_by_device(const struct rte_device *dev) > { > diff --git a/lib/librte_eal/common/eal_common_dev.c > b/lib/librte_eal/common/eal_common_dev.c > index dda8f58..47909e8 100644 > --- a/lib/librte_eal/common/eal_common_dev.c > +++ b/lib/librte_eal/common/eal_common_dev.c > @@ -42,9 +42,31 @@ > #include <rte_devargs.h> > #include <rte_debug.h> > #include <rte_log.h> > +#include <rte_spinlock.h> > +#include <rte_malloc.h> > > #include "eal_private.h" > > +/* spinlock for device callbacks */ > +static rte_spinlock_t rte_dev_cb_lock = RTE_SPINLOCK_INITIALIZER; > + > +/** > + * The user application callback description. > + * > + * It contains callback address to be registered by user application, > + * the pointer to the parameters for callback, and the event type. > + */ > +struct rte_eal_dev_callback { > + TAILQ_ENTRY(rte_eal_dev_callback) next; /**< Callbacks list */ > + rte_eal_dev_cb_fn cb_fn; /**< Callback address */ > + void *cb_arg; /**< Parameter for callback */ > + void *ret_param; /**< Return parameter */ > + enum rte_eal_dev_event_type event; /**< device event type */ > + uint32_t active; /**< Callback is executing */ > +}; > + > +static struct rte_eal_dev_callback *dev_add_cb; > + > static int cmp_detached_dev_name(const struct rte_device *dev, > const void *_name) > { > @@ -234,3 +256,150 @@ int rte_eal_hotplug_remove(const char *busname, > const char *devname) > rte_eal_devargs_remove(busname, devname); > return ret; > } > + > +int > +rte_eal_dev_monitor_enable(void) > +{ > + int ret; > + > + ret = rte_dev_monitor_start(); > + if (ret) > + RTE_LOG(ERR, EAL, "Can not init device monitor\n"); > + return ret; > +} > + > +int > +rte_dev_callback_register(struct rte_device *device, > + enum rte_eal_dev_event_type event, > + rte_eal_dev_cb_fn cb_fn, void *cb_arg) > +{ > + struct rte_eal_dev_callback *user_cb; > + > + if (!cb_fn) > + return -EINVAL; > + What's about checking the device pointer is not NULL ? > + rte_spinlock_lock(&rte_dev_cb_lock); > + > + if (TAILQ_EMPTY(&(device->uev_cbs))) > + TAILQ_INIT(&(device->uev_cbs)); > + > + if (event == RTE_EAL_DEV_EVENT_ADD) { > + user_cb = NULL; > + } else { > + TAILQ_FOREACH(user_cb, &(device->uev_cbs), next) { > + if (user_cb->cb_fn == cb_fn && > + user_cb->cb_arg == cb_arg && > + user_cb->event == event) { > + break; > + } > + } > + } > + > + /* create a new callback. */ > + if (user_cb == NULL) { > + /* allocate a new interrupt callback entity */ > + user_cb = rte_zmalloc("eal device event", > + sizeof(*user_cb), 0); > + if (user_cb == NULL) { > + RTE_LOG(ERR, EAL, "Can not allocate memory\n"); Missing rte_spinlock_unlock. > + return -ENOMEM; > + } > + user_cb->cb_fn = cb_fn; > + user_cb->cb_arg = cb_arg; > + user_cb->event = event; > + if (event == RTE_EAL_DEV_EVENT_ADD) > + dev_add_cb = user_cb; Only one dpdk entity can register to ADD callback? I suggest to add option to register all devices maybe by using dummy device which will include all the "ALL_DEVICES" callbacks per event. All means past, present and future devices, by this way 1 callback can be called for all the devices and more than one dpdk entity could register to an ADD\NEW event. What's about NEW instead of ADD? I also suggest to add the device pointer as a parameter to the callback(which will be managed by EAL). > + else > + TAILQ_INSERT_TAIL(&(device->uev_cbs), user_cb, > next); > + } > + > + rte_spinlock_unlock(&rte_dev_cb_lock); > + return 0; > +} > + > +int > +rte_dev_callback_unregister(struct rte_device *device, > + enum rte_eal_dev_event_type event, > + rte_eal_dev_cb_fn cb_fn, void *cb_arg) > +{ > + int ret; > + struct rte_eal_dev_callback *cb, *next; > + > + if (!cb_fn) > + return -EINVAL; > + > + rte_spinlock_lock(&rte_dev_cb_lock); > + > + ret = 0; > + if (event == RTE_EAL_DEV_EVENT_ADD) { > + rte_free(dev_add_cb); > + dev_add_cb = NULL; > + } else { Device NULL checking? > + for (cb = TAILQ_FIRST(&(device->uev_cbs)); cb != NULL; > + cb = next) { > + > + next = TAILQ_NEXT(cb, next); > + > + if (cb->cb_fn != cb_fn || cb->event != event || > + (cb->cb_arg != (void *)-1 && > + cb->cb_arg != cb_arg)) > + continue; > + > + /* > + * if this callback is not executing right now, > + * then remove it. > + */ > + if (cb->active == 0) { > + TAILQ_REMOVE(&(device->uev_cbs), cb, > next); > + rte_free(cb); > + } else { > + ret = -EAGAIN; > + } > + } > + } > + rte_spinlock_unlock(&rte_dev_cb_lock); > + return ret; > +} > + > +int > +_rte_dev_callback_process(struct rte_device *device, > + enum rte_eal_dev_event_type event, > + void *cb_arg, void *ret_param) > +{ > + struct rte_eal_dev_callback dev_cb; > + struct rte_eal_dev_callback *cb_lst; > + int rc = 0; > + > + rte_spinlock_lock(&rte_dev_cb_lock); > + if (event == RTE_EAL_DEV_EVENT_ADD) { > + if (cb_arg != NULL) > + dev_add_cb->cb_arg = cb_arg; > + > + if (ret_param != NULL) > + dev_add_cb->ret_param = ret_param; > + > + rte_spinlock_unlock(&rte_dev_cb_lock); Can't someone free it when it running? I suggest to keep the lock locked. Callbacks are not allowed to use this mechanism to prevent deadlock. > + rc = dev_add_cb->cb_fn(dev_add_cb->event, > + dev_add_cb->cb_arg, dev_add_cb- > >ret_param); > + rte_spinlock_lock(&rte_dev_cb_lock); > + } else { > + TAILQ_FOREACH(cb_lst, &(device->uev_cbs), next) { > + if (cb_lst->cb_fn == NULL || cb_lst->event != event) > + continue; > + dev_cb = *cb_lst; > + cb_lst->active = 1; > + if (cb_arg != NULL) > + dev_cb.cb_arg = cb_arg; > + if (ret_param != NULL) > + dev_cb.ret_param = ret_param; > + > + rte_spinlock_unlock(&rte_dev_cb_lock); The current active flag doesn't do it thread safe here, I suggest to keep the lock locked. Scenario: 1. Thread A see active = 0 in unregister function. 2. Context switch. 3. Thread B start the callback. 4. Context switch. 5. Thread A free it. 6. Context switch. 7. Seg fault in Thread B. > + rc = dev_cb.cb_fn(dev_cb.event, > + dev_cb.cb_arg, dev_cb.ret_param); > + rte_spinlock_lock(&rte_dev_cb_lock); > + cb_lst->active = 0; > + } > + } > + rte_spinlock_unlock(&rte_dev_cb_lock); > + return rc; > +} > diff --git a/lib/librte_eal/common/include/rte_bus.h > b/lib/librte_eal/common/include/rte_bus.h > index 6fb0834..6c4ae31 100644 > --- a/lib/librte_eal/common/include/rte_bus.h > +++ b/lib/librte_eal/common/include/rte_bus.h > @@ -122,6 +122,34 @@ typedef struct rte_device * > const void *data); > > /** > + * Device iterator to find a device on a bus. > + * > + * This function returns an rte_device if one of those held by the bus > + * matches the data passed as parameter. > + * > + * If the comparison function returns zero this function should stop > iterating > + * over any more devices. To continue a search the device of a previous > search > + * can be passed via the start parameter. > + * > + * @param cmp > + * the device name comparison function. > + * > + * @param data > + * Data to compare each device against. > + * > + * @param start > + * starting point for the iteration > + * > + * @return > + * The first device matching the data, NULL if none exists. > + */ > +typedef struct rte_device * > +(*rte_bus_find_device_by_name_t)(const struct rte_device *start, > + rte_dev_cmp_name_t cmp, > + const void *data); > + > + > +/** > * Implementation specific probe function which is responsible for linking > * devices on that bus with applicable drivers. > * > @@ -168,6 +196,37 @@ typedef int (*rte_bus_unplug_t)(struct rte_device > *dev); > typedef int (*rte_bus_parse_t)(const char *name, void *addr); > > /** > + * Implementation specific remap function which is responsible for > remmaping > + * devices on that bus from original share memory resource to a private > memory > + * resource for the sake of device has been removal. > + * > + * @param dev > + * Device pointer that was returned by a previous call to find_device. > + * > + * @return > + * 0 on success. > + * !0 on error. > + */ > +typedef int (*rte_bus_remap_device_t)(struct rte_device *dev); > + > +/** > + * Implementation specific bind driver function which is responsible for bind > + * a explicit type of driver with a devices on that bus. > + * > + * @param dev_name > + * device textual description. > + * > + * @param drv_type > + * driver type textual description. > + * > + * @return > + * 0 on success. > + * !0 on error. > + */ > +typedef int (*rte_bus_bind_driver_t)(const char *dev_name, > + const char *drv_type); > + > +/** > * Bus scan policies > */ > enum rte_bus_scan_mode { > @@ -206,9 +265,13 @@ struct rte_bus { > rte_bus_scan_t scan; /**< Scan for devices attached to bus */ > rte_bus_probe_t probe; /**< Probe devices on bus */ > rte_bus_find_device_t find_device; /**< Find a device on the bus */ > + rte_bus_find_device_by_name_t find_device_by_name; > + /**< Find a device on the bus */ > rte_bus_plug_t plug; /**< Probe single device for drivers */ > rte_bus_unplug_t unplug; /**< Remove single device from driver > */ > rte_bus_parse_t parse; /**< Parse a device name */ > + rte_bus_remap_device_t remap_device; /**< remap a device */ > + rte_bus_bind_driver_t bind_driver; /**< bind a driver for bus device > */ > struct rte_bus_conf conf; /**< Bus configuration */ > rte_bus_get_iommu_class_t get_iommu_class; /**< Get iommu > class */ > }; > @@ -306,6 +369,12 @@ struct rte_bus *rte_bus_find(const struct rte_bus > *start, rte_bus_cmp_t cmp, > struct rte_bus *rte_bus_find_by_device(const struct rte_device *dev); > > /** > + * Find the registered bus for a particular device. > + */ > +struct rte_device *rte_bus_find_device(const struct rte_bus *bus, > + const void *dev_name); > + > +/** > * Find the registered bus for a given name. > */ > struct rte_bus *rte_bus_find_by_name(const char *busname); > diff --git a/lib/librte_eal/common/include/rte_dev.h > b/lib/librte_eal/common/include/rte_dev.h > index 9342e0c..19971d0 100644 > --- a/lib/librte_eal/common/include/rte_dev.h > +++ b/lib/librte_eal/common/include/rte_dev.h > @@ -51,6 +51,15 @@ extern "C" { > > #include <rte_log.h> > > +#include <exec-env/rte_dev.h> > + > +typedef int (*rte_eal_dev_cb_fn)(enum rte_eal_dev_event_type event, > + void *cb_arg, void *ret_param); > + > +struct rte_eal_dev_callback; > +/** @internal Structure to keep track of registered callbacks */ > +TAILQ_HEAD(rte_eal_dev_cb_list, rte_eal_dev_callback); > + > __attribute__((format(printf, 2, 0))) > static inline void > rte_pmd_debug_trace(const char *func_name, const char *fmt, ...) > @@ -157,6 +166,13 @@ struct rte_driver { > */ > #define RTE_DEV_NAME_MAX_LEN 64 > > +enum device_state { > + DEVICE_UNDEFINED, > + DEVICE_FAULT, > + DEVICE_PARSED, > + DEVICE_PROBED, > +}; > + > /** > * A structure describing a generic device. > */ > @@ -166,6 +182,9 @@ struct rte_device { > const struct rte_driver *driver;/**< Associated driver */ > int numa_node; /**< NUMA node connection */ > struct rte_devargs *devargs; /**< Device user arguments */ > + enum device_state state; /**< Device state */ > + /** User application callbacks for device event */ > + struct rte_eal_dev_cb_list uev_cbs; > }; > > /** > @@ -248,6 +267,8 @@ int rte_eal_hotplug_remove(const char *busname, > const char *devname); > */ > typedef int (*rte_dev_cmp_t)(const struct rte_device *dev, const void > *data); > > +typedef int (*rte_dev_cmp_name_t)(const char *dev_name, const void > *data); > + > #define RTE_PMD_EXPORT_NAME_ARRAY(n, idx) n##idx[] > > #define RTE_PMD_EXPORT_NAME(name, idx) \ > @@ -293,4 +314,72 @@ __attribute__((used)) = str > } > #endif > > +/** > + * It enable the device event monitoring for a specific event. > + * > + * @param none > + * @return > + * - On success, zero. > + * - On failure, a negative value. > + */ > +int > +rte_eal_dev_monitor_enable(void); > +/** > + * It registers the callback for the specific event. Multiple > + * callbacks cal be registered at the same time. > + * @param event > + * The device event type. > + * @param cb_fn > + * callback address. > + * @param cb_arg > + * address of parameter for callback. > + * > + * @return > + * - On success, zero. > + * - On failure, a negative value. > + */ > +int rte_dev_callback_register(struct rte_device *device, > + enum rte_eal_dev_event_type event, > + rte_eal_dev_cb_fn cb_fn, void *cb_arg); > + > +/** > + * It unregisters the callback according to the specified event. > + * > + * @param event > + * The event type which corresponding to the callback. > + * @param cb_fn > + * callback address. > + * address of parameter for callback, (void *)-1 means to remove all > + * registered which has the same callback address. > + * > + * @return > + * - On success, return the number of callback entities removed. > + * - On failure, a negative value. > + */ > +int rte_dev_callback_unregister(struct rte_device *device, > + enum rte_eal_dev_event_type event, > + rte_eal_dev_cb_fn cb_fn, void *cb_arg); > + > +/** > + * @internal Executes all the user application registered callbacks for > + * the specific device. It is for DPDK internal user only. User > + * application should not call it directly. > + * > + * @param event > + * The device event type. > + * @param cb_arg > + * callback parameter. > + * @param ret_param > + * To pass data back to user application. > + * This allows the user application to decide if a particular function > + * is permitted or not. > + * > + * @return > + * - On success, return zero. > + * - On failure, a negative value. > + */ > +int > +_rte_dev_callback_process(struct rte_device *device, > + enum rte_eal_dev_event_type event, > + void *cb_arg, void *ret_param); > #endif /* _RTE_DEV_H_ */ > diff --git a/lib/librte_eal/linuxapp/eal/Makefile > b/lib/librte_eal/linuxapp/eal/Makefile > index 5a7b8b2..05a2437 100644 > --- a/lib/librte_eal/linuxapp/eal/Makefile > +++ b/lib/librte_eal/linuxapp/eal/Makefile > @@ -67,6 +67,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += > eal_lcore.c > SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c > SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_interrupts.c > SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_alarm.c > +SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_dev.c > > # from common dir > SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_lcore.c > @@ -120,7 +121,7 @@ ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y) > CFLAGS_eal_thread.o += -Wno-return-type > endif > > -INC := rte_kni_common.h > +INC := rte_kni_common.h rte_dev.h > > SYMLINK-$(CONFIG_RTE_EXEC_ENV_LINUXAPP)-include/exec-env := \ > $(addprefix include/exec-env/,$(INC)) > diff --git a/lib/librte_eal/linuxapp/eal/eal_alarm.c > b/lib/librte_eal/linuxapp/eal/eal_alarm.c > index 8e4a775..29e73a7 100644 > --- a/lib/librte_eal/linuxapp/eal/eal_alarm.c > +++ b/lib/librte_eal/linuxapp/eal/eal_alarm.c > @@ -209,6 +209,7 @@ rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, > void *cb_arg) > int count = 0; > int err = 0; > int executing; > + int ret; > > if (!cb_fn) { > rte_errno = EINVAL; > @@ -259,6 +260,10 @@ rte_eal_alarm_cancel(rte_eal_alarm_callback cb_fn, > void *cb_arg) > } > ap_prev = ap; > } > + > + ret |= rte_intr_callback_unregister(&intr_handle, > + eal_alarm_callback, NULL); > + > rte_spinlock_unlock(&alarm_list_lk); > } while (executing != 0); > > diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c > b/lib/librte_eal/linuxapp/eal/eal_dev.c > new file mode 100644 > index 0000000..49fd0dc > --- /dev/null > +++ b/lib/librte_eal/linuxapp/eal/eal_dev.c > @@ -0,0 +1,356 @@ > +/*- > + * Copyright(c) 2010-2017 Intel Corporation. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * * Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in > + * the documentation and/or other materials provided with the > + * distribution. > + * * Neither the name of Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived > + * from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND > CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT > NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND > FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE > COPYRIGHT > + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, > INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT > NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS > OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED > AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR > TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF > THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH > DAMAGE. > + */ > + > +#include <stdio.h> > +#include <string.h> > +#include <inttypes.h> > +#include <sys/queue.h> > +#include <sys/signalfd.h> > +#include <sys/ioctl.h> > +#include <sys/socket.h> > +#include <linux/netlink.h> > +#include <sys/epoll.h> > +#include <unistd.h> > +#include <signal.h> > +#include <stdbool.h> > + > +#include <rte_malloc.h> > +#include <rte_bus.h> > +#include <rte_dev.h> > +#include <rte_devargs.h> > +#include <rte_debug.h> > +#include <rte_log.h> > + > +#include "eal_thread.h" > + > +/* uev monitoring thread */ > +static pthread_t uev_monitor_thread; > + > +bool udev_exit = true; > + > +bool no_request_thread = true; > + > +static void sig_handler(int signum) > +{ > + if (signum == SIGINT || signum == SIGTERM) > + rte_dev_monitor_stop(); > +} > + > +static int > +dev_monitor_fd_new(void) > +{ > + > + int uevent_fd; > + > + uevent_fd = socket(PF_NETLINK, SOCK_RAW | SOCK_CLOEXEC | > + SOCK_NONBLOCK, > + NETLINK_KOBJECT_UEVENT); > + if (uevent_fd < 0) { > + RTE_LOG(ERR, EAL, "create uevent fd failed\n"); > + return -1; > + } > + return uevent_fd; > +} > + > +static int > +dev_monitor_enable(int netlink_fd) > +{ > + struct sockaddr_nl addr; > + int ret; > + int size = 64 * 1024; > + int nonblock = 1; > + > + memset(&addr, 0, sizeof(addr)); > + addr.nl_family = AF_NETLINK; > + addr.nl_pid = 0; > + addr.nl_groups = 0xffffffff; > + > + if (bind(netlink_fd, (struct sockaddr *) &addr, sizeof(addr)) < 0) { > + RTE_LOG(ERR, EAL, "bind failed\n"); > + goto err; > + } > + > + setsockopt(netlink_fd, SOL_SOCKET, SO_PASSCRED, &size, > sizeof(size)); > + > + ret = ioctl(netlink_fd, FIONBIO, &nonblock); > + if (ret != 0) { > + RTE_LOG(ERR, EAL, "ioctl(FIONBIO) failed\n"); > + goto err; > + } > + return 0; > +err: > + close(netlink_fd); > + return -1; > +} > + > +static void > +dev_uev_parse(const char *buf, struct rte_eal_uevent *event) > +{ > + char action[RTE_EAL_UEV_MSG_ELEM_LEN]; > + char subsystem[RTE_EAL_UEV_MSG_ELEM_LEN]; > + char dev_path[RTE_EAL_UEV_MSG_ELEM_LEN]; > + char pci_slot_name[RTE_EAL_UEV_MSG_ELEM_LEN]; > + int i = 0; > + > + memset(action, 0, RTE_EAL_UEV_MSG_ELEM_LEN); > + memset(subsystem, 0, RTE_EAL_UEV_MSG_ELEM_LEN); > + memset(dev_path, 0, RTE_EAL_UEV_MSG_ELEM_LEN); > + memset(pci_slot_name, 0, RTE_EAL_UEV_MSG_ELEM_LEN); > + > + while (i < RTE_EAL_UEV_MSG_LEN) { > + for (; i < RTE_EAL_UEV_MSG_LEN; i++) { > + if (*buf) > + break; > + buf++; > + } > + if (!strncmp(buf, "libudev", 7)) { > + buf += 7; > + i += 7; > + event->group = UEV_MONITOR_UDEV; > + } > + if (!strncmp(buf, "ACTION=", 7)) { > + buf += 7; > + i += 7; > + snprintf(action, sizeof(action), "%s", buf); > + } else if (!strncmp(buf, "DEVPATH=", 8)) { > + buf += 8; > + i += 8; > + snprintf(dev_path, sizeof(dev_path), "%s", buf); > + } else if (!strncmp(buf, "SUBSYSTEM=", 10)) { > + buf += 10; > + i += 10; > + snprintf(subsystem, sizeof(subsystem), "%s", buf); > + } else if (!strncmp(buf, "PCI_SLOT_NAME=", 14)) { > + buf += 14; > + i += 14; > + snprintf(pci_slot_name, sizeof(subsystem), "%s", > buf); > + event->devname = pci_slot_name; > + } > + for (; i < RTE_EAL_UEV_MSG_LEN; i++) { > + if (*buf == '\0') > + break; > + buf++; > + } > + } > + > + if (!strncmp(subsystem, "pci", 3)) > + event->subsystem = UEV_SUBSYSTEM_PCI; > + if (!strncmp(action, "add", 3)) > + event->type = RTE_EAL_DEV_EVENT_ADD; > + if (!strncmp(action, "remove", 6)) > + event->type = RTE_EAL_DEV_EVENT_REMOVE; > + event->devname = pci_slot_name; > +} > + > +static int > +dev_uev_receive(int fd, struct rte_eal_uevent *uevent) > +{ > + int ret; > + char buf[RTE_EAL_UEV_MSG_LEN]; > + > + memset(uevent, 0, sizeof(struct rte_eal_uevent)); > + memset(buf, 0, RTE_EAL_UEV_MSG_LEN); > + > + ret = recv(fd, buf, RTE_EAL_UEV_MSG_LEN - 1, MSG_DONTWAIT); > + if (ret < 0) { > + RTE_LOG(ERR, EAL, > + "Socket read error(%d): %s\n", > + errno, strerror(errno)); > + return -1; > + } else if (ret == 0) > + /* connection closed */ > + return -1; > + > + dev_uev_parse(buf, uevent); > + > + return 0; > +} > + > +static int > +dev_uev_process(struct epoll_event *events, int nfds) > +{ > + struct rte_bus *bus; > + struct rte_device *dev; > + struct rte_eal_uevent uevent; > + int ret; > + int i; > + > + for (i = 0; i < nfds; i++) { > + /** > + * check device uevent from kernel side, no need to check > + * uevent from udev. > + */ > + if ((dev_uev_receive(events[i].data.fd, &uevent)) || > + (uevent.group == UEV_MONITOR_UDEV)) > + return 0; > + > + /* default handle all pci devcie when is being hot plug */ > + if (uevent.subsystem == UEV_SUBSYSTEM_PCI) { > + bus = rte_bus_find_by_name("pci"); > + dev = rte_bus_find_device(bus, uevent.devname); > + if (uevent.type == RTE_EAL_DEV_EVENT_REMOVE) { > + > + if ((!dev) || dev->state == > DEVICE_UNDEFINED) > + return 0; > + dev->state = DEVICE_FAULT; > + > + /** > + * remap the resource to be fake > + * before user's removal processing > + */ > + ret = bus->remap_device(dev); > + if (!ret) > + > return(_rte_dev_callback_process(dev, > + RTE_EAL_DEV_EVENT_REMOVE, > + NULL, NULL)); What is the reason to keep this device in EAL device list after the removal? I suggest to remove it (driver remove, bus remove and EAL remove) after the callbacks running. By this way EAL can initiate all device removals. > + } else if (uevent.type == RTE_EAL_DEV_EVENT_ADD) > { > + if (dev == NULL) { > + /** > + * bind the driver to the device > + * before user's add processing > + */ > + bus->bind_driver( > + uevent.devname, > + "igb_uio"); > + Similar comments here: EAL can initiate all device probe operations by adding the device and probing it here before the callback running. Then, also the device pointer can be passed to the callbacks. > return(_rte_dev_callback_process(NULL, > + RTE_EAL_DEV_EVENT_ADD, > + uevent.devname, NULL)); > + } > + } > + } > + } > + return 0; > +} > + > +/** > + * It builds/rebuilds up the epoll file descriptor with all the > + * file descriptors being waited on. Then handles the interrupts. > + * > + * @param arg > + * pointer. (unused) > + * > + * @return > + * never return; > + */ > +static __attribute__((noreturn)) void * > +dev_uev_monitoring(__rte_unused void *arg) > +{ > + struct sigaction act; > + sigset_t mask; > + int netlink_fd; > + struct epoll_event ep_kernel; > + int fd_ep; > + > + udev_exit = false; > + > + /* set signal handlers */ > + memset(&act, 0x00, sizeof(struct sigaction)); > + act.sa_handler = sig_handler; > + sigemptyset(&act.sa_mask); > + act.sa_flags = SA_RESTART; > + sigaction(SIGINT, &act, NULL); > + sigaction(SIGTERM, &act, NULL); > + sigemptyset(&mask); > + sigaddset(&mask, SIGINT); > + sigaddset(&mask, SIGTERM); > + sigprocmask(SIG_UNBLOCK, &mask, NULL); > + > + fd_ep = epoll_create1(EPOLL_CLOEXEC); > + if (fd_ep < 0) { > + RTE_LOG(ERR, EAL, "error creating epoll fd: %m\n"); > + goto out; > + } > + > + netlink_fd = dev_monitor_fd_new(); > + > + if (dev_monitor_enable(netlink_fd) < 0) { > + RTE_LOG(ERR, EAL, "error subscribing to kernel events\n"); > + goto out; > + } > + > + memset(&ep_kernel, 0, sizeof(struct epoll_event)); > + ep_kernel.events = EPOLLIN | EPOLLPRI | EPOLLRDHUP | EPOLLHUP; > + ep_kernel.data.fd = netlink_fd; > + if (epoll_ctl(fd_ep, EPOLL_CTL_ADD, netlink_fd, > + &ep_kernel) < 0) { > + RTE_LOG(ERR, EAL, "error addding fd to epoll: %m\n"); > + goto out; > + } > + > + while (!udev_exit) { > + int fdcount; > + struct epoll_event ev[1]; > + > + fdcount = epoll_wait(fd_ep, ev, 1, -1); > + if (fdcount < 0) { > + if (errno != EINTR) > + RTE_LOG(ERR, EAL, "error receiving uevent " > + "message: %m\n"); > + continue; > + } > + > + /* epoll_wait has at least one fd ready to read */ > + if (dev_uev_process(ev, fdcount) < 0) { > + if (errno != EINTR) > + RTE_LOG(ERR, EAL, "error processing uevent > " > + "message: %m\n"); > + } > + } > +out: > + if (fd_ep >= 0) > + close(fd_ep); > + if (netlink_fd >= 0) > + close(netlink_fd); > + rte_panic("uev monitoring fail\n"); > +} > + > +int > +rte_dev_monitor_start(void) > +{ Maybe add option to run it also by new EAL command line parameter? > + int ret; > + > + if (!no_request_thread) > + return 0; > + no_request_thread = false; > + > + /* create the host thread to wait/handle the uevent from kernel */ > + ret = pthread_create(&uev_monitor_thread, NULL, > + dev_uev_monitoring, NULL); What is the reason to open new thread for hotplug? Why not to use the current dpdk host thread by the alarm mechanism? > + return ret; > +} > + > +int > +rte_dev_monitor_stop(void) > +{ > + udev_exit = true; > + no_request_thread = true; > + return 0; > +} > diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h > b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h > new file mode 100644 > index 0000000..6a6feb5 > --- /dev/null > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_dev.h > @@ -0,0 +1,106 @@ > +/*- > + * BSD LICENSE > + * > + * Copyright(c) 2010-2014 Intel Corporation. All rights reserved. > + * All rights reserved. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * > + * * Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * * Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in > + * the documentation and/or other materials provided with the > + * distribution. > + * * Neither the name of Intel Corporation nor the names of its > + * contributors may be used to endorse or promote products derived > + * from this software without specific prior written permission. > + * > + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND > CONTRIBUTORS > + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT > NOT > + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND > FITNESS FOR > + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE > COPYRIGHT > + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, > INCIDENTAL, > + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT > NOT > + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS > OF USE, > + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED > AND ON ANY > + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR > TORT > + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF > THE USE > + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH > DAMAGE. > + */ > + > +#ifndef _RTE_DEV_H_ > +#error "don't include this file directly, please include generic <rte_dev.h>" > +#endif > + > +#ifndef _RTE_LINUXAPP_DEV_H_ > +#define _RTE_LINUXAPP_DEV_H_ > + > +#include <stdio.h> > + > +#include <rte_dev.h> > + > +#define RTE_EAL_UEV_MSG_LEN 4096 > +#define RTE_EAL_UEV_MSG_ELEM_LEN 128 > + > +enum uev_subsystem { > + UEV_SUBSYSTEM_UIO, > + UEV_SUBSYSTEM_VFIO, > + UEV_SUBSYSTEM_PCI, > + UEV_SUBSYSTEM_MAX > +}; > + > +enum uev_monitor_netlink_group { > + UEV_MONITOR_KERNEL, > + UEV_MONITOR_UDEV, > +}; > + > +/** > + * The device event type. > + */ > +enum rte_eal_dev_event_type { > + RTE_EAL_DEV_EVENT_UNKNOWN, /**< unknown event type */ > + RTE_EAL_DEV_EVENT_ADD, /**< device adding event */ > + RTE_EAL_DEV_EVENT_REMOVE, > + /**< device removing event */ > + RTE_EAL_DEV_EVENT_CHANGE, > + /**< device status change event */ > + RTE_EAL_DEV_EVENT_MOVE, /**< device sys path move > event */ > + RTE_EAL_DEV_EVENT_ONLINE, /**< device online event */ > + RTE_EAL_DEV_EVENT_OFFLINE, /**< device offline event */ > + RTE_EAL_DEV_EVENT_MAX /**< max value of this enum > */ > +}; > + > +struct rte_eal_uevent { > + enum rte_eal_dev_event_type type; /**< device event type */ > + int subsystem; /**< subsystem id */ > + char *devname; /**< device name */ > + enum uev_monitor_netlink_group group; /**< device netlink > group */ > +}; > + > +/** > + * Start the device uevent monitoring. > + * > + * @param none > + * @return > + * - On success, zero. > + * - On failure, a negative value. > + */ > +int > +rte_dev_monitor_start(void); > + > +/** > + * Stop the device uevent monitoring . > + * > + * @param none > + * @return > + * - On success, zero. > + * - On failure, a negative value. > + */ > + > +int > +rte_dev_monitor_stop(void); > + > +#endif /* _RTE_LINUXAPP_DEV_H_ */ > diff --git a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c > b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c > index a3a98c1..d0e07b4 100644 > --- a/lib/librte_eal/linuxapp/igb_uio/igb_uio.c > +++ b/lib/librte_eal/linuxapp/igb_uio/igb_uio.c > @@ -354,6 +354,12 @@ igbuio_pci_release(struct uio_info *info, struct > inode *inode) > struct rte_uio_pci_dev *udev = info->priv; > struct pci_dev *dev = udev->pdev; > > + /* check if device have been remove before release */ > + if ((&dev->dev.kobj)->state_remove_uevent_sent == 1) { > + pr_info("The device have been removed\n"); > + return -1; > + } > + > /* disable interrupts */ > igbuio_pci_disable_interrupts(udev); > > diff --git a/lib/librte_pci/rte_pci.c b/lib/librte_pci/rte_pci.c > index 0160fc1..feb5fd7 100644 > --- a/lib/librte_pci/rte_pci.c > +++ b/lib/librte_pci/rte_pci.c > @@ -172,6 +172,26 @@ rte_pci_addr_parse(const char *str, struct > rte_pci_addr *addr) > return -1; > } > > +/* map a private resource from an address*/ > +void * > +pci_map_private_resource(void *requested_addr, off_t offset, size_t size) > +{ > + void *mapaddr; > + > + mapaddr = mmap(requested_addr, size, > + PROT_READ | PROT_WRITE, > + MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, > -1, 0); > + if (mapaddr == MAP_FAILED) { > + RTE_LOG(ERR, EAL, "%s(): cannot mmap(%p, 0x%lx, 0x%lx): " > + "%s (%p)\n", > + __func__, requested_addr, > + (unsigned long)size, (unsigned long)offset, > + strerror(errno), mapaddr); > + } else > + RTE_LOG(DEBUG, EAL, " PCI memory mapped at %p\n", > mapaddr); > + > + return mapaddr; > +} > > /* map a particular resource from a file */ > void * > diff --git a/lib/librte_pci/rte_pci.h b/lib/librte_pci/rte_pci.h > index 4f2cd18..f6091a6 100644 > --- a/lib/librte_pci/rte_pci.h > +++ b/lib/librte_pci/rte_pci.h > @@ -227,6 +227,23 @@ int rte_pci_addr_cmp(const struct rte_pci_addr > *addr, > int rte_pci_addr_parse(const char *str, struct rte_pci_addr *addr); > > /** > + * @internal > + * Map to a particular private resource. > + * > + * @param requested_addr > + * The starting address for the new mapping range. > + * @param offset > + * The offset for the mapping range. > + * @param size > + * The size for the mapping range. > + * @return > + * - On success, the function returns a pointer to the mapped area. > + * - On error, the value MAP_FAILED is returned. > + */ > +void *pci_map_private_resource(void *requested_addr, off_t offset, > + size_t size); > + > +/** > * Map a particular resource from a file. > * > * @param requested_addr > -- > 2.7.4