This patch introduces a failure handler mechanism to handle device
hot unplug event. When device be hot plug out, the device resource
become invalid, if this resource is still be unexpected read/write,
system will crash. This patch let eal help application to handle
this fault, when sigbus error occur, check the failure address and
accordingly remap the invalid memory for the corresponding device,
that could guaranty the application not to be shut down when hot plug.
Signed-off-by: Jeff Guo <jia....@intel.com>
---
v21->v20:
sync failure hanlde to fix multiple process issue
---
lib/librte_eal/linuxapp/eal/eal_dev.c | 154 +++++++++++++++++++++++++++++++++-
1 file changed, 153 insertions(+), 1 deletion(-)
diff --git a/lib/librte_eal/linuxapp/eal/eal_dev.c
b/lib/librte_eal/linuxapp/eal/eal_dev.c
index 1cf6aeb..3067f39 100644
--- a/lib/librte_eal/linuxapp/eal/eal_dev.c
+++ b/lib/librte_eal/linuxapp/eal/eal_dev.c
@@ -4,6 +4,8 @@
#include <string.h>
#include <unistd.h>
+#include <fcntl.h>
+#include <signal.h>
#include <sys/socket.h>
#include <linux/netlink.h>
@@ -14,15 +16,27 @@
#include <rte_malloc.h>
#include <rte_interrupts.h>
#include <rte_alarm.h>
+#include <rte_bus.h>
+#include <rte_eal.h>
+#include <rte_spinlock.h>
#include "eal_private.h"
static struct rte_intr_handle intr_handle = {.fd = -1 };
static bool monitor_started;
+extern struct rte_bus_list rte_bus_list;
+
#define EAL_UEV_MSG_LEN 4096
#define EAL_UEV_MSG_ELEM_LEN 128
+/* spinlock for device failure process */
+static rte_spinlock_t dev_failure_lock = RTE_SPINLOCK_INITIALIZER;
+
+static struct sigaction sigbus_action_old;
+
+static int sigbus_need_recover;
+
static void dev_uev_handler(__rte_unused void *param);
/* identify the system layer which reports this event. */
@@ -34,6 +48,93 @@ enum eal_dev_event_subsystem {
};
static int
+dev_uev_failure_process(struct rte_device *dev, void *dev_addr)
+{
+ struct rte_bus *bus;
+ int ret = 0;
+
+ if (!dev && !dev_addr) {
+ return -EINVAL;
+ } else if (dev) {
+ bus = rte_bus_find_by_device_name(dev->name);
+ if (bus->handle_hot_unplug) {
+ /**
+ * call bus ops to handle hot unplug.
+ */
+ ret = bus->handle_hot_unplug(dev, dev_addr);
+ if (ret) {
+ RTE_LOG(ERR, EAL,
+ "Cannot handle hot unplug "
+ "for device %s "
+ "on the bus %s.\n ",
+ dev->name, bus->name);
+ }
+ } else {
+ RTE_LOG(ERR, EAL,
+ "Not support handle hot unplug for bus %s!\n",
+ bus->name);
+ ret = -ENOTSUP;
+ }
+ } else {
+ TAILQ_FOREACH(bus, &rte_bus_list, next) {
+ if (bus->handle_hot_unplug) {
+ /**
+ * call bus ops to handle hot unplug.
+ */
+ ret = bus->handle_hot_unplug(dev, dev_addr);
+ if (ret)
+ RTE_LOG(ERR, EAL,
+ "Cannot handle hot unplug "
+ "for the device "
+ "on the bus %s!\n", bus->name);
+ else
+ break;
+ } else {
+ RTE_LOG(ERR, EAL,
+ "Not support handle hot unplug "
+ "for bus %s!\n", bus->name);
+ ret = -ENOTSUP;
+ }
+ }
+ }
+ return ret;
+}
+
+static void
+sigbus_action_recover(void)
+{
+ if (sigbus_need_recover) {
+ sigaction(SIGBUS, &sigbus_action_old, NULL);
+ sigbus_need_recover = 0;
+ }
+}
+
+static void sigbus_handler(int signum __rte_unused, siginfo_t *info,
+ void *ctx __rte_unused)
+{
+ int ret;
+
+ RTE_LOG(DEBUG, EAL, "Thread[%d] catch SIGBUS, fault address:%p\n",
+ (int)pthread_self(), info->si_addr);
+ rte_spinlock_lock(&dev_failure_lock);
+ ret = dev_uev_failure_process(NULL, info->si_addr);
+ rte_spinlock_unlock(&dev_failure_lock);
+ if (!ret)
+ RTE_LOG(DEBUG, EAL,
+ "Success to handle SIGBUS error for hot unplug!\n");
+ else
+ rte_exit(EXIT_FAILURE, "exit for SIGBUS error!");