This commit adds a preliminary Linux implementation of the IF Proxy
library.  It should allow one to play around with the idea and check its
usefulness.

Signed-off-by: Andrzej Ostruszka <aostrus...@marvell.com>
---
 config/common_base                            |   5 +
 lib/Makefile                                  |   2 +
 .../common/include/rte_eal_interrupts.h       |   2 +
 lib/librte_eal/linux/eal/eal_interrupts.c     |  14 +-
 lib/librte_if_proxy/Makefile                  |  25 +
 lib/librte_if_proxy/meson.build               |   7 +
 lib/librte_if_proxy/rte_if_proxy.c            | 803 ++++++++++++++++++
 lib/meson.build                               |   2 +-
 8 files changed, 855 insertions(+), 5 deletions(-)
 create mode 100644 lib/librte_if_proxy/Makefile
 create mode 100644 lib/librte_if_proxy/meson.build
 create mode 100644 lib/librte_if_proxy/rte_if_proxy.c

diff --git a/config/common_base b/config/common_base
index 7dec7ed45..f20296750 100644
--- a/config/common_base
+++ b/config/common_base
@@ -1056,6 +1056,11 @@ CONFIG_RTE_LIBRTE_BPF_ELF=n
 #
 CONFIG_RTE_LIBRTE_IPSEC=y
 
+#
+# Compile librte_if_proxy
+#
+CONFIG_RTE_LIBRTE_IF_PROXY=y
+
 #
 # Compile the test application
 #
diff --git a/lib/Makefile b/lib/Makefile
index 46b91ae1a..0a60f3656 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -118,6 +118,8 @@ DIRS-$(CONFIG_RTE_LIBRTE_TELEMETRY) += librte_telemetry
 DEPDIRS-librte_telemetry := librte_eal librte_metrics librte_ethdev
 DIRS-$(CONFIG_RTE_LIBRTE_RCU) += librte_rcu
 DEPDIRS-librte_rcu := librte_eal
+DIRS-$(CONFIG_RTE_LIBRTE_IF_PROXY) += librte_if_proxy
+DEPDIRS-librte_if_proxy := librte_eal
 
 ifeq ($(CONFIG_RTE_EXEC_ENV_LINUX),y)
 DIRS-$(CONFIG_RTE_LIBRTE_KNI) += librte_kni
diff --git a/lib/librte_eal/common/include/rte_eal_interrupts.h 
b/lib/librte_eal/common/include/rte_eal_interrupts.h
index b370c0d26..f3d39a5ce 100644
--- a/lib/librte_eal/common/include/rte_eal_interrupts.h
+++ b/lib/librte_eal/common/include/rte_eal_interrupts.h
@@ -35,7 +35,9 @@ enum rte_intr_handle_type {
        RTE_INTR_HANDLE_EXT,          /**< external handler */
        RTE_INTR_HANDLE_VDEV,         /**< virtual device */
        RTE_INTR_HANDLE_DEV_EVENT,    /**< device event handle */
+       RTE_INTR_HANDLE_NETLINK,      /**< netlink notification handle */
        RTE_INTR_HANDLE_VFIO_REQ,     /**< VFIO request handle */
+
        RTE_INTR_HANDLE_MAX           /**< count of elements */
 };
 
diff --git a/lib/librte_eal/linux/eal/eal_interrupts.c 
b/lib/librte_eal/linux/eal/eal_interrupts.c
index 14ebb108c..ccdd94002 100644
--- a/lib/librte_eal/linux/eal/eal_interrupts.c
+++ b/lib/librte_eal/linux/eal/eal_interrupts.c
@@ -680,6 +680,9 @@ rte_intr_enable(const struct rte_intr_handle *intr_handle)
                break;
        /* not used at this moment */
        case RTE_INTR_HANDLE_ALARM:
+#if RTE_LIBRTE_IF_PROXY
+       case RTE_INTR_HANDLE_NETLINK:
+#endif
                return -1;
 #ifdef VFIO_PRESENT
        case RTE_INTR_HANDLE_VFIO_MSIX:
@@ -796,6 +799,9 @@ rte_intr_disable(const struct rte_intr_handle *intr_handle)
                break;
        /* not used at this moment */
        case RTE_INTR_HANDLE_ALARM:
+#if RTE_LIBRTE_IF_PROXY
+       case RTE_INTR_HANDLE_NETLINK:
+#endif
                return -1;
 #ifdef VFIO_PRESENT
        case RTE_INTR_HANDLE_VFIO_MSIX:
@@ -889,12 +895,12 @@ eal_intr_process_interrupts(struct epoll_event *events, 
int nfds)
                        break;
 #endif
 #endif
-               case RTE_INTR_HANDLE_VDEV:
                case RTE_INTR_HANDLE_EXT:
-                       bytes_read = 0;
-                       call = true;
-                       break;
+               case RTE_INTR_HANDLE_VDEV:
                case RTE_INTR_HANDLE_DEV_EVENT:
+#if RTE_LIBRTE_IF_PROXY
+               case RTE_INTR_HANDLE_NETLINK:
+#endif
                        bytes_read = 0;
                        call = true;
                        break;
diff --git a/lib/librte_if_proxy/Makefile b/lib/librte_if_proxy/Makefile
new file mode 100644
index 000000000..9dd5f4791
--- /dev/null
+++ b/lib/librte_if_proxy/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2019 Marvell International Ltd.
+
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+# library name
+LIB = librte_if_proxy.a
+
+CFLAGS += -DALLOW_EXPERIMENTAL_API
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR)
+LDLIBS += -lrte_eal
+
+EXPORT_MAP := rte_if_proxy_version.map
+
+LIBABIVER := 1
+
+# all source are stored in SRCS-y
+SRCS-$(CONFIG_RTE_LIBRTE_IF_PROXY) := rte_if_proxy.c
+
+# install this header file
+SYMLINK-$(CONFIG_RTE_LIBRTE_IF_PROXY)-include := rte_if_proxy.h
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_if_proxy/meson.build b/lib/librte_if_proxy/meson.build
new file mode 100644
index 000000000..f9ed410b6
--- /dev/null
+++ b/lib/librte_if_proxy/meson.build
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: BSD-3-Clause
+# Copyright(C) 2019 Marvell International Ltd.
+
+version = 1
+allow_experimental_apis = true
+sources = files('rte_if_proxy.c')
+headers = files('rte_if_proxy.h')
diff --git a/lib/librte_if_proxy/rte_if_proxy.c 
b/lib/librte_if_proxy/rte_if_proxy.c
new file mode 100644
index 000000000..770462702
--- /dev/null
+++ b/lib/librte_if_proxy/rte_if_proxy.c
@@ -0,0 +1,803 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(C) 2020 Marvell International Ltd.
+ */
+
+#include <rte_if_proxy.h>
+#include <rte_interrupts.h>
+#include <rte_spinlock.h>
+#include <rte_string_fns.h>
+
+#include <stdbool.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/rtnetlink.h>
+#include <linux/if.h>
+#include <sys/socket.h>
+#include <sys/queue.h>
+
+static
+int ifpx_log_type;
+#define IFPX_LOG(level, fmt, args...) \
+       rte_log(RTE_LOG_ ## level, ifpx_log_type, "%s(): " fmt "\n", \
+               __func__, ##args)
+
+static
+struct rte_intr_handle ifpx_irq = {
+       .type = RTE_INTR_HANDLE_NETLINK,
+       .fd = -1,
+};
+
+static
+unsigned int ifpx_pid;
+
+/* Port to proxy mapping table */
+static uint16_t ifpx_p2p[RTE_MAX_ETHPORTS];
+
+/* Since this library is really slow/config path we guard data structures with
+ * a lock - and only one for all of them should be enough.  But only callback
+ * and proxies lists are protected, I don't expect the need to protect port to
+ * proxy map table above.
+ */
+static
+rte_spinlock_t ifpx_lock = RTE_SPINLOCK_INITIALIZER;
+
+/* List of configured proxies */
+struct ifpx_proxies_node {
+       TAILQ_ENTRY(ifpx_proxies_node) elem;
+       uint16_t proxy_id;
+       struct rte_ifpx_info info;
+};
+static
+TAILQ_HEAD(ifpx_proxies_head, ifpx_proxies_node) ifpx_proxies =
+               TAILQ_HEAD_INITIALIZER(ifpx_proxies);
+
+/* List of registered callbacks */
+struct ifpx_cbs_node {
+       TAILQ_ENTRY(ifpx_cbs_node) elem;
+       struct rte_ifpx_callbacks cbs;
+};
+static
+TAILQ_HEAD(ifpx_cbs_head, ifpx_cbs_node) ifpx_callbacks =
+               TAILQ_HEAD_INITIALIZER(ifpx_callbacks);
+
+static
+int request_info(int type, int index);
+
+uint64_t rte_ifpx_callbacks_available(void)
+{
+       return RTE_IFPX_MAC_CHANGE | RTE_IFPX_MTU_CHANGE |
+               RTE_IFPX_LINK_CHANGE | RTE_IFPX_ADDR_ADD |
+               RTE_IFPX_ADDR_DEL | RTE_IFPX_ADDR6_ADD |
+               RTE_IFPX_ADDR6_DEL | RTE_IFPX_ROUTE_ADD |
+               RTE_IFPX_ROUTE_DEL | RTE_IFPX_ROUTE6_ADD |
+               RTE_IFPX_ROUTE6_DEL;
+}
+
+uint16_t rte_ifpx_create(enum rte_ifpx_type type)
+{
+       char devargs[16] = { '\0' };
+       int dev_cnt = 0, nlen;
+       uint16_t port_id;
+
+       switch (type) {
+       case RTE_IFPX_DEFAULT:
+       case RTE_IFPX_TAP:
+               nlen = strlcpy(devargs, "net_tap", sizeof(devargs));
+               break;
+       case RTE_IFPX_KNI:
+               nlen = strlcpy(devargs, "net_kni", sizeof(devargs));
+               break;
+       default:
+               IFPX_LOG(ERR, "Unknown proxy type: %d", type);
+               return RTE_MAX_ETHPORTS;
+       }
+
+       RTE_ETH_FOREACH_DEV(port_id) {
+               if (strcmp(rte_eth_devices[port_id].device->driver->name,
+                          devargs) == 0)
+                       ++dev_cnt;
+       }
+       snprintf(devargs+nlen, sizeof(devargs)-nlen, "%d", dev_cnt);
+
+       return rte_ifpx_create_by_devarg(devargs);
+}
+
+uint16_t rte_ifpx_create_by_devarg(const char *devarg)
+{
+       uint16_t port_id = RTE_MAX_ETHPORTS;
+       struct rte_dev_iterator iter;
+
+       if (rte_dev_probe(devarg) < 0) {
+               IFPX_LOG(ERR, "Failed to create proxy port %s\n", devarg);
+               return RTE_MAX_ETHPORTS;
+       }
+
+       RTE_ETH_FOREACH_MATCHING_DEV(port_id, devarg, &iter) {
+               break;
+       }
+       if (port_id != RTE_MAX_ETHPORTS)
+               rte_eth_iterator_cleanup(&iter);
+
+       return port_id;
+}
+
+int rte_ifpx_destroy(uint16_t proxy_id)
+{
+       struct ifpx_proxies_node *px;
+       unsigned int i;
+       int ec = 0;
+
+       rte_spinlock_lock(&ifpx_lock);
+       TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+               if (px->proxy_id != proxy_id)
+                       continue;
+       }
+       if (!px) {
+               ec = -EINVAL;
+               goto exit;
+       }
+       TAILQ_REMOVE(&ifpx_proxies, px, elem);
+       free(px);
+
+       /* Clear any bindings for this proxy. */
+       for (i = 0; i < RTE_DIM(ifpx_p2p); ++i) {
+               if (ifpx_p2p[i] == proxy_id)
+                       ifpx_p2p[i] = RTE_MAX_ETHPORTS;
+       }
+
+       ec = rte_dev_remove(rte_eth_devices[proxy_id].device);
+exit:
+       rte_spinlock_unlock(&ifpx_lock);
+       return ec;
+}
+
+int rte_ifpx_port_bind(uint16_t port_id, uint16_t proxy_id)
+{
+       struct rte_eth_dev_info proxy_eth_info;
+       struct ifpx_proxies_node *px;
+       int ec;
+
+       if (port_id >= RTE_MAX_ETHPORTS || proxy_id >= RTE_MAX_ETHPORTS) {
+               IFPX_LOG(ERR, "Invalid port_id: %d", port_id);
+               return -EINVAL;
+       }
+
+       /* Do automatic rebinding but issue a warning since this is not
+        * considered to be a valid behaviour.
+        */
+       if (ifpx_p2p[port_id] != RTE_MAX_ETHPORTS) {
+               IFPX_LOG(WARNING, "Port already bound: %d -> %d", port_id,
+                        ifpx_p2p[port_id]);
+       }
+
+       ec = rte_eth_dev_info_get(proxy_id, &proxy_eth_info);
+       if (ec < 0) {
+               IFPX_LOG(ERR, "Failed to read proxy dev info: %d", ec);
+               return ec;
+       }
+       if (proxy_eth_info.if_index == 0) {
+               IFPX_LOG(ERR, "Proxy with no IF index");
+               return -EINVAL;
+       }
+
+       /* Search for existing proxy - if not found add one to the list. */
+       rte_spinlock_lock(&ifpx_lock);
+       TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+               if (px->proxy_id == proxy_id)
+                       break;
+       }
+       if (!px) {
+               px = malloc(sizeof(*px));
+               if (!px) {
+                       rte_spinlock_unlock(&ifpx_lock);
+                       return -ENOMEM;
+               }
+               px->proxy_id = proxy_id;
+               px->info.if_index = proxy_eth_info.if_index;
+               rte_eth_dev_get_mtu(proxy_id, &px->info.mtu);
+               rte_eth_macaddr_get(proxy_id, &px->info.mac);
+               memset(px->info.if_name, 0, sizeof(px->info.if_name));
+               TAILQ_INSERT_TAIL(&ifpx_proxies, px, elem);
+       }
+       rte_spinlock_unlock(&ifpx_lock);
+       ifpx_p2p[port_id] = proxy_id;
+
+       if (ifpx_irq.fd != -1)
+               request_info(RTM_GETLINK, px->info.if_index);
+
+       return 0;
+}
+
+int rte_ifpx_port_unbind(uint16_t port_id)
+{
+       if (port_id >= RTE_MAX_ETHPORTS ||
+           ifpx_p2p[port_id] == RTE_MAX_ETHPORTS)
+               return -EINVAL;
+
+       ifpx_p2p[port_id] = RTE_MAX_ETHPORTS;
+       /* Proxy without any port bound is OK - that is the state of the proxy
+        * that has just been created, and it can still report routing
+        * information.  So we do not even check if this is the case.
+        */
+
+       return 0;
+}
+
+rte_ifpx_cbs_hndl rte_ifpx_callbacks_register(const
+                                             struct rte_ifpx_callbacks *cbs)
+{
+       rte_ifpx_cbs_hndl cb_hndl = NULL;
+       struct ifpx_cbs_node *node;
+
+       if (!cbs)
+               return NULL;
+
+       rte_spinlock_lock(&ifpx_lock);
+       TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+               if (&node->cbs == cbs) {
+                       cb_hndl = cbs;
+                       goto exit;
+               }
+       }
+
+       node = malloc(sizeof(*node));
+       if (!node)
+               goto exit;
+
+       node->cbs = *cbs;
+       TAILQ_INSERT_TAIL(&ifpx_callbacks, node, elem);
+       cb_hndl = &node->cbs;
+exit:
+       rte_spinlock_unlock(&ifpx_lock);
+
+       return cb_hndl;
+}
+
+int rte_ifpx_callbacks_unregister(rte_ifpx_cbs_hndl cbs)
+{
+       struct ifpx_cbs_node *node;
+       int ec = -EINVAL;
+
+       if (!cbs)
+               return ec;
+
+       rte_spinlock_lock(&ifpx_lock);
+       TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+               if (&node->cbs == cbs) {
+                       TAILQ_REMOVE(&ifpx_callbacks, node, elem);
+                       free(node);
+                       ec = 0;
+                       break;
+               }
+       }
+       rte_spinlock_unlock(&ifpx_lock);
+
+       return ec;
+}
+
+uint16_t rte_ifpx_proxy_get(uint16_t port_id)
+{
+       if (port_id >= RTE_MAX_ETHPORTS)
+               return RTE_MAX_ETHPORTS;
+
+       return ifpx_p2p[port_id];
+}
+
+unsigned int rte_ifpx_port_get(uint16_t proxy_id,
+                               uint16_t *ports, unsigned int num)
+{
+       unsigned int p, cnt = 0;
+
+       for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+               if (ifpx_p2p[p] == proxy_id) {
+                       ++cnt;
+                       if (ports && num > 0) {
+                               *ports++ = ifpx_p2p[p];
+                               --num;
+                       }
+               }
+       }
+       return cnt;
+}
+
+const struct rte_ifpx_info *rte_ifpx_info_get(uint16_t port_id)
+{
+       struct ifpx_proxies_node *px;
+
+       if (port_id >= RTE_MAX_ETHPORTS ||
+           ifpx_p2p[port_id] == RTE_MAX_ETHPORTS)
+               return NULL;
+
+       rte_spinlock_lock(&ifpx_lock);
+       TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+               if (px->proxy_id == ifpx_p2p[port_id])
+                       break;
+       }
+       rte_spinlock_unlock(&ifpx_lock);
+       RTE_ASSERT(px && "Internal IF Proxy library error");
+
+       return &px->info;
+}
+
+static
+void handle_link(const struct nlmsghdr *h)
+{
+       const struct ifinfomsg *ifi = NLMSG_DATA(h);
+       int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*ifi));
+       const struct rtattr *attrs[IFLA_MAX+1] = { NULL };
+       const struct rtattr *attr;
+       struct ifpx_proxies_node *px;
+       struct ifpx_cbs_node *cb;
+       uint16_t p;
+
+       IFPX_LOG(DEBUG, "\tLink action (%u): %u, 0x%x/0x%x (flags/changed)",
+                ifi->ifi_index, h->nlmsg_type, ifi->ifi_flags,
+                ifi->ifi_change);
+
+       rte_spinlock_lock(&ifpx_lock);
+       TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+               if (px->info.if_index == (unsigned int)ifi->ifi_index)
+                       break;
+       }
+       rte_spinlock_unlock(&ifpx_lock);
+
+       /* Drop messages that are not associated with any proxy */
+       if (!px)
+               return;
+       /* When message is a reply to request for specific interface then keep
+        * it only when it contains info for this interface.
+        */
+       if (h->nlmsg_pid == ifpx_pid && h->nlmsg_seq >> 8 &&
+           (h->nlmsg_seq >> 8) != (unsigned int)ifi->ifi_index)
+               return;
+
+       for (attr = IFLA_RTA(ifi); RTA_OK(attr, alen);
+                                  attr = RTA_NEXT(attr, alen)) {
+               if (attr->rta_type > IFLA_MAX)
+                       continue;
+               attrs[attr->rta_type] = attr;
+       }
+
+       rte_spinlock_lock(&ifpx_lock);
+       if (ifi->ifi_change & IFF_UP) {
+               TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+                       if (!cb->cbs.link_change)
+                               continue;
+                       for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+                               if (ifpx_p2p[p] != px->proxy_id)
+                                       continue;
+                               cb->cbs.link_change(p,
+                                                   ifi->ifi_flags & IFF_UP);
+                       }
+               }
+       }
+       if (attrs[IFLA_MTU]) {
+               uint16_t mtu = *(const int *)RTA_DATA(attrs[IFLA_MTU]);
+               if (mtu != px->info.mtu) {
+                       px->info.mtu = mtu;
+                       TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+                               if (!cb->cbs.mtu_change)
+                                       continue;
+                               for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+                                       if (ifpx_p2p[p] != px->proxy_id)
+                                               continue;
+                                       cb->cbs.mtu_change(p, mtu);
+                               }
+                       }
+               }
+       }
+       if (attrs[IFLA_ADDRESS]) {
+               const struct rte_ether_addr *mac =
+                               RTA_DATA(attrs[IFLA_ADDRESS]);
+
+               RTE_ASSERT(RTA_PAYLOAD(attrs[IFLA_ADDRESS]) ==
+                          RTE_ETHER_ADDR_LEN);
+               if (memcmp(mac, &px->info.mac, RTE_ETHER_ADDR_LEN) != 0) {
+                       memcpy(px->info.mac.addr_bytes, mac, 
RTE_ETHER_ADDR_LEN);
+                       TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+                               if (!cb->cbs.mac_change)
+                                       continue;
+                               for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+                                       if (ifpx_p2p[p] != px->proxy_id)
+                                               continue;
+                                       cb->cbs.mac_change(p, mac);
+                               }
+                       }
+               }
+       }
+       rte_spinlock_unlock(&ifpx_lock);
+       if (h->nlmsg_pid == ifpx_pid) {
+               RTE_ASSERT((h->nlmsg_seq & 0xFF) == RTM_GETLINK);
+               /* If this is reply for specific link request (not initial
+                * global dump) then follow up with address request, otherwise
+                * just store the interface name.
+                */
+               if (h->nlmsg_seq >> 8)
+                       request_info(RTM_GETADDR, ifi->ifi_index);
+               else if (!px->info.if_name[0] && attrs[IFLA_IFNAME])
+                       strlcpy(px->info.if_name, RTA_DATA(attrs[IFLA_IFNAME]),
+                               sizeof(px->info.if_name));
+       }
+}
+
+static
+void handle_addr(const struct nlmsghdr *h, bool needs_del)
+{
+       const struct ifaddrmsg *ifa = NLMSG_DATA(h);
+       int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*ifa));
+       const struct rtattr *attrs[IFA_MAX+1] = { NULL };
+       const struct rtattr *attr;
+       struct ifpx_proxies_node *px;
+       struct ifpx_cbs_node *cb;
+       const uint8_t *ip;
+       uint16_t p;
+
+       rte_spinlock_lock(&ifpx_lock);
+       TAILQ_FOREACH(px, &ifpx_proxies, elem) {
+               if (px->info.if_index == ifa->ifa_index)
+                       break;
+       }
+       rte_spinlock_unlock(&ifpx_lock);
+
+       /* Drop messages that are not associated with any proxy */
+       if (!px)
+               return;
+       /* When message is a reply to request for specific interface then keep
+        * it only when it contains info for this interface.
+        */
+       if (h->nlmsg_pid == ifpx_pid && h->nlmsg_seq >> 8 &&
+           (h->nlmsg_seq >> 8) != ifa->ifa_index)
+               return;
+
+       for (attr = IFA_RTA(ifa); RTA_OK(attr, alen);
+                                 attr = RTA_NEXT(attr, alen)) {
+               if (attr->rta_type > IFA_MAX)
+                       continue;
+               attrs[attr->rta_type] = attr;
+       }
+
+       rte_spinlock_lock(&ifpx_lock);
+       if (attrs[IFA_ADDRESS]) {
+               TAILQ_FOREACH(cb, &ifpx_callbacks, elem) {
+                       struct rte_ifpx_callbacks *cbs = &cb->cbs;
+
+                       ip = RTA_DATA(attrs[IFA_ADDRESS]);
+                       if (ifa->ifa_family == AF_INET) {
+                               /* address is in network order */
+                               uint32_t ipv4 =
+                                       RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
+
+                               for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+                                       if (ifpx_p2p[p] != px->proxy_id)
+                                               continue;
+                                       if (needs_del && cbs->addr_del)
+                                               cb->cbs.addr_del(p, ipv4);
+                                       else if (!needs_del && cbs->addr_add)
+                                               cb->cbs.addr_add(p, ipv4);
+                               }
+                       } else if (ifa->ifa_family == AF_INET6) {
+                               for (p = 0; p < RTE_DIM(ifpx_p2p); ++p) {
+                                       if (ifpx_p2p[p] != px->proxy_id)
+                                               continue;
+                                       if (needs_del && cbs->addr6_del)
+                                               cb->cbs.addr6_del(p, ip);
+                                       else if (!needs_del && cbs->addr6_add)
+                                               cb->cbs.addr6_add(p, ip);
+                               }
+                       }
+               }
+       }
+       rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+void handle_route(const struct nlmsghdr *h, bool needs_del)
+{
+       const struct rtmsg *r = NLMSG_DATA(h);
+       int alen = h->nlmsg_len - NLMSG_LENGTH(sizeof(*r));
+       const struct rtattr *attrs[RTA_MAX+1] = { NULL };
+       const struct rtattr *attr;
+       struct ifpx_cbs_node *node;
+       const uint8_t *ip;
+
+       for (attr = RTM_RTA(r); RTA_OK(attr, alen);
+                               attr = RTA_NEXT(attr, alen)) {
+               if (attr->rta_type > RTA_MAX)
+                       continue;
+               attrs[attr->rta_type] = attr;
+       }
+
+       rte_spinlock_lock(&ifpx_lock);
+       if (attrs[RTA_DST]) {
+               TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+                       struct rte_ifpx_callbacks *cbs = &node->cbs;
+
+                       ip = RTA_DATA(attrs[RTA_DST]);
+                       if (r->rtm_family == AF_INET) {
+                               /* address is in network order */
+                               uint32_t ipv4 =
+                                       RTE_IPV4(ip[0], ip[1], ip[2], ip[3]);
+
+                               if (needs_del && cbs->route_del)
+                                       cbs->route_del(ipv4, r->rtm_dst_len);
+                               else if (!needs_del && cbs->route_add)
+                                       cbs->route_add(ipv4, r->rtm_dst_len);
+                       } else if (r->rtm_family == AF_INET6) {
+                               if (needs_del && cbs->route6_del)
+                                       cbs->route6_del(ip, r->rtm_dst_len);
+                               else if (!needs_del && cbs->route6_add)
+                                       cbs->route6_add(ip, r->rtm_dst_len);
+                       }
+               }
+       }
+       rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+int request_info(int type, int index)
+{
+       static rte_spinlock_t send_lock = RTE_SPINLOCK_INITIALIZER;
+       struct info_get {
+               struct nlmsghdr h;
+               union {
+                       struct ifinfomsg ifm;
+                       struct ifaddrmsg ifa;
+                       struct rtmsg rtm;
+               } __rte_aligned(NLMSG_ALIGNTO);
+       } info_req;
+       int ret;
+
+       IFPX_LOG(DEBUG, "\tRequesting msg %d for: %u", type, index);
+
+       memset(&info_req, 0, sizeof(info_req));
+       /* First byte of these messages is family, so just make sure that this
+        * memset is enough to get all families.
+        */
+       RTE_ASSERT(AF_UNSPEC == 0);
+
+       info_req.h.nlmsg_pid = ifpx_pid;
+       info_req.h.nlmsg_type = type;
+       info_req.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+       info_req.h.nlmsg_len = offsetof(struct info_get, ifm);
+
+       switch (type) {
+       case RTM_GETLINK:
+               info_req.h.nlmsg_len += sizeof(info_req.ifm);
+               info_req.ifm.ifi_index = index;
+               break;
+       case RTM_GETADDR:
+               info_req.h.nlmsg_len += sizeof(info_req.ifa);
+               info_req.ifa.ifa_index = index;
+               break;
+       case RTM_GETROUTE:
+               info_req.h.nlmsg_len += sizeof(info_req.rtm);
+               break;
+       default:
+               return -EINVAL;
+       }
+       /* Store request type (and if it is global or link specific) in 'seq'.
+        * Later it is used during handling of reply to continue requesting of
+        * information dump from system - if needed.
+        */
+       info_req.h.nlmsg_seq = index << 8 | type;
+
+       rte_spinlock_lock(&send_lock);
+       ret = send(ifpx_irq.fd, &info_req, info_req.h.nlmsg_len, 0);
+       if (ret < 0) {
+               IFPX_LOG(ERR, "Failed to send netlink msg: %d", errno);
+               rte_errno = errno;
+       }
+       rte_spinlock_unlock(&send_lock);
+
+       return ret;
+}
+
+static
+void notify_cfg_finished(void)
+{
+       struct ifpx_cbs_node *node;
+
+       rte_spinlock_lock(&ifpx_lock);
+       TAILQ_FOREACH(node, &ifpx_callbacks, elem) {
+               if ( !node->cbs.cfg_finished)
+                       continue;
+               node->cbs.cfg_finished();
+       }
+       rte_spinlock_unlock(&ifpx_lock);
+}
+
+static
+void if_proxy_intr_callback(void *arg __rte_unused)
+{
+       struct nlmsghdr *h;
+       struct sockaddr_nl addr;
+       socklen_t addr_len;
+       char buf[8192];
+       ssize_t len;
+
+restart:
+       len = recvfrom(ifpx_irq.fd, buf, sizeof(buf), 0,
+                      (struct sockaddr *)&addr, &addr_len);
+       if (len < 0) {
+               if (errno == EINTR) {
+                       IFPX_LOG(DEBUG, "recvmsg() interrupted");
+                       goto restart;
+               }
+               IFPX_LOG(ERR, "Failed to read netlink msg: %ld (errno %d)",
+                        len, errno);
+               return;
+       }
+       if (addr_len != sizeof(addr)) {
+               IFPX_LOG(ERR, "Invalid netlink addr size: %d", addr_len);
+               return;
+       }
+       IFPX_LOG(DEBUG, "Read %lu bytes (buf %lu) from %u/%u", len,
+                sizeof(buf), addr.nl_pid, addr.nl_groups);
+
+       for (h = (struct nlmsghdr *)buf; NLMSG_OK(h, len);
+                                        h = NLMSG_NEXT(h, len)) {
+               IFPX_LOG(DEBUG, "Recv msg: %u (%u/%u/%u seq/flags/pid)",
+                        h->nlmsg_type, h->nlmsg_seq, h->nlmsg_flags,
+                        h->nlmsg_pid);
+
+               switch (h->nlmsg_type) {
+               case RTM_NEWLINK:
+               case RTM_DELLINK:
+                       handle_link(h);
+                       break;
+               case RTM_NEWADDR:
+               case RTM_DELADDR:
+                       handle_addr(h, h->nlmsg_type == RTM_DELADDR);
+                       break;
+               case RTM_NEWROUTE:
+               case RTM_DELROUTE:
+                       handle_route(h, h->nlmsg_type == RTM_DELROUTE);
+                       break;
+               }
+
+               /* If this is a reply for global request then follow up with
+                * additional requests and notify about finish.
+                */
+               if (h->nlmsg_pid == ifpx_pid && (h->nlmsg_seq >> 8) == 0 &&
+                   h->nlmsg_type == NLMSG_DONE) {
+                       if ((h->nlmsg_seq & 0xFF) == RTM_GETLINK)
+                               request_info(RTM_GETADDR, 0);
+                       else if ((h->nlmsg_seq & 0xFF) == RTM_GETADDR)
+                               request_info(RTM_GETROUTE, 0);
+                       else {
+                               RTE_ASSERT((h->nlmsg_seq & 0xFF) ==
+                                                               RTE_GETROUTE);
+                               notify_cfg_finished();
+                       }
+               }
+       }
+       IFPX_LOG(DEBUG, "Finished msg loop: %ld bytes left", len);
+}
+
+int rte_ifpx_listen(void)
+{
+       struct sockaddr_nl addr = {
+               .nl_family = AF_NETLINK,
+               .nl_pid = 0,
+       };
+       socklen_t addr_len = sizeof(addr);
+       int ret;
+
+       if (ifpx_irq.fd != -1) {
+               rte_errno = EBUSY;
+               return -1;
+       }
+
+       addr.nl_groups = 1 << (RTNLGRP_LINK-1)
+                       | 1 << (RTNLGRP_IPV4_IFADDR-1)
+                       | 1 << (RTNLGRP_IPV6_IFADDR-1)
+                       | 1 << (RTNLGRP_IPV4_ROUTE-1)
+                       | 1 << (RTNLGRP_IPV6_ROUTE-1);
+
+       ifpx_irq.fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC,
+                                NETLINK_ROUTE);
+       if (ifpx_irq.fd == -1) {
+               IFPX_LOG(ERR, "Failed to create netlink socket: %d", errno);
+               goto error;
+       }
+       /* Starting with kernel 4.19 you can request dump for a specific
+        * interface and kernel will filter out and send only relevant info.
+        * Otherwise NLM_F_DUMP will generate info for all interfaces and you
+        * need to filter them yourself.
+        */
+#ifdef NETLINK_DUMP_STRICT_CHK
+       ret = 1; /* use this var also as an input param */
+       ret = setsockopt(ifpx_irq.fd, SOL_SOCKET, NETLINK_DUMP_STRICT_CHK,
+                        &ret, sizeof(ret));
+       if (ret < 0) {
+               IFPX_LOG(ERR, "Failed to set socket option: %d", errno);
+               goto error;
+       }
+#endif
+
+       ret = bind(ifpx_irq.fd, (struct sockaddr *)&addr, addr_len);
+       if (ret < 0) {
+               IFPX_LOG(ERR, "Failed to bind socket: %d", errno);
+               goto error;
+       }
+       ret = getsockname(ifpx_irq.fd, (struct sockaddr *)&addr, &addr_len);
+       if (ret < 0) {
+               IFPX_LOG(ERR, "Failed to get socket addr: %d", errno);
+               goto error;
+       } else {
+               ifpx_pid = addr.nl_pid;
+               IFPX_LOG(DEBUG, "Assigned port ID: %u", addr.nl_pid);
+       }
+
+       ret = rte_intr_callback_register(&ifpx_irq, if_proxy_intr_callback,
+                                        NULL);
+       if (ret < 0)
+               goto error;
+
+       request_info(RTM_GETLINK, 0);
+
+       return 0;
+
+error:
+       rte_errno = errno;
+       if (ifpx_irq.fd != -1) {
+               close(ifpx_irq.fd);
+               ifpx_irq.fd = -1;
+       }
+       return -1;
+}
+
+int rte_ifpx_close(void)
+{
+       int ec;
+       unsigned int p;
+       struct ifpx_cbs_node *cbs;
+       struct ifpx_proxies_node *px;
+
+       if (ifpx_irq.fd < 0)
+               return -EBADFD;
+
+restart:
+       ec = rte_intr_callback_unregister(&ifpx_irq,
+                                         if_proxy_intr_callback, NULL);
+       if (ec == -EAGAIN) /* unlikely but possible - at least I think so */
+               goto restart;
+
+       rte_spinlock_lock(&ifpx_lock);
+
+       close(ifpx_irq.fd);
+       ifpx_irq.fd = -1;
+       ifpx_pid = 0;
+
+       /* Clear callbacks. */
+       while (!TAILQ_EMPTY(&ifpx_callbacks)) {
+               cbs = TAILQ_FIRST(&ifpx_callbacks);
+               TAILQ_REMOVE(&ifpx_callbacks, cbs, elem);
+               free(cbs);
+       }
+
+       /* Clear proxies. */
+       while (!TAILQ_EMPTY(&ifpx_proxies)) {
+               px = TAILQ_FIRST(&ifpx_proxies);
+               TAILQ_REMOVE(&ifpx_proxies, px, elem);
+               free(px);
+       }
+
+       for (p = 0; p < RTE_DIM(ifpx_p2p); ++p)
+               ifpx_p2p[p] = RTE_MAX_ETHPORTS;
+
+       rte_spinlock_unlock(&ifpx_lock);
+
+       return 0;
+}
+
+RTE_INIT(if_proxy_init)
+{
+       unsigned int i;
+       for (i = 0; i < RTE_DIM(ifpx_p2p); ++i)
+               ifpx_p2p[i] = RTE_MAX_ETHPORTS;
+
+       ifpx_log_type = rte_log_register("lib.if_proxy");
+       if (ifpx_log_type >= 0)
+               rte_log_set_level(ifpx_log_type, RTE_LOG_WARNING);
+}
diff --git a/lib/meson.build b/lib/meson.build
index 0af3efab2..c913b33dd 100644
--- a/lib/meson.build
+++ b/lib/meson.build
@@ -19,7 +19,7 @@ libraries = [
        'acl', 'bbdev', 'bitratestats', 'cfgfile',
        'compressdev', 'cryptodev',
        'distributor', 'efd', 'eventdev',
-       'gro', 'gso', 'ip_frag', 'jobstats',
+       'gro', 'gso', 'if_proxy', 'ip_frag', 'jobstats',
        'kni', 'latencystats', 'lpm', 'member',
        'power', 'pdump', 'rawdev',
        'rcu', 'rib', 'reorder', 'sched', 'security', 'stack', 'vhost',
-- 
2.17.1

Reply via email to