On 3/31/20 2:36 PM, Harman Kalra wrote: > On Fri, Mar 06, 2020 at 05:41:01PM +0100, Andrzej Ostruszka wrote: [...] >> +void ifpx_notify_event(struct rte_ifpx_event *ev, struct ifpx_proxy_node >> *px) >> +{ >> + struct ifpx_queue_node *q; >> + int done = 0; >> + uint16_t p, proxy_id; >> + >> + if (px) { >> + if (px->state & DEL_PENDING) >> + return; >> + proxy_id = px->proxy_id; >> + RTE_ASSERT(proxy_id != RTE_MAX_ETHPORTS); >> + px->state |= IN_USE; >> + } else >> + proxy_id = RTE_MAX_ETHPORTS; >> + >> + RTE_ASSERT(ev); >> + /* This function is expected to be called with a lock held. */ >> + RTE_ASSERT(rte_spinlock_trylock(&ifpx_lock) == 0); >> + >> + if (ifpx_callbacks.funcs[ev->type].f_ptr) { >> + union cb_ptr_t cb = ifpx_callbacks.funcs[ev->type]; >> + >> + /* Drop the lock for the time of callback call. */ >> + rte_spinlock_unlock(&ifpx_lock); >> + if (px) { >> + for (p = 0; p < RTE_DIM(ifpx_ports); ++p) { >> + if (ifpx_ports[p] != proxy_id || >> + ifpx_ports[p] == p) >> + continue; >> + ev->data.port_id = p; >> + done = cb.f_ptr(&ev->data) || done; > Since callback are handled as DPDK interrupts, hope there is no event > which gets lost. Cannot afford to loose a route change event as kernel > might not send it again.
We have some protection against this in form of netlink socket buffer. In general, callbacks (as noted previously by Morten) can't block so this should be fine - we might need to play around with SO_RCVBUF socket option of the netlink socket but so far I have not experienced any problem. > >> + } >> + } else { >> + RTE_ASSERT(ev->type == RTE_IFPX_CFG_DONE); >> + done = cb.cfg_done(); >> + } >> + rte_spinlock_lock(&ifpx_lock); >> + } >> + if (done) >> + goto exit; >> + >> + /* Event not "consumed" yet so try to notify via queues. */ >> + TAILQ_FOREACH(q, &ifpx_queues, elem) { >> + if (px) { >> + for (p = 0; p < RTE_DIM(ifpx_ports); ++p) { >> + if (ifpx_ports[p] != proxy_id || >> + ifpx_ports[p] == p) >> + continue; >> + /* Set the port_id - the remaining params should >> + * be filled before calling this function. >> + */ >> + ev->data.port_id = p; >> + queue_event(ev, q->r); >> + } >> + } else >> + queue_event(ev, q->r); >> + } >> +exit: >> + if (px) >> + px->state &= ~IN_USE; >> +} >> + >> +void ifpx_cleanup_proxies(void) >> +{ >> + struct ifpx_proxy_node *px, *next; >> + for (px = TAILQ_FIRST(&ifpx_proxies); px; px = next) { >> + next = TAILQ_NEXT(px, elem); >> + if (px->state & DEL_PENDING) >> + ifpx_proxy_destroy(px); >> + } >> +} >> + >> +int rte_ifpx_listen(void) >> +{ >> + int ec; >> + >> + if (!ifpx_platform.listen) >> + return -ENOTSUP; >> + >> + ec = ifpx_platform.listen(); >> + if (ec == 0 && ifpx_platform.get_info) >> + ifpx_platform.get_info(0); > nlink_get_info calls request_info with a if_index, passing 0 might > be good in current scenario but valid index should be passed to > get_info. 0 is an invalid if_index (on Windows too) so I've used it to encode "all interfaces". This is related to your next comment. So I'll expand on this there. [...] >> +static >> +int request_info(int type, int index) >> +{ >> + static rte_spinlock_t send_lock = RTE_SPINLOCK_INITIALIZER; >> + struct info_get { >> + struct nlmsghdr h; >> + union { >> + struct ifinfomsg ifm; >> + struct ifaddrmsg ifa; >> + struct rtmsg rtm; >> + struct ndmsg ndm; >> + } __rte_aligned(NLMSG_ALIGNTO); >> + } info_req; >> + int ret; >> + >> + memset(&info_req, 0, sizeof(info_req)); >> + /* First byte of these messages is family, so just make sure that this >> + * memset is enough to get all families. >> + */ >> + RTE_ASSERT(AF_UNSPEC == 0); >> + >> + info_req.h.nlmsg_pid = ifpx_pid; >> + info_req.h.nlmsg_type = type; >> + info_req.h.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; >> + info_req.h.nlmsg_len = offsetof(struct info_get, ifm); >> + >> + switch (type) { >> + case RTM_GETLINK: >> + info_req.h.nlmsg_len += sizeof(info_req.ifm); >> + info_req.ifm.ifi_index = index; >> + break; >> + case RTM_GETADDR: >> + info_req.h.nlmsg_len += sizeof(info_req.ifa); >> + info_req.ifa.ifa_index = index; >> + break; >> + case RTM_GETROUTE: >> + info_req.h.nlmsg_len += sizeof(info_req.rtm); >> + break; >> + case RTM_GETNEIGH: >> + info_req.h.nlmsg_len += sizeof(info_req.ndm); >> + break; >> + default: >> + IFPX_LOG(WARNING, "Unhandled message type: %d", type); >> + return -EINVAL; >> + } >> + /* Store request type (and if it is global or link specific) in 'seq'. >> + * Later it is used during handling of reply to continue requesting of >> + * information dump from system - if needed. >> + */ >> + info_req.h.nlmsg_seq = index << 8 | type; >> + >> + IFPX_LOG(DEBUG, "\tRequesting msg %d for: %u", type, index); >> + >> + rte_spinlock_lock(&send_lock); >> + ret = send(ifpx_irq.fd, &info_req, info_req.h.nlmsg_len, 0); >> + if (ret < 0) { >> + IFPX_LOG(ERR, "Failed to send netlink msg: %d", errno); >> + rte_errno = errno; >> + } >> + rte_spinlock_unlock(&send_lock); >> + >> + return ret; >> +} [...] >> +static >> +void if_proxy_intr_callback(void *arg __rte_unused) >> +{ >> + struct nlmsghdr *h; >> + struct sockaddr_nl addr; >> + socklen_t addr_len; >> + char buf[8192]; >> + ssize_t len; >> + >> +restart: >> + len = recvfrom(ifpx_irq.fd, buf, sizeof(buf), 0, >> + (struct sockaddr *)&addr, &addr_len); >> + if (len < 0) { >> + if (errno == EINTR) { >> + IFPX_LOG(DEBUG, "recvmsg() interrupted"); >> + goto restart; >> + } >> + IFPX_LOG(ERR, "Failed to read netlink msg: %ld (errno %d)", >> + len, errno); >> + return; >> + } >> + if (addr_len != sizeof(addr)) { >> + IFPX_LOG(ERR, "Invalid netlink addr size: %d", addr_len); >> + return; >> + } >> + IFPX_LOG(DEBUG, "Read %lu bytes (buf %lu) from %u/%u", len, >> + sizeof(buf), addr.nl_pid, addr.nl_groups); >> + >> + for (h = (struct nlmsghdr *)buf; NLMSG_OK(h, len); >> + h = NLMSG_NEXT(h, len)) { >> + IFPX_LOG(DEBUG, "Recv msg: %u (%u/%u/%u seq/flags/pid)", >> + h->nlmsg_type, h->nlmsg_seq, h->nlmsg_flags, >> + h->nlmsg_pid); >> + >> + switch (h->nlmsg_type) { >> + case RTM_NEWLINK: >> + case RTM_DELLINK: >> + handle_link(h); >> + break; >> + case RTM_NEWADDR: >> + case RTM_DELADDR: >> + handle_addr(h, h->nlmsg_type == RTM_DELADDR); >> + break; >> + case RTM_NEWROUTE: >> + case RTM_DELROUTE: >> + handle_route(h, h->nlmsg_type == RTM_DELROUTE); >> + break; >> + case RTM_NEWNEIGH: >> + case RTM_DELNEIGH: >> + handle_neigh(h, h->nlmsg_type == RTM_DELNEIGH); >> + break; >> + } >> + >> + /* If this is a reply for global request then follow up with >> + * additional requests and notify about finish. >> + */ >> + if (h->nlmsg_pid == ifpx_pid && (h->nlmsg_seq >> 8) == 0 && >> + h->nlmsg_type == NLMSG_DONE) { > Sorry, but in what scenario will the flow reach here. OK. So let me describe the initialization flow on Linux (the only available implementation right now). When we start listening we first request dumping of the whole configuration. We call get_info(0). Again this '0' is invalid if_index so is used as "all intefaces" value. This index is written in Netlink msg headers and is coupled with possible filtering of messages on kernel side (see comment in nlink_listen() below). When we request info we always use REQUEST|DUMP flags but on newer kernels there is an option (when if_index is non-zero) to send out only information for that interace instead of dumping all info. In addition it is encoded in nlmsg_seq. So there are different types of info we get from kernel: link/address/routing/neighbouring. Instead of requesting them all at once I do that sequentially and in get_info() I start with a request for link info. This code that you asked about above is a check that: - this message is a direct reply to us (pid) - and reply for global request (index = 0) - and this is the last part of multi-segmented message (this is how Linux dumps info - sends couple of messages with the additional "DONE" msg at the end). And the logic below is sequencing LINK->ADDR->ROUTE->NEIGH-> we are done so notify the user about that. That way we have at most one active "transaction" with kernel. >> + if ((h->nlmsg_seq & 0xFF) == RTM_GETLINK) >> + request_info(RTM_GETADDR, 0); >> + else if ((h->nlmsg_seq & 0xFF) == RTM_GETADDR) >> + request_info(RTM_GETROUTE, 0); >> + else if ((h->nlmsg_seq & 0xFF) == RTM_GETROUTE) >> + request_info(RTM_GETNEIGH, 0); >> + else { >> + struct rte_ifpx_event ev = { >> + .type = RTE_IFPX_CFG_DONE >> + }; >> + >> + RTE_ASSERT((h->nlmsg_seq & 0xFF) == >> + RTM_GETNEIGH); >> + rte_spinlock_lock(&ifpx_lock); >> + ifpx_notify_event(&ev, NULL); >> + rte_spinlock_unlock(&ifpx_lock); >> + } >> + } >> + } >> + IFPX_LOG(DEBUG, "Finished msg loop: %ld bytes left", len); >> +} >> + >> +static >> +int nlink_listen(void) >> +{ >> + struct sockaddr_nl addr = { >> + .nl_family = AF_NETLINK, >> + .nl_pid = 0, >> + }; >> + socklen_t addr_len = sizeof(addr); >> + int ret; >> + >> + if (ifpx_irq.fd != -1) { >> + rte_errno = EBUSY; >> + return -1; >> + } >> + >> + addr.nl_groups = 1 << (RTNLGRP_LINK-1) >> + | 1 << (RTNLGRP_NEIGH-1) >> + | 1 << (RTNLGRP_IPV4_IFADDR-1) >> + | 1 << (RTNLGRP_IPV6_IFADDR-1) >> + | 1 << (RTNLGRP_IPV4_ROUTE-1) >> + | 1 << (RTNLGRP_IPV6_ROUTE-1); >> + >> + ifpx_irq.fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, >> + NETLINK_ROUTE); >> + if (ifpx_irq.fd == -1) { >> + IFPX_LOG(ERR, "Failed to create netlink socket: %d", errno); >> + goto error; >> + } >> + /* Starting with kernel 4.19 you can request dump for a specific >> + * interface and kernel will filter out and send only relevant info. >> + * Otherwise NLM_F_DUMP will generate info for all interfaces and you >> + * need to filter them yourself. >> + */ >> +#ifdef NETLINK_DUMP_STRICT_CHK >> + ret = 1; /* use this var also as an input param */ >> + ret = setsockopt(ifpx_irq.fd, SOL_SOCKET, NETLINK_DUMP_STRICT_CHK, >> + &ret, sizeof(ret)); >> + if (ret < 0) { >> + IFPX_LOG(ERR, "Failed to set socket option: %d", errno); >> + goto error; >> + } >> +#endif >> + >> + ret = bind(ifpx_irq.fd, (struct sockaddr *)&addr, addr_len); >> + if (ret < 0) { >> + IFPX_LOG(ERR, "Failed to bind socket: %d", errno); >> + goto error; >> + } >> + ret = getsockname(ifpx_irq.fd, (struct sockaddr *)&addr, &addr_len); >> + if (ret < 0) { >> + IFPX_LOG(ERR, "Failed to get socket addr: %d", errno); >> + goto error; >> + } else { >> + ifpx_pid = addr.nl_pid; >> + IFPX_LOG(DEBUG, "Assigned port ID: %u", addr.nl_pid); >> + } >> + >> + ret = rte_intr_callback_register(&ifpx_irq, if_proxy_intr_callback, >> + NULL); >> + if (ret == 0) >> + return 0; >> + >> +error: >> + rte_errno = errno; >> + if (ifpx_irq.fd != -1) { >> + close(ifpx_irq.fd); >> + ifpx_irq.fd = -1; >> + } >> + return -1; >> +} [...] If you are playing with this library (running test case or the exemplary application) and would like to have better view what is going on you can add "--log=lib.if_proxy:debug" to the arguments list. Thanks for taking a look at this. The more people do this the better this should be. E.g. explaining initialization flow to you made me realize that the there is another case where I request info which is not handled well - normally user should bind the proxies and start listening. But if for some reason user binds proxy later, during listening, I request info for that particular interface but implementation will request only link level and will not follow with request for other info. I will fix this in the next version. With regards Andrzej Ostruszka