On Mon, Oct 15, 2018 at 02:13:35PM +0000, Viacheslav Ovsiienko wrote: > The last part of patchset contains the rule cleanup routines. > These ones is the part of outer interface initialization at > the moment of VXLAN VTEP attaching. These routines query > the list of attached VXLAN devices, the list of local IP > addresses with peer and link scope attribute and the list > of permanent neigh rules, then all found abovementioned > items on the specified outer device are flushed. > > Suggested-by: Adrien Mazarguil <adrien.mazarg...@6wind.com> > Signed-off-by: Viacheslav Ovsiienko <viachesl...@mellanox.com> > --- > drivers/net/mlx5/mlx5_flow_tcf.c | 505 > ++++++++++++++++++++++++++++++++++++++- > 1 file changed, 499 insertions(+), 6 deletions(-) > > diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c > b/drivers/net/mlx5/mlx5_flow_tcf.c > index a1d7733..a3348ea 100644 > --- a/drivers/net/mlx5/mlx5_flow_tcf.c > +++ b/drivers/net/mlx5/mlx5_flow_tcf.c > @@ -4012,6 +4012,502 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep) > } > #endif /* HAVE_IFLA_VXLAN_COLLECT_METADATA */ > > +#define MNL_REQUEST_SIZE_MIN 256 > +#define MNL_REQUEST_SIZE_MAX 2048 > +#define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \ > + MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX) > + > +/* Data structures used by flow_tcf_xxx_cb() routines. */ > +struct tcf_nlcb_buf { > + LIST_ENTRY(tcf_nlcb_buf) next; > + uint32_t size; > + alignas(struct nlmsghdr) > + uint8_t msg[]; /**< Netlink message data. */ > +}; > + > +struct tcf_nlcb_context { > + unsigned int ifindex; /**< Base interface index. */ > + uint32_t bufsize; > + LIST_HEAD(, tcf_nlcb_buf) nlbuf; > +}; > + > +/** > + * Allocate space for netlink command in buffer list > + * > + * @param[in, out] ctx > + * Pointer to callback context with command buffers list. > + * @param[in] size > + * Required size of data buffer to be allocated. > + * > + * @return > + * Pointer to allocated memory, aligned as message header. > + * NULL if some error occurred. > + */ > +static struct nlmsghdr * > +flow_tcf_alloc_nlcmd(struct tcf_nlcb_context *ctx, uint32_t size) > +{ > + struct tcf_nlcb_buf *buf; > + struct nlmsghdr *nlh; > + > + size = NLMSG_ALIGN(size); > + buf = LIST_FIRST(&ctx->nlbuf); > + if (buf && (buf->size + size) <= ctx->bufsize) { > + nlh = (struct nlmsghdr *)&buf->msg[buf->size]; > + buf->size += size; > + return nlh; > + } > + if (size > ctx->bufsize) { > + DRV_LOG(WARNING, "netlink: too long command buffer requested"); > + return NULL; > + } > + buf = rte_malloc(__func__, > + ctx->bufsize + sizeof(struct tcf_nlcb_buf), > + alignof(struct tcf_nlcb_buf)); > + if (!buf) { > + DRV_LOG(WARNING, "netlink: no memory for command buffer"); > + return NULL; > + } > + LIST_INSERT_HEAD(&ctx->nlbuf, buf, next); > + buf->size = size; > + nlh = (struct nlmsghdr *)&buf->msg[0]; > + return nlh; > +} > + > +/** > + * Set NLM_F_ACK flags in the last netlink command in buffer. > + * Only last command in the buffer will be acked by system. > + * > + * @param[in, out] buf > + * Pointer to buffer with netlink commands. > + */ > +static void > +flow_tcf_setack_nlcmd(struct tcf_nlcb_buf *buf) > +{ > + struct nlmsghdr *nlh; > + uint32_t size = 0; > + > + assert(buf->size); > + do { > + nlh = (struct nlmsghdr *)&buf->msg[size]; > + size += NLMSG_ALIGN(nlh->nlmsg_len); > + if (size >= buf->size) { > + nlh->nlmsg_flags |= NLM_F_ACK; > + break; > + } > + } while (true); > +} > + > +/** > + * Send the buffers with prepared netlink commands. Scans the list and > + * sends all found buffers. Buffers are sent and freed anyway in order > + * to prevent memory leakage if some every message in received packet. > + * > + * @param[in] tcf > + * Context object initialized by mlx5_flow_tcf_context_create(). > + * @param[in, out] ctx > + * Pointer to callback context with command buffers list. > + * > + * @return > + * Zero value on success, negative errno value otherwise > + * and rte_errno is set. > + */ > +static int > +flow_tcf_send_nlcmd(struct mlx5_flow_tcf_context *tcf, > + struct tcf_nlcb_context *ctx) > +{ > + struct tcf_nlcb_buf *bc, *bn; > + struct nlmsghdr *nlh; > + int ret = 0; > + > + bc = LIST_FIRST(&ctx->nlbuf); > + while (bc) { > + int rc; > + > + bn = LIST_NEXT(bc, next); > + if (bc->size) { > + flow_tcf_setack_nlcmd(bc); > + nlh = (struct nlmsghdr *)&bc->msg; > + rc = flow_tcf_nl_ack(tcf, nlh, bc->size, NULL, NULL); > + if (rc && !ret) > + ret = rc; > + } > + rte_free(bc); > + bc = bn; > + } > + LIST_INIT(&ctx->nlbuf); > + return ret; > +} > + > +/** > + * Collect local IP address rules with scope link attribute on specified > + * network device. This is callback routine called by libmnl mnl_cb_run() > + * in loop for every message in received packet. > + * > + * @param[in] nlh > + * Pointer to reply header. > + * @param[in, out] arg > + * Opaque data pointer for this callback. > + * > + * @return > + * A positive, nonzero value on success, negative errno value otherwise > + * and rte_errno is set. > + */ > +static int > +flow_tcf_collect_local_cb(const struct nlmsghdr *nlh, void *arg) > +{ > + struct tcf_nlcb_context *ctx = arg; > + struct nlmsghdr *cmd; > + struct ifaddrmsg *ifa; > + struct nlattr *na; > + struct nlattr *na_local = NULL; > + struct nlattr *na_peer = NULL; > + unsigned char family; > + > + if (nlh->nlmsg_type != RTM_NEWADDR) { > + rte_errno = EINVAL; > + return -rte_errno; > + } > + ifa = mnl_nlmsg_get_payload(nlh); > + family = ifa->ifa_family; > + if (ifa->ifa_index != ctx->ifindex || > + ifa->ifa_scope != RT_SCOPE_LINK || > + !(ifa->ifa_flags & IFA_F_PERMANENT) || > + (family != AF_INET && family != AF_INET6)) > + return 1; > + mnl_attr_for_each(na, nlh, sizeof(*ifa)) { > + switch (mnl_attr_get_type(na)) { > + case IFA_LOCAL: > + na_local = na; > + break; > + case IFA_ADDRESS: > + na_peer = na; > + break; > + } > + if (na_local && na_peer) > + break; > + } > + if (!na_local || !na_peer) > + return 1; > + /* Local rule found with scope link, permanent and assigned peer. */ > + cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) + > + MNL_ALIGN(sizeof(struct ifaddrmsg)) + > + (family == AF_INET6 > + ? 2 * SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) > + : 2 * SZ_NLATTR_TYPE_OF(uint32_t)));
Better to use IPV4_ADDR_LEN instead? > + if (!cmd) { > + rte_errno = ENOMEM; > + return -rte_errno; > + } > + cmd = mnl_nlmsg_put_header(cmd); > + cmd->nlmsg_type = RTM_DELADDR; > + cmd->nlmsg_flags = NLM_F_REQUEST; > + ifa = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifa)); > + ifa->ifa_flags = IFA_F_PERMANENT; > + ifa->ifa_scope = RT_SCOPE_LINK; > + ifa->ifa_index = ctx->ifindex; > + if (family == AF_INET) { > + ifa->ifa_family = AF_INET; > + ifa->ifa_prefixlen = 32; > + mnl_attr_put_u32(cmd, IFA_LOCAL, mnl_attr_get_u32(na_local)); > + mnl_attr_put_u32(cmd, IFA_ADDRESS, mnl_attr_get_u32(na_peer)); > + } else { > + ifa->ifa_family = AF_INET6; > + ifa->ifa_prefixlen = 128; > + mnl_attr_put(cmd, IFA_LOCAL, IPV6_ADDR_LEN, > + mnl_attr_get_payload(na_local)); > + mnl_attr_put(cmd, IFA_ADDRESS, IPV6_ADDR_LEN, > + mnl_attr_get_payload(na_peer)); > + } > + return 1; > +} > + > +/** > + * Cleanup the local IP addresses on outer interface. > + * > + * @param[in] tcf > + * Context object initialized by mlx5_flow_tcf_context_create(). > + * @param[in] ifindex > + * Network inferface index to perform cleanup. > + */ > +static void > +flow_tcf_encap_local_cleanup(struct mlx5_flow_tcf_context *tcf, > + unsigned int ifindex) > +{ > + struct nlmsghdr *nlh; > + struct ifaddrmsg *ifa; > + struct tcf_nlcb_context ctx = { > + .ifindex = ifindex, > + .bufsize = MNL_REQUEST_SIZE, > + .nlbuf = LIST_HEAD_INITIALIZER(), > + }; > + int ret; > + > + assert(ifindex); > + /* > + * Seek and destroy leftovers of local IP addresses with > + * matching properties "scope link". > + */ > + nlh = mnl_nlmsg_put_header(tcf->buf); > + nlh->nlmsg_type = RTM_GETADDR; > + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; > + ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa)); > + ifa->ifa_family = AF_UNSPEC; > + ifa->ifa_index = ifindex; > + ifa->ifa_scope = RT_SCOPE_LINK; > + ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_local_cb, &ctx); > + if (ret) > + DRV_LOG(WARNING, "netlink: query device list error %d", ret); > + ret = flow_tcf_send_nlcmd(tcf, &ctx); > + if (ret) > + DRV_LOG(WARNING, "netlink: device delete error %d", ret); > +} > + > +/** > + * Collect neigh permament rules on specified network device. > + * This is callback routine called by libmnl mnl_cb_run() in loop for > + * every message in received packet. > + * > + * @param[in] nlh > + * Pointer to reply header. > + * @param[in, out] arg > + * Opaque data pointer for this callback. > + * > + * @return > + * A positive, nonzero value on success, negative errno value otherwise > + * and rte_errno is set. > + */ > +static int > +flow_tcf_collect_neigh_cb(const struct nlmsghdr *nlh, void *arg) > +{ > + struct tcf_nlcb_context *ctx = arg; > + struct nlmsghdr *cmd; > + struct ndmsg *ndm; > + struct nlattr *na; > + struct nlattr *na_ip = NULL; > + struct nlattr *na_mac = NULL; > + unsigned char family; > + > + if (nlh->nlmsg_type != RTM_NEWNEIGH) { > + rte_errno = EINVAL; > + return -rte_errno; > + } > + ndm = mnl_nlmsg_get_payload(nlh); > + family = ndm->ndm_family; > + if (ndm->ndm_ifindex != (int)ctx->ifindex || > + !(ndm->ndm_state & NUD_PERMANENT) || > + (family != AF_INET && family != AF_INET6)) > + return 1; > + mnl_attr_for_each(na, nlh, sizeof(*ndm)) { > + switch (mnl_attr_get_type(na)) { > + case NDA_DST: > + na_ip = na; > + break; > + case NDA_LLADDR: > + na_mac = na; > + break; > + } > + if (na_mac && na_ip) > + break; > + } > + if (!na_mac || !na_ip) > + return 1; > + /* Neigh rule with permenent attribute found. */ > + cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) + > + MNL_ALIGN(sizeof(struct ndmsg)) + > + SZ_NLATTR_DATA_OF(ETHER_ADDR_LEN) + > + (family == AF_INET6 > + ? SZ_NLATTR_DATA_OF(IPV6_ADDR_LEN) > + : SZ_NLATTR_TYPE_OF(uint32_t))); Better to use IPV4_ADDR_LEN instead? > + if (!cmd) { > + rte_errno = ENOMEM; > + return -rte_errno; > + } > + cmd = mnl_nlmsg_put_header(cmd); > + cmd->nlmsg_type = RTM_DELNEIGH; > + cmd->nlmsg_flags = NLM_F_REQUEST; > + ndm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ndm)); > + ndm->ndm_ifindex = ctx->ifindex; > + ndm->ndm_state = NUD_PERMANENT; > + ndm->ndm_flags = 0; > + ndm->ndm_type = 0; > + if (family == AF_INET) { > + ndm->ndm_family = AF_INET; > + mnl_attr_put_u32(cmd, NDA_DST, mnl_attr_get_u32(na_ip)); > + } else { > + ndm->ndm_family = AF_INET6; > + mnl_attr_put(cmd, NDA_DST, IPV6_ADDR_LEN, > + mnl_attr_get_payload(na_ip)); > + } > + mnl_attr_put(cmd, NDA_LLADDR, ETHER_ADDR_LEN, > + mnl_attr_get_payload(na_mac)); > + return 1; > +} > + > +/** > + * Cleanup the neigh rules on outer interface. > + * > + * @param[in] tcf > + * Context object initialized by mlx5_flow_tcf_context_create(). > + * @param[in] ifindex > + * Network inferface index to perform cleanup. > + */ > +static void > +flow_tcf_encap_neigh_cleanup(struct mlx5_flow_tcf_context *tcf, > + unsigned int ifindex) > +{ > + struct nlmsghdr *nlh; > + struct ndmsg *ndm; > + struct tcf_nlcb_context ctx = { > + .ifindex = ifindex, > + .bufsize = MNL_REQUEST_SIZE, > + .nlbuf = LIST_HEAD_INITIALIZER(), > + }; > + int ret; > + > + assert(ifindex); > + /* Seek and destroy leftovers of neigh rules. */ > + nlh = mnl_nlmsg_put_header(tcf->buf); > + nlh->nlmsg_type = RTM_GETNEIGH; > + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; > + ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm)); > + ndm->ndm_family = AF_UNSPEC; > + ndm->ndm_ifindex = ifindex; > + ndm->ndm_state = NUD_PERMANENT; > + ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_neigh_cb, &ctx); > + if (ret) > + DRV_LOG(WARNING, "netlink: query device list error %d", ret); > + ret = flow_tcf_send_nlcmd(tcf, &ctx); > + if (ret) > + DRV_LOG(WARNING, "netlink: device delete error %d", ret); > +} > + > +/** > + * Collect indices of VXLAN encap/decap interfaces associated with device. > + * This is callback routine called by libmnl mnl_cb_run() in loop for > + * every message in received packet. > + * > + * @param[in] nlh > + * Pointer to reply header. > + * @param[in, out] arg > + * Opaque data pointer for this callback. > + * > + * @return > + * A positive, nonzero value on success, negative errno value otherwise > + * and rte_errno is set. > + */ > +static int > +flow_tcf_collect_vxlan_cb(const struct nlmsghdr *nlh, void *arg) > +{ > + struct tcf_nlcb_context *ctx = arg; > + struct nlmsghdr *cmd; > + struct ifinfomsg *ifm; > + struct nlattr *na; > + struct nlattr *na_info = NULL; > + struct nlattr *na_vxlan = NULL; > + bool found = false; > + unsigned int vxindex; > + > + if (nlh->nlmsg_type != RTM_NEWLINK) { > + rte_errno = EINVAL; > + return -rte_errno; > + } > + ifm = mnl_nlmsg_get_payload(nlh); > + if (!ifm->ifi_index) { > + rte_errno = EINVAL; > + return -rte_errno; > + } > + mnl_attr_for_each(na, nlh, sizeof(*ifm)) > + if (mnl_attr_get_type(na) == IFLA_LINKINFO) { > + na_info = na; > + break; > + } > + if (!na_info) > + return 1; > + mnl_attr_for_each_nested(na, na_info) { > + switch (mnl_attr_get_type(na)) { > + case IFLA_INFO_KIND: > + if (!strncmp("vxlan", mnl_attr_get_str(na), > + mnl_attr_get_len(na))) > + found = true; > + break; > + case IFLA_INFO_DATA: > + na_vxlan = na; > + break; > + } > + if (found && na_vxlan) > + break; > + } > + if (!found || !na_vxlan) > + return 1; > + found = false; > + mnl_attr_for_each_nested(na, na_vxlan) { > + if (mnl_attr_get_type(na) == IFLA_VXLAN_LINK && > + mnl_attr_get_u32(na) == ctx->ifindex) { > + found = true; > + break; > + } > + } > + if (!found) > + return 1; > + /* Attached VXLAN device found, store the command to delete. */ > + vxindex = ifm->ifi_index; > + cmd = flow_tcf_alloc_nlcmd(ctx, MNL_ALIGN(sizeof(struct nlmsghdr)) + > + MNL_ALIGN(sizeof(struct ifinfomsg))); > + if (!nlh) { > + rte_errno = ENOMEM; > + return -rte_errno; > + } > + cmd = mnl_nlmsg_put_header(cmd); > + cmd->nlmsg_type = RTM_DELLINK; > + cmd->nlmsg_flags = NLM_F_REQUEST; > + ifm = mnl_nlmsg_put_extra_header(cmd, sizeof(*ifm)); > + ifm->ifi_family = AF_UNSPEC; > + ifm->ifi_index = vxindex; > + return 1; > +} > + > +/** > + * Cleanup the outer interface. Removes all found vxlan devices > + * attached to specified index, flushes the meigh and local IP > + * datavase. > + * > + * @param[in] tcf > + * Context object initialized by mlx5_flow_tcf_context_create(). > + * @param[in] ifindex > + * Network inferface index to perform cleanup. > + */ > +static void > +flow_tcf_encap_iface_cleanup(struct mlx5_flow_tcf_context *tcf, > + unsigned int ifindex) > +{ > + struct nlmsghdr *nlh; > + struct ifinfomsg *ifm; > + struct tcf_nlcb_context ctx = { > + .ifindex = ifindex, > + .bufsize = MNL_REQUEST_SIZE, > + .nlbuf = LIST_HEAD_INITIALIZER(), > + }; > + int ret; > + > + assert(ifindex); > + /* > + * Seek and destroy leftover VXLAN encap/decap interfaces with > + * matching properties. > + */ > + nlh = mnl_nlmsg_put_header(tcf->buf); > + nlh->nlmsg_type = RTM_GETLINK; > + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; > + ifm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifm)); > + ifm->ifi_family = AF_UNSPEC; > + ret = flow_tcf_nl_ack(tcf, nlh, 0, flow_tcf_collect_vxlan_cb, &ctx); > + if (ret) > + DRV_LOG(WARNING, "netlink: query device list error %d", ret); > + ret = flow_tcf_send_nlcmd(tcf, &ctx); > + if (ret) > + DRV_LOG(WARNING, "netlink: device delete error %d", ret); > +} > + > + > /** > * Create target interface index for VXLAN tunneling decapsulation. > * In order to share the UDP port within the other interfaces the > @@ -4100,12 +4596,9 @@ static LIST_HEAD(, mlx5_flow_tcf_vtep) > uint16_t pcnt; > > /* Not found, we should create the new attached VTEP. */ > -/* > - * TODO: not implemented yet > - * flow_tcf_encap_iface_cleanup(tcf, ifouter); > - * flow_tcf_encap_local_cleanup(tcf, ifouter); > - * flow_tcf_encap_neigh_cleanup(tcf, ifouter); > - */ > + flow_tcf_encap_iface_cleanup(tcf, ifouter); > + flow_tcf_encap_local_cleanup(tcf, ifouter); > + flow_tcf_encap_neigh_cleanup(tcf, ifouter); I have a fundamental questioin. Why are these cleanups needed? If I read the code correctly, it looks like cleaning up vtep, ip assginment and neigh entry which are not created/set by PMD. The reason why we have to clean up things is that PMD exclusively owns the interface (ifouter). Is my understanding correct? Thanks, Yongseok > for (pcnt = 0; pcnt <= (MLX5_VXLAN_PORT_RANGE_MAX > - MLX5_VXLAN_PORT_RANGE_MIN); pcnt++) { > encap_port++; >