On Wed, Jun 27, 2018 at 08:08:10PM +0200, Adrien Mazarguil wrote: > With mlx5, unlike normal flow rules implemented through Verbs for traffic > emitted and received by the application, those targeting different logical > ports of the device (VF representors for instance) are offloaded at the > switch level and must be configured through Netlink (TC interface). > > This patch adds preliminary support to manage such flow rules through the > flow API (rte_flow). > > Instead of rewriting tons of Netlink helpers and as previously suggested by > Stephen [1], this patch introduces a new dependency to libmnl [2] > (LGPL-2.1) when compiling mlx5. > > [1] > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-March%2F092676.html&data=02%7C01%7Cyskoh%40mellanox.com%7C1250093eca0c4ad6d9f008d5dc58fbb4%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657197116524482&sdata=JrAyzK1s3JG5CnuquNcA7XRN4d2WYtHUi1KXyloGdvA%3D&reserved=0 > [2] > https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fnetfilter.org%2Fprojects%2Flibmnl%2F&data=02%7C01%7Cyskoh%40mellanox.com%7C1250093eca0c4ad6d9f008d5dc58fbb4%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636657197116524482&sdata=yLYa0NzsTyE62BHDCZDoDah31snt6w4Coq47pY913Oo%3D&reserved=0 > > Signed-off-by: Adrien Mazarguil <adrien.mazarg...@6wind.com> > --- > drivers/net/mlx5/Makefile | 2 + > drivers/net/mlx5/mlx5.c | 32 ++++++++ > drivers/net/mlx5/mlx5.h | 10 +++ > drivers/net/mlx5/mlx5_nl_flow.c | 139 +++++++++++++++++++++++++++++++++++ > mk/rte.app.mk | 2 +- > 5 files changed, 184 insertions(+), 1 deletion(-) > > diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile > index 8a5229e61..3325eed06 100644 > --- a/drivers/net/mlx5/Makefile > +++ b/drivers/net/mlx5/Makefile > @@ -33,6 +33,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_mr.c > SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_flow.c > SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_socket.c > SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl.c > +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += mlx5_nl_flow.c > > ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y) > INSTALL-$(CONFIG_RTE_LIBRTE_MLX5_PMD)-lib += $(LIB_GLUE) > @@ -56,6 +57,7 @@ LDLIBS += -ldl > else > LDLIBS += -libverbs -lmlx5 > endif > +LDLIBS += -lmnl > LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring > LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs > LDLIBS += -lrte_bus_pci > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c > index 665a3c31f..d9b9097b1 100644 > --- a/drivers/net/mlx5/mlx5.c > +++ b/drivers/net/mlx5/mlx5.c > @@ -279,6 +279,8 @@ mlx5_dev_close(struct rte_eth_dev *dev) > mlx5_nl_mac_addr_flush(dev); > if (priv->nl_socket >= 0) > close(priv->nl_socket); > + if (priv->mnl_socket) > + mlx5_nl_flow_socket_destroy(priv->mnl_socket); > ret = mlx5_hrxq_ibv_verify(dev); > if (ret) > DRV_LOG(WARNING, "port %u some hash Rx queue still remain", > @@ -1077,6 +1079,34 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev, > priv->nl_socket = -1; > mlx5_nl_mac_addr_sync(eth_dev); > } > + priv->mnl_socket = mlx5_nl_flow_socket_create(); > + if (!priv->mnl_socket) { > + err = -rte_errno; > + DRV_LOG(WARNING, > + "flow rules relying on switch offloads will not be" > + " supported: cannot open libmnl socket: %s", > + strerror(rte_errno)); > + } else { > + struct rte_flow_error error; > + unsigned int ifindex = mlx5_ifindex(eth_dev); > + > + if (!ifindex) { > + err = -rte_errno; > + error.message = > + "cannot retrieve network interface index"; > + } else { > + err = mlx5_nl_flow_init(priv->mnl_socket, ifindex, > + &error); > + } > + if (err) { > + DRV_LOG(WARNING, > + "flow rules relying on switch offloads will" > + " not be supported: %s: %s", > + error.message, strerror(rte_errno)); > + mlx5_nl_flow_socket_destroy(priv->mnl_socket); > + priv->mnl_socket = NULL; > + } > + } > TAILQ_INIT(&priv->flows); > TAILQ_INIT(&priv->ctrl_flows); > /* Hint libmlx5 to use PMD allocator for data plane resources */ > @@ -1127,6 +1157,8 @@ mlx5_dev_spawn_one(struct rte_device *dpdk_dev, > if (priv) { > unsigned int i; > > + if (priv->mnl_socket) > + mlx5_nl_flow_socket_destroy(priv->mnl_socket); > i = mlx5_domain_to_port_id(priv->domain_id, NULL, 0); > if (i == 1) > claim_zero(rte_eth_switch_domain_free(priv->domain_id)); > diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h > index 1d8e156c8..390249adb 100644 > --- a/drivers/net/mlx5/mlx5.h > +++ b/drivers/net/mlx5/mlx5.h > @@ -148,6 +148,8 @@ struct mlx5_drop { > struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */ > }; > > +struct mnl_socket; > + > struct priv { > LIST_ENTRY(priv) mem_event_cb; /* Called by memory event callback. */ > struct rte_eth_dev_data *dev_data; /* Pointer to device data. */ > @@ -207,6 +209,7 @@ struct priv { > /* Context for Verbs allocator. */ > int nl_socket; /* Netlink socket. */ > uint32_t nl_sn; /* Netlink message sequence number. */ > + struct mnl_socket *mnl_socket; /* Libmnl socket. */ > }; > > #define PORT_ID(priv) ((priv)->dev_data->port_id) > @@ -369,4 +372,11 @@ void mlx5_nl_mac_addr_flush(struct rte_eth_dev *dev); > int mlx5_nl_promisc(struct rte_eth_dev *dev, int enable); > int mlx5_nl_allmulti(struct rte_eth_dev *dev, int enable); > > +/* mlx5_nl_flow.c */ > + > +int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex, > + struct rte_flow_error *error); > +struct mnl_socket *mlx5_nl_flow_socket_create(void); > +void mlx5_nl_flow_socket_destroy(struct mnl_socket *nl); > + > #endif /* RTE_PMD_MLX5_H_ */ > diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c > new file mode 100644 > index 000000000..7a8683b03 > --- /dev/null > +++ b/drivers/net/mlx5/mlx5_nl_flow.c > @@ -0,0 +1,139 @@ > +/* SPDX-License-Identifier: BSD-3-Clause > + * Copyright 2018 6WIND S.A. > + * Copyright 2018 Mellanox Technologies, Ltd > + */ > + > +#include <errno.h> > +#include <libmnl/libmnl.h> > +#include <linux/netlink.h> > +#include <linux/pkt_sched.h> > +#include <linux/rtnetlink.h> > +#include <stdalign.h> > +#include <stddef.h> > +#include <stdint.h> > +#include <stdlib.h> > +#include <sys/socket.h> > + > +#include <rte_errno.h> > +#include <rte_flow.h> > + > +#include "mlx5.h" > + > +/** > + * Send Netlink message with acknowledgment. > + * > + * @param nl > + * Libmnl socket to use. > + * @param nlh > + * Message to send. This function always raises the NLM_F_ACK flag before > + * sending. > + * > + * @return > + * 0 on success, a negative errno value otherwise and rte_errno is set. > + */ > +static int > +mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh) > +{ > + alignas(struct nlmsghdr) > + uint8_t ans[MNL_SOCKET_BUFFER_SIZE];
There are total 3 of this buffer. On a certain host having large pagesize, this can be 8kB * 3 = 24kB. This is not a gigantic buffer but as all the functions here are sequentially accessed, how about having just one global buffer instead? > + uint32_t seq = random(); > + int ret; > + > + nlh->nlmsg_flags |= NLM_F_ACK; > + nlh->nlmsg_seq = seq; > + ret = mnl_socket_sendto(nl, nlh, nlh->nlmsg_len); > + if (ret != -1) > + ret = mnl_socket_recvfrom(nl, ans, sizeof(ans)); > + if (ret != -1) > + ret = mnl_cb_run > + (ans, ret, seq, mnl_socket_get_portid(nl), NULL, NULL); > + if (!ret) > + return 0; > + rte_errno = errno; > + return -rte_errno; > +} > + > +/** > + * Initialize ingress qdisc of a given network interface. > + * > + * @param nl > + * Libmnl socket of the @p NETLINK_ROUTE kind. > + * @param ifindex > + * Index of network interface to initialize. > + * @param[out] error > + * Perform verbose error reporting if not NULL. > + * > + * @return > + * 0 on success, a negative errno value otherwise and rte_errno is set. > + */ > +int > +mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex, > + struct rte_flow_error *error) > +{ > + uint8_t buf[MNL_SOCKET_BUFFER_SIZE]; > + struct nlmsghdr *nlh; > + struct tcmsg *tcm; > + > + /* Destroy existing ingress qdisc and everything attached to it. */ > + nlh = mnl_nlmsg_put_header(buf); > + nlh->nlmsg_type = RTM_DELQDISC; > + nlh->nlmsg_flags = NLM_F_REQUEST; > + tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); > + tcm->tcm_family = AF_UNSPEC; > + tcm->tcm_ifindex = ifindex; > + tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); > + tcm->tcm_parent = TC_H_INGRESS; > + /* Ignore errors when qdisc is already absent. */ > + if (mlx5_nl_flow_nl_ack(nl, nlh) && > + rte_errno != EINVAL && rte_errno != ENOENT) > + return rte_flow_error_set > + (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, > + NULL, "netlink: failed to remove ingress qdisc"); > + /* Create fresh ingress qdisc. */ > + nlh = mnl_nlmsg_put_header(buf); > + nlh->nlmsg_type = RTM_NEWQDISC; > + nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL; > + tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm)); > + tcm->tcm_family = AF_UNSPEC; > + tcm->tcm_ifindex = ifindex; > + tcm->tcm_handle = TC_H_MAKE(TC_H_INGRESS, 0); > + tcm->tcm_parent = TC_H_INGRESS; > + mnl_attr_put_strz_check(nlh, sizeof(buf), TCA_KIND, "ingress"); > + if (mlx5_nl_flow_nl_ack(nl, nlh)) > + return rte_flow_error_set > + (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, > + NULL, "netlink: failed to create ingress qdisc"); > + return 0; > +} > + > +/** > + * Create and configure a libmnl socket for Netlink flow rules. > + * > + * @return > + * A valid libmnl socket object pointer on success, NULL otherwise and > + * rte_errno is set. > + */ > +struct mnl_socket * > +mlx5_nl_flow_socket_create(void) > +{ > + struct mnl_socket *nl = mnl_socket_open(NETLINK_ROUTE); > + > + if (nl && > + !mnl_socket_setsockopt(nl, NETLINK_CAP_ACK, &(int){ 1 }, > + sizeof(int)) && > + !mnl_socket_bind(nl, 0, MNL_SOCKET_AUTOPID)) > + return nl; > + rte_errno = errno; > + if (nl) > + mnl_socket_close(nl); > + return NULL; > +} > + > +/** > + * Destroy a libmnl socket. > + */ > +void > +mlx5_nl_flow_socket_destroy(struct mnl_socket *nl) > +{ > + mnl_socket_close(nl); > +} > diff --git a/mk/rte.app.mk b/mk/rte.app.mk > index 7bcf6308d..414f1b967 100644 > --- a/mk/rte.app.mk > +++ b/mk/rte.app.mk > @@ -145,7 +145,7 @@ endif > ifeq ($(CONFIG_RTE_LIBRTE_MLX5_DLOPEN_DEPS),y) > _LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += -lrte_pmd_mlx5 -ldl > else > -_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += -lrte_pmd_mlx5 -libverbs > -lmlx5 > +_LDLIBS-$(CONFIG_RTE_LIBRTE_MLX5_PMD) += -lrte_pmd_mlx5 -libverbs > -lmlx5 -lmnl > endif > _LDLIBS-$(CONFIG_RTE_LIBRTE_MVPP2_PMD) += -lrte_pmd_mvpp2 > -L$(LIBMUSDK_PATH)/lib -lmusdk > _LDLIBS-$(CONFIG_RTE_LIBRTE_NFP_PMD) += -lrte_pmd_nfp > -- > 2.11.0