Hi Pascal, Please see inline > -----Original Message----- > From: Pascal Mazon [mailto:pascal.ma...@6wind.com] > Sent: Wednesday, January 10, 2018 12:20 PM > To: Ophir Munk <ophi...@mellanox.com>; dev@dpdk.org > Cc: Thomas Monjalon <tho...@monjalon.net>; Olga Shern > <ol...@mellanox.com> > Subject: Re: [PATCH v3 2/2] net/tap: implement RSS with eBPF classifier and > action > > Ok, now I get why ARRAY_SIZE was where it was. Nevermind. > > I think you're doing too much in one patch. As the subject is not trivial, I > would rather have it split. > Can you make a first patch that just adds support for several actions, then > your patch that adds BPF rule support? > > Otherwise your code looks neat. > > Comments inline.
All comments accepted for v4, see inline > > Best regards, > > Pascal > > On 10/01/2018 08:06, Ophir Munk wrote: > > Add BPF classifier for each TAP queue. According to this classifier > > packets marked with an RSS queue will be directed to this queue using > > a traffic control with "skbedit" action. > > The BPF classifier program is downloaded to the kernel once per TAP Rx > > queue. > > > > Add RSS BPF map entry for each new RSS rule. A BPF map entry contains > > the RSS queues of that rule. > > > > Add BPF action for each new RSS rule in which the Toeplitz RSS hash is > > calculated based on L3 addresses and L4 ports. Mark the packet with > > the RSS queue according the resulting RSS hash, then reclassify the packet. > > > > Signed-off-by: Ophir Munk <ophi...@mellanox.com> > > --- > > drivers/net/tap/Makefile | 20 ++ > > drivers/net/tap/rte_eth_tap.h | 9 +- > > drivers/net/tap/tap_bpf_insns.c | 62 +++- > > drivers/net/tap/tap_flow.c | 635 > ++++++++++++++++++++++++++++++++++------ > > drivers/net/tap/tap_flow.h | 10 + > > drivers/net/tap/tap_rss.h | 32 ++ > > drivers/net/tap/tap_tcmsgs.h | 4 + > > 7 files changed, 675 insertions(+), 97 deletions(-) create mode > > 100644 drivers/net/tap/tap_rss.h > > > > diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile index > > feaa5b7..0e7c494 100644 > > --- a/drivers/net/tap/Makefile > > +++ b/drivers/net/tap/Makefile > > @@ -60,6 +60,26 @@ tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto- > config-h.sh > > enum TCA_FLOWER_KEY_VLAN_PRIO \ > > $(AUTOCONF_OUTPUT) > > $Q sh -- '$<' '$@' \ > > + HAVE_TC_BPF \ > > + linux/pkt_cls.h \ > > + enum TCA_BPF_UNSPEC \ > > + $(AUTOCONF_OUTPUT) > > + $Q sh -- '$<' '$@' \ > > + HAVE_TC_BPF_FD \ > > + linux/pkt_cls.h \ > > + enum TCA_BPF_FD \ > > + $(AUTOCONF_OUTPUT) > > + $Q sh -- '$<' '$@' \ > > + HAVE_TC_ACT_BPF \ > > + linux/tc_act/tc_bpf.h \ > > + enum TCA_ACT_BPF_UNSPEC \ > > + $(AUTOCONF_OUTPUT) > > + $Q sh -- '$<' '$@' \ > > + HAVE_TC_ACT_BPF_FD \ > > + linux/tc_act/tc_bpf.h \ > > + enum TCA_ACT_BPF_FD \ > > + $(AUTOCONF_OUTPUT) > > + $Q sh -- '$<' '$@' \ > > HAVE_BPF_PROG_LOAD \ > > linux/bpf.h \ > > enum BPF_PROG_LOAD \ > > diff --git a/drivers/net/tap/rte_eth_tap.h > > b/drivers/net/tap/rte_eth_tap.h index 829f32f..c185473 100644 > > --- a/drivers/net/tap/rte_eth_tap.h > > +++ b/drivers/net/tap/rte_eth_tap.h > > @@ -45,7 +45,7 @@ > > #include <rte_ether.h> > > > > #ifdef IFF_MULTI_QUEUE > > -#define RTE_PMD_TAP_MAX_QUEUES 16 > > +#define RTE_PMD_TAP_MAX_QUEUES TAP_MAX_QUEUES > > #else > > #define RTE_PMD_TAP_MAX_QUEUES 1 > > #endif > > @@ -90,6 +90,13 @@ struct pmd_internals { > > int ioctl_sock; /* socket for ioctl calls */ > > int nlsk_fd; /* Netlink socket fd */ > > int flow_isolate; /* 1 if flow isolation is enabled */ > > + int flower_support; /* 1 if kernel supports, else 0 */ > > + int flower_vlan_support; /* 1 if kernel supports, else 0 */ > > + int rss_enabled; /* 1 if RSS is enabled, else 0 */ > > + /* implicit rules set when RSS is enabled */ > > + int map_fd; /* BPF RSS map fd */ > > + int bpf_fd[RTE_PMD_TAP_MAX_QUEUES];/* List of bpf fds per > queue */ > > + LIST_HEAD(tap_rss_flows, rte_flow) rss_flows; > > LIST_HEAD(tap_flows, rte_flow) flows; /* rte_flow rules */ > > /* implicit rte_flow rules set when a remote device is active */ > > LIST_HEAD(tap_implicit_flows, rte_flow) implicit_flows; diff --git > > a/drivers/net/tap/tap_bpf_insns.c b/drivers/net/tap/tap_bpf_insns.c > > index 25aa82c..bf95f2d 100644 > > --- a/drivers/net/tap/tap_bpf_insns.c > > +++ b/drivers/net/tap/tap_bpf_insns.c > > @@ -1830,7 +1830,7 @@ static int bpf_load(enum bpf_prog_type type, > > size_t insns_cnt, > > const char *license) > > { > > - union bpf_attr attr; > > + union bpf_attr attr = {}; > > Should be squashed in previous commit. > Done for v4 > > > > bzero(&attr, sizeof(attr)); > > attr.prog_type = type; > > @@ -1843,3 +1843,63 @@ static int bpf_load(enum bpf_prog_type type, > > > > return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); } > > + > > +/** > > + * Create BPF map for RSS rules > > + * > > + * @param[in] key_size > > + * map RSS key size > > + * > > + * @param[in] value_size > > + * Map RSS value size > > + * > > + * @param[in] max_entries > > + * Map max number of RSS entries (limit on max RSS rules) > > + * > > + * @return > > + * -1 if BPF map couldn't be created, map fd otherwise > > + */ > > +int tap_flow_bpf_rss_map_create(unsigned int key_size, > > + unsigned int value_size, > > + unsigned int max_entries) > > +{ > > + union bpf_attr attr = {}; > > + > > + bzero(&attr, sizeof(attr)); > > + attr.map_type = BPF_MAP_TYPE_HASH; > > + attr.key_size = key_size; > > + attr.value_size = value_size; > > + attr.max_entries = max_entries; > > + > > + return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } > > + > > +/** > > + * Update RSS entry in BPF map > > + * > > + * @param[in] fd > > + * RSS map fd > > + * > > + * @param[in] key > > + * Pointer to RSS key whose entry is updated > > + * > > + * @param[in] value > > + * Pointer to RSS new updated value > > + * > > + * @return > > + * -1 if RSS entry failed to be updated, 0 otherwise > > + */ > > +int tap_flow_bpf_update_rss_elem(int fd, void *key, void *value) { > > + union bpf_attr attr = {}; > > + > > + bzero(&attr, sizeof(attr)); > > + > > + attr.map_type = BPF_MAP_TYPE_HASH; > > + attr.map_fd = fd; > > + attr.key = ptr_to_u64(key); > > + attr.value = ptr_to_u64(value); > > + attr.flags = BPF_ANY; > > + > > + return sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); } > > diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c > > index 90b2654..7c51614 100644 > > --- a/drivers/net/tap/tap_flow.c > > +++ b/drivers/net/tap/tap_flow.c > > @@ -33,7 +33,9 @@ > > > > #include <errno.h> > > #include <string.h> > > +#include <unistd.h> > > #include <sys/queue.h> > > +#include <sys/resource.h> > > > > #include <rte_byteorder.h> > > #include <rte_jhash.h> > > @@ -42,6 +44,7 @@ > > #include <tap_flow.h> > > #include <tap_autoconf.h> > > #include <tap_tcmsgs.h> > > +#include <tap_rss.h> > > > > #ifndef HAVE_TC_FLOWER > > /* > > @@ -81,12 +84,78 @@ enum { > > TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ > > }; > > #endif > > +/* > > + * For kernels < 4.2 BPF related enums may not be defined. > > + * Runtime checks will be carried out to gracefully report on TC > > +messages that > > + * are rejected by the kernel. Rejection reasons may be due to: > > + * 1. enum is not defined > > + * 2. enum is defined but kerenl is not configured to support BPF > > +system calls, > > Typo at "kerenl". Fixed for v4 > > > + * BPF classifications or BPF actions. > > + */ > > +#ifndef HAVE_TC_BPF > > +enum { > > + TCA_BPF_UNSPEC, > > + TCA_BPF_ACT, > > + TCA_BPF_POLICE, > > + TCA_BPF_CLASSID, > > + TCA_BPF_OPS_LEN, > > + TCA_BPF_OPS, > > +}; > > +#endif > > +#ifndef HAVE_TC_BPF_FD > > +enum { > > + TCA_BPF_FD = TCA_BPF_OPS + 1, > > + TCA_BPF_NAME, > > +}; > > +#endif > > +#ifndef HAVE_TC_ACT_BPF > > +#define tc_gen \ > > + __u32 index; \ > > + __u32 capab; \ > > + int action; \ > > + int refcnt; \ > > + int bindcnt > > + > > +struct tc_act_bpf { > > + tc_gen; > > +}; > > + > > +enum { > > + TCA_ACT_BPF_UNSPEC, > > + TCA_ACT_BPF_TM, > > + TCA_ACT_BPF_PARMS, > > + TCA_ACT_BPF_OPS_LEN, > > + TCA_ACT_BPF_OPS, > > +}; > > + > > +#endif > > +#ifndef HAVE_TC_ACT_BPF_FD > > +enum { > > + TCA_ACT_BPF_FD = TCA_ACT_BPF_OPS + 1, > > + TCA_ACT_BPF_NAME, > > +}; > > +#endif > > + > > +/* RSS key management */ > > +enum bpf_rss_key_e { > > + KEY_CMD_GET = 1, > > + KEY_CMD_RELEASE, > > + KEY_CMD_INIT, > > + KEY_CMD_DEINIT, > > +}; > > + > > +enum key_status_e { > > + KEY_STAT_UNSPEC, > > + KEY_STAT_USED, > > + KEY_STAT_AVAILABLE, > > +}; > > > > #define ISOLATE_HANDLE 1 > > > > struct rte_flow { > > LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure > */ > > struct rte_flow *remote_flow; /* associated remote flow */ > > + int bpf_fd[SEC_MAX]; /* list of bfs fds per ELF section */ > > struct nlmsg msg; > > }; > > > > @@ -104,6 +173,24 @@ struct remote_rule { > > int mirred; > > }; > > > > +struct action_data { > > + char id[16]; > > + > > + union { > > + struct tc_gact gact; > > + struct tc_mirred mirred; > > + struct skbedit { > > + struct tc_skbedit skbedit; > > + uint16_t queue; > > + } skbedit; > > + struct bpf { > > + struct tc_act_bpf bpf; > > + int bpf_fd; > > + const char *annotation; > > + } bpf; > > + }; > > +}; > > + > > static int tap_flow_create_eth(const struct rte_flow_item *item, void > > *data); static int tap_flow_create_vlan(const struct rte_flow_item > > *item, void *data); static int tap_flow_create_ipv4(const struct > > rte_flow_item *item, void *data); @@ -134,6 +221,14 @@ > tap_flow_isolate(struct rte_eth_dev *dev, > > int set, > > struct rte_flow_error *error); > > > > +static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx); > > +static int rss_enable(struct pmd_internals *pmd, > > + const struct rte_flow_attr *attr, > > + struct rte_flow_error *error); > > +static int rss_add_actions(struct rte_flow *flow, struct pmd_internals > *pmd, > > + const struct rte_flow_action_rss *rss, > > + struct rte_flow_error *error); > > + > > static const struct rte_flow_ops tap_flow_ops = { > > .validate = tap_flow_validate, > > .create = tap_flow_create, > > @@ -819,111 +914,97 @@ tap_flow_item_validate(const struct > > rte_flow_item *item, } > > > > /** > > - * Transform a DROP/PASSTHRU action item in the provided flow for TC. > > + * Configure the kernel with a TC action and its configured > > + parameters > > + * Handled actions: "gact", "mirred", "skbedit", "bpf" > > * > > - * @param[in, out] flow > > - * Flow to be filled. > > - * @param[in] action > > - * Appropriate action to be set in the TCA_GACT_PARMS structure. > > + * @param[in] flow > > + * Pointer to rte flow containing the netlink message > > * > > - * @return > > - * 0 if checks are alright, -1 otherwise. > > - */ > > -static int > > -add_action_gact(struct rte_flow *flow, int action) -{ > > - struct nlmsg *msg = &flow->msg; > > - size_t act_index = 1; > > - struct tc_gact p = { > > - .action = action > > - }; > > - > > - if (tap_nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) > > - return -1; > > - if (tap_nlattr_nested_start(msg, act_index++) < 0) > > - return -1; > > - tap_nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact"); > > - if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) > > - return -1; > > - tap_nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p); > > - tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ > > - tap_nlattr_nested_finish(msg); /* nested act_index */ > > - tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ > > - return 0; > > -} > > - > > -/** > > - * Transform a MIRRED action item in the provided flow for TC. > > + * @param[in, out] act_index > > + * Pointer to action sequence number in the TC command > > * > > - * @param[in, out] flow > > - * Flow to be filled. > > - * @param[in] ifindex > > - * Netdevice ifindex, where to mirror/redirect packet to. > > - * @param[in] action_type > > - * Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for > mirroring. > > + * @param[in] adata > > + * Pointer to struct holding the action parameters > > * > > * @return > > - * 0 if checks are alright, -1 otherwise. > > + * -1 on failure, 0 on success > > */ > > static int > > -add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t > > action_type) > > +add_action(struct rte_flow *flow, size_t *act_index, struct > > +action_data *adata) > > { > > struct nlmsg *msg = &flow->msg; > > - size_t act_index = 1; > > - struct tc_mirred p = { > > - .eaction = action_type, > > - .ifindex = ifindex, > > - }; > > > > - if (tap_nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) > > - return -1; > > - if (tap_nlattr_nested_start(msg, act_index++) < 0) > > + if (tap_nlattr_nested_start(msg, (*act_index)++) < 0) > > return -1; > > - tap_nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), > "mirred"); > > + > > + tap_nlattr_add(&msg->nh, TCA_ACT_KIND, > > + strlen(adata->id) + 1, adata->id); > > if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) > > return -1; > > - if (action_type == TCA_EGRESS_MIRROR) > > - p.action = TC_ACT_PIPE; > > - else /* REDIRECT */ > > - p.action = TC_ACT_STOLEN; > > - tap_nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p); > > + if (strcmp("gact", adata->id) == 0) { > > + tap_nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(adata- > >gact), > > + &adata->gact); > > + } else if (strcmp("mirred", adata->id) == 0) { > > + if (adata->mirred.eaction == TCA_EGRESS_MIRROR) > > + adata->mirred.action = TC_ACT_PIPE; > > + else /* REDIRECT */ > > + adata->mirred.action = TC_ACT_STOLEN; > > + tap_nlattr_add(&msg->nh, TCA_MIRRED_PARMS, > > + sizeof(adata->mirred), > > + &adata->mirred); > > + } else if (strcmp("skbedit", adata->id) == 0) { > > + tap_nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, > > + sizeof(adata->skbedit.skbedit), > > + &adata->skbedit.skbedit); > > + tap_nlattr_add16(&msg->nh, > TCA_SKBEDIT_QUEUE_MAPPING, > > + adata->skbedit.queue); > > + } else if (strcmp("bpf", adata->id) == 0) { > > + tap_nlattr_add32(&msg->nh, TCA_ACT_BPF_FD, adata- > >bpf.bpf_fd); > > + tap_nlattr_add(&msg->nh, TCA_ACT_BPF_NAME, > > + strlen(adata->bpf.annotation) + 1, > > + adata->bpf.annotation); > > + tap_nlattr_add(&msg->nh, TCA_ACT_BPF_PARMS, > > + sizeof(adata->bpf.bpf), > > + &adata->bpf.bpf); > > + } else { > > + return -1; > > + } > > tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ > > tap_nlattr_nested_finish(msg); /* nested act_index */ > > - tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ > > return 0; > > } > > > > /** > > - * Transform a QUEUE action item in the provided flow for TC. > > + * Helper function to send a serie of TC actions to the kernel > > * > > - * @param[in, out] flow > > - * Flow to be filled. > > - * @param[in] queue > > - * Queue id to use. > > + * @param[in] flow > > + * Pointer to rte flow containing the netlink message > > + * > > + * @param[in] nb_actions > > + * Number of actions in an array of action structs > > + * > > + * @param[in] data > > + * Pointer to an array of action structs > > + * > > + * @param[in] classifier_actions > > + * The classifier on behave of which the actions are configured > > * > > * @return > > - * 0 if checks are alright, -1 otherwise. > > + * -1 on failure, 0 on success > > */ > > static int > > -add_action_skbedit(struct rte_flow *flow, uint16_t queue) > > +add_actions(struct rte_flow *flow, int nb_actions, struct action_data > *data, > > + int classifier_action) > > { > > struct nlmsg *msg = &flow->msg; > > size_t act_index = 1; > > - struct tc_skbedit p = { > > - .action = TC_ACT_PIPE > > - }; > > + int i; > > > > - if (tap_nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0) > > - return -1; > > - if (tap_nlattr_nested_start(msg, act_index++) < 0) > > + if (tap_nlattr_nested_start(msg, classifier_action) < 0) > > return -1; > > - tap_nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), > "skbedit"); > > - if (tap_nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0) > > - return -1; > > - tap_nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p); > > - tap_nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, > queue); > > - tap_nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */ > > - tap_nlattr_nested_finish(msg); /* nested act_index */ > > + for (i = 0; i < nb_actions; i++) > > + if (add_action(flow, &act_index, data + i) < 0) > > + return -1; > > tap_nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */ > > return 0; > > } > > @@ -987,7 +1068,8 @@ priv_flow_process(struct pmd_internals *pmd, > > return -rte_errno; > > } else if (flow) { > > uint16_t group = attr->group << GROUP_SHIFT; > > - uint16_t prio = group | (attr->priority + PRIORITY_OFFSET); > > + uint16_t prio = group | (attr->priority + > > + RSS_PRIORITY_OFFSET + PRIORITY_OFFSET); > > flow->msg.t.tcm_info = TC_H_MAKE(prio << 16, > > flow->msg.t.tcm_info); > > } > > @@ -1056,7 +1138,12 @@ priv_flow_process(struct pmd_internals *pmd, > > } > > } > > if (mirred && flow) { > > - uint16_t if_index = pmd->if_index; > > + struct action_data adata = { > > + .id = "mirred", > > + .mirred = { > > + .eaction = mirred, > > + }, > > + }; > > > > /* > > * If attr->egress && mirred, then this is a special @@ - > 1064,9 > > +1151,13 @@ priv_flow_process(struct pmd_internals *pmd, > > * redirect packets coming from the DPDK App, out > > * through the remote netdevice. > > */ > > - if (attr->egress) > > - if_index = pmd->remote_if_index; > > - if (add_action_mirred(flow, if_index, mirred) < 0) > > + adata.mirred.ifindex = attr->ingress ? pmd->if_index : > > + pmd->remote_if_index; > > + if (mirred == TCA_EGRESS_MIRROR) > > + adata.mirred.action = TC_ACT_PIPE; > > + else > > + adata.mirred.action = TC_ACT_STOLEN; > > + if (add_actions(flow, 1, &adata, TCA_FLOWER_ACT) < 0) > > goto exit_action_not_supported; > > else > > goto end; > > @@ -1080,14 +1171,33 @@ priv_flow_process(struct pmd_internals > *pmd, > > if (action) > > goto exit_action_not_supported; > > action = 1; > > - if (flow) > > - err = add_action_gact(flow, TC_ACT_SHOT); > > + if (flow) { > > + struct action_data adata = { > > + .id = "gact", > > + .gact = { > > + .action = TC_ACT_SHOT, > > + }, > > + }; > > + > > + err = add_actions(flow, 1, &adata, > > + TCA_FLOWER_ACT); > > + } > > } else if (actions->type == > RTE_FLOW_ACTION_TYPE_PASSTHRU) { > > if (action) > > goto exit_action_not_supported; > > action = 1; > > - if (flow) > > - err = add_action_gact(flow, > TC_ACT_UNSPEC); > > + if (flow) { > > + struct action_data adata = { > > + .id = "gact", > > + .gact = { > > + /* continue */ > > + .action = TC_ACT_UNSPEC, > > + }, > > + }; > > + > > + err = add_actions(flow, 1, &adata, > > + TCA_FLOWER_ACT); > > + } > > } else if (actions->type == > RTE_FLOW_ACTION_TYPE_QUEUE) { > > const struct rte_flow_action_queue *queue = > > (const struct rte_flow_action_queue *) @@ - > 1099,22 +1209,35 @@ > > priv_flow_process(struct pmd_internals *pmd, > > if (!queue || > > (queue->index > pmd->dev->data->nb_rx_queues - > 1)) > > goto exit_action_not_supported; > > - if (flow) > > - err = add_action_skbedit(flow, queue- > >index); > > + if (flow) { > > + struct action_data adata = { > > + .id = "skbedit", > > + .skbedit = { > > + .skbedit = { > > + .action = > TC_ACT_PIPE, > > + }, > > + .queue = queue->index, > > + }, > > + }; > > + > > + err = add_actions(flow, 1, &adata, > > + TCA_FLOWER_ACT); > > + } > > } else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) { > > - /* Fake RSS support. */ > > const struct rte_flow_action_rss *rss = > > (const struct rte_flow_action_rss *) > > actions->conf; > > > > - if (action) > > - goto exit_action_not_supported; > > - action = 1; > > - if (!rss || rss->num < 1 || > > - (rss->queue[0] > pmd->dev->data->nb_rx_queues - > 1)) > > + if (action++) > > goto exit_action_not_supported; > > - if (flow) > > - err = add_action_skbedit(flow, rss- > >queue[0]); > > + > > + if (!pmd->rss_enabled) { > > + err = rss_enable(pmd, attr, error); > > + if (err) > > + goto exit_action_not_supported; > > + } > > + if (flow && rss) > > + err = rss_add_actions(flow, pmd, rss, error); > > } else { > > goto exit_action_not_supported; > > } > > @@ -1326,6 +1449,7 @@ tap_flow_destroy_pmd(struct pmd_internals > *pmd, > > struct rte_flow_error *error) > > { > > struct rte_flow *remote_flow = flow->remote_flow; > > + int i; > > int ret = 0; > > > > LIST_REMOVE(flow, next); > > @@ -1351,6 +1475,13 @@ tap_flow_destroy_pmd(struct pmd_internals > *pmd, > > "couldn't receive kernel ack to our request"); > > goto end; > > } > > + /* Close opened BPF file descriptors of this flow */ > > + for (i = 0; i < SEC_MAX; i++) > > + if (flow->bpf_fd[i] != 0) { > > + close(flow->bpf_fd[i]); > > + flow->bpf_fd[i] = 0; > > + } > > + > > if (remote_flow) { > > remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | > NLM_F_ACK; > > remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER; @@ > -1635,6 > > +1766,320 @@ tap_flow_implicit_flush(struct pmd_internals *pmd, struct > rte_flow_error *error) > > return 0; > > } > > > > +#define MAX_RSS_KEYS 256 > > +#define SEC_NAME_CLS_Q "cls_q" > > + > > +const char *sec_name[SEC_MAX] = { > > + [SEC_L3_L4] = "l3_l4", > > +}; > > + > > +/** > > + * Enable RSS on tap: create TC rules for queuing. > > + * > > + * @param[in, out] pmd > > + * Pointer to private structure. > > + * > > + * @param[in] attr > > + * Pointer to rte_flow to get flow group > > + * > > + * @param[out] error > > + * Pointer to error reporting if not NULL. > > + * > > + * @return 0 on success, negative value on failure. > > + */ > > +static int rss_enable(struct pmd_internals *pmd, > > + const struct rte_flow_attr *attr, > > + struct rte_flow_error *error) > > +{ > > + struct rte_flow *rss_flow = NULL; > > + struct nlmsg *msg = NULL; > > + /* 4096 is the maximum number of instructions for a BPF program */ > > + char annotation[64]; > > + int i; > > + int err = 0; > > + > > + /* unlimit locked memory */ > > + struct rlimit limit = { > > + .rlim_cur = RLIM_INFINITY, > > + .rlim_max = RLIM_INFINITY, > > + }; > > + setrlimit(RLIMIT_MEMLOCK, &limit); > > + > > + /* Get a new map key for a new RSS rule */ > > + err = bpf_rss_key(KEY_CMD_INIT, NULL); > > + if (err < 0) { > > + rte_flow_error_set( > > + error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, > NULL, > > + "Failed to initialize BPF RSS keys"); > > + > > + return -1; > > + } > > + > > + /* > > + * Create BPF RSS MAP > > + */ > > + pmd->map_fd = tap_flow_bpf_rss_map_create(sizeof(__u32), /* key > size */ > > + sizeof(struct rss_key), > > + MAX_RSS_KEYS); > > + if (pmd->map_fd < 0) { > > + RTE_LOG(ERR, PMD, > > + "Failed to create BPF map (%d): %s\n", > > + errno, strerror(errno)); > > + rte_flow_error_set( > > + error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, > NULL, > > + "Kernel too old or not configured " > > + "to support BPF maps"); > > + > > + return -ENOTSUP; > > + } > > + > > + /* > > + * Add a rule per queue to match reclassified packets and direct them > to > > + * the correct queue. > > + */ > > + for (i = 0; i < pmd->dev->data->nb_rx_queues; i++) { > > + pmd->bpf_fd[i] = tap_flow_bpf_cls_q(i); > > + if (pmd->bpf_fd[i] < 0) { > > + RTE_LOG(ERR, PMD, > > + "Failed to load BPF section %s for queue %d", > > + SEC_NAME_CLS_Q, i); > > + rte_flow_error_set( > > + error, ENOTSUP, > RTE_FLOW_ERROR_TYPE_HANDLE, > > + NULL, > > + "Kernel too old or not configured " > > + "to support BPF programs loading"); > > + > > + return -ENOTSUP; > > + } > > + > > + rss_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0); > > + if (!rss_flow) { > > + RTE_LOG(ERR, PMD, > > + "Cannot allocate memory for rte_flow"); > > + return -1; > > + } > > + msg = &rss_flow->msg; > > + tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, > NLM_F_REQUEST | > > + NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE); > > + msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL)); > > + tap_flow_set_handle(rss_flow); > > + uint16_t group = attr->group << GROUP_SHIFT; > > + uint16_t prio = group | (i + PRIORITY_OFFSET); > > + msg->t.tcm_info = TC_H_MAKE(prio << 16, msg- > >t.tcm_info); > > + msg->t.tcm_parent = > TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0); > > + > > + tap_nlattr_add(&msg->nh, TCA_KIND, sizeof("bpf"), "bpf"); > > + if (tap_nlattr_nested_start(msg, TCA_OPTIONS) < 0) > > + return -1; > > + tap_nlattr_add32(&msg->nh, TCA_BPF_FD, pmd->bpf_fd[i]); > > + snprintf(annotation, sizeof(annotation), "[%s%d]", > > + SEC_NAME_CLS_Q, i); > > + tap_nlattr_add(&msg->nh, TCA_BPF_NAME, > strlen(annotation) + 1, > > + annotation); > > + /* Actions */ > > + { > > + struct action_data adata = { > > + .id = "skbedit", > > + .skbedit = { > > + .skbedit = { > > + .action = TC_ACT_PIPE, > > + }, > > + .queue = i, > > + }, > > + }; > > + if (add_actions(rss_flow, 1, &adata, TCA_BPF_ACT) < > 0) > > + return -1; > > + } > > + tap_nlattr_nested_finish(msg); /* nested TCA_OPTIONS */ > > + > > + /* Netlink message is now ready to be sent */ > > + if (tap_nl_send(pmd->nlsk_fd, &msg->nh) < 0) > > + return -1; > > + err = tap_nl_recv_ack(pmd->nlsk_fd); > > + if (err < 0) { > > + RTE_LOG(ERR, PMD, > > + "Kernel refused TC filter rule creation (%d): > %s\n", > > + errno, strerror(errno)); > > + return err; > > + } > > + LIST_INSERT_HEAD(&pmd->rss_flows, rss_flow, next); > > + } > > + > > + pmd->rss_enabled = 1; > > + return err; > > +} > > + > > +/** > > + * Manage bpf RSS keys repository with operations: init, get, release > > + * > > + * @param[in] cmd > > + * Command on RSS keys: init, get, release > > + * > > + * @param[in, out] key_idx > > + * Pointer to RSS Key index (out for get command, in for release > command) > > + * > > + * @return -1 if couldn't get, release or init the RSS keys, 0 otherwise. > > + */ > > +static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx) { > > + __u32 i; > > + int err = -1; > > + static __u32 num_used_keys; > > + static __u32 rss_keys[MAX_RSS_KEYS] = {KEY_STAT_UNSPEC}; > > + static __u32 rss_keys_initialized; > > + > > + switch (cmd) { > > + case KEY_CMD_GET: > > + if (!rss_keys_initialized) > > + break; > > + > > + if (num_used_keys == ARRAY_SIZE(rss_keys)) > > + break; > > + > > + *key_idx = num_used_keys % ARRAY_SIZE(rss_keys); > > + while (rss_keys[*key_idx] == KEY_STAT_USED) > > + *key_idx = (*key_idx + 1) % ARRAY_SIZE(rss_keys); > > + > > + rss_keys[*key_idx] = KEY_STAT_USED; > > + num_used_keys++; > > + err = 0; > > + break; > > + > > + case KEY_CMD_RELEASE: > > + if (!rss_keys_initialized) > > + break; > > + > > + if (rss_keys[*key_idx] == KEY_STAT_USED) { > > + rss_keys[*key_idx] = KEY_STAT_AVAILABLE; > > + num_used_keys--; > > + err = 0; > > + } > > + break; > > + > > + case KEY_CMD_INIT: > > + for (i = 0; i < ARRAY_SIZE(rss_keys); i++) > > + rss_keys[i] = KEY_STAT_AVAILABLE; > > + > > + rss_keys_initialized = 1; > > + num_used_keys = 0; > > + err = 0; > > + break; > > + > > + case KEY_CMD_DEINIT: > > + for (i = 0; i < ARRAY_SIZE(rss_keys); i++) > > + rss_keys[i] = KEY_STAT_UNSPEC; > > + > > + rss_keys_initialized = 0; > > + num_used_keys = 0; > > + err = 0; > > + break; > > + > > + default: > > + break; > > + } > > + > > + return err; > > +} > > + > > +/** > > + * Add RSS hash calculations and queue selection > > + * > > + * @param[in, out] pmd > > + * Pointer to internal structure. Used to set/get RSS map fd > > + * > > + * @param[in] rss > > + * Pointer to RSS flow actions > > + * > > + * @param[out] error > > + * Pointer to error reporting if not NULL. > > + * > > + * @return 0 on success, negative value on failure */ static int > > +rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd, > > + const struct rte_flow_action_rss *rss, > > + struct rte_flow_error *error) > > +{ > > + /* 4096 is the maximum number of instructions for a BPF program */ > > + int i; > > + __u32 key_idx; > > + int err; > > + struct rss_key rss_entry = { .hash_fields = 0, > > + .key_size = 0 }; > > + > > + /* Get a new map key for a new RSS rule */ > > + err = bpf_rss_key(KEY_CMD_GET, &key_idx); > > + if (err < 0) { > > + rte_flow_error_set( > > + error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, > NULL, > > + "Failed to get BPF RSS key"); > > + > > + return -1; > > + } > > + > > + /* Update RSS map entry with queues */ > > + rss_entry.nb_queues = rss->num; > > + for (i = 0; i < rss->num; i++) > > + rss_entry.queues[i] = rss->queue[i]; > > + rss_entry.hash_fields = > > + (1 << HASH_FIELD_IPV4_L3_L4) | (1 << > HASH_FIELD_IPV6_L3_L4); > > + > > + /* Add this RSS entry to map */ > > + err = tap_flow_bpf_update_rss_elem(pmd->map_fd, &key_idx, > > +&rss_entry); > > + > > + if (err) { > > + RTE_LOG(ERR, PMD, > > + "Failed to update BPF map entry #%u (%d): %s\n", > > + key_idx, errno, strerror(errno)); > > + rte_flow_error_set( > > + error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, > NULL, > > + "Kernel too old or not configured " > > + "to support BPF maps updates"); > > + > > + return -ENOTSUP; > > + } > > + > > + > > + /* > > + * Load bpf rules to calculate hash for this key_idx > > + */ > > + > > + flow->bpf_fd[SEC_L3_L4] = > > + tap_flow_bpf_calc_l3_l4_hash(key_idx, pmd->map_fd); > > + if (flow->bpf_fd[SEC_L3_L4] < 0) { > > + RTE_LOG(ERR, PMD, > > + "Failed to load BPF section %s (%d): %s\n", > > + sec_name[SEC_L3_L4], errno, > strerror(errno)); > > + rte_flow_error_set( > > + error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, > NULL, > > + "Kernel too old or not configured " > > + "to support BPF program loading"); > > + > > + return -ENOTSUP; > > + } > > + > > + /* Actions */ > > + { > > + struct action_data adata[] = { > > + { > > + .id = "bpf", > > + .bpf = { > > + .bpf_fd = flow->bpf_fd[SEC_L3_L4], > > + .annotation = sec_name[SEC_L3_L4], > > + .bpf = { > > + .action = TC_ACT_PIPE, > > + }, > > + }, > > + }, > > + }; > > + > > + if (add_actions(flow, ARRAY_SIZE(adata), adata, > > + TCA_FLOWER_ACT) < 0) > > + return -1; > > + } > > + > > + return 0; > > +} > > + > > /** > > * Manage filter operations. > > * > > diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h > > index 6cc01b4..dbb7ff6 100644 > > --- a/drivers/net/tap/tap_flow.h > > +++ b/drivers/net/tap/tap_flow.h > > @@ -37,6 +37,7 @@ > > #include <rte_flow.h> > > #include <rte_flow_driver.h> > > #include <rte_eth_tap.h> > > +#include <tap_autoconf.h> > > > > /** > > * In TC, priority 0 means we require the kernel to allocate one for us. > > @@ -49,6 +50,7 @@ > > #define GROUP_MASK (0xf) > > #define GROUP_SHIFT 12 > > #define MAX_GROUP GROUP_MASK > > +#define RSS_PRIORITY_OFFSET 256 > > Shouldn't RSS_PRIORITY_OFFSET be equal to the number of queues, as > there's one implicit rule (skbedit action) per queue at start? > Done for v4 > > > > #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0])) > > > > @@ -69,6 +71,11 @@ enum implicit_rule_index { > > TAP_REMOTE_MAX_IDX, > > }; > > > > +enum bpf_fd_idx { > > + SEC_L3_L4, > > + SEC_MAX, > > +}; > > + > > int tap_dev_filter_ctrl(struct rte_eth_dev *dev, > > enum rte_filter_type filter_type, > > enum rte_filter_op filter_op, > > @@ -84,5 +91,8 @@ int tap_flow_implicit_flush(struct pmd_internals > > *pmd, > > > > int tap_flow_bpf_cls_q(__u32 queue_idx); int > > tap_flow_bpf_calc_l3_l4_hash(__u32 key_idx, int map_fd); > > +int tap_flow_bpf_rss_map_create(unsigned int key_size, unsigned int > value_size, > > + unsigned int max_entries); > > +int tap_flow_bpf_update_rss_elem(int fd, void *key, void *value); > > > > #endif /* _TAP_FLOW_H_ */ > > diff --git a/drivers/net/tap/tap_rss.h b/drivers/net/tap/tap_rss.h new > > file mode 100644 index 0000000..5e12531 > > --- /dev/null > > +++ b/drivers/net/tap/tap_rss.h > > @@ -0,0 +1,32 @@ > > +/* SPDX-License-Identifier: BSD-3-Clause > > + * Copyright 2017 Mellanox Technologies, Ltd. > > + */ > > + > > +#ifndef _TAP_RSS_H_ > > +#define _TAP_RSS_H_ > > + > > +#define TAP_MAX_QUEUES 16 > > + > > +/* hashed fields for RSS */ > > +enum hash_field { > > + HASH_FIELD_IPV4_L3, /* IPv4 src/dst addr */ > > + HASH_FIELD_IPV4_L3_L4, /* IPv4 src/dst addr + L4 src/dst ports > */ > > + HASH_FIELD_IPV6_L3, /* IPv6 src/dst addr */ > > + HASH_FIELD_IPV6_L3_L4, /* IPv6 src/dst addr + L4 src/dst ports > */ > > + HASH_FIELD_L2_SRC, /* Ethernet src addr */ > > + HASH_FIELD_L2_DST, /* Ethernet dst addr */ > > + HASH_FIELD_L3_SRC, /* L3 src addr */ > > + HASH_FIELD_L3_DST, /* L3 dst addr */ > > + HASH_FIELD_L4_SRC, /* TCP/UDP src ports */ > > + HASH_FIELD_L4_DST, /* TCP/UDP dst ports */ > > +}; > > + > > +struct rss_key { > > + __u8 key[128]; > > + __u32 hash_fields; > > + __u32 key_size; > > + __u32 queues[TAP_MAX_QUEUES]; > > + __u32 nb_queues; > > +} __attribute__((packed)); > > + > > +#endif /* _TAP_RSS_H_ */ > > diff --git a/drivers/net/tap/tap_tcmsgs.h > > b/drivers/net/tap/tap_tcmsgs.h index 7895957..07c5074 100644 > > --- a/drivers/net/tap/tap_tcmsgs.h > > +++ b/drivers/net/tap/tap_tcmsgs.h > > @@ -34,6 +34,7 @@ > > #ifndef _TAP_TCMSGS_H_ > > #define _TAP_TCMSGS_H_ > > > > +#include <tap_autoconf.h> > > #include <linux/if_ether.h> > > #include <linux/rtnetlink.h> > > #include <linux/pkt_sched.h> > > @@ -41,6 +42,9 @@ > > #include <linux/tc_act/tc_mirred.h> > > #include <linux/tc_act/tc_gact.h> > > #include <linux/tc_act/tc_skbedit.h> > > +#ifdef HAVE_TC_ACT_BPF > > +#include <linux/tc_act/tc_bpf.h> > > +#endif > > #include <inttypes.h> > > > > #include <rte_ether.h>