mlx5: add e-switch VXLAN encapsulation rules

Yongseok Koh Thu, 01 Nov 2018 17:02:07 -0700

On Thu, Nov 01, 2018 at 05:19:34AM -0700, Slava Ovsiienko wrote:
> VXLAN encap rules are applied to the VF ingress traffic and have the
> VTEP as actual redirection destinations instead of outer PF.
> The encapsulation rule should provide:
> - redirection action VF->PF
> - VF port ID
> - some inner network parameters (MACs/IP)
> - the tunnel outer source IP (v4/v6)
> - the tunnel outer destination IP (v4/v6). Current
> - VNI - Virtual Network Identifier
> 
> There is no direct way found to provide kernel with all required
> encapsulatioh header parameters. The encapsulation VTEP is created
> attached to the outer interface and assumed as default path for
> egress encapsulated traffic. The outer tunnel IP address are
> assigned to interface using Netlink, the implicit route is
> created like this:
> 
>   ip addr add <src_ip> peer <dst_ip> dev <outer> scope link
> 
> Peer address provides implicit route, and scode link reduces
> the risk of conflicts. At initialization time all local scope
> link addresses are flushed from device (see next part of patchset).
> 
> The destination MAC address is provided via permenent neigh rule:
> 
>   ip neigh add dev <outer> lladdr <dst_mac> to <dst_ip> nud permanent
> 
> At initialization time all neigh rules of this type are flushed
> from device (see the next part of patchset).
> 
> Suggested-by: Adrien Mazarguil <adrien.mazarg...@6wind.com>
> Signed-off-by: Viacheslav Ovsiienko <viachesl...@mellanox.com>
> ---
Acked-by: Yongseok Koh <ys...@mellanox.com>


Thanks

>  drivers/net/mlx5/mlx5_flow_tcf.c | 386 
> +++++++++++++++++++++++++++++++++++++++
>  1 file changed, 386 insertions(+)
> 
> diff --git a/drivers/net/mlx5/mlx5_flow_tcf.c 
> b/drivers/net/mlx5/mlx5_flow_tcf.c
> index c6e07f5..eae80ae 100644
> --- a/drivers/net/mlx5/mlx5_flow_tcf.c
> +++ b/drivers/net/mlx5/mlx5_flow_tcf.c
> @@ -3756,6 +3756,374 @@ struct pedit_parser {
>  #define MNL_REQUEST_SIZE RTE_MIN(RTE_MAX(sysconf(_SC_PAGESIZE), \
>                                MNL_REQUEST_SIZE_MIN), MNL_REQUEST_SIZE_MAX)
>  
> +/**
> + * Emit Netlink message to add/remove local address to the outer device.
> + * The address being added is visible within the link only (scope link).
> + *
> + * Note that an implicit route is maintained by the kernel due to the
> + * presence of a peer address (IFA_ADDRESS).
> + *
> + * These rules are used for encapsultion only and allow to assign
> + * the outer tunnel source IP address.
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] encap
> + *   Encapsulation properties (source address and its peer).
> + * @param[in] ifindex
> + *   Network interface to apply rule.
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_rule_local(struct mlx5_flow_tcf_context *tcf,
> +                 const struct flow_tcf_vxlan_encap *encap,
> +                 unsigned int ifindex,
> +                 bool enable,
> +                 struct rte_flow_error *error)
> +{
> +     struct nlmsghdr *nlh;
> +     struct ifaddrmsg *ifa;
> +     alignas(struct nlmsghdr)
> +     uint8_t buf[mnl_nlmsg_size(sizeof(*ifa) + 128)];
> +
> +     nlh = mnl_nlmsg_put_header(buf);
> +     nlh->nlmsg_type = enable ? RTM_NEWADDR : RTM_DELADDR;
> +     nlh->nlmsg_flags =
> +             NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
> +     nlh->nlmsg_seq = 0;
> +     ifa = mnl_nlmsg_put_extra_header(nlh, sizeof(*ifa));
> +     ifa->ifa_flags = IFA_F_PERMANENT;
> +     ifa->ifa_scope = RT_SCOPE_LINK;
> +     ifa->ifa_index = ifindex;
> +     if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
> +             ifa->ifa_family = AF_INET;
> +             ifa->ifa_prefixlen = 32;
> +             mnl_attr_put_u32(nlh, IFA_LOCAL, encap->ipv4.src);
> +             if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST)
> +                     mnl_attr_put_u32(nlh, IFA_ADDRESS,
> +                                           encap->ipv4.dst);
> +     } else {
> +             assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
> +             ifa->ifa_family = AF_INET6;
> +             ifa->ifa_prefixlen = 128;
> +             mnl_attr_put(nlh, IFA_LOCAL,
> +                               sizeof(encap->ipv6.src),
> +                               &encap->ipv6.src);
> +             if (encap->mask & FLOW_TCF_ENCAP_IPV6_DST)
> +                     mnl_attr_put(nlh, IFA_ADDRESS,
> +                                       sizeof(encap->ipv6.dst),
> +                                       &encap->ipv6.dst);
> +     }
> +     if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
> +             return 0;
> +     return rte_flow_error_set
> +             (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> +              "netlink: cannot complete IFA request (ip addr add)");
> +}
> +
> +/**
> + * Emit Netlink message to add/remove neighbor.
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] encap
> + *   Encapsulation properties (destination address).
> + * @param[in] ifindex
> + *   Network interface.
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_rule_neigh(struct mlx5_flow_tcf_context *tcf,
> +                  const struct flow_tcf_vxlan_encap *encap,
> +                  unsigned int ifindex,
> +                  bool enable,
> +                  struct rte_flow_error *error)
> +{
> +     struct nlmsghdr *nlh;
> +     struct ndmsg *ndm;
> +     alignas(struct nlmsghdr)
> +     uint8_t buf[mnl_nlmsg_size(sizeof(*ndm) + 128)];
> +
> +     nlh = mnl_nlmsg_put_header(buf);
> +     nlh->nlmsg_type = enable ? RTM_NEWNEIGH : RTM_DELNEIGH;
> +     nlh->nlmsg_flags =
> +             NLM_F_REQUEST | (enable ? NLM_F_CREATE | NLM_F_REPLACE : 0);
> +     nlh->nlmsg_seq = 0;
> +     ndm = mnl_nlmsg_put_extra_header(nlh, sizeof(*ndm));
> +     ndm->ndm_ifindex = ifindex;
> +     ndm->ndm_state = NUD_PERMANENT;
> +     ndm->ndm_flags = 0;
> +     ndm->ndm_type = 0;
> +     if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
> +             ndm->ndm_family = AF_INET;
> +             mnl_attr_put_u32(nlh, NDA_DST, encap->ipv4.dst);
> +     } else {
> +             assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
> +             ndm->ndm_family = AF_INET6;
> +             mnl_attr_put(nlh, NDA_DST, sizeof(encap->ipv6.dst),
> +                                              &encap->ipv6.dst);
> +     }
> +     if (encap->mask & FLOW_TCF_ENCAP_ETH_SRC && enable)
> +             DRV_LOG(WARNING,
> +                     "Outer ethernet source address cannot be "
> +                     "forced for VXLAN encapsulation");
> +     if (encap->mask & FLOW_TCF_ENCAP_ETH_DST)
> +             mnl_attr_put(nlh, NDA_LLADDR, sizeof(encap->eth.dst),
> +                                                 &encap->eth.dst);
> +     if (!flow_tcf_nl_ack(tcf, nlh, 0, NULL, NULL))
> +             return 0;
> +     return rte_flow_error_set
> +             (error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
> +              "netlink: cannot complete ND request (ip neigh)");
> +}
> +
> +/**
> + * Manage the local IP addresses and their peers IP addresses on the
> + * outer interface for encapsulation purposes. The kernel searches the
> + * appropriate device for tunnel egress traffic using the outer source
> + * IP, this IP should be assigned to the outer network device, otherwise
> + * kernel rejects the rule.
> + *
> + * Adds or removes the addresses using the Netlink command like this:
> + *   ip addr add <src_ip> peer <dst_ip> scope link dev <ifouter>
> + *
> + * The addresses are local to the netdev ("scope link"), this reduces
> + * the risk of conflicts. Note that an implicit route is maintained by
> + * the kernel due to the presence of a peer address (IFA_ADDRESS).
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] vtep
> + *   VTEP object, contains rule database and ifouter index.
> + * @param[in] dev_flow
> + *   Flow object, contains the tunnel parameters (for encap only).
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_encap_local(struct mlx5_flow_tcf_context *tcf,
> +                  struct tcf_vtep *vtep,
> +                  struct mlx5_flow *dev_flow,
> +                  bool enable,
> +                  struct rte_flow_error *error)
> +{
> +     const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
> +     struct tcf_local_rule *rule;
> +     bool found = false;
> +     int ret;
> +
> +     assert(encap);
> +     assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
> +     if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
> +             assert(encap->mask & FLOW_TCF_ENCAP_IPV4_DST);
> +             LIST_FOREACH(rule, &vtep->local, next) {
> +                     if (rule->mask & FLOW_TCF_ENCAP_IPV4_SRC &&
> +                         encap->ipv4.src == rule->ipv4.src &&
> +                         encap->ipv4.dst == rule->ipv4.dst) {
> +                             found = true;
> +                             break;
> +                     }
> +             }
> +     } else {
> +             assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
> +             assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
> +             LIST_FOREACH(rule, &vtep->local, next) {
> +                     if (rule->mask & FLOW_TCF_ENCAP_IPV6_SRC &&
> +                         !memcmp(&encap->ipv6.src, &rule->ipv6.src,
> +                                         sizeof(encap->ipv6.src)) &&
> +                         !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
> +                                         sizeof(encap->ipv6.dst))) {
> +                             found = true;
> +                             break;
> +                     }
> +             }
> +     }
> +     if (found) {
> +             if (enable) {
> +                     rule->refcnt++;
> +                     return 0;
> +             }
> +             if (!rule->refcnt || !--rule->refcnt) {
> +                     LIST_REMOVE(rule, next);
> +                     return flow_tcf_rule_local(tcf, encap,
> +                                     vtep->ifouter, false, error);
> +             }
> +             return 0;
> +     }
> +     if (!enable) {
> +             DRV_LOG(WARNING, "Disabling not existing local rule");
> +             rte_flow_error_set
> +                     (error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +                      NULL, "Disabling not existing local rule");
> +             return -ENOENT;
> +     }
> +     rule = rte_zmalloc(__func__, sizeof(struct tcf_local_rule),
> +                             alignof(struct tcf_local_rule));
> +     if (!rule) {
> +             rte_flow_error_set
> +                     (error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +                      NULL, "unable to allocate memory for local rule");
> +             return -rte_errno;
> +     }
> +     *rule = (struct tcf_local_rule){.refcnt = 0,
> +                                     .mask = 0,
> +                                     };
> +     if (encap->mask & FLOW_TCF_ENCAP_IPV4_SRC) {
> +             rule->mask = FLOW_TCF_ENCAP_IPV4_SRC
> +                        | FLOW_TCF_ENCAP_IPV4_DST;
> +             rule->ipv4.src = encap->ipv4.src;
> +             rule->ipv4.dst = encap->ipv4.dst;
> +     } else {
> +             rule->mask = FLOW_TCF_ENCAP_IPV6_SRC
> +                        | FLOW_TCF_ENCAP_IPV6_DST;
> +             memcpy(&rule->ipv6.src, &encap->ipv6.src,
> +                             sizeof(rule->ipv6.src));
> +             memcpy(&rule->ipv6.dst, &encap->ipv6.dst,
> +                             sizeof(rule->ipv6.dst));
> +     }
> +     ret = flow_tcf_rule_local(tcf, encap, vtep->ifouter, true, error);
> +     if (ret) {
> +             rte_free(rule);
> +             return ret;
> +     }
> +     rule->refcnt++;
> +     LIST_INSERT_HEAD(&vtep->local, rule, next);
> +     return 0;
> +}
> +
> +/**
> + * Manage the destination MAC/IP addresses neigh database, kernel uses
> + * this one to determine the destination MAC address within encapsulation
> + * header. Adds or removes the entries using the Netlink command like this:
> + *   ip neigh add dev <ifouter> lladdr <dst_mac> to <dst_ip> nud permanent
> + *
> + * @param[in] tcf
> + *   Libmnl socket context object.
> + * @param[in] vtep
> + *   VTEP object, contains rule database and ifouter index.
> + * @param[in] dev_flow
> + *   Flow object, contains the tunnel parameters (for encap only).
> + * @param[in] enable
> + *   Toggle between add and remove.
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +flow_tcf_encap_neigh(struct mlx5_flow_tcf_context *tcf,
> +                  struct tcf_vtep *vtep,
> +                  struct mlx5_flow *dev_flow,
> +                  bool enable,
> +                  struct rte_flow_error *error)
> +{
> +     const struct flow_tcf_vxlan_encap *encap = dev_flow->tcf.vxlan_encap;
> +     struct tcf_neigh_rule *rule;
> +     bool found = false;
> +     int ret;
> +
> +     assert(encap);
> +     assert(encap->hdr.type == FLOW_TCF_TUNACT_VXLAN_ENCAP);
> +     if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
> +             assert(encap->mask & FLOW_TCF_ENCAP_IPV4_SRC);
> +             LIST_FOREACH(rule, &vtep->neigh, next) {
> +                     if (rule->mask & FLOW_TCF_ENCAP_IPV4_DST &&
> +                         encap->ipv4.dst == rule->ipv4.dst) {
> +                             found = true;
> +                             break;
> +                     }
> +             }
> +     } else {
> +             assert(encap->mask & FLOW_TCF_ENCAP_IPV6_SRC);
> +             assert(encap->mask & FLOW_TCF_ENCAP_IPV6_DST);
> +             LIST_FOREACH(rule, &vtep->neigh, next) {
> +                     if (rule->mask & FLOW_TCF_ENCAP_IPV6_DST &&
> +                         !memcmp(&encap->ipv6.dst, &rule->ipv6.dst,
> +                                             sizeof(encap->ipv6.dst))) {
> +                             found = true;
> +                             break;
> +                     }
> +             }
> +     }
> +     if (found) {
> +             if (memcmp(&encap->eth.dst, &rule->eth,
> +                        sizeof(encap->eth.dst))) {
> +                     DRV_LOG(WARNING, "Destination MAC differs"
> +                                      " in neigh rule");
> +                     rte_flow_error_set(error, EEXIST,
> +                                        RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +                                        NULL, "Different MAC address"
> +                                        " neigh rule for the same"
> +                                        " destination IP");
> +                                     return -EEXIST;
> +             }
> +             if (enable) {
> +                     rule->refcnt++;
> +                     return 0;
> +             }
> +             if (!rule->refcnt || !--rule->refcnt) {
> +                     LIST_REMOVE(rule, next);
> +                     return flow_tcf_rule_neigh(tcf, encap,
> +                                                vtep->ifouter,
> +                                                false, error);
> +             }
> +             return 0;
> +     }
> +     if (!enable) {
> +             DRV_LOG(WARNING, "Disabling not existing neigh rule");
> +             rte_flow_error_set
> +                     (error, ENOENT, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +                      NULL, "unable to allocate memory for neigh rule");
> +             return -ENOENT;
> +     }
> +     rule = rte_zmalloc(__func__, sizeof(struct tcf_neigh_rule),
> +                             alignof(struct tcf_neigh_rule));
> +     if (!rule) {
> +             rte_flow_error_set
> +                     (error, ENOMEM, RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +                      NULL, "unadble to allocate memory for neigh rule");
> +             return -rte_errno;
> +     }
> +     *rule = (struct tcf_neigh_rule){.refcnt = 0,
> +                                     .mask = 0,
> +                                     };
> +     if (encap->mask & FLOW_TCF_ENCAP_IPV4_DST) {
> +             rule->mask = FLOW_TCF_ENCAP_IPV4_DST;
> +             rule->ipv4.dst = encap->ipv4.dst;
> +     } else {
> +             rule->mask = FLOW_TCF_ENCAP_IPV6_DST;
> +             memcpy(&rule->ipv6.dst, &encap->ipv6.dst,
> +                                     sizeof(rule->ipv6.dst));
> +     }
> +     memcpy(&rule->eth, &encap->eth.dst, sizeof(rule->eth));
> +     ret = flow_tcf_rule_neigh(tcf, encap, vtep->ifouter, true, error);
> +     if (ret) {
> +             rte_free(rule);
> +             return ret;
> +     }
> +     rule->refcnt++;
> +     LIST_INSERT_HEAD(&vtep->neigh, rule, next);
> +     return 0;
> +}
> +
>  /* VTEP device list is shared between PMD port instances. */
>  static LIST_HEAD(, tcf_vtep) vtep_list_vxlan = LIST_HEAD_INITIALIZER();
>  static pthread_mutex_t vtep_list_mutex = PTHREAD_MUTEX_INITIALIZER;
> @@ -4032,6 +4400,7 @@ struct pedit_parser {
>  {
>       static uint16_t encap_port = MLX5_VXLAN_PORT_MIN - 1;
>       struct tcf_vtep *vtep;
> +     int ret;
>  
>       assert(ifouter);
>       /* Look whether the attached VTEP for encap is created. */
> @@ -4077,6 +4446,20 @@ struct pedit_parser {
>       }
>       assert(vtep->ifouter == ifouter);
>       assert(vtep->ifindex);
> +     /* Create local ipaddr with peer to specify the outer IPs. */
> +     ret = flow_tcf_encap_local(tcf, vtep, dev_flow, true, error);
> +     if (!ret) {
> +             /* Create neigh rule to specify outer destination MAC. */
> +             ret = flow_tcf_encap_neigh(tcf, vtep, dev_flow, true, error);
> +             if (ret)
> +                     flow_tcf_encap_local(tcf, vtep,
> +                                          dev_flow, false, error);
> +     }
> +     if (ret) {
> +             if (--vtep->refcnt == 0)
> +                     flow_tcf_vtep_delete(tcf, vtep);
> +             return NULL;
> +     }
>       return vtep;
>  }
>  
> @@ -4146,6 +4529,9 @@ struct pedit_parser {
>       case FLOW_TCF_TUNACT_VXLAN_DECAP:
>               break;
>       case FLOW_TCF_TUNACT_VXLAN_ENCAP:
> +             /* Remove the encap ancillary rules first. */
> +             flow_tcf_encap_neigh(tcf, vtep, dev_flow, false, NULL);
> +             flow_tcf_encap_local(tcf, vtep, dev_flow, false, NULL);
>               break;
>       default:
>               assert(false);
> -- 
> 1.8.3.1
>

Re: [dpdk-dev] [PATCH v3 12/13] net/mlx5: add e-switch VXLAN encapsulation rules

Reply via email to