Add basic recirculation infrastructure and user space data path support for it. The following bond mega flow patch will make use of this infrastructure.
Signed-off-by: Andy Zhou <az...@nicira.com> --- v1->v2: Rewritten based on having post recirculation rules stored in table 254. --- include/linux/openvswitch.h | 35 ++++++++++++++++- lib/dpif-netdev.c | 35 ++++++++++++++++- lib/dpif.c | 3 +- lib/odp-execute.c | 9 +++++ lib/odp-execute.h | 2 +- lib/odp-util.c | 91 ++++++++++++++++++++++++++++++++++++++++++- lib/packets.c | 16 +++++--- lib/packets.h | 16 +++++--- ofproto/ofproto-dpif.c | 2 +- ofproto/ofproto-dpif.h | 63 ++++++++++++++++++++++++++++-- 10 files changed, 251 insertions(+), 21 deletions(-) diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index d1ff5ec..af951f5 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -307,7 +307,8 @@ enum ovs_key_attr { OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ OVS_KEY_ATTR_SCTP, /* struct ovs_key_sctp */ OVS_KEY_ATTR_TCP_FLAGS, /* be16 TCP flags. */ - + OVS_KEY_ATTR_DP_HASH, /* u32 hash value */ + OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ #ifdef __KERNEL__ OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ #endif @@ -530,6 +531,36 @@ struct ovs_action_push_vlan { __be16 vlan_tci; /* 802.1Q TCI (VLAN ID and priority). */ }; +/* Recirculation ID needs to be unique per back-end. + * It can be any value except zero. Use the following + * defines to restrict the range further. + */ +#define RECIRC_ID_BASE 300 +#define RECIRC_ID_SIZE 65535 + +/* Data path hash algorithm for computing Datapath hash. + * + * The Algorithm type only specifies the fields in a flow + * will be used as part of the hash. Each datapath is free + * to use its own hash algorithm. The hash value will be + * opaque to the user space daemon. + */ +enum ovs_recirc_hash_alg { + OVS_RECIRC_HASH_ALG_NONE, + OVS_RECIRC_HASH_ALG_L4, +}; +/* + * struct ovs_action_recirc - %OVS_ACTION_ATTR_RECIRC action argument. + * @recirc_id: The Recirculation label, Zero is invalid. + * @hash_alg: Algorithm used to compute hash prior to recirculation. + * @hash_bias: bias used for computing hash. used to compute hash prior to recirculation. + */ +struct ovs_action_recirc { + uint8_t hash_alg; /* One of ovs_dp_hash_alg */ + __be32 hash_bias; + __be32 recirc_id; /* Recirculation label. */ +}; + /** * enum ovs_action_attr - Action types. * @@ -553,6 +584,7 @@ struct ovs_action_push_vlan { * indicate the new packet contents. This could potentially still be * %ETH_P_MPLS if the resulting MPLS label stack is not empty. If there * is no MPLS label stack, as determined by ethertype, no action is taken. + * @OVS_ACTION_RECIRC: Recirculate within the data path. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -569,6 +601,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */ OVS_ACTION_ATTR_PUSH_MPLS, /* struct ovs_action_push_mpls. */ OVS_ACTION_ATTR_POP_MPLS, /* __be16 ethertype. */ + OVS_ACTION_ATTR_RECIRC, /* struct ovs_action_recirc. */ __OVS_ACTION_ATTR_MAX }; diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 5897f8b..58bcfc1 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -1157,6 +1157,19 @@ dpif_netdev_flow_get(const struct dpif *dpif, return error; } +/* To support recirculation, user space datapath requires all rules + * have exact match to the metadata field, which will store the + * recirculation ID. */ +static void +dp_netdev_match_init(struct match *match, const struct flow *flow, + const struct flow_wildcards *wc) +{ + struct flow_wildcards netdev_wc = *wc; + + netdev_wc.masks.metadata = UINT64_MAX; + match_init(match, flow, &netdev_wc); +} + static int dp_netdev_flow_add(struct dp_netdev *dp, const struct flow *flow, const struct flow_wildcards *wc, @@ -1176,7 +1189,7 @@ dp_netdev_flow_add(struct dp_netdev *dp, const struct flow *flow, netdev_flow->actions = dp_netdev_actions_create(actions, actions_len); - match_init(&match, flow, wc); + dp_netdev_match_init(&match, flow, wc); cls_rule_init(CONST_CAST(struct cls_rule *, &netdev_flow->cr), &match, NETDEV_RULE_PRIORITY); fat_rwlock_wrlock(&dp->cls.rwlock); @@ -1808,7 +1821,7 @@ struct dp_netdev_execute_aux { static void dp_execute_cb(void *aux_, struct ofpbuf *packet, - const struct pkt_metadata *md OVS_UNUSED, + struct pkt_metadata *md, const struct nlattr *a, bool may_steal) OVS_NO_THREAD_SAFETY_ANALYSIS { @@ -1840,6 +1853,24 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet, } break; } + + case OVS_ACTION_ATTR_RECIRC: { + const struct ovs_action_recirc *act; + act = nl_attr_get(a); + md->recirc_id = ntohl(act->recirc_id); + md->dp_hash = 0; + + if (act->hash_alg == OVS_RECIRC_HASH_ALG_L4) { + struct flow flow; + + flow_extract(packet, md, &flow); + md->dp_hash = flow_hash_symmetric_l4(&flow, ntohl(act->hash_bias)); + } + + dp_netdev_port_input(aux->dp, packet, md); + break; + } + case OVS_ACTION_ATTR_PUSH_VLAN: case OVS_ACTION_ATTR_POP_VLAN: case OVS_ACTION_ATTR_PUSH_MPLS: diff --git a/lib/dpif.c b/lib/dpif.c index 8cb2145..a56c612 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -1099,7 +1099,7 @@ struct dpif_execute_helper_aux { * meaningful. */ static void dpif_execute_helper_cb(void *aux_, struct ofpbuf *packet, - const struct pkt_metadata *md, + struct pkt_metadata *md, const struct nlattr *action, bool may_steal OVS_UNUSED) { struct dpif_execute_helper_aux *aux = aux_; @@ -1124,6 +1124,7 @@ dpif_execute_helper_cb(void *aux_, struct ofpbuf *packet, case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_UNSPEC: + case OVS_ACTION_ATTR_RECIRC: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); } diff --git a/lib/odp-execute.c b/lib/odp-execute.c index 096c113..cde5bf4 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -125,6 +125,14 @@ odp_execute_set_action(struct ofpbuf *packet, const struct nlattr *a, set_arp(packet, nl_attr_get_unspec(a, sizeof(struct ovs_key_arp))); break; + case OVS_KEY_ATTR_DP_HASH: + md->dp_hash = nl_attr_get_u32(a); + break; + + case OVS_KEY_ATTR_RECIRC_ID: + md->recirc_id = nl_attr_get_u32(a); + break; + case OVS_KEY_ATTR_UNSPEC: case OVS_KEY_ATTR_ENCAP: case OVS_KEY_ATTR_ETHERTYPE: @@ -195,6 +203,7 @@ odp_execute_actions__(void *dp, struct ofpbuf *packet, struct pkt_metadata *md, /* These only make sense in the context of a datapath. */ case OVS_ACTION_ATTR_OUTPUT: case OVS_ACTION_ATTR_USERSPACE: + case OVS_ACTION_ATTR_RECIRC: if (dp_execute_action) { /* Allow 'dp_execute_action' to steal the packet data if we do * not need it any more. */ diff --git a/lib/odp-execute.h b/lib/odp-execute.h index 670e8ea..18f50c3 100644 --- a/lib/odp-execute.h +++ b/lib/odp-execute.h @@ -28,7 +28,7 @@ struct ofpbuf; struct pkt_metadata; typedef void (*odp_execute_cb)(void *dp, struct ofpbuf *packet, - const struct pkt_metadata *, + struct pkt_metadata *, const struct nlattr *action, bool may_steal); /* Actions that need to be executed in the context of a datapath are handed diff --git a/lib/odp-util.c b/lib/odp-util.c index 463f008..6e46dff 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -79,6 +79,7 @@ odp_action_len(uint16_t type) case OVS_ACTION_ATTR_POP_VLAN: return 0; case OVS_ACTION_ATTR_PUSH_MPLS: return sizeof(struct ovs_action_push_mpls); case OVS_ACTION_ATTR_POP_MPLS: return sizeof(ovs_be16); + case OVS_ACTION_ATTR_RECIRC: return sizeof(struct ovs_action_recirc); case OVS_ACTION_ATTR_SET: return -2; case OVS_ACTION_ATTR_SAMPLE: return -2; @@ -118,6 +119,8 @@ ovs_key_attr_to_string(enum ovs_key_attr attr, char *namebuf, size_t bufsize) case OVS_KEY_ATTR_ARP: return "arp"; case OVS_KEY_ATTR_ND: return "nd"; case OVS_KEY_ATTR_MPLS: return "mpls"; + case OVS_KEY_ATTR_DP_HASH: return "dp_hash"; + case OVS_KEY_ATTR_RECIRC_ID: return "recirc_id"; case __OVS_KEY_ATTR_MAX: default: @@ -385,6 +388,27 @@ format_mpls(struct ds *ds, const struct ovs_key_mpls *mpls_key, } static void +format_odp_recirc_action(struct ds *ds, + const struct ovs_action_recirc *act) +{ + const char *hash_str = NULL; + + switch (act->hash_alg) { + case OVS_RECIRC_HASH_ALG_L4: + hash_str = "hash_l4"; + break; + case OVS_RECIRC_HASH_ALG_NONE: + default: + break; + } + + hash_str + ? ds_put_format(ds, "recirc(%s(%"PRIu32"), %"PRIu32")", + hash_str, ntohl(act->hash_bias), ntohl(act->recirc_id)) + : ds_put_format(ds, "recirc(%"PRIu32")", ntohl(act->recirc_id)); +} + +static void format_odp_action(struct ds *ds, const struct nlattr *a) { int expected_len; @@ -406,6 +430,9 @@ format_odp_action(struct ds *ds, const struct nlattr *a) case OVS_ACTION_ATTR_USERSPACE: format_odp_userspace_action(ds, a); break; + case OVS_ACTION_ATTR_RECIRC: + format_odp_recirc_action(ds, nl_attr_get(a)); + break; case OVS_ACTION_ATTR_SET: ds_put_cstr(ds, "set("); format_odp_key_attr(nl_attr_get(a), NULL, NULL, ds, true); @@ -731,6 +758,8 @@ odp_flow_key_attr_len(uint16_t type) case OVS_KEY_ATTR_ENCAP: return -2; case OVS_KEY_ATTR_PRIORITY: return 4; case OVS_KEY_ATTR_SKB_MARK: return 4; + case OVS_KEY_ATTR_DP_HASH: return 4; + case OVS_KEY_ATTR_RECIRC_ID: return 4; case OVS_KEY_ATTR_TUNNEL: return -2; case OVS_KEY_ATTR_IN_PORT: return 4; case OVS_KEY_ATTR_ETHERNET: return sizeof(struct ovs_key_ethernet); @@ -1026,6 +1055,8 @@ format_odp_key_attr(const struct nlattr *a, const struct nlattr *ma, case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_DP_HASH: + case OVS_KEY_ATTR_RECIRC_ID: ds_put_format(ds, "%#"PRIx32, nl_attr_get_u32(a)); if (!is_exact) { ds_put_format(ds, "/%#"PRIx32, nl_attr_get_u32(ma)); @@ -1387,7 +1418,6 @@ format_odp_key_attr(const struct nlattr *a, const struct nlattr *ma, } break; } - case OVS_KEY_ATTR_UNSPEC: case __OVS_KEY_ATTR_MAX: default: @@ -1620,6 +1650,36 @@ parse_odp_key_mask_attr(const char *s, const struct simap *port_names, } { + uint32_t recirc_id; + int n = -1; + + if (ovs_scan(s, "recirc_id(%"SCNi32")%n", &recirc_id, &n)) { + nl_msg_put_u32(key, OVS_KEY_ATTR_RECIRC_ID, recirc_id); + nl_msg_put_u32(mask, OVS_KEY_ATTR_RECIRC_ID, UINT32_MAX); + return n; + } + } + + { + uint32_t dp_hash; + uint32_t dp_hash_mask; + int n = -1; + + if (mask && ovs_scan(s, "dp_hash(%"SCNi32"/%"SCNi32")%n", &dp_hash, + &dp_hash_mask, &n)) { + nl_msg_put_u32(key, OVS_KEY_ATTR_DP_HASH, dp_hash); + nl_msg_put_u32(mask, OVS_KEY_ATTR_DP_HASH, dp_hash_mask); + return n; + } else if (ovs_scan(s, "dp_hash(%"SCNi32")%n", &dp_hash, &n)) { + nl_msg_put_u32(key, OVS_KEY_ATTR_DP_HASH, dp_hash); + if (mask) { + nl_msg_put_u32(mask, OVS_KEY_ATTR_DP_HASH, UINT32_MAX); + } + return n; + } + } + + { uint64_t tun_id, tun_id_mask; struct flow_tnl tun_key, tun_key_mask; int n = -1; @@ -2439,6 +2499,14 @@ odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *data, nl_msg_put_u32(buf, OVS_KEY_ATTR_SKB_MARK, data->pkt_mark); + if (!is_mask && data->recirc_id) { + nl_msg_put_u32(buf, OVS_KEY_ATTR_RECIRC_ID, data->recirc_id); + } + + if (data->dp_hash != 0) { + nl_msg_put_u32(buf, OVS_KEY_ATTR_DP_HASH, data->dp_hash); + } + /* Add an ingress port attribute if this is a mask or 'odp_in_port' * is not the magical value "ODPP_NONE". */ if (is_mask || odp_in_port != ODPP_NONE) { @@ -2674,6 +2742,14 @@ odp_key_to_pkt_metadata(const struct nlattr *key, size_t key_len, continue; } + if (type == OVS_KEY_ATTR_RECIRC_ID) { + md->recirc_id = nl_attr_get_u32(nla); + wanted_attrs &= ~(1u << OVS_KEY_ATTR_RECIRC_ID); + } + if (type == OVS_KEY_ATTR_DP_HASH) { + md->dp_hash = nl_attr_get_u32(nla); + wanted_attrs &= ~(1u << OVS_KEY_ATTR_DP_HASH); + } if (type == OVS_KEY_ATTR_PRIORITY) { md->skb_priority = nl_attr_get_u32(nla); wanted_attrs &= ~(1u << OVS_KEY_ATTR_PRIORITY); @@ -3231,6 +3307,19 @@ odp_flow_key_to_flow__(const struct nlattr *key, size_t key_len, expected_attrs = 0; /* Metadata. */ + if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_RECIRC_ID)) { + flow->recirc_id = nl_attr_get_u32(attrs[OVS_KEY_ATTR_RECIRC_ID]); + expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_RECIRC_ID; + } + /* Always exact match RECIRC_ID. */ + if (is_mask) { + flow->recirc_id = UINT32_MAX; + } + + if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_DP_HASH)) { + flow->dp_hash = nl_attr_get_u32(attrs[OVS_KEY_ATTR_DP_HASH]); + expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_DP_HASH; + } if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_PRIORITY)) { flow->skb_priority = nl_attr_get_u32(attrs[OVS_KEY_ATTR_PRIORITY]); expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_PRIORITY; diff --git a/lib/packets.c b/lib/packets.c index 3f7d6eb..9f8feea 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -993,15 +993,18 @@ packet_format_tcp_flags(struct ds *s, uint16_t tcp_flags) } } -void pkt_metadata_init(struct pkt_metadata *md, const struct flow_tnl *tnl, - const uint32_t skb_priority, - const uint32_t pkt_mark, - const union flow_in_port *in_port) +void pkt_metadata_init(struct pkt_metadata *md, + const uint32_t recirc_id, const uint32_t dp_hash, + const struct flow_tnl *tnl, const uint32_t skb_priority, + const uint32_t pkt_mark, + const union flow_in_port *in_port) { tnl ? memcpy(&md->tunnel, tnl, sizeof(md->tunnel)) : memset(&md->tunnel, 0, sizeof(md->tunnel)); + md->recirc_id = recirc_id; + md->dp_hash = dp_hash; md->skb_priority = skb_priority; md->pkt_mark = pkt_mark; md->in_port.odp_port = in_port ? in_port->odp_port : ODPP_NONE; @@ -1009,6 +1012,7 @@ void pkt_metadata_init(struct pkt_metadata *md, const struct flow_tnl *tnl, void pkt_metadata_from_flow(struct pkt_metadata *md, const struct flow *flow) { - pkt_metadata_init(md, &flow->tunnel, flow->skb_priority, - flow->pkt_mark, &flow->in_port); + pkt_metadata_init(md, flow->recirc_id, flow->dp_hash, + &flow->tunnel, flow->skb_priority, + flow->pkt_mark, &flow->in_port); } diff --git a/lib/packets.h b/lib/packets.h index e6b3303..f538218 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -33,6 +33,11 @@ struct ds; /* Datapath packet metadata */ struct pkt_metadata { + uint32_t recirc_id; /* Recirculation id carried with the + recirculating packets. 0 for packets + received from the wire. */ + uint32_t dp_hash; /* hash value computed by the recirculation + action. */ struct flow_tnl tunnel; /* Encapsulating tunnel parameters. */ uint32_t skb_priority; /* Packet priority for QoS. */ uint32_t pkt_mark; /* Packet mark. */ @@ -40,12 +45,13 @@ struct pkt_metadata { }; #define PKT_METADATA_INITIALIZER(PORT) \ - (struct pkt_metadata){ { 0, 0, 0, 0, 0, 0}, 0, 0, {(PORT)} } + (struct pkt_metadata){ 0, 0, { 0, 0, 0, 0, 0, 0}, 0, 0, {(PORT)} } -void pkt_metadata_init(struct pkt_metadata *md, const struct flow_tnl *tnl, - const uint32_t skb_priority, - const uint32_t pkt_mark, - const union flow_in_port *in_port); +void pkt_metadata_init(struct pkt_metadata *md, + const uint32_t recirc_id, const uint32_t dp_hash, + const struct flow_tnl *tnl, const uint32_t skb_priority, + const uint32_t pkt_mark, + const union flow_in_port *in_port); void pkt_metadata_from_flow(struct pkt_metadata *md, const struct flow *flow); bool dpid_from_string(const char *s, uint64_t *dpidp); diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 252e6b4..4018129 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -3818,7 +3818,7 @@ parse_flow_and_packet(int argc, const char *argv[], /* Use the metadata from the flow and the packet argument * to reconstruct the flow. */ - pkt_metadata_init(&md, NULL, flow->skb_priority, + pkt_metadata_init(&md, 0, 0, NULL, flow->skb_priority, flow->pkt_mark, &in_port); flow_extract(packet, &md, flow); diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index d57bf21..f020c7d 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -120,9 +120,66 @@ void ofproto_dpif_send_packet_in(struct ofproto_dpif *, int ofproto_dpif_send_packet(const struct ofport_dpif *, struct ofpbuf *); void ofproto_dpif_flow_mod(struct ofproto_dpif *, struct ofputil_flow_mod *); -uint32_t ofproto_dpif_alloc_recirc_id(struct ofproto *ofproto_); -void ofproto_dpif_free_recirc_id(struct ofproto *ofproto_, uint32_t recirc_id); - struct ofport_dpif *odp_port_to_ofport(const struct dpif_backer *, odp_port_t); +/* +Recirculation +============= + +Recirculation is a technique to allow a frame to re-enter the packet processing +path for one or multiple times to achieve more flexible packet processing in the +data path. MPLS handling and selecting bond slave port of a bond ports. + +Data path and user space interface +----------------------------------- +Two new fields, recirc_id and dp_hash, are added to the current flow data structure. +They are both both of type uint32_t. In addition, a new action, RECIRC, are added. + +The value recirc_id is used to distinguish a packet from multiple iterations of +recirculation. A packet initially received is considered of having recirc_id of 0. +Recirc_id is managed by the user space, opaque to the data path. + +On the other hand, dp_hash can only be computed by the data path, opaque to +the user space. In fact, user space may not able to recompute the hash value. +The dp_hash value should be wildcarded when for a newly received packet. +RECIRC action specifies whether the hash is computed. If computed, how many +fields to be included in the hash computation. The computed hash value is +stored into the dp_hash field prior to recirculation. + +The RECIRC action computes and set the dp_hash field, set the recirc_id field +and then reprocess the packet as if it was received on the same input port. +RECIRC action works like a function call; actions listed behind the RECIRC +action will be executed after its execution. RECIRC action can be nested, +data path implementation limits the number of recirculation executed +to prevent unreasonable nesting depth or infinite loop. + +Both flow fields and the RECIRC action are exposed as open flow fields via +Nicira extensions. + +Post recirculation flow +------------------------ +At the open flow level, post recirculation rules are always hidden from the +controller. They are installed in table 254 which is set up as a hidden table +during boot time. Those rules are managed by the local user space program only. + +To speed up the classifier look up process, recirc_id is always reflected into +the metadata field, since recirc_id is required to be exactly matched. + +Classifier look up always starts with table 254. A post recirculation flow +lookup should find its hidden rule within this table. On the other hand, A +newly received packet should miss all post recirculation rules because its +recirc_id is zero, then hit a pre-installed lower priority rule to redirect +classifier to look up starting from table 0: + + * , actions=resubmit(,0) + +Post recirculation data path flows are managed like other data path flows. +They are created on demand. Miss handling, stats collection and revalidation +work the same way as regular flows. +*/ + +uint32_t ofproto_dpif_alloc_recirc_id(struct ofproto *ofproto); +void ofproto_dpif_free_recirc_id(struct ofproto *ofproto, uint32_t recirc_id); + + #endif /* ofproto-dpif.h */ -- 1.7.9.5 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev