Add basic recirculation infrastructure and user space data path support for it. The following bond mega flow patch will make use of this infrastructure.
Signed-off-by: Andy Zhou <az...@nicira.com> --- v1->v2: Rewritten based on having post recirculation rules stored in table 254. V2->V3: Force recirc_id exact match only when data path does not provide a mask. Coding style improvments. --- include/linux/openvswitch.h | 29 ++++++++++++- lib/dpif-netdev.c | 20 ++++++++- lib/dpif.c | 3 +- lib/odp-execute.c | 9 ++++ lib/odp-execute.h | 2 +- lib/odp-util.c | 98 ++++++++++++++++++++++++++++++++++++++++--- lib/packets.c | 24 +++++++++++ lib/packets.h | 19 +++++++-- ofproto/ofproto-dpif.h | 58 +++++++++++++++++++++++++ 9 files changed, 250 insertions(+), 12 deletions(-) diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h index d1ff5ec..17dc5fb 100644 --- a/include/linux/openvswitch.h +++ b/include/linux/openvswitch.h @@ -307,7 +307,8 @@ enum ovs_key_attr { OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */ OVS_KEY_ATTR_SCTP, /* struct ovs_key_sctp */ OVS_KEY_ATTR_TCP_FLAGS, /* be16 TCP flags. */ - + OVS_KEY_ATTR_DP_HASH, /* u32 hash value */ + OVS_KEY_ATTR_RECIRC_ID, /* u32 recirc id */ #ifdef __KERNEL__ OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */ #endif @@ -530,6 +531,30 @@ struct ovs_action_push_vlan { __be16 vlan_tci; /* 802.1Q TCI (VLAN ID and priority). */ }; +/* Data path hash algorithm for computing Datapath hash. + * + * The Algorithm type only specifies the fields in a flow + * will be used as part of the hash. Each datapath is free + * to use its own hash algorithm. The hash value will be + * opaque to the user space daemon. + */ +enum ovs_recirc_hash_alg { + OVS_RECIRC_HASH_ALG_NONE, + OVS_RECIRC_HASH_ALG_L4, +}; +/* + * struct ovs_action_recirc - %OVS_ACTION_ATTR_RECIRC action argument. + * @recirc_id: The Recirculation label, Zero is invalid. + * @hash_alg: Algorithm used to compute hash prior to recirculation. + * @hash_bias: bias used for computing hash. used to compute hash prior to recirculation. + */ +struct ovs_action_recirc { + uint8_t hash_alg; /* One of ovs_dp_hash_alg. */ + uint8_t pad[3]; /* Always zero. */ + uint32_t hash_bias; + uint32_t recirc_id; /* Recirculation label. */ +}; + /** * enum ovs_action_attr - Action types. * @@ -553,6 +578,7 @@ struct ovs_action_push_vlan { * indicate the new packet contents. This could potentially still be * %ETH_P_MPLS if the resulting MPLS label stack is not empty. If there * is no MPLS label stack, as determined by ethertype, no action is taken. + * @OVS_ACTION_RECIRC: Recirculate within the data path. * * Only a single header can be set with a single %OVS_ACTION_ATTR_SET. Not all * fields within a header are modifiable, e.g. the IPv4 protocol and fragment @@ -569,6 +595,7 @@ enum ovs_action_attr { OVS_ACTION_ATTR_SAMPLE, /* Nested OVS_SAMPLE_ATTR_*. */ OVS_ACTION_ATTR_PUSH_MPLS, /* struct ovs_action_push_mpls. */ OVS_ACTION_ATTR_POP_MPLS, /* __be16 ethertype. */ + OVS_ACTION_ATTR_RECIRC, /* struct ovs_action_recirc. */ __OVS_ACTION_ATTR_MAX }; diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 3d0c09f..baa2a8f 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -1937,7 +1937,7 @@ struct dp_netdev_execute_aux { static void dp_execute_cb(void *aux_, struct ofpbuf *packet, - const struct pkt_metadata *md OVS_UNUSED, + struct pkt_metadata *md, const struct nlattr *a, bool may_steal) OVS_NO_THREAD_SAFETY_ANALYSIS { @@ -1972,6 +1972,24 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet, } break; } + + case OVS_ACTION_ATTR_RECIRC: { + const struct ovs_action_recirc *act; + act = nl_attr_get(a); + md->recirc_id =act->recirc_id; + md->dp_hash = 0; + + if (act->hash_alg == OVS_RECIRC_HASH_ALG_L4) { + struct flow flow; + + flow_extract(packet, md, &flow); + md->dp_hash = flow_hash_symmetric_l4(&flow, act->hash_bias); + } + + dp_netdev_port_input(aux->dp, packet, md); + break; + } + case OVS_ACTION_ATTR_PUSH_VLAN: case OVS_ACTION_ATTR_POP_VLAN: case OVS_ACTION_ATTR_PUSH_MPLS: diff --git a/lib/dpif.c b/lib/dpif.c index b33d13e..dcda0b6 100644 --- a/lib/dpif.c +++ b/lib/dpif.c @@ -1108,7 +1108,7 @@ struct dpif_execute_helper_aux { * meaningful. */ static void dpif_execute_helper_cb(void *aux_, struct ofpbuf *packet, - const struct pkt_metadata *md, + struct pkt_metadata *md, const struct nlattr *action, bool may_steal OVS_UNUSED) { struct dpif_execute_helper_aux *aux = aux_; @@ -1133,6 +1133,7 @@ dpif_execute_helper_cb(void *aux_, struct ofpbuf *packet, case OVS_ACTION_ATTR_SET: case OVS_ACTION_ATTR_SAMPLE: case OVS_ACTION_ATTR_UNSPEC: + case OVS_ACTION_ATTR_RECIRC: case __OVS_ACTION_ATTR_MAX: OVS_NOT_REACHED(); } diff --git a/lib/odp-execute.c b/lib/odp-execute.c index 096c113..cde5bf4 100644 --- a/lib/odp-execute.c +++ b/lib/odp-execute.c @@ -125,6 +125,14 @@ odp_execute_set_action(struct ofpbuf *packet, const struct nlattr *a, set_arp(packet, nl_attr_get_unspec(a, sizeof(struct ovs_key_arp))); break; + case OVS_KEY_ATTR_DP_HASH: + md->dp_hash = nl_attr_get_u32(a); + break; + + case OVS_KEY_ATTR_RECIRC_ID: + md->recirc_id = nl_attr_get_u32(a); + break; + case OVS_KEY_ATTR_UNSPEC: case OVS_KEY_ATTR_ENCAP: case OVS_KEY_ATTR_ETHERTYPE: @@ -195,6 +203,7 @@ odp_execute_actions__(void *dp, struct ofpbuf *packet, struct pkt_metadata *md, /* These only make sense in the context of a datapath. */ case OVS_ACTION_ATTR_OUTPUT: case OVS_ACTION_ATTR_USERSPACE: + case OVS_ACTION_ATTR_RECIRC: if (dp_execute_action) { /* Allow 'dp_execute_action' to steal the packet data if we do * not need it any more. */ diff --git a/lib/odp-execute.h b/lib/odp-execute.h index 670e8ea..18f50c3 100644 --- a/lib/odp-execute.h +++ b/lib/odp-execute.h @@ -28,7 +28,7 @@ struct ofpbuf; struct pkt_metadata; typedef void (*odp_execute_cb)(void *dp, struct ofpbuf *packet, - const struct pkt_metadata *, + struct pkt_metadata *, const struct nlattr *action, bool may_steal); /* Actions that need to be executed in the context of a datapath are handed diff --git a/lib/odp-util.c b/lib/odp-util.c index 7c6aad4..956fef1 100644 --- a/lib/odp-util.c +++ b/lib/odp-util.c @@ -79,6 +79,7 @@ odp_action_len(uint16_t type) case OVS_ACTION_ATTR_POP_VLAN: return 0; case OVS_ACTION_ATTR_PUSH_MPLS: return sizeof(struct ovs_action_push_mpls); case OVS_ACTION_ATTR_POP_MPLS: return sizeof(ovs_be16); + case OVS_ACTION_ATTR_RECIRC: return sizeof(struct ovs_action_recirc); case OVS_ACTION_ATTR_SET: return -2; case OVS_ACTION_ATTR_SAMPLE: return -2; @@ -118,6 +119,8 @@ ovs_key_attr_to_string(enum ovs_key_attr attr, char *namebuf, size_t bufsize) case OVS_KEY_ATTR_ARP: return "arp"; case OVS_KEY_ATTR_ND: return "nd"; case OVS_KEY_ATTR_MPLS: return "mpls"; + case OVS_KEY_ATTR_DP_HASH: return "dp_hash"; + case OVS_KEY_ATTR_RECIRC_ID: return "recirc_id"; case __OVS_KEY_ATTR_MAX: default: @@ -384,6 +387,19 @@ format_mpls(struct ds *ds, const struct ovs_key_mpls *mpls_key, } static void +format_odp_recirc_action(struct ds *ds, + const struct ovs_action_recirc *act) +{ + ds_put_format(ds, "recirc("); + + if (act->hash_alg == OVS_RECIRC_HASH_ALG_L4) { + ds_put_format(ds, "hash_l4(%"PRIu32"), ", act->hash_bias); + } + + ds_put_format(ds, "%"PRIu32")", act->recirc_id); +} + +static void format_odp_action(struct ds *ds, const struct nlattr *a) { int expected_len; @@ -405,6 +421,9 @@ format_odp_action(struct ds *ds, const struct nlattr *a) case OVS_ACTION_ATTR_USERSPACE: format_odp_userspace_action(ds, a); break; + case OVS_ACTION_ATTR_RECIRC: + format_odp_recirc_action(ds, nl_attr_get(a)); + break; case OVS_ACTION_ATTR_SET: ds_put_cstr(ds, "set("); format_odp_key_attr(nl_attr_get(a), NULL, NULL, ds, true); @@ -730,6 +749,8 @@ odp_flow_key_attr_len(uint16_t type) case OVS_KEY_ATTR_ENCAP: return -2; case OVS_KEY_ATTR_PRIORITY: return 4; case OVS_KEY_ATTR_SKB_MARK: return 4; + case OVS_KEY_ATTR_DP_HASH: return 4; + case OVS_KEY_ATTR_RECIRC_ID: return 4; case OVS_KEY_ATTR_TUNNEL: return -2; case OVS_KEY_ATTR_IN_PORT: return 4; case OVS_KEY_ATTR_ETHERNET: return sizeof(struct ovs_key_ethernet); @@ -1025,6 +1046,8 @@ format_odp_key_attr(const struct nlattr *a, const struct nlattr *ma, case OVS_KEY_ATTR_PRIORITY: case OVS_KEY_ATTR_SKB_MARK: + case OVS_KEY_ATTR_DP_HASH: + case OVS_KEY_ATTR_RECIRC_ID: ds_put_format(ds, "%#"PRIx32, nl_attr_get_u32(a)); if (!is_exact) { ds_put_format(ds, "/%#"PRIx32, nl_attr_get_u32(ma)); @@ -1386,7 +1409,6 @@ format_odp_key_attr(const struct nlattr *a, const struct nlattr *ma, } break; } - case OVS_KEY_ATTR_UNSPEC: case __OVS_KEY_ATTR_MAX: default: @@ -1619,6 +1641,36 @@ parse_odp_key_mask_attr(const char *s, const struct simap *port_names, } { + uint32_t recirc_id; + int n = -1; + + if (ovs_scan(s, "recirc_id(%"SCNi32")%n", &recirc_id, &n)) { + nl_msg_put_u32(key, OVS_KEY_ATTR_RECIRC_ID, recirc_id); + nl_msg_put_u32(mask, OVS_KEY_ATTR_RECIRC_ID, UINT32_MAX); + return n; + } + } + + { + uint32_t dp_hash; + uint32_t dp_hash_mask; + int n = -1; + + if (mask && ovs_scan(s, "dp_hash(%"SCNi32"/%"SCNi32")%n", &dp_hash, + &dp_hash_mask, &n)) { + nl_msg_put_u32(key, OVS_KEY_ATTR_DP_HASH, dp_hash); + nl_msg_put_u32(mask, OVS_KEY_ATTR_DP_HASH, dp_hash_mask); + return n; + } else if (ovs_scan(s, "dp_hash(%"SCNi32")%n", &dp_hash, &n)) { + nl_msg_put_u32(key, OVS_KEY_ATTR_DP_HASH, dp_hash); + if (mask) { + nl_msg_put_u32(mask, OVS_KEY_ATTR_DP_HASH, UINT32_MAX); + } + return n; + } + } + + { uint64_t tun_id, tun_id_mask; struct flow_tnl tun_key, tun_key_mask; int n = -1; @@ -2438,6 +2490,14 @@ odp_flow_key_from_flow__(struct ofpbuf *buf, const struct flow *data, nl_msg_put_u32(buf, OVS_KEY_ATTR_SKB_MARK, data->pkt_mark); + if (flow->recirc_id) { + nl_msg_put_u32(buf, OVS_KEY_ATTR_RECIRC_ID, data->recirc_id); + } + + if (flow->dp_hash) { + nl_msg_put_u32(buf, OVS_KEY_ATTR_DP_HASH, data->dp_hash); + } + /* Add an ingress port attribute if this is a mask or 'odp_in_port' * is not the magical value "ODPP_NONE". */ if (is_mask || odp_in_port != ODPP_NONE) { @@ -2673,13 +2733,24 @@ odp_key_to_pkt_metadata(const struct nlattr *key, size_t key_len, continue; } - if (type == OVS_KEY_ATTR_PRIORITY) { + switch (type) { + case OVS_KEY_ATTR_RECIRC_ID: + md->recirc_id = nl_attr_get_u32(nla); + wanted_attrs &= ~(1u << OVS_KEY_ATTR_RECIRC_ID); + break; + case OVS_KEY_ATTR_DP_HASH: + md->dp_hash = nl_attr_get_u32(nla); + wanted_attrs &= ~(1u << OVS_KEY_ATTR_DP_HASH); + break; + case OVS_KEY_ATTR_PRIORITY: md->skb_priority = nl_attr_get_u32(nla); wanted_attrs &= ~(1u << OVS_KEY_ATTR_PRIORITY); - } else if (type == OVS_KEY_ATTR_SKB_MARK) { + break; + case OVS_KEY_ATTR_SKB_MARK: md->pkt_mark = nl_attr_get_u32(nla); wanted_attrs &= ~(1u << OVS_KEY_ATTR_SKB_MARK); - } else if (type == OVS_KEY_ATTR_TUNNEL) { + break; + case OVS_KEY_ATTR_TUNNEL: { enum odp_key_fitness res; res = odp_tun_key_from_attr(nla, &md->tunnel); @@ -2688,9 +2759,14 @@ odp_key_to_pkt_metadata(const struct nlattr *key, size_t key_len, } else if (res == ODP_FIT_PERFECT) { wanted_attrs &= ~(1u << OVS_KEY_ATTR_TUNNEL); } - } else if (type == OVS_KEY_ATTR_IN_PORT) { + break; + } + case OVS_KEY_ATTR_IN_PORT: md->in_port.odp_port = nl_attr_get_odp_port(nla); wanted_attrs &= ~(1u << OVS_KEY_ATTR_IN_PORT); + break; + default: + break; } if (!wanted_attrs) { @@ -3226,6 +3302,18 @@ odp_flow_key_to_flow__(const struct nlattr *key, size_t key_len, expected_attrs = 0; /* Metadata. */ + if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_RECIRC_ID)) { + flow->recirc_id = nl_attr_get_u32(attrs[OVS_KEY_ATTR_RECIRC_ID]); + expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_RECIRC_ID; + } else if (is_mask) { + /* Always exact match recirc_id when datapath does not sepcify it. */ + flow->recirc_id = UINT32_MAX; + } + + if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_DP_HASH)) { + flow->dp_hash = nl_attr_get_u32(attrs[OVS_KEY_ATTR_DP_HASH]); + expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_DP_HASH; + } if (present_attrs & (UINT64_C(1) << OVS_KEY_ATTR_PRIORITY)) { flow->skb_priority = nl_attr_get_u32(attrs[OVS_KEY_ATTR_PRIORITY]); expected_attrs |= UINT64_C(1) << OVS_KEY_ATTR_PRIORITY; diff --git a/lib/packets.c b/lib/packets.c index 65ba3f6..dce2e3b 100644 --- a/lib/packets.c +++ b/lib/packets.c @@ -976,3 +976,27 @@ packet_format_tcp_flags(struct ds *s, uint16_t tcp_flags) ds_put_cstr(s, "[800]"); } } + +void pkt_metadata_init(struct pkt_metadata *md, + const uint32_t recirc_id, const uint32_t dp_hash, + const struct flow_tnl *tnl, const uint32_t skb_priority, + const uint32_t pkt_mark, + const union flow_in_port *in_port) +{ + + tnl ? memcpy(&md->tunnel, tnl, sizeof(md->tunnel)) + : memset(&md->tunnel, 0, sizeof(md->tunnel)); + + md->recirc_id = recirc_id; + md->dp_hash = dp_hash; + md->skb_priority = skb_priority; + md->pkt_mark = pkt_mark; + md->in_port.odp_port = in_port ? in_port->odp_port : ODPP_NONE; +} + +void pkt_metadata_from_flow(struct pkt_metadata *md, const struct flow *flow) +{ + pkt_metadata_init(md, flow->recirc_id, flow->dp_hash, + &flow->tunnel, flow->skb_priority, + flow->pkt_mark, &flow->in_port); +} diff --git a/lib/packets.h b/lib/packets.h index 18a3b17..971d4f2 100644 --- a/lib/packets.h +++ b/lib/packets.h @@ -33,6 +33,11 @@ struct ds; /* Datapath packet metadata */ struct pkt_metadata { + uint32_t recirc_id; /* Recirculation id carried with the + recirculating packets. 0 for packets + received from the wire. */ + uint32_t dp_hash; /* hash value computed by the recirculation + action. */ struct flow_tnl tunnel; /* Encapsulating tunnel parameters. */ uint32_t skb_priority; /* Packet priority for QoS. */ uint32_t pkt_mark; /* Packet mark. */ @@ -40,11 +45,19 @@ struct pkt_metadata { }; #define PKT_METADATA_INITIALIZER(PORT) \ - (struct pkt_metadata){ { 0, 0, 0, 0, 0, 0}, 0, 0, {(PORT)} } + (struct pkt_metadata){ 0, 0, { 0, 0, 0, 0, 0, 0}, 0, 0, {(PORT)} } #define PKT_METADATA_INITIALIZER_FLOW(FLOW) \ - (struct pkt_metadata){ (FLOW)->tunnel, (FLOW)->skb_priority, \ - (FLOW)->pkt_mark, (FLOW)->in_port } + (struct pkt_metadata){ (FLOW)->recirc_id, (FLOW)->dp_hash, (FLOW)->tunnel, \ + (FLOW)->skb_priority, (FLOW)->pkt_mark, (FLOW)->in_port } + +void pkt_metadata_init(struct pkt_metadata *md, + const uint32_t recirc_id, const uint32_t dp_hash, + const struct flow_tnl *tnl, const uint32_t skb_priority, + const uint32_t pkt_mark, + const union flow_in_port *in_port); + +void pkt_metadata_from_flow(struct pkt_metadata *md, const struct flow *flow); bool dpid_from_string(const char *s, uint64_t *dpidp); diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h index 6fbc672..93e6ec0 100644 --- a/ofproto/ofproto-dpif.h +++ b/ofproto/ofproto-dpif.h @@ -135,6 +135,64 @@ void ofproto_dpif_flow_mod(struct ofproto_dpif *, struct ofputil_flow_mod *); struct ofport_dpif *odp_port_to_ofport(const struct dpif_backer *, odp_port_t); +/* + * Recirculation + * ============= + * + * Recirculation is a technique to allow a frame to re-enter the packet processing + * path for one or multiple times to achieve more flexible packet processing in the + * data path. MPLS handling and selecting bond slave port of a bond ports. + * + * Data path and user space interface + * ----------------------------------- + * + * Two new fields, recirc_id and dp_hash, are added to the current flow data structure. + * They are both both of type uint32_t. In addition, a new action, RECIRC, are added. + * + * The value recirc_id is used to distinguish a packet from multiple iterations of + * recirculation. A packet initially received is considered of having recirc_id of 0. + * Recirc_id is managed by the user space, opaque to the data path. + * + * On the other hand, dp_hash can only be computed by the data path, opaque to + * the user space. In fact, user space may not able to recompute the hash value. + * The dp_hash value should be wildcarded when for a newly received packet. + * RECIRC action specifies whether the hash is computed. If computed, how many + * fields to be included in the hash computation. The computed hash value is + * stored into the dp_hash field prior to recirculation. + * + * The RECIRC action computes and set the dp_hash field, set the recirc_id field + * and then reprocess the packet as if it was received on the same input port. + * RECIRC action works like a function call; actions listed behind the RECIRC + * action will be executed after its execution. RECIRC action can be nested, + * data path implementation limits the number of recirculation executed + * to prevent unreasonable nesting depth or infinite loop. + * + * Both flow fields and the RECIRC action are exposed as open flow fields via + * Nicira extensions. + * + * Post recirculation flow + * ------------------------ + * + * At the open flow level, post recirculation rules are always hidden from the + * controller. They are installed in table 254 which is set up as a hidden table + * during boot time. Those rules are managed by the local user space program only. + * + * To speed up the classifier look up process, recirc_id is always reflected into + * the metadata field, since recirc_id is required to be exactly matched. + * + * Classifier look up always starts with table 254. A post recirculation flow + * lookup should find its hidden rule within this table. On the other hand, A + * newly received packet should miss all post recirculation rules because its + * recirc_id is zero, then hit a pre-installed lower priority rule to redirect + * classifier to look up starting from table 0: + * + * * , actions=resubmit(,0) + * + * Post recirculation data path flows are managed like other data path flows. + * They are created on demand. Miss handling, stats collection and revalidation + * work the same way as regular flows. + */ + uint32_t ofproto_dpif_alloc_recirc_id(struct ofproto_dpif *ofproto); void ofproto_dpif_free_recirc_id(struct ofproto_dpif *ofproto, uint32_t recirc_id); #endif /* ofproto-dpif.h */ -- 1.7.9.5 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev