I get test failures after this patch, seems to relate to slow path processing. Haven't looked at them, but I see failures on 656 and 657 come and go, so those are most likely unrelated test timing issues.
after 6/7: 632: ofproto-dpif - controller FAILED (ofproto-dpif.at:292) 633: ofproto-dpif - VLAN handling ok 634: ofproto-dpif - fragment handling ok 635: ofproto-dpif - exit ok 636: ofproto-dpif - mirroring, select_all ok 637: ofproto-dpif - mirroring, select_src ok 638: ofproto-dpif - mirroring, OFPP_NONE ingress port ok 639: ofproto-dpif - mirroring, select_dst ok 640: ofproto-dpif - mirroring, select_vlan ok 641: ofproto-dpif - mirroring, output_port ok 642: ofproto-dpif - mirroring, output_vlan ok 643: ofproto-dpif - ofproto/trace command 1 ok 644: ofproto-dpif - ofproto/trace command 2 ok 645: ofproto-dpif - MAC learning ok 646: ofproto-dpif - MAC table overflow ok 647: ofproto-dpif - sFlow packet sampling FAILED (ofproto-dpif.at:1586) 648: ofproto-dpif - NetFlow flow expiration ok 649: ofproto-dpif - NetFlow active expiration FAILED (ofproto-dpif.at:1952) 650: idle_age and hard_age increase over time ok 651: ofproto-dpif - fin_timeout FAILED (ofproto-dpif.at:2084) 652: ofproto-dpif - ovs-appctl dpif/dump-dps ok 653: ofproto-dpif - ovs-appctl dpif/show ok 654: ofproto-dpif - ovs-appctl dpif/dump-flows ok 655: ofproto-dpif - ovs-appctl dpif/del-flows ok 656: ofproto-dpif - patch ports FAILED (ofproto-dpif.at:2201) 657: ofproto-dpif - ovs-appctl dpif/show rates FAILED (ofproto-dpif.at:2250) after 7/7: 632: ofproto-dpif - controller FAILED (ofproto-dpif.at:292) 633: ofproto-dpif - VLAN handling ok 634: ofproto-dpif - fragment handling ok 635: ofproto-dpif - exit ok 636: ofproto-dpif - mirroring, select_all ok 637: ofproto-dpif - mirroring, select_src ok 638: ofproto-dpif - mirroring, OFPP_NONE ingress port ok 639: ofproto-dpif - mirroring, select_dst ok 640: ofproto-dpif - mirroring, select_vlan ok 641: ofproto-dpif - mirroring, output_port ok 642: ofproto-dpif - mirroring, output_vlan ok 643: ofproto-dpif - ofproto/trace command 1 ok 644: ofproto-dpif - ofproto/trace command 2 ok 645: ofproto-dpif - MAC learning ok 646: ofproto-dpif - MAC table overflow ok 647: ofproto-dpif - sFlow packet sampling FAILED (ofproto-dpif.at:1586) 648: ofproto-dpif - NetFlow flow expiration FAILED (ofproto-dpif.at:1867) 649: ofproto-dpif - NetFlow active expiration ok 650: idle_age and hard_age increase over time ok 651: ofproto-dpif - fin_timeout FAILED (ofproto-dpif.at:2084) 652: ofproto-dpif - ovs-appctl dpif/dump-dps ok 653: ofproto-dpif - ovs-appctl dpif/show ok 654: ofproto-dpif - ovs-appctl dpif/dump-flows ok 655: ofproto-dpif - ovs-appctl dpif/del-flows ok 656: ofproto-dpif - patch ports ok 657: ofproto-dpif - ovs-appctl dpif/show rates FAILED (ofproto-dpif.at:2250) Jarno On Sep 16, 2013, at 2:59 PM, Ben Pfaff <b...@nicira.com> wrote: > Until now, the code in ofproto-dpif-upcall (and the code that preceded it > in ofproto-dpif) obtained a batch of incoming packets, inserted them into > a hash table based on hashes of their flows, processed them, and then > forwarded them in hash order. Usually this maintains order within a single > network connection, but because OVS's notion of a flow is so fine-grained, > it can reorder packets within (e.g.) a TCP connection if two packets > handled in a single batch have (e.g.) different ECN values. > > This commit fixes the problem by making ofproto-dpif-upcall always forward > packets in the same order they were received. > > This is far from the minimal change necessary to avoid reordering packets. > I think that the code is easier to understand afterward. > > Reported-by: Dmitry Fleytman <dfley...@redhat.com> > Signed-off-by: Ben Pfaff <b...@nicira.com> > --- > ofproto/ofproto-dpif-upcall.c | 337 +++++++++++++++++++++-------------------- > ofproto/ofproto-dpif-upcall.h | 3 +- > 2 files changed, 176 insertions(+), 164 deletions(-) > > diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c > index b9d91b2..e13a5f8 100644 > --- a/ofproto/ofproto-dpif-upcall.c > +++ b/ofproto/ofproto-dpif-upcall.c > @@ -416,12 +416,6 @@ udpif_miss_handler(void *arg) > static void > miss_destroy(struct flow_miss *miss) > { > - struct upcall *upcall, *next; > - > - LIST_FOR_EACH_SAFE (upcall, next, list_node, &miss->upcalls) { > - list_remove(&upcall->list_node); > - upcall_destroy(upcall); > - } > xlate_out_uninit(&miss->xout); > } > > @@ -598,105 +592,6 @@ flow_miss_find(struct hmap *todo, const struct > ofproto_dpif *ofproto, > return NULL; > } > > -/* Executes flow miss 'miss'. May add any required datapath operations > - * to 'ops', incrementing '*n_ops' for each new op. */ > -static void > -execute_flow_miss(struct flow_miss *miss, struct dpif_op *ops, size_t *n_ops) > -{ > - struct ofproto_dpif *ofproto = miss->ofproto; > - struct upcall *upcall; > - struct flow_wildcards wc; > - struct rule_dpif *rule; > - struct xlate_in xin; > - > - memset(&miss->stats, 0, sizeof miss->stats); > - miss->stats.used = time_msec(); > - LIST_FOR_EACH (upcall, list_node, &miss->upcalls) { > - struct ofpbuf *packet = upcall->dpif_upcall.packet; > - > - miss->stats.tcp_flags |= packet_get_tcp_flags(packet, &miss->flow); > - miss->stats.n_bytes += packet->size; > - miss->stats.n_packets++; > - } > - > - flow_wildcards_init_catchall(&wc); > - rule_dpif_lookup(ofproto, &miss->flow, &wc, &rule); > - rule_dpif_credit_stats(rule, &miss->stats); > - xlate_in_init(&xin, ofproto, &miss->flow, rule, miss->stats.tcp_flags, > - NULL); > - xin.may_learn = true; > - xin.resubmit_stats = &miss->stats; > - xlate_actions(&xin, &miss->xout); > - flow_wildcards_or(&miss->xout.wc, &miss->xout.wc, &wc); > - > - if (rule_dpif_fail_open(rule)) { > - LIST_FOR_EACH (upcall, list_node, &miss->upcalls) { > - struct ofpbuf *packet = upcall->dpif_upcall.packet; > - struct ofputil_packet_in *pin; > - > - /* Extra-special case for fail-open mode. > - * > - * We are in fail-open mode and the packet matched the fail-open > - * rule, but we are connected to a controller too. We should > send > - * the packet up to the controller in the hope that it will try > to > - * set up a flow and thereby allow us to exit fail-open. > - * > - * See the top-level comment in fail-open.c for more > information. */ > - pin = xmalloc(sizeof(*pin)); > - pin->packet = xmemdup(packet->data, packet->size); > - pin->packet_len = packet->size; > - pin->reason = OFPR_NO_MATCH; > - pin->controller_id = 0; > - pin->table_id = 0; > - pin->cookie = 0; > - pin->send_len = 0; /* Not used for flow table misses. */ > - flow_get_metadata(&miss->flow, &pin->fmd); > - ofproto_dpif_send_packet_in(ofproto, pin); > - } > - } > - > - if (miss->xout.slow) { > - LIST_FOR_EACH (upcall, list_node, &miss->upcalls) { > - struct ofpbuf *packet = upcall->dpif_upcall.packet; > - struct xlate_in xin; > - > - xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet); > - xlate_actions_for_side_effects(&xin); > - } > - } > - rule_dpif_unref(rule); > - > - if (miss->xout.odp_actions.size) { > - LIST_FOR_EACH (upcall, list_node, &miss->upcalls) { > - struct ofpbuf *packet = upcall->dpif_upcall.packet; > - struct dpif_op *op = &ops[*n_ops]; > - struct dpif_execute *execute = &op->u.execute; > - > - if (miss->flow.in_port.ofp_port > - != vsp_realdev_to_vlandev(miss->ofproto, > - miss->flow.in_port.ofp_port, > - miss->flow.vlan_tci)) { > - /* This packet was received on a VLAN splinter port. We > - * added a VLAN to the packet to make the packet resemble > - * the flow, but the actions were composed assuming that > - * the packet contained no VLAN. So, we must remove the > - * VLAN header from the packet before trying to execute the > - * actions. */ > - eth_pop_vlan(packet); > - } > - > - op->type = DPIF_OP_EXECUTE; > - execute->key = miss->key; > - execute->key_len = miss->key_len; > - execute->packet = packet; > - execute->actions = miss->xout.odp_actions.data; > - execute->actions_len = miss->xout.odp_actions.size; > - > - (*n_ops)++; > - } > - } > -} > - > static void > handle_miss_upcalls(struct udpif *udpif, struct list *upcalls) > { > @@ -707,92 +602,184 @@ handle_miss_upcalls(struct udpif *udpif, struct list > *upcalls) > size_t n_upcalls, n_ops, i; > struct flow_miss *miss; > unsigned int reval_seq; > + bool fail_open; > > - /* Construct the to-do list. > + /* Extract the flow from each upcall. Construct in fmb->misses a hash > + * table that maps each unique flow to a 'struct flow_miss'. > + * > + * Most commonly there is a single packet per flow_miss, but there are > + * several reasons why there might be more than one, e.g.: > * > - * This just amounts to extracting the flow from each packet and sticking > - * the packets that have the same flow in the same "flow_miss" structure > so > - * that we can process them together. */ > + * - The dpif packet interface does not support TSO (or UFO, etc.), so > a > + * large packet sent to userspace is split into a sequence of smaller > + * ones. > + * > + * - A stream of quickly arriving packets in an established > "slow-pathed" > + * flow. > + * > + * - Rarely, a stream of quickly arriving packets in a flow not yet > + * established. (This is rare because most protocols do not send > + * multiple back-to-back packets before receiving a reply from the > + * other end of the connection, which gives OVS a chance to set up a > + * datapath flow.) > + */ > fmb = xmalloc(sizeof *fmb); > atomic_read(&udpif->reval_seq, &fmb->reval_seq); > hmap_init(&fmb->misses); > n_upcalls = 0; > LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) { > struct dpif_upcall *dupcall = &upcall->dpif_upcall; > + struct ofpbuf *packet = dupcall->packet; > struct flow_miss *miss = &fmb->miss_buf[n_upcalls]; > struct flow_miss *existing_miss; > struct ofproto_dpif *ofproto; > odp_port_t odp_in_port; > struct flow flow; > - uint32_t hash; > int error; > > - error = xlate_receive(udpif->backer, dupcall->packet, dupcall->key, > + error = xlate_receive(udpif->backer, packet, dupcall->key, > dupcall->key_len, &flow, &miss->key_fitness, > &ofproto, &odp_in_port); > > - if (error == ENODEV) { > - struct drop_key *drop_key; > - > - /* Received packet on datapath port for which we couldn't > - * associate an ofproto. This can happen if a port is removed > - * while traffic is being received. Print a rate-limited message > - * in case it happens frequently. Install a drop flow so > - * that future packets of the flow are inexpensively dropped > - * in the kernel. */ > - VLOG_INFO_RL(&rl, "received packet on unassociated datapath port > " > - "%"PRIu32, odp_in_port); > - > - drop_key = xmalloc(sizeof *drop_key); > - drop_key->key = xmemdup(dupcall->key, dupcall->key_len); > - drop_key->key_len = dupcall->key_len; > - > - if (guarded_list_push_back(&udpif->drop_keys, > &drop_key->list_node, > - MAX_QUEUE_LENGTH)) { > - seq_change(udpif->wait_seq); > + if (!error) { > + uint32_t hash; > + > + flow_extract(packet, flow.skb_priority, flow.pkt_mark, > + &flow.tunnel, &flow.in_port, &miss->flow); > + > + hash = flow_hash(&miss->flow, 0); > + existing_miss = flow_miss_find(&fmb->misses, ofproto, > &miss->flow, > + hash); > + if (!existing_miss) { > + hmap_insert(&fmb->misses, &miss->hmap_node, hash); > + miss->ofproto = ofproto; > + miss->key = dupcall->key; > + miss->key_len = dupcall->key_len; > + miss->upcall_type = dupcall->type; > + miss->stats.n_packets = 0; > + miss->stats.n_bytes = 0; > + miss->stats.used = time_msec(); > + miss->stats.tcp_flags = 0; > + > + n_upcalls++; > } else { > - COVERAGE_INC(drop_queue_overflow); > - drop_key_destroy(drop_key); > + miss = existing_miss; > } > - continue; > - } else if (error) { > - continue; > - } > + miss->stats.tcp_flags |= packet_get_tcp_flags(packet, > &miss->flow); > + miss->stats.n_bytes += packet->size; > + miss->stats.n_packets++; > > - flow_extract(dupcall->packet, flow.skb_priority, flow.pkt_mark, > - &flow.tunnel, &flow.in_port, &miss->flow); > - > - /* Add other packets to a to-do list. */ > - hash = flow_hash(&miss->flow, 0); > - existing_miss = flow_miss_find(&fmb->misses, ofproto, &miss->flow, > hash); > - if (!existing_miss) { > - hmap_insert(&fmb->misses, &miss->hmap_node, hash); > - miss->ofproto = ofproto; > - miss->key = dupcall->key; > - miss->key_len = dupcall->key_len; > - miss->upcall_type = dupcall->type; > - list_init(&miss->upcalls); > - > - n_upcalls++; > + upcall->flow_miss = miss; > } else { > - miss = existing_miss; > + if (error == ENODEV) { > + struct drop_key *drop_key; > + > + /* Received packet on datapath port for which we couldn't > + * associate an ofproto. This can happen if a port is > removed > + * while traffic is being received. Print a rate-limited > + * message in case it happens frequently. Install a drop > flow > + * so that future packets of the flow are inexpensively > dropped > + * in the kernel. */ > + VLOG_INFO_RL(&rl, "received packet on unassociated datapath " > + "port %"PRIu32, odp_in_port); > + > + drop_key = xmalloc(sizeof *drop_key); > + drop_key->key = xmemdup(dupcall->key, dupcall->key_len); > + drop_key->key_len = dupcall->key_len; > + > + if (guarded_list_push_back(&udpif->drop_keys, > + &drop_key->list_node, > + MAX_QUEUE_LENGTH)) { > + seq_change(udpif->wait_seq); > + } else { > + COVERAGE_INC(drop_queue_overflow); > + drop_key_destroy(drop_key); > + } > + } > + list_remove(&upcall->list_node); > + upcall_destroy(upcall); > } > - list_remove(&upcall->list_node); > - list_push_back(&miss->upcalls, &upcall->list_node); > } > > - LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) { > - list_remove(&upcall->list_node); > - upcall_destroy(upcall); > + /* Initialize each 'struct flow_miss's ->xout. > + * > + * We do this per-flow_miss rather than per-packet because, most > commonly, > + * all the packets in a flow can use the same translation. > + * > + * We can't do this in the previous loop because we need the TCP flags > for > + * all the packets in each miss. */ > + fail_open = false; > + HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) { > + struct flow_wildcards wc; > + struct rule_dpif *rule; > + struct xlate_in xin; > + > + flow_wildcards_init_catchall(&wc); > + rule_dpif_lookup(miss->ofproto, &miss->flow, &wc, &rule); > + if (rule_dpif_fail_open(rule)) { > + fail_open = true; > + } > + rule_dpif_credit_stats(rule, &miss->stats); > + xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, > + miss->stats.tcp_flags, NULL); > + xin.may_learn = true; > + xin.resubmit_stats = &miss->stats; > + xlate_actions(&xin, &miss->xout); > + flow_wildcards_or(&miss->xout.wc, &miss->xout.wc, &wc); > + rule_dpif_unref(rule); > } > > - /* Process each element in the to-do list, constructing the set of > - * operations to batch. */ > + /* Now handle each packet individually: > + * > + * - In the common case, the packet is a member of a flow that doesn't > + * need per-packet translation. We already did the translation in > the > + * previous loop, so reuse it. > + * > + * - Otherwise, we need to do another translation just for this > + * packet. > + * > + * The loop fills 'ops' with an array of operations to execute in the > + * datapath. */ > n_ops = 0; > - HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) { > - execute_flow_miss(miss, ops, &n_ops); > + LIST_FOR_EACH (upcall, list_node, upcalls) { > + struct flow_miss *miss = upcall->flow_miss; > + struct ofpbuf *packet = upcall->dpif_upcall.packet; > + > + if (miss->xout.slow) { > + struct rule_dpif *rule; > + struct xlate_in xin; > + > + rule_dpif_lookup(miss->ofproto, &miss->flow, NULL, &rule); > + xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet); > + xlate_actions_for_side_effects(&xin); > + rule_dpif_unref(rule); > + } > + > + if (miss->xout.odp_actions.size) { > + struct dpif_op *op; > + > + if (miss->flow.in_port.ofp_port > + != vsp_realdev_to_vlandev(miss->ofproto, > + miss->flow.in_port.ofp_port, > + miss->flow.vlan_tci)) { > + /* This packet was received on a VLAN splinter port. We > + * added a VLAN to the packet to make the packet resemble > + * the flow, but the actions were composed assuming that > + * the packet contained no VLAN. So, we must remove the > + * VLAN header from the packet before trying to execute the > + * actions. */ > + eth_pop_vlan(packet); > + } > + > + op = &ops[n_ops++]; > + op->type = DPIF_OP_EXECUTE; > + op->u.execute.key = miss->key; > + op->u.execute.key_len = miss->key_len; > + op->u.execute.packet = packet; > + op->u.execute.actions = miss->xout.odp_actions.data; > + op->u.execute.actions_len = miss->xout.odp_actions.size; > + } > } > - ovs_assert(n_ops <= ARRAY_SIZE(ops)); > > /* Execute batch. */ > for (i = 0; i < n_ops; i++) { > @@ -800,6 +787,32 @@ handle_miss_upcalls(struct udpif *udpif, struct list > *upcalls) > } > dpif_operate(udpif->dpif, opsp, n_ops); > > + /* Special case for fail-open mode. > + * > + * If we are in fail-open mode, but we are connected to a controller too, > + * then we should send the packet up to the controller in the hope that > it > + * will try to set up a flow and thereby allow us to exit fail-open. > + * > + * See the top-level comment in fail-open.c for more information. */ > + if (fail_open) { > + LIST_FOR_EACH (upcall, list_node, upcalls) { > + struct flow_miss *miss = upcall->flow_miss; > + struct ofpbuf *packet = upcall->dpif_upcall.packet; > + struct ofputil_packet_in *pin; > + > + pin = xmalloc(sizeof *pin); > + pin->packet = xmemdup(packet->data, packet->size); > + pin->packet_len = packet->size; > + pin->reason = OFPR_NO_MATCH; > + pin->controller_id = 0; > + pin->table_id = 0; > + pin->cookie = 0; > + pin->send_len = 0; /* Not used for flow table misses. */ > + flow_get_metadata(&miss->flow, &pin->fmd); > + ofproto_dpif_send_packet_in(miss->ofproto, pin); > + } > + } > + > atomic_read(&udpif->reval_seq, &reval_seq); > if (reval_seq != fmb->reval_seq) { > COVERAGE_INC(fmb_queue_revalidated); > diff --git a/ofproto/ofproto-dpif-upcall.h b/ofproto/ofproto-dpif-upcall.h > index a23f7a0..9bd19ad 100644 > --- a/ofproto/ofproto-dpif-upcall.h > +++ b/ofproto/ofproto-dpif-upcall.h > @@ -59,6 +59,7 @@ enum upcall_type { > /* An upcall. */ > struct upcall { > struct list list_node; /* For queuing upcalls. */ > + struct flow_miss *flow_miss; /* This upcall's flow_miss. */ > > enum upcall_type type; /* Classification. */ > > @@ -94,8 +95,6 @@ struct flow_miss { > struct dpif_flow_stats stats; > > struct xlate_out xout; > - > - struct list upcalls; /* Contains "struct upcall"s. */ > }; > > struct flow_miss_batch { > -- > 1.7.10.4 > > _______________________________________________ > dev mailing list > dev@openvswitch.org > http://openvswitch.org/mailman/listinfo/dev _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev