I get test failures after this patch, seems to relate to slow path processing. 
Haven't looked at them, but I see failures on 656 and 657 come and go, so those 
are most likely unrelated test timing issues.

after 6/7:

632: ofproto-dpif - controller                       FAILED 
(ofproto-dpif.at:292)
633: ofproto-dpif - VLAN handling                    ok
634: ofproto-dpif - fragment handling                ok
635: ofproto-dpif - exit                             ok
636: ofproto-dpif - mirroring, select_all            ok
637: ofproto-dpif - mirroring, select_src            ok
638: ofproto-dpif - mirroring, OFPP_NONE ingress port ok
639: ofproto-dpif - mirroring, select_dst            ok
640: ofproto-dpif - mirroring, select_vlan           ok
641: ofproto-dpif - mirroring, output_port           ok
642: ofproto-dpif - mirroring, output_vlan           ok
643: ofproto-dpif - ofproto/trace command 1          ok
644: ofproto-dpif - ofproto/trace command 2          ok
645: ofproto-dpif - MAC learning                     ok
646: ofproto-dpif - MAC table overflow               ok
647: ofproto-dpif - sFlow packet sampling            FAILED 
(ofproto-dpif.at:1586)
648: ofproto-dpif - NetFlow flow expiration          ok
649: ofproto-dpif - NetFlow active expiration        FAILED 
(ofproto-dpif.at:1952)
650: idle_age and hard_age increase over time        ok
651: ofproto-dpif - fin_timeout                      FAILED 
(ofproto-dpif.at:2084)
652: ofproto-dpif - ovs-appctl dpif/dump-dps         ok
653: ofproto-dpif - ovs-appctl dpif/show             ok
654: ofproto-dpif - ovs-appctl dpif/dump-flows       ok
655: ofproto-dpif - ovs-appctl dpif/del-flows        ok
656: ofproto-dpif - patch ports                      FAILED 
(ofproto-dpif.at:2201)
657: ofproto-dpif - ovs-appctl dpif/show rates       FAILED 
(ofproto-dpif.at:2250)

after 7/7:

632: ofproto-dpif - controller                       FAILED 
(ofproto-dpif.at:292)
633: ofproto-dpif - VLAN handling                    ok
634: ofproto-dpif - fragment handling                ok
635: ofproto-dpif - exit                             ok
636: ofproto-dpif - mirroring, select_all            ok
637: ofproto-dpif - mirroring, select_src            ok
638: ofproto-dpif - mirroring, OFPP_NONE ingress port ok
639: ofproto-dpif - mirroring, select_dst            ok
640: ofproto-dpif - mirroring, select_vlan           ok
641: ofproto-dpif - mirroring, output_port           ok
642: ofproto-dpif - mirroring, output_vlan           ok
643: ofproto-dpif - ofproto/trace command 1          ok
644: ofproto-dpif - ofproto/trace command 2          ok
645: ofproto-dpif - MAC learning                     ok
646: ofproto-dpif - MAC table overflow               ok
647: ofproto-dpif - sFlow packet sampling            FAILED 
(ofproto-dpif.at:1586)
648: ofproto-dpif - NetFlow flow expiration          FAILED 
(ofproto-dpif.at:1867)
649: ofproto-dpif - NetFlow active expiration        ok
650: idle_age and hard_age increase over time        ok
651: ofproto-dpif - fin_timeout                      FAILED 
(ofproto-dpif.at:2084)
652: ofproto-dpif - ovs-appctl dpif/dump-dps         ok
653: ofproto-dpif - ovs-appctl dpif/show             ok
654: ofproto-dpif - ovs-appctl dpif/dump-flows       ok
655: ofproto-dpif - ovs-appctl dpif/del-flows        ok
656: ofproto-dpif - patch ports                      ok
657: ofproto-dpif - ovs-appctl dpif/show rates       FAILED 
(ofproto-dpif.at:2250)

  Jarno

On Sep 16, 2013, at 2:59 PM, Ben Pfaff <b...@nicira.com> wrote:

> Until now, the code in ofproto-dpif-upcall (and the code that preceded it
> in ofproto-dpif) obtained a batch of incoming packets, inserted them into
> a hash table based on hashes of their flows, processed them, and then
> forwarded them in hash order.  Usually this maintains order within a single
> network connection, but because OVS's notion of a flow is so fine-grained,
> it can reorder packets within (e.g.) a TCP connection if two packets
> handled in a single batch have (e.g.) different ECN values.
> 
> This commit fixes the problem by making ofproto-dpif-upcall always forward
> packets in the same order they were received.
> 
> This is far from the minimal change necessary to avoid reordering packets.
> I think that the code is easier to understand afterward.
> 
> Reported-by: Dmitry Fleytman <dfley...@redhat.com>
> Signed-off-by: Ben Pfaff <b...@nicira.com>
> ---
> ofproto/ofproto-dpif-upcall.c |  337 +++++++++++++++++++++--------------------
> ofproto/ofproto-dpif-upcall.h |    3 +-
> 2 files changed, 176 insertions(+), 164 deletions(-)
> 
> diff --git a/ofproto/ofproto-dpif-upcall.c b/ofproto/ofproto-dpif-upcall.c
> index b9d91b2..e13a5f8 100644
> --- a/ofproto/ofproto-dpif-upcall.c
> +++ b/ofproto/ofproto-dpif-upcall.c
> @@ -416,12 +416,6 @@ udpif_miss_handler(void *arg)
> static void
> miss_destroy(struct flow_miss *miss)
> {
> -    struct upcall *upcall, *next;
> -
> -    LIST_FOR_EACH_SAFE (upcall, next, list_node, &miss->upcalls) {
> -        list_remove(&upcall->list_node);
> -        upcall_destroy(upcall);
> -    }
>     xlate_out_uninit(&miss->xout);
> }
> 
> @@ -598,105 +592,6 @@ flow_miss_find(struct hmap *todo, const struct 
> ofproto_dpif *ofproto,
>     return NULL;
> }
> 
> -/* Executes flow miss 'miss'.  May add any required datapath operations
> - * to 'ops', incrementing '*n_ops' for each new op. */
> -static void
> -execute_flow_miss(struct flow_miss *miss, struct dpif_op *ops, size_t *n_ops)
> -{
> -    struct ofproto_dpif *ofproto = miss->ofproto;
> -    struct upcall *upcall;
> -    struct flow_wildcards wc;
> -    struct rule_dpif *rule;
> -    struct xlate_in xin;
> -
> -    memset(&miss->stats, 0, sizeof miss->stats);
> -    miss->stats.used = time_msec();
> -    LIST_FOR_EACH (upcall, list_node, &miss->upcalls) {
> -        struct ofpbuf *packet = upcall->dpif_upcall.packet;
> -
> -        miss->stats.tcp_flags |= packet_get_tcp_flags(packet, &miss->flow);
> -        miss->stats.n_bytes += packet->size;
> -        miss->stats.n_packets++;
> -    }
> -
> -    flow_wildcards_init_catchall(&wc);
> -    rule_dpif_lookup(ofproto, &miss->flow, &wc, &rule);
> -    rule_dpif_credit_stats(rule, &miss->stats);
> -    xlate_in_init(&xin, ofproto, &miss->flow, rule, miss->stats.tcp_flags,
> -                  NULL);
> -    xin.may_learn = true;
> -    xin.resubmit_stats = &miss->stats;
> -    xlate_actions(&xin, &miss->xout);
> -    flow_wildcards_or(&miss->xout.wc, &miss->xout.wc, &wc);
> -
> -    if (rule_dpif_fail_open(rule)) {
> -        LIST_FOR_EACH (upcall, list_node, &miss->upcalls) {
> -            struct ofpbuf *packet = upcall->dpif_upcall.packet;
> -            struct ofputil_packet_in *pin;
> -
> -            /* Extra-special case for fail-open mode.
> -             *
> -             * We are in fail-open mode and the packet matched the fail-open
> -             * rule, but we are connected to a controller too.  We should 
> send
> -             * the packet up to the controller in the hope that it will try 
> to
> -             * set up a flow and thereby allow us to exit fail-open.
> -             *
> -             * See the top-level comment in fail-open.c for more 
> information. */
> -            pin = xmalloc(sizeof(*pin));
> -            pin->packet = xmemdup(packet->data, packet->size);
> -            pin->packet_len = packet->size;
> -            pin->reason = OFPR_NO_MATCH;
> -            pin->controller_id = 0;
> -            pin->table_id = 0;
> -            pin->cookie = 0;
> -            pin->send_len = 0; /* Not used for flow table misses. */
> -            flow_get_metadata(&miss->flow, &pin->fmd);
> -            ofproto_dpif_send_packet_in(ofproto, pin);
> -        }
> -    }
> -
> -    if (miss->xout.slow) {
> -        LIST_FOR_EACH (upcall, list_node, &miss->upcalls) {
> -            struct ofpbuf *packet = upcall->dpif_upcall.packet;
> -            struct xlate_in xin;
> -
> -            xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet);
> -            xlate_actions_for_side_effects(&xin);
> -        }
> -    }
> -    rule_dpif_unref(rule);
> -
> -    if (miss->xout.odp_actions.size) {
> -        LIST_FOR_EACH (upcall, list_node, &miss->upcalls) {
> -            struct ofpbuf *packet = upcall->dpif_upcall.packet;
> -            struct dpif_op *op = &ops[*n_ops];
> -            struct dpif_execute *execute = &op->u.execute;
> -
> -            if (miss->flow.in_port.ofp_port
> -                != vsp_realdev_to_vlandev(miss->ofproto,
> -                                          miss->flow.in_port.ofp_port,
> -                                          miss->flow.vlan_tci)) {
> -                /* This packet was received on a VLAN splinter port.  We
> -                 * added a VLAN to the packet to make the packet resemble
> -                 * the flow, but the actions were composed assuming that
> -                 * the packet contained no VLAN.  So, we must remove the
> -                 * VLAN header from the packet before trying to execute the
> -                 * actions. */
> -                eth_pop_vlan(packet);
> -            }
> -
> -            op->type = DPIF_OP_EXECUTE;
> -            execute->key = miss->key;
> -            execute->key_len = miss->key_len;
> -            execute->packet = packet;
> -            execute->actions = miss->xout.odp_actions.data;
> -            execute->actions_len = miss->xout.odp_actions.size;
> -
> -            (*n_ops)++;
> -        }
> -    }
> -}
> -
> static void
> handle_miss_upcalls(struct udpif *udpif, struct list *upcalls)
> {
> @@ -707,92 +602,184 @@ handle_miss_upcalls(struct udpif *udpif, struct list 
> *upcalls)
>     size_t n_upcalls, n_ops, i;
>     struct flow_miss *miss;
>     unsigned int reval_seq;
> +    bool fail_open;
> 
> -    /* Construct the to-do list.
> +    /* Extract the flow from each upcall.  Construct in fmb->misses a hash
> +     * table that maps each unique flow to a 'struct flow_miss'.
> +     *
> +     * Most commonly there is a single packet per flow_miss, but there are
> +     * several reasons why there might be more than one, e.g.:
>      *
> -     * This just amounts to extracting the flow from each packet and sticking
> -     * the packets that have the same flow in the same "flow_miss" structure 
> so
> -     * that we can process them together. */
> +     *   - The dpif packet interface does not support TSO (or UFO, etc.), so 
> a
> +     *     large packet sent to userspace is split into a sequence of smaller
> +     *     ones.
> +     *
> +     *   - A stream of quickly arriving packets in an established 
> "slow-pathed"
> +     *     flow.
> +     *
> +     *   - Rarely, a stream of quickly arriving packets in a flow not yet
> +     *     established.  (This is rare because most protocols do not send
> +     *     multiple back-to-back packets before receiving a reply from the
> +     *     other end of the connection, which gives OVS a chance to set up a
> +     *     datapath flow.)
> +     */
>     fmb = xmalloc(sizeof *fmb);
>     atomic_read(&udpif->reval_seq, &fmb->reval_seq);
>     hmap_init(&fmb->misses);
>     n_upcalls = 0;
>     LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) {
>         struct dpif_upcall *dupcall = &upcall->dpif_upcall;
> +        struct ofpbuf *packet = dupcall->packet;
>         struct flow_miss *miss = &fmb->miss_buf[n_upcalls];
>         struct flow_miss *existing_miss;
>         struct ofproto_dpif *ofproto;
>         odp_port_t odp_in_port;
>         struct flow flow;
> -        uint32_t hash;
>         int error;
> 
> -        error = xlate_receive(udpif->backer, dupcall->packet, dupcall->key,
> +        error = xlate_receive(udpif->backer, packet, dupcall->key,
>                               dupcall->key_len, &flow, &miss->key_fitness,
>                               &ofproto, &odp_in_port);
> 
> -        if (error == ENODEV) {
> -            struct drop_key *drop_key;
> -
> -            /* Received packet on datapath port for which we couldn't
> -             * associate an ofproto.  This can happen if a port is removed
> -             * while traffic is being received.  Print a rate-limited message
> -             * in case it happens frequently.  Install a drop flow so
> -             * that future packets of the flow are inexpensively dropped
> -             * in the kernel. */
> -            VLOG_INFO_RL(&rl, "received packet on unassociated datapath port 
> "
> -                              "%"PRIu32, odp_in_port);
> -
> -            drop_key = xmalloc(sizeof *drop_key);
> -            drop_key->key = xmemdup(dupcall->key, dupcall->key_len);
> -            drop_key->key_len = dupcall->key_len;
> -
> -            if (guarded_list_push_back(&udpif->drop_keys, 
> &drop_key->list_node,
> -                                       MAX_QUEUE_LENGTH)) {
> -                seq_change(udpif->wait_seq);
> +        if (!error) {
> +            uint32_t hash;
> +
> +            flow_extract(packet, flow.skb_priority, flow.pkt_mark,
> +                         &flow.tunnel, &flow.in_port, &miss->flow);
> +
> +            hash = flow_hash(&miss->flow, 0);
> +            existing_miss = flow_miss_find(&fmb->misses, ofproto, 
> &miss->flow,
> +                                           hash);
> +            if (!existing_miss) {
> +                hmap_insert(&fmb->misses, &miss->hmap_node, hash);
> +                miss->ofproto = ofproto;
> +                miss->key = dupcall->key;
> +                miss->key_len = dupcall->key_len;
> +                miss->upcall_type = dupcall->type;
> +                miss->stats.n_packets = 0;
> +                miss->stats.n_bytes = 0;
> +                miss->stats.used = time_msec();
> +                miss->stats.tcp_flags = 0;
> +
> +                n_upcalls++;
>             } else {
> -                COVERAGE_INC(drop_queue_overflow);
> -                drop_key_destroy(drop_key);
> +                miss = existing_miss;
>             }
> -            continue;
> -        } else if (error) {
> -            continue;
> -        }
> +            miss->stats.tcp_flags |= packet_get_tcp_flags(packet, 
> &miss->flow);
> +            miss->stats.n_bytes += packet->size;
> +            miss->stats.n_packets++;
> 
> -        flow_extract(dupcall->packet, flow.skb_priority, flow.pkt_mark,
> -                     &flow.tunnel, &flow.in_port, &miss->flow);
> -
> -        /* Add other packets to a to-do list. */
> -        hash = flow_hash(&miss->flow, 0);
> -        existing_miss = flow_miss_find(&fmb->misses, ofproto, &miss->flow, 
> hash);
> -        if (!existing_miss) {
> -            hmap_insert(&fmb->misses, &miss->hmap_node, hash);
> -            miss->ofproto = ofproto;
> -            miss->key = dupcall->key;
> -            miss->key_len = dupcall->key_len;
> -            miss->upcall_type = dupcall->type;
> -            list_init(&miss->upcalls);
> -
> -            n_upcalls++;
> +            upcall->flow_miss = miss;
>         } else {
> -            miss = existing_miss;
> +            if (error == ENODEV) {
> +                struct drop_key *drop_key;
> +
> +                /* Received packet on datapath port for which we couldn't
> +                 * associate an ofproto.  This can happen if a port is 
> removed
> +                 * while traffic is being received.  Print a rate-limited
> +                 * message in case it happens frequently.  Install a drop 
> flow
> +                 * so that future packets of the flow are inexpensively 
> dropped
> +                 * in the kernel. */
> +                VLOG_INFO_RL(&rl, "received packet on unassociated datapath "
> +                             "port %"PRIu32, odp_in_port);
> +
> +                drop_key = xmalloc(sizeof *drop_key);
> +                drop_key->key = xmemdup(dupcall->key, dupcall->key_len);
> +                drop_key->key_len = dupcall->key_len;
> +
> +                if (guarded_list_push_back(&udpif->drop_keys,
> +                                           &drop_key->list_node,
> +                                           MAX_QUEUE_LENGTH)) {
> +                    seq_change(udpif->wait_seq);
> +                } else {
> +                    COVERAGE_INC(drop_queue_overflow);
> +                    drop_key_destroy(drop_key);
> +                }
> +            }
> +            list_remove(&upcall->list_node);
> +            upcall_destroy(upcall);
>         }
> -        list_remove(&upcall->list_node);
> -        list_push_back(&miss->upcalls, &upcall->list_node);
>     }
> 
> -    LIST_FOR_EACH_SAFE (upcall, next, list_node, upcalls) {
> -        list_remove(&upcall->list_node);
> -        upcall_destroy(upcall);
> +    /* Initialize each 'struct flow_miss's ->xout.
> +     *
> +     * We do this per-flow_miss rather than per-packet because, most 
> commonly,
> +     * all the packets in a flow can use the same translation.
> +     *
> +     * We can't do this in the previous loop because we need the TCP flags 
> for
> +     * all the packets in each miss. */
> +    fail_open = false;
> +    HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) {
> +        struct flow_wildcards wc;
> +        struct rule_dpif *rule;
> +        struct xlate_in xin;
> +
> +        flow_wildcards_init_catchall(&wc);
> +        rule_dpif_lookup(miss->ofproto, &miss->flow, &wc, &rule);
> +        if (rule_dpif_fail_open(rule)) {
> +            fail_open = true;
> +        }
> +        rule_dpif_credit_stats(rule, &miss->stats);
> +        xlate_in_init(&xin, miss->ofproto, &miss->flow, rule,
> +                      miss->stats.tcp_flags, NULL);
> +        xin.may_learn = true;
> +        xin.resubmit_stats = &miss->stats;
> +        xlate_actions(&xin, &miss->xout);
> +        flow_wildcards_or(&miss->xout.wc, &miss->xout.wc, &wc);
> +        rule_dpif_unref(rule);
>     }
> 
> -    /* Process each element in the to-do list, constructing the set of
> -     * operations to batch. */
> +    /* Now handle each packet individually:
> +     *
> +     *   - In the common case, the packet is a member of a flow that doesn't
> +     *     need per-packet translation.  We already did the translation in 
> the
> +     *     previous loop, so reuse it.
> +     *
> +     *   - Otherwise, we need to do another translation just for this
> +     *     packet.
> +     *
> +     * The loop fills 'ops' with an array of operations to execute in the
> +     * datapath. */
>     n_ops = 0;
> -    HMAP_FOR_EACH (miss, hmap_node, &fmb->misses) {
> -        execute_flow_miss(miss, ops, &n_ops);
> +    LIST_FOR_EACH (upcall, list_node, upcalls) {
> +        struct flow_miss *miss = upcall->flow_miss;
> +        struct ofpbuf *packet = upcall->dpif_upcall.packet;
> +
> +        if (miss->xout.slow) {
> +            struct rule_dpif *rule;
> +            struct xlate_in xin;
> +
> +            rule_dpif_lookup(miss->ofproto, &miss->flow, NULL, &rule);
> +            xlate_in_init(&xin, miss->ofproto, &miss->flow, rule, 0, packet);
> +            xlate_actions_for_side_effects(&xin);
> +            rule_dpif_unref(rule);
> +        }
> +
> +        if (miss->xout.odp_actions.size) {
> +            struct dpif_op *op;
> +
> +            if (miss->flow.in_port.ofp_port
> +                != vsp_realdev_to_vlandev(miss->ofproto,
> +                                          miss->flow.in_port.ofp_port,
> +                                          miss->flow.vlan_tci)) {
> +                /* This packet was received on a VLAN splinter port.  We
> +                 * added a VLAN to the packet to make the packet resemble
> +                 * the flow, but the actions were composed assuming that
> +                 * the packet contained no VLAN.  So, we must remove the
> +                 * VLAN header from the packet before trying to execute the
> +                 * actions. */
> +                eth_pop_vlan(packet);
> +            }
> +
> +            op = &ops[n_ops++];
> +            op->type = DPIF_OP_EXECUTE;
> +            op->u.execute.key = miss->key;
> +            op->u.execute.key_len = miss->key_len;
> +            op->u.execute.packet = packet;
> +            op->u.execute.actions = miss->xout.odp_actions.data;
> +            op->u.execute.actions_len = miss->xout.odp_actions.size;
> +        }
>     }
> -    ovs_assert(n_ops <= ARRAY_SIZE(ops));
> 
>     /* Execute batch. */
>     for (i = 0; i < n_ops; i++) {
> @@ -800,6 +787,32 @@ handle_miss_upcalls(struct udpif *udpif, struct list 
> *upcalls)
>     }
>     dpif_operate(udpif->dpif, opsp, n_ops);
> 
> +    /* Special case for fail-open mode.
> +     *
> +     * If we are in fail-open mode, but we are connected to a controller too,
> +     * then we should send the packet up to the controller in the hope that 
> it
> +     * will try to set up a flow and thereby allow us to exit fail-open.
> +     *
> +     * See the top-level comment in fail-open.c for more information. */
> +    if (fail_open) {
> +        LIST_FOR_EACH (upcall, list_node, upcalls) {
> +            struct flow_miss *miss = upcall->flow_miss;
> +            struct ofpbuf *packet = upcall->dpif_upcall.packet;
> +            struct ofputil_packet_in *pin;
> +
> +            pin = xmalloc(sizeof *pin);
> +            pin->packet = xmemdup(packet->data, packet->size);
> +            pin->packet_len = packet->size;
> +            pin->reason = OFPR_NO_MATCH;
> +            pin->controller_id = 0;
> +            pin->table_id = 0;
> +            pin->cookie = 0;
> +            pin->send_len = 0; /* Not used for flow table misses. */
> +            flow_get_metadata(&miss->flow, &pin->fmd);
> +            ofproto_dpif_send_packet_in(miss->ofproto, pin);
> +        }
> +    }
> +
>     atomic_read(&udpif->reval_seq, &reval_seq);
>     if (reval_seq != fmb->reval_seq) {
>         COVERAGE_INC(fmb_queue_revalidated);
> diff --git a/ofproto/ofproto-dpif-upcall.h b/ofproto/ofproto-dpif-upcall.h
> index a23f7a0..9bd19ad 100644
> --- a/ofproto/ofproto-dpif-upcall.h
> +++ b/ofproto/ofproto-dpif-upcall.h
> @@ -59,6 +59,7 @@ enum upcall_type {
> /* An upcall. */
> struct upcall {
>     struct list list_node;          /* For queuing upcalls. */
> +    struct flow_miss *flow_miss;    /* This upcall's flow_miss. */
> 
>     enum upcall_type type;          /* Classification. */
> 
> @@ -94,8 +95,6 @@ struct flow_miss {
>     struct dpif_flow_stats stats;
> 
>     struct xlate_out xout;
> -
> -    struct list upcalls;        /* Contains "struct upcall"s. */
> };
> 
> struct flow_miss_batch {
> -- 
> 1.7.10.4
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> http://openvswitch.org/mailman/listinfo/dev

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to