The cost of creating and initializing facets and subfacets and installing, tracking, and uninstalling kernel flows is significant. When most flows have only one or a few packets, this overhead is higher than the cost of handling each packet individually. This commit introduces heuristics that cheaply count (approximately) the number of packets seen in a flow and skips most of this expensive bookkeeping when the packet count exceeds a threshold (currently 5 packets).
Signed-off-by: Ben Pfaff <b...@nicira.com> --- ofproto/automake.mk | 4 +- ofproto/ofproto-dpif-governor.c | 186 +++++++++++++++++++++++++ ofproto/ofproto-dpif-governor.h | 53 +++++++ ofproto/ofproto-dpif.c | 286 +++++++++++++++++++++++++++++---------- 4 files changed, 454 insertions(+), 75 deletions(-) create mode 100644 ofproto/ofproto-dpif-governor.c create mode 100644 ofproto/ofproto-dpif-governor.h diff --git a/ofproto/automake.mk b/ofproto/automake.mk index ae35b7f..6d98de7 100644 --- a/ofproto/automake.mk +++ b/ofproto/automake.mk @@ -1,4 +1,4 @@ -# Copyright (C) 2009, 2010, 2011 Nicira Networks, Inc. +# Copyright (C) 2009, 2010, 2011, 2012 Nicira Networks, Inc. # # Copying and distribution of this file, with or without modification, # are permitted in any medium without royalty provided the copyright @@ -21,6 +21,8 @@ ofproto_libofproto_a_SOURCES = \ ofproto/ofproto.c \ ofproto/ofproto.h \ ofproto/ofproto-dpif.c \ + ofproto/ofproto-dpif-governor.c \ + ofproto/ofproto-dpif-governor.h \ ofproto/ofproto-dpif-sflow.c \ ofproto/ofproto-dpif-sflow.h \ ofproto/ofproto-provider.h \ diff --git a/ofproto/ofproto-dpif-governor.c b/ofproto/ofproto-dpif-governor.c new file mode 100644 index 0000000..f674f95 --- /dev/null +++ b/ofproto/ofproto-dpif-governor.c @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2012 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <config.h> + +#include "ofproto-dpif-governor.h" + +#include <assert.h> +#include <stdlib.h> + +#include "coverage.h" +#include "poll-loop.h" +#include "random.h" +#include "timeval.h" +#include "util.h" +#include "valgrind.h" +#include "vlog.h" + +VLOG_DEFINE_THIS_MODULE(ofproto_dpif_governor); + +/* Minimum number of observed packets before setting up a flow. + * + * This value seems OK empirically. */ +#define FLOW_SETUP_THRESHOLD 5 +BUILD_ASSERT_DECL(FLOW_SETUP_THRESHOLD > 1); +BUILD_ASSERT_DECL(FLOW_SETUP_THRESHOLD < 16); + +/* Minimum and maximum size of a governor, in bytes. */ +enum { MIN_SIZE = 16 * 1024 }; +enum { MAX_SIZE = 256 * 1024 }; +BUILD_ASSERT_DECL(IS_POW2(MIN_SIZE)); +BUILD_ASSERT_DECL(IS_POW2(MAX_SIZE)); + +/* Minimum and maximum time to process the number of packets that make up a + * given generation. If a generation completes faster than the minimum time, + * we double the table size (but no more than MAX_SIZE). If a generation take + * more than the maximum time to complete, we halve the table size (but no + * smaller than MIN_SIZE). */ +enum { MIN_ELAPSED = 1000 }; /* In milliseconds. */ +enum { MAX_ELAPSED = 5000 }; /* In milliseconds. */ + +static void governor_new_generation(struct governor *, unsigned int size); + +/* Creates and returns a new governor named 'name' (which is used only for log + * messages). */ +struct governor * +governor_create(const char *name) +{ + struct governor *g = xzalloc(sizeof *g); + g->name = xstrdup(name); + governor_new_generation(g, MIN_SIZE); + return g; +} + +/* Destroys 'g'. */ +void +governor_destroy(struct governor *g) +{ + VLOG_INFO("%s: disengaging", g->name); + free(g->table); + free(g); +} + +/* Performs periodic maintenance work on 'g'. */ +void +governor_run(struct governor *g) +{ + if (time_msec() - g->start > MAX_ELAPSED) { + if (g->size > MIN_SIZE) { + governor_new_generation(g, g->size / 2); + } else { + /* Don't start a new generation (we'd never go idle). */ + } + } +} + +/* Arranges for the poll loop to wake up when 'g' needs to do some work. */ +void +governor_wait(struct governor *g) +{ + poll_timer_wait_until(g->start + MAX_ELAPSED); +} + +/* Returns true if 'g' has been doing only a minimal amount of work and thus + * the client should consider getting rid of it entirely. */ +bool +governor_is_idle(struct governor *g) +{ + return g->size == MIN_SIZE && time_msec() - g->start > MAX_ELAPSED; +} + +/* Tests whether a flow whose hash is 'hash' and for which 'n' packets have + * just arrived should be set up in the datapath or just processed on a + * packet-by-packet basis. Returns true to set up a datapath flow, false to + * process the packets individually. + * + * One would expect 'n' to ordinarily be 1, if batching leads multiple packets + * to be processed at a time then it could be greater. */ +bool +governor_should_install_flow(struct governor *g, uint32_t hash, int n) +{ + bool install_flow; + uint8_t *e; + int count; + + assert(n > 0); + + /* Count these packets and begin a new generation if necessary. */ + g->n_packets += n; + if (g->n_packets >= g->size / 4) { + unsigned int new_size; + long long elapsed; + + elapsed = time_msec() - g->start; + new_size = (elapsed < MIN_ELAPSED && g->size < MAX_SIZE ? g->size * 2 + : elapsed > MAX_ELAPSED && g->size > MIN_SIZE ? g->size / 2 + : g->size); + governor_new_generation(g, new_size); + } + + /* Do hash table processing. + * + * Even-numbered hash values use high-order nibbles. + * Odd-numbered hash values use low-order nibbles. */ + e = &g->table[(hash >> 1) & (g->size - 1)]; + count = n + (hash & 1 ? *e >> 4 : *e & 0x0f); + if (count >= FLOW_SETUP_THRESHOLD) { + install_flow = true; + count = 0; + } else { + install_flow = false; + } + *e = hash & 1 ? (count << 4) | (*e & 0x0f) : (*e & 0xf0) | count; + + return install_flow; +} + +/* Starts a new generation in 'g' with a table size of 'size' bytes. 'size' + * must be a power of two between MIN_SIZE and MAX_SIZE, inclusive. */ +static void +governor_new_generation(struct governor *g, unsigned int size) +{ + assert(size >= MIN_SIZE && size <= MAX_SIZE); + assert(is_pow2(size)); + + /* Allocate new table, if necessary. */ + if (g->size != size) { + if (!g->size) { + VLOG_INFO("%s: engaging governor with %u kB hash table", + g->name, g->size / 1024); + } else { + VLOG_INFO("%s: processed %u packets in %.2f s, " + "%s hash table to %u kB", + g->name, g->n_packets, + (time_msec() - g->start) / 1000.0, + size > g->size ? "enlarging" : "shrinking", + size / 1024); + } + + free(g->table); + g->table = xmalloc(size * sizeof *g->table); + g->size = size; + } else { + VLOG_DBG("%s: processed %u packets in %.2f s with %u kB hash table", + g->name, g->n_packets, (time_msec() - g->start) / 1000.0, + size / 1024); + } + + /* Clear data for next generation. */ + memset(g->table, 0, size * sizeof *g->table); + g->start = time_msec(); + g->n_packets = 0; +} diff --git a/ofproto/ofproto-dpif-governor.h b/ofproto/ofproto-dpif-governor.h new file mode 100644 index 0000000..ad022d5 --- /dev/null +++ b/ofproto/ofproto-dpif-governor.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2012 Nicira Networks. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef OFPROTO_DPIF_GOVERNOR_H +#define OFPROTO_DPIF_GOVERNOR_H 1 + +/* Flow setup rate limiter. + * + * A governor in an engine limits a vehicle's speed. This governor limits the + * rate at which flows are set up in the datapath. The client provides as + * input the hashes of observed packets. The governor keeps track of hashes + * seen multiple times. When a given hash is seen often enough, the governor + * indicates to its client that it should set up a facet and a subfacet and a + * datapath flow for that flow. + * + * The same tracking could be done in terms of facets and subfacets directly, + * but the governor code uses much less time and space to do the same job. */ + +#include <stdbool.h> +#include <stdint.h> + +struct governor { + char *name; /* Name, for log messages. */ + uint8_t *table; /* Table of counters, two per byte. */ + unsigned int size; /* Table size in bytes. */ + long long int start; /* Time when the table was last cleared. */ + unsigned int n_packets; /* Number of packets processed. */ +}; + +struct governor *governor_create(const char *name); +void governor_destroy(struct governor *); + +void governor_run(struct governor *); +void governor_wait(struct governor *); + +bool governor_is_idle(struct governor *); + +bool governor_should_install_flow(struct governor *, uint32_t hash, int n); + +#endif /* ofproto/ofproto-dpif-governor.h */ diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index b74331e..80f5dc9 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -43,6 +43,7 @@ #include "ofp-util.h" #include "ofpbuf.h" #include "ofp-print.h" +#include "ofproto-dpif-governor.h" #include "ofproto-dpif-sflow.h" #include "poll-loop.h" #include "timer.h" @@ -61,6 +62,7 @@ COVERAGE_DEFINE(facet_changed_rule); COVERAGE_DEFINE(facet_invalidated); COVERAGE_DEFINE(facet_revalidate); COVERAGE_DEFINE(facet_unexpected); +COVERAGE_DEFINE(facet_suppress); /* Maximum depth of flow table recursion (due to resubmit actions) in a * flow translation. */ @@ -545,6 +547,7 @@ struct ofproto_dpif { /* Facets. */ struct hmap facets; struct hmap subfacets; + struct governor *governor; /* Revalidation. */ struct table_dpif tables[N_TABLES]; @@ -700,6 +703,7 @@ construct(struct ofproto *ofproto_) hmap_init(&ofproto->facets); hmap_init(&ofproto->subfacets); + ofproto->governor = NULL; for (i = 0; i < N_TABLES; i++) { struct table_dpif *table = &ofproto->tables[i]; @@ -772,6 +776,7 @@ destruct(struct ofproto *ofproto_) hmap_destroy(&ofproto->facets); hmap_destroy(&ofproto->subfacets); + governor_destroy(ofproto->governor); hmap_destroy(&ofproto->vlandev_map); hmap_destroy(&ofproto->realdev_vid_map); @@ -879,6 +884,24 @@ run(struct ofproto *ofproto_) } } + if (ofproto->governor) { + size_t n_subfacets; + + governor_run(ofproto->governor); + + /* If the governor has shrunk to its minimum size and the number of + * subfacets has dwindled, then drop the governor entirely. + * + * For hysteresis, the number of subfacets to drop the governor is + * smaller than the number needed to trigger its creation. */ + n_subfacets = hmap_count(&ofproto->subfacets); + if (n_subfacets * 4 < ofproto->up.flow_eviction_threshold + && governor_is_idle(ofproto->governor)) { + governor_destroy(ofproto->governor); + ofproto->governor = NULL; + } + } + return 0; } @@ -919,6 +942,9 @@ wait(struct ofproto *ofproto_) } else { timer_wait(&ofproto->next_expiration); } + if (ofproto->governor) { + governor_wait(ofproto->governor); + } } static void @@ -2558,43 +2584,137 @@ flow_miss_find(struct hmap *todo, const struct flow *flow, uint32_t hash) return NULL; } +/* Partially Initializes 'op' as an "execute" operation for 'miss' and + * 'packet'. The caller must initialize op->actions and op->actions_len. If + * 'miss' is associated with a subfacet the caller must also initialize the + * returned op->subfacet, and if anything needs to be freed after processing + * the op, the caller must initialize op->garbage also. */ static void -handle_flow_miss(struct ofproto_dpif *ofproto, struct flow_miss *miss, - struct flow_miss_op *ops, size_t *n_ops) +init_flow_miss_execute_op(struct flow_miss *miss, struct ofpbuf *packet, + struct flow_miss_op *op) +{ + if (miss->flow.vlan_tci != miss->initial_tci) { + /* This packet was received on a VLAN splinter port. We + * added a VLAN to the packet to make the packet resemble + * the flow, but the actions were composed assuming that + * the packet contained no VLAN. So, we must remove the + * VLAN header from the packet before trying to execute the + * actions. */ + eth_pop_vlan(packet); + } + + op->subfacet = NULL; + op->garbage = NULL; + op->dpif_op.type = DPIF_OP_EXECUTE; + op->dpif_op.u.execute.key = miss->key; + op->dpif_op.u.execute.key_len = miss->key_len; + op->dpif_op.u.execute.packet = packet; +} + +/* Helper for handle_flow_miss_without_facet() and + * handle_flow_miss_with_facet(). */ +static void +handle_flow_miss_common(struct rule_dpif *rule, + struct ofpbuf *packet, const struct flow *flow) { - const struct flow *flow = &miss->flow; - struct subfacet *subfacet; + struct ofproto_dpif *ofproto = ofproto_dpif_cast(rule->up.ofproto); + + ofproto->n_matches++; + + if (rule->up.cr.priority == FAIL_OPEN_PRIORITY) { + /* + * Extra-special case for fail-open mode. + * + * We are in fail-open mode and the packet matched the fail-open + * rule, but we are connected to a controller too. We should send + * the packet up to the controller in the hope that it will try to + * set up a flow and thereby allow us to exit fail-open. + * + * See the top-level comment in fail-open.c for more information. + */ + send_packet_in_miss(ofproto, packet, flow); + } +} + +/* Figures out whether a flow that missed in 'ofproto', whose details are in + * 'miss', is likely to be worth tracking in detail in userspace and (usually) + * installing a datapath flow. The answer is usually "yes" (a return value of + * true). However, for short flows the cost of bookkeeping is much higher than + * the benefits, so when the datapath holds a large number of flows we impose + * some heuristics to decide which flows are likely to be worth tracking. */ +static bool +flow_miss_should_make_facet(struct ofproto_dpif *ofproto, + struct flow_miss *miss) +{ + if (!ofproto->governor) { + size_t n_subfacets; + + n_subfacets = hmap_count(&ofproto->subfacets); + if (n_subfacets * 2 <= ofproto->up.flow_eviction_threshold) { + return true; + } + + ofproto->governor = governor_create(ofproto->up.name); + } + + return governor_should_install_flow(ofproto->governor, + miss->hmap_node.hash, + list_size(&miss->packets)); +} + +/* Handles 'miss', which matches 'rule', without creating a facet or subfacet + * or creating any datapath flow. May add an "execute" operation to 'ops' and + * increment '*n_ops'. */ +static void +handle_flow_miss_without_facet(struct flow_miss *miss, + struct rule_dpif *rule, + struct flow_miss_op *ops, size_t *n_ops) +{ + struct ofproto_dpif *ofproto = ofproto_dpif_cast(rule->up.ofproto); + struct action_xlate_ctx ctx; struct ofpbuf *packet; - struct facet *facet; - facet = facet_lookup_valid(ofproto, flow, miss->hmap_node.hash); - if (!facet) { - struct rule_dpif *rule; + LIST_FOR_EACH (packet, list_node, &miss->packets) { + struct flow_miss_op *op = &ops[*n_ops]; + struct dpif_flow_stats stats; + struct ofpbuf odp_actions; - rule = rule_dpif_lookup(ofproto, flow, 0); - if (!rule) { - /* Don't send a packet-in if OFPUTIL_PC_NO_PACKET_IN asserted. */ - struct ofport_dpif *port = get_ofp_port(ofproto, flow->in_port); - if (port) { - if (port->up.pp.config & OFPUTIL_PC_NO_PACKET_IN) { - COVERAGE_INC(ofproto_dpif_no_packet_in); - /* XXX install 'drop' flow entry */ - return; - } - } else { - VLOG_WARN_RL(&rl, "packet-in on unknown port %"PRIu16, - flow->in_port); - } + COVERAGE_INC(facet_suppress); - LIST_FOR_EACH (packet, list_node, &miss->packets) { - send_packet_in_miss(ofproto, packet, flow); - } + ofpbuf_use_stub(&odp_actions, op->stub, sizeof op->stub); - return; - } + dpif_flow_stats_extract(&miss->flow, packet, &stats); + rule_credit_stats(rule, &stats); + + action_xlate_ctx_init(&ctx, ofproto, &miss->flow, miss->initial_tci, + rule, 0, packet); + ctx.resubmit_stats = &stats; + xlate_actions(&ctx, rule->up.actions, rule->up.n_actions, + &odp_actions); + + if (odp_actions.size) { + struct dpif_execute *execute = &op->dpif_op.u.execute; + + init_flow_miss_execute_op(miss, packet, op); + execute->actions = odp_actions.data; + execute->actions_len = odp_actions.size; + op->garbage = ofpbuf_get_uninit_pointer(&odp_actions); - facet = facet_create(rule, flow, miss->hmap_node.hash); + (*n_ops)++; + } else { + ofpbuf_uninit(&odp_actions); + } } +} + +/* Handles 'miss', which matches 'facet'. May add any required datapath + * operations to 'ops', incrementing '*n_ops' for each new op. */ +static void +handle_flow_miss_with_facet(struct flow_miss *miss, struct facet *facet, + struct flow_miss_op *ops, size_t *n_ops) +{ + struct subfacet *subfacet; + struct ofpbuf *packet; subfacet = subfacet_create(facet, miss->key_fitness, miss->key, miss->key_len, @@ -2602,30 +2722,12 @@ handle_flow_miss(struct ofproto_dpif *ofproto, struct flow_miss *miss, LIST_FOR_EACH (packet, list_node, &miss->packets) { struct flow_miss_op *op = &ops[*n_ops]; - struct dpif_execute *execute = &op->dpif_op.u.execute; struct dpif_flow_stats stats; - - uint64_t odp_actions_stub[1024 / 8]; struct ofpbuf odp_actions; - ofproto->n_matches++; + handle_flow_miss_common(facet->rule, packet, &miss->flow); - if (facet->rule->up.cr.priority == FAIL_OPEN_PRIORITY) { - /* - * Extra-special case for fail-open mode. - * - * We are in fail-open mode and the packet matched the fail-open - * rule, but we are connected to a controller too. We should send - * the packet up to the controller in the hope that it will try to - * set up a flow and thereby allow us to exit fail-open. - * - * See the top-level comment in fail-open.c for more information. - */ - send_packet_in_miss(ofproto, packet, flow); - } - - ofpbuf_use_stub(&odp_actions, - odp_actions_stub, sizeof odp_actions_stub); + ofpbuf_use_stub(&odp_actions, op->stub, sizeof op->stub); if (!facet->may_install || !subfacet->actions) { subfacet_make_actions(subfacet, packet, &odp_actions); } @@ -2633,36 +2735,24 @@ handle_flow_miss(struct ofproto_dpif *ofproto, struct flow_miss *miss, dpif_flow_stats_extract(&facet->flow, packet, &stats); subfacet_update_stats(subfacet, &stats); - if (!subfacet->actions_len) { - /* No actions to execute, so skip talking to the dpif. */ - continue; - } + if (subfacet->actions_len) { + struct dpif_execute *execute = &op->dpif_op.u.execute; - if (flow->vlan_tci != subfacet->initial_tci) { - /* This packet was received on a VLAN splinter port. We added - * a VLAN to the packet to make the packet resemble the flow, - * but the actions were composed assuming that the packet - * contained no VLAN. So, we must remove the VLAN header from - * the packet before trying to execute the actions. */ - eth_pop_vlan(packet); - } + init_flow_miss_execute_op(miss, packet, op); + op->subfacet = subfacet; + if (facet->may_install) { + execute->actions = subfacet->actions; + execute->actions_len = subfacet->actions_len; + } else { + execute->actions = odp_actions.data; + execute->actions_len = odp_actions.size; + op->garbage = ofpbuf_get_uninit_pointer(&odp_actions); + } - /* Set up operation. */ - op->dpif_op.type = DPIF_OP_EXECUTE; - execute->key = miss->key; - execute->key_len = miss->key_len; - if (facet->may_install) { - execute->actions = subfacet->actions; - execute->actions_len = subfacet->actions_len; - op->garbage = NULL; + (*n_ops)++; } else { - execute->actions = odp_actions.data; - execute->actions_len = odp_actions.size; - op->garbage = ofpbuf_get_uninit_pointer(&odp_actions); + ofpbuf_uninit(&odp_actions); } - execute->packet = packet; - - (*n_ops)++; } if (facet->may_install && subfacet->key_fitness != ODP_FIT_TOO_LITTLE) { @@ -2681,6 +2771,54 @@ handle_flow_miss(struct ofproto_dpif *ofproto, struct flow_miss *miss, } } +/* Handles flow miss 'miss' on 'ofproto'. The flow does not match any flow in + * the OpenFlow flow table. */ +static void +handle_flow_miss_no_rule(struct ofproto_dpif *ofproto, struct flow_miss *miss) +{ + uint16_t in_port = miss->flow.in_port; + struct ofport_dpif *port = get_ofp_port(ofproto, in_port); + + if (!port) { + VLOG_WARN_RL(&rl, "packet-in on unknown port %"PRIu16, in_port); + } + + if (port && port->up.pp.config & OFPUTIL_PC_NO_PACKET_IN) { + /* XXX install 'drop' flow entry */ + COVERAGE_INC(ofproto_dpif_no_packet_in); + } else { + const struct ofpbuf *packet; + + LIST_FOR_EACH (packet, list_node, &miss->packets) { + send_packet_in_miss(ofproto, packet, &miss->flow); + } + } +} + +/* Handles flow miss 'miss' on 'ofproto'. May add any required datapath + * operations to 'ops', incrementing '*n_ops' for each new op. */ +static void +handle_flow_miss(struct ofproto_dpif *ofproto, struct flow_miss *miss, + struct flow_miss_op *ops, size_t *n_ops) +{ + struct facet *facet; + + facet = facet_lookup_valid(ofproto, &miss->flow, miss->hmap_node.hash); + if (!facet) { + struct rule_dpif *rule = rule_dpif_lookup(ofproto, &miss->flow, 0); + if (!rule) { + handle_flow_miss_no_rule(ofproto, miss); + return; + } else if (!flow_miss_should_make_facet(ofproto, miss)) { + handle_flow_miss_without_facet(miss, rule, ops, n_ops); + return; + } + + facet = facet_create(rule, &miss->flow, miss->hmap_node.hash); + } + handle_flow_miss_with_facet(miss, facet, ops, n_ops); +} + /* Like odp_flow_key_to_flow(), this function converts the 'key_len' bytes of * OVS_KEY_ATTR_* attributes in 'key' to a flow structure in 'flow' and returns * an ODP_FIT_* value that indicates how well 'key' fits our expectations for -- 1.7.9 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev