I'm doing some performance testing which heavily stresses the OVS DPDK slow path, and notice we spend a ridiculous amount of CPU time on the rule_dpif stats mutex. For my test case, by commenting out the mutex I literally doubled the performance. I'm in a bit of a rush, so as a short term hack, I simply copied over the per-thread stats code which dpif-netdev uses, but it got me thinking of a general plan for how we should handle stats in userspace which I'll outline here:
To start, I think the ovsthread_stats construction needs to be pulled into it's own library and made higher level. Right now it's a general mechanism for storing arbitrary per thread data. I think we really want something which implements an API something like "get_stats", "credit_stats", "clear_stats", etc. That could be used directly by both ofproto-dpif and dpif-netdev very cleanly. You'll see in the hacked together patch I've included below, it's basically a direct copy of all the relevant dpif-netdev code. That done, I think we can actually get rid of the flow stats mutex by using a technique similar to what we do for cmaps. We're guaranteed that only one writer ever touches a particular bucket, so it should be sufficient to ensure all of the data fits on a particular cache line, and that we update a sequence number to notify readers of changes. Thoughts? Ethan --- ofproto/ofproto-dpif.c | 125 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 83 insertions(+), 42 deletions(-) diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c index 6a59098..e9f4f16 100644 --- a/ofproto/ofproto-dpif.c +++ b/ofproto/ofproto-dpif.c @@ -25,10 +25,10 @@ #include "bond.h" #include "bundle.h" #include "byte-order.h" +#include "cfm.h" #include "connectivity.h" #include "connmgr.h" #include "coverage.h" -#include "cfm.h" #include "dpif.h" #include "dynamic-string.h" #include "fail-open.h" @@ -44,13 +44,13 @@ #include "netdev.h" #include "netlink.h" #include "nx-match.h" -#include "odp-util.h" #include "odp-execute.h" -#include "ofp-util.h" -#include "ofpbuf.h" +#include "odp-util.h" #include "ofp-actions.h" #include "ofp-parse.h" #include "ofp-print.h" +#include "ofp-util.h" +#include "ofpbuf.h" #include "ofproto-dpif-ipfix.h" #include "ofproto-dpif-mirror.h" #include "ofproto-dpif-monitor.h" @@ -58,6 +58,7 @@ #include "ofproto-dpif-sflow.h" #include "ofproto-dpif-upcall.h" #include "ofproto-dpif-xlate.h" +#include "ovs-thread.h" #include "poll-loop.h" #include "seq.h" #include "simap.h" @@ -76,6 +77,13 @@ COVERAGE_DEFINE(packet_in_overflow); struct flow_miss; +struct rule_dpif_stats { + struct ovs_mutex mutex; + uint64_t n_packets; + uint64_t n_bytes; + long long int used; +}; + struct rule_dpif { struct rule up; @@ -83,8 +91,7 @@ struct rule_dpif { * * - Do include packets and bytes from datapath flows which have not * recently been processed by a revalidator. */ - struct ovs_mutex stats_mutex; - struct dpif_flow_stats stats OVS_GUARDED; + struct ovsthread_stats stats; /* If non-zero then the recirculation id that has * been allocated for use with this rule. @@ -96,8 +103,7 @@ struct rule_dpif { /* RULE_CAST() depends on this. */ BUILD_ASSERT_DECL(offsetof(struct rule_dpif, up) == 0); -static void rule_get_stats(struct rule *, uint64_t *packets, uint64_t *bytes, - long long int *used); +static void rule_get_stats(struct rule_dpif *, struct dpif_flow_stats *); static struct rule_dpif *rule_dpif_cast(const struct rule *); static void rule_expire(struct rule_dpif *); @@ -3425,13 +3431,10 @@ rule_expire(struct rule_dpif *rule) } if (reason < 0 && idle_timeout) { - long long int used; + struct dpif_flow_stats stats; - ovs_mutex_lock(&rule->stats_mutex); - used = rule->stats.used; - ovs_mutex_unlock(&rule->stats_mutex); - - if (now > used + idle_timeout * 1000) { + rule_get_stats(rule, &stats); + if (now > stats.used + idle_timeout * 1000) { reason = OFPRR_IDLE_TIMEOUT; } } @@ -3493,15 +3496,26 @@ ofproto_dpif_execute_actions(struct ofproto_dpif *ofproto, return error; } +static void * +rule_dpif_stats_new_cb(void) +{ + struct rule_dpif_stats *bucket = xzalloc_cacheline(sizeof *bucket); + ovs_mutex_init(&bucket->mutex); + return bucket; +} + void rule_dpif_credit_stats(struct rule_dpif *rule, const struct dpif_flow_stats *stats) { - ovs_mutex_lock(&rule->stats_mutex); - rule->stats.n_packets += stats->n_packets; - rule->stats.n_bytes += stats->n_bytes; - rule->stats.used = MAX(rule->stats.used, stats->used); - ovs_mutex_unlock(&rule->stats_mutex); + struct rule_dpif_stats *bucket; + + bucket = ovsthread_stats_bucket_get(&rule->stats, rule_dpif_stats_new_cb); + ovs_mutex_lock(&bucket->mutex); + bucket->n_packets += stats->n_packets; + bucket->n_bytes += stats->n_bytes; + bucket->used = MAX(bucket->used, stats->used); + ovs_mutex_unlock(&bucket->mutex); } ovs_be64 @@ -3814,16 +3828,9 @@ rule_dealloc(struct rule *rule_) } static enum ofperr -rule_construct(struct rule *rule_) - OVS_NO_THREAD_SAFETY_ANALYSIS +rule_construct(struct rule *rule) { - struct rule_dpif *rule = rule_dpif_cast(rule_); - ovs_mutex_init_adaptive(&rule->stats_mutex); - rule->stats.n_packets = 0; - rule->stats.n_bytes = 0; - rule->stats.used = rule->up.modified; - rule->recirc_id = 0; - + ovsthread_stats_init(&rule_dpif_cast(rule)->stats); return 0; } @@ -3848,8 +3855,15 @@ static void rule_destruct(struct rule *rule_) { struct rule_dpif *rule = rule_dpif_cast(rule_); + struct rule_dpif_stats *bucket; + size_t i; + + OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &rule->stats) { + ovs_mutex_destroy(&bucket->mutex); + free_cacheline(bucket); + } + ovsthread_stats_destroy(&rule->stats); - ovs_mutex_destroy(&rule->stats_mutex); if (rule->recirc_id) { struct ofproto_dpif *ofproto = ofproto_dpif_cast(rule->up.ofproto); @@ -3857,17 +3871,39 @@ rule_destruct(struct rule *rule_) } } +/* TODO make sure you uninit the stats. */ static void -rule_get_stats(struct rule *rule_, uint64_t *packets, uint64_t *bytes, - long long int *used) +rule_get_stats(struct rule_dpif *rule, struct dpif_flow_stats *stats) + OVS_NO_THREAD_SAFETY_ANALYSIS { - struct rule_dpif *rule = rule_dpif_cast(rule_); + struct rule_dpif_stats *bucket; + size_t i; - ovs_mutex_lock(&rule->stats_mutex); - *packets = rule->stats.n_packets; - *bytes = rule->stats.n_bytes; - *used = rule->stats.used; - ovs_mutex_unlock(&rule->stats_mutex); + memset(stats, 0, sizeof *stats); + /* We're supposed to hold a mutex for rule->up.modified. Not sure if + * that's really true as the only writer is probably ofproto? At any rate + * perhaps it should have it's own mutex? For now I'm not dealing with it. + * Add back the thread safety analysis when fixed. */ + stats->used = rule->up.modified; + OVSTHREAD_STATS_FOR_EACH_BUCKET(bucket, i, &rule->stats) { + ovs_mutex_lock(&bucket->mutex); + stats->n_packets += bucket->n_packets; + stats->n_bytes += bucket->n_bytes; + stats->used = MAX(stats->used, bucket->used); + ovs_mutex_unlock(&bucket->mutex); + } +} + +static void +rule_get_stats__(struct rule *rule, uint64_t *packets, uint64_t *bytes, + long long int *used) +{ + struct dpif_flow_stats stats; + + rule_get_stats(rule_dpif_cast(rule), &stats); + *packets = stats.n_packets; + *bytes = stats.n_bytes; + *used = stats.used; } static void @@ -3895,10 +3931,15 @@ rule_modify_actions(struct rule *rule_, bool reset_counters) struct rule_dpif *rule = rule_dpif_cast(rule_); if (reset_counters) { - ovs_mutex_lock(&rule->stats_mutex); - rule->stats.n_packets = 0; - rule->stats.n_bytes = 0; - ovs_mutex_unlock(&rule->stats_mutex); + struct rule_dpif_stats *bucket; + size_t i; + + OVSTHREAD_STATS_FOR_EACH_BUCKET (bucket, i, &rule->stats) { + ovs_mutex_lock(&bucket->mutex); + bucket->n_packets = 0; + bucket->n_bytes = 0; + ovs_mutex_unlock(&bucket->mutex); + } } complete_operation(rule); @@ -5398,7 +5439,7 @@ const struct ofproto_class ofproto_dpif_class = { rule_delete, rule_destruct, rule_dealloc, - rule_get_stats, + rule_get_stats__, rule_execute, NULL, /* rule_premodify_actions */ rule_modify_actions, -- 1.8.1.2 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev