Thanks for doing this!  Looks like it's on the right track.

I get the following errors when building this:

lib/cfm.c:324:1: warning: symbol 'cfm_get_rmp_health_stats' was not declared.
Should it be static?
lib/cfm.c:324:1: error: no previous prototype for ???cfm_get_rmp_health_stats???
[-Werror=missing-prototypes]
lib/cfm.c: In function ???cfm_print_details???:
lib/cfm.c:718:9: error: format ???%lu??? expects argument of type ???long 
unsigned
int???, but argument 3 has type ???int64_t (*)(struct remote_mp *)???
[-Werror=format]

Please surround operators with spaces (i.e 3 + 5 not 3+5).

+    int64_t health;           /* Average health over all remote_mps */

It's not clear to me why we're using an int64_t to calculate this value.  It
can only take on the range 0 - 100, an int seems simpler.

Also, it's a bit confusing that we're keeping track of a weighted moving
average of 'health' for the cfm module, but if its inverse, 'loss', for each
remote_mp.  Do you think it's possible/simpler to keep track of the data as
'health' in both places?
 
+/* Returns the health as a percentage. */
+int64_t
+cfm_get_rmp_health_stats(struct remote_mp *rmp)
+{
+    return 100 - rmp->loss;
+}

This function isn't globally available so it should be static.

         cfm->remote_opup = true;
+        health = 0;
         HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, &cfm->remote_mps) {
 
+            /* Calculate the exponentially weighted moving average. */
+            lost = ((3 - rmp->num_ccm) <= 0) ? 0 : ((3 - rmp->num_ccm)*100)/3;
+            rmp->loss = (rmp->loss+lost)/2;
+            health += cfm_get_rmp_health_stats(rmp);

This code is complex enough that it probably deserves a more detailed comment
explaining the algorithm used to calculate the average.  In particular, it
wouldn't be obvious to someone reading the code where "3" came from.  Also I
think we can make the code a bit simpler by delaying the negativity check a
bit.  Perhaps something like the following?

new_loss = ((3 - rmp->num_ccm) * 100) / 3;
if (new_loss >= 0) {
    rmp->loss = (rmp->loss + new_loss) / 2;
}

+            rmp->num_ccm = 0;
This accesses free()'d memory when rmp failed to receive anything.


+        cfm->health = cfm->rmps_array_len > 0 ? health/cfm->rmps_array_len : 0;

I think the average is going to be messed up here if rmps_array_len changes as
we're doing the calculation.  Perhaps it makes sense to cache that value?
Alternatively, we could split the calculation into a separate loop before the
already existing one.

It may be worth asserting that cfm->health is between 0 and 100. Might make
sense for rmp->loss above as well.
 
         if (hmap_is_empty(&cfm->remote_mps)) {
             cfm->fault |= CFM_FAULT_RECV;
@@ -535,6 +555,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf 
*p)
         uint64_t ccm_mpid;
         uint32_t ccm_seq;
         bool ccm_opdown;
+        bool fault = false;
 
         if (cfm->extended) {
             ccm_mpid = ntohll(ccm->mpid64);
@@ -549,6 +570,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf 
*p)
             VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid interval"
                          " (%"PRIu8") from RMP %"PRIu64, cfm->name,
                          ccm_interval, ccm_mpid);
+            fault = true;
         }
 
         if (cfm->extended && ccm_interval == 0
@@ -556,6 +578,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf 
*p)
             VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid extended"
                          " interval (%"PRIu16"ms) from RMP %"PRIu64, cfm->name,
                          ccm_interval_ms_x, ccm_mpid);
+            fault = true;
         }
 
         rmp = lookup_remote_mp(cfm, ccm_mpid);
@@ -563,12 +586,15 @@ cfm_process_heartbeat(struct cfm *cfm, const struct 
ofpbuf *p)
             if (hmap_count(&cfm->remote_mps) < CFM_MAX_RMPS) {
                 rmp = xzalloc(sizeof *rmp);
                 hmap_insert(&cfm->remote_mps, &rmp->node, hash_mpid(ccm_mpid));
+                rmp->num_ccm = 0;
+                rmp->loss = 0;
             } else {
                 cfm->recv_fault |= CFM_FAULT_OVERFLOW;
                 VLOG_WARN_RL(&rl,
                              "%s: dropped CCM with MPID %"PRIu64" from MAC "
                              ETH_ADDR_FMT, cfm->name, ccm_mpid,
                              ETH_ADDR_ARGS(eth->eth_src));
+                fault = true;
             }
         }
 
@@ -576,16 +602,23 @@ cfm_process_heartbeat(struct cfm *cfm, const struct 
ofpbuf *p)
                  " (interval %"PRIu8") (RDI %s)", cfm->name, ccm_seq,
                  ccm_mpid, ccm_interval, ccm_rdi ? "true" : "false");
 
+        if (ccm_rdi) {
+           fault = true;
+        }
         if (rmp) {
             if (rmp->seq && ccm_seq != (rmp->seq + 1)) {
                 VLOG_WARN_RL(&rl, "%s: (mpid %"PRIu64") detected sequence"
                              " numbers which indicate possible connectivity"
                              " problems (previous %"PRIu32") (current %"PRIu32
                              ")", cfm->name, ccm_mpid, rmp->seq, ccm_seq);
+                fault = true;
             }
 
             rmp->mpid = ccm_mpid;
             rmp->recv = true;
+            if (!fault) {
+                rmp->num_ccm++;
+            }
             rmp->seq = ccm_seq;
             rmp->rdi = ccm_rdi;
             rmp->opup = !ccm_opdown;

I'm struggling a bit to figure out what we should do with this section of code.
We discussed offline using the approach you've presented here which seems fine
to me. But the more I think about it, things will be a lot cleaner if we use
the same heuristics to calculate the health percentage as we use to calculate
cfm->fault (excepting RDI of course).  Of course, with the current code this is
problematic because things like invalid ccm intervals, invalid extended mode,
and out of order sequence numbers aren't accounted for.  However, the more I
think about it, if any of these things are happening, something catastrophic is
happening on the link and a fault should probably be triggered anyways.

What do you think of the following approach?  We could insert a patch before
this one that causes the conditions you're flagging with 'fault' to actually
trigger a fault.  That done, this patch will have a much simpler time deciding
to increment num_ccm.

+        ds_put_format(ds, "\tlink health: (%"PRIu64")\n", 
cfm_get_rmp_health_stats);

Seems unlikely to me you want to print the address of this function.

 
diff --git a/ofproto/ofproto-provider.h b/ofproto/ofproto-provider.h
index e540850..6ae6c0e 100644
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -980,6 +980,15 @@ struct ofproto_class {
     int (*get_cfm_remote_mpids)(const struct ofport *ofport,
                                 const uint64_t **rmps, size_t *n_rmps);
 
+    /* Checks the health of CFM configured on 'ofport'.  Returns an integer
+     * to indicate the percentage health of the 'ofport' which is an average of
This would read easier as "health percentage".

+    health = ofproto_port_get_cfm_health(iface->port->bridge->ofproto,
+                                        iface->ofp_port);
+    if (health >= 0) {
+        assert (health >= 0 && health <= 100);
This assertion makes more sense in the CFM module IMO.
 
 {"name": "Open_vSwitch",
  "version": "6.8.0",
- "cksum": "4106006492 16485",
+ "cksum": "834545060 16568",

Please update the version number as well.


+       "cfm_health": {
+         "type": {"key": "integer", "min": 0, "max": 100}},

Since the cfm_health column is meaningless when CFM is not configured, it may
make sense to represent it as an array of zero or one integers.  The array will
be empty when CFM is not configured, or populated with the health when CFM is
configured.  This is done quite often throughout the code.  The dscp column is
a good example.  

Ethan
_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to