Granular link health statistics for cfm.

The changes display the cfm_health of an interface. The cfm_health is an exponential weighted moving average of the health of all remote_mpids. The value can vary from 0 to 100, 100 being very healthy and 0 being unhealthy. Feature #10363 Requested-by: Ethan Jackson <ethan@nicira.com> Signed-off-by: Mehak Mahajan <mmahajan@nicira.com>
2025-08-30 13:58:14 +00:00 · 2012-03-29 14:34:51 -07:00
parent 7dc05f69ef
commit c75b7e39d9
10 changed files with 146 additions and 3 deletions
--- a/2
+++ b/2
@@ -6,6 +6,8 @@ post-v1.6.0
    - Added ability to configure dscp setting for manager and controller
      connections.  By default, these connections have a DSCP value of
      Internetwork Control (0xc0).
+    - Added the granular link health statistics, 'cfm_health', to an
+      interface.


 v1.6.0 - xx xxx xxxx
--- a/lib/cfm.c
+++ b/lib/cfm.c
@@ -60,6 +60,7 @@ static const uint8_t eth_addr_ccm_x[6] = {
 #define CCM_MAID_LEN 48
 #define CCM_OPCODE 1 /* CFM message opcode meaning CCM. */
 #define CCM_RDI_MASK 0x80
+#define CFM_HEALTH_INTERVAL 6
 struct ccm {
    uint8_t  mdlevel_version; /* MD Level and Version */
    uint8_t  opcode;
@@ -111,6 +112,12 @@ struct cfm {
     * avoid flapping. */
    uint64_t *rmps_array;     /* Cache of remote_mps. */
    size_t rmps_array_len;    /* Number of rmps in 'rmps_array'. */
+
+    int health;               /* Percentage of the number of CCM frames
+                                 received. */
+    int health_interval;      /* Number of fault_intervals since health was
+                                 recomputed. */
+
 };

 /* Remote MPs represent foreign network entities that are configured to have
@@ -124,6 +131,9 @@ struct remote_mp {
                            receiving CCMs that it's expecting to. */
    bool opup;           /* Operational State. */
    uint32_t seq;        /* Most recently received sequence number. */
+    uint8_t num_health_ccm; /* Number of received ccm frames every
+                               CFM_HEALTH_INTERVAL * 'fault_interval'. */
+
 };

 static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(20, 30);
@@ -290,6 +300,7 @@ cfm_create(const char *name)
    hmap_insert(&all_cfms, &cfm->hmap_node, hash_string(cfm->name, 0));
    cfm->remote_opup = true;
    cfm->fault_override = -1;
+    cfm->health = -1;
    return cfm;
 }

@@ -332,6 +343,37 @@ cfm_run(struct cfm *cfm)
                                  sizeof *cfm->rmps_array);

        cfm->remote_opup = true;
+        if (cfm->health_interval == CFM_HEALTH_INTERVAL) {
+            /* Calculate the cfm health of the interface.  If the number of
+             * remote_mpids of a cfm interface is > 1, the cfm health is
+             * undefined. If the number of remote_mpids is 1, the cfm health is
+             * the percentage of the ccm frames received in the
+             * (CFM_HEALTH_INTERVAL * 3.5)ms, else it is 0. */
+            if (hmap_count(&cfm->remote_mps) > 1) {
+                cfm->health = -1;
+            } else if (hmap_is_empty(&cfm->remote_mps)) {
+                cfm->health = 0;
+            } else {
+                int exp_ccm_recvd;
+
+                rmp = CONTAINER_OF(hmap_first(&cfm->remote_mps),
+                                   struct remote_mp, node);
+                exp_ccm_recvd = (CFM_HEALTH_INTERVAL * 7) / 2;
+                /* Calculate the percentage of healthy ccm frames received.
+                 * Since the 'fault_interval' is (3.5 * cfm_interval), and
+                 * 1 CCM packet must be received every cfm_interval,
+                 * the 'remote_mpid' health reports the percentage of
+                 * healthy CCM frames received every
+                 * 'CFM_HEALTH_INTERVAL'th 'fault_interval'. */
+                cfm->health = (rmp->num_health_ccm * 100) / exp_ccm_recvd;
+                cfm->health = MIN(cfm->health, 100);
+                rmp->num_health_ccm = 0;
+                assert(cfm->health >= 0 && cfm->health <= 100);
+            }
+            cfm->health_interval = 0;
+        }
+        cfm->health_interval++;
+
        HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, &cfm->remote_mps) {

            if (!rmp->recv) {
@@ -535,6 +577,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
        uint64_t ccm_mpid;
        uint32_t ccm_seq;
        bool ccm_opdown;
+        bool fault = false;

        if (cfm->extended) {
            ccm_mpid = ntohll(ccm->mpid64);
@@ -549,6 +592,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
            VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid interval"
                         " (%"PRIu8") from RMP %"PRIu64, cfm->name,
                         ccm_interval, ccm_mpid);
+            fault = true;
        }

        if (cfm->extended && ccm_interval == 0
@@ -556,6 +600,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
            VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid extended"
                         " interval (%"PRIu16"ms) from RMP %"PRIu64, cfm->name,
                         ccm_interval_ms_x, ccm_mpid);
+            fault = true;
        }

        rmp = lookup_remote_mp(cfm, ccm_mpid);
@@ -569,6 +614,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
                             "%s: dropped CCM with MPID %"PRIu64" from MAC "
                             ETH_ADDR_FMT, cfm->name, ccm_mpid,
                             ETH_ADDR_ARGS(eth->eth_src));
+                fault = true;
            }
        }

@@ -576,16 +622,23 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
                 " (interval %"PRIu8") (RDI %s)", cfm->name, ccm_seq,
                 ccm_mpid, ccm_interval, ccm_rdi ? "true" : "false");

+        if (ccm_rdi) {
+            fault = true;
+        }
        if (rmp) {
            if (rmp->seq && ccm_seq != (rmp->seq + 1)) {
                VLOG_WARN_RL(&rl, "%s: (mpid %"PRIu64") detected sequence"
                             " numbers which indicate possible connectivity"
                             " problems (previous %"PRIu32") (current %"PRIu32
                             ")", cfm->name, ccm_mpid, rmp->seq, ccm_seq);
+                fault = true;
            }

            rmp->mpid = ccm_mpid;
            rmp->recv = true;
+            if (!fault) {
+                rmp->num_health_ccm++;
+            }
            rmp->seq = ccm_seq;
            rmp->rdi = ccm_rdi;
            rmp->opup = !ccm_opdown;
@@ -605,6 +658,17 @@ cfm_get_fault(const struct cfm *cfm)
    return cfm->fault;
 }

+/* Gets the health of 'cfm'.  Returns an integer between 0 and 100 indicating
+ * the health of the link as a percentage of ccm frames received in
+ * CFM_HEALTH_INTERVAL * 'fault_interval' if there is only 1 remote_mpid,
+ * returns 0 if there are no remote_mpids, and returns -1 if there are more
+ * than 1 remote_mpids. */
+int
+cfm_get_health(const struct cfm *cfm)
+{
+    return cfm->health;
+}
+
 /* Gets the operational state of 'cfm'.  'cfm' is considered operationally down
 * if it has received a CCM with the operationally down bit set from any of its
 * remote maintenance points. Returns true if 'cfm' is operationally up. False
@@ -656,6 +720,11 @@ cfm_print_details(struct ds *ds, const struct cfm *cfm)
        ds_put_cstr(ds, "\n");
    }

+    if (cfm->health == -1) {
+        ds_put_format(ds, "\taverage health: undefined\n");
+    } else {
+        ds_put_format(ds, "\taverage health: %d\n", cfm->health);
+    }
    ds_put_format(ds, "\topstate: %s\n", cfm->opup ? "up" : "down");
    ds_put_format(ds, "\tremote_opstate: %s\n",
                  cfm->remote_opup ? "up" : "down");
--- a/lib/cfm.h
+++ b/lib/cfm.h
@@ -69,6 +69,7 @@ bool cfm_configure(struct cfm *, const struct cfm_settings *);
 bool cfm_should_process_flow(const struct cfm *cfm, const struct flow *);
 void cfm_process_heartbeat(struct cfm *, const struct ofpbuf *packet);
 int cfm_get_fault(const struct cfm *);
+int cfm_get_health(const struct cfm *);
 bool cfm_get_opup(const struct cfm *);
 void cfm_get_remote_mpids(const struct cfm *, const uint64_t **rmps,
                          size_t *n_rmps);
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -1119,6 +1119,14 @@ get_cfm_remote_mpids(const struct ofport *ofport_, const uint64_t **rmps,
        return -1;
    }
 }
+
+static int
+get_cfm_health(const struct ofport *ofport_)
+{
+    struct ofport_dpif *ofport = ofport_dpif_cast(ofport_);
+
+    return ofport->cfm ? cfm_get_health(ofport->cfm) : -1;
+}

 /* Spanning Tree. */

@@ -6491,6 +6499,7 @@ const struct ofproto_class ofproto_dpif_class = {
    set_cfm,
    get_cfm_fault,
    get_cfm_remote_mpids,
+    get_cfm_health,
    set_stp,
    get_stp_status,
    set_stp_port,
--- a/ofproto/ofproto-provider.h
+++ b/ofproto/ofproto-provider.h
@@ -980,6 +980,17 @@ struct ofproto_class {
    int (*get_cfm_remote_mpids)(const struct ofport *ofport,
                                const uint64_t **rmps, size_t *n_rmps);

+    /* Checks the health of CFM configured on 'ofport'.  Returns an integer
+     * to indicate the health percentage of the 'ofport' which is an average of
+     * the health of all the remote_mps.  Returns an integer between 0 and 100
+     * where 0 means that the 'ofport' is very unhealthy and 100 means the
+     * 'ofport' is perfectly healthy.  Returns -1 if CFM is not enabled on
+     * 'port' or if the number of remote_mpids is > 1.
+     *
+     * This function may be a null pointer if the ofproto implementation does
+     * not support CFM. */
+    int (*get_cfm_health)(const struct ofport *ofport);
+
    /* Configures spanning tree protocol (STP) on 'ofproto' using the
     * settings defined in 's'.
     *
--- a/ofproto/ofproto.c
+++ b/ofproto/ofproto.c
@@ -2481,6 +2481,19 @@ ofproto_port_get_cfm_remote_mpids(const struct ofproto *ofproto,
            : -1);
 }

+/* Checks the health of the CFM for 'ofp_port' within 'ofproto'.  Returns an
+ * integer value between 0 and 100 to indicate the health of the port as a
+ * percentage which is the average of cfm health of all the remote_mpids or
+ * returns -1 if CFM is not enabled on 'ofport'. */
+int
+ofproto_port_get_cfm_health(const struct ofproto *ofproto, uint16_t ofp_port)
+{
+    struct ofport *ofport = ofproto_get_port(ofproto, ofp_port);
+    return (ofport && ofproto->ofproto_class->get_cfm_health
+            ? ofproto->ofproto_class->get_cfm_health(ofport)
+            : -1);
+}
+
 static enum ofperr
 handle_aggregate_stats_request(struct ofconn *ofconn,
                               const struct ofp_stats_msg *osm)
--- a/ofproto/ofproto.h
+++ b/ofproto/ofproto.h
@@ -348,7 +348,8 @@ int ofproto_port_get_cfm_fault(const struct ofproto *, uint16_t ofp_port);
 int ofproto_port_get_cfm_remote_mpids(const struct ofproto *,
                                      uint16_t ofp_port, const uint64_t **rmps,
                                      size_t *n_rmps);
-
+int ofproto_port_get_cfm_health(const struct ofproto *ofproto,
+                                uint16_t ofp_port);
 void ofproto_get_ofproto_controller_info(const struct ofproto *, struct shash *);
 void ofproto_free_ofproto_controller_info(struct shash *);

--- a/vswitchd/bridge.c
+++ b/vswitchd/bridge.c
@@ -279,6 +279,7 @@ bridge_init(const char *remote)
    ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_fault);
    ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_fault_status);
    ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_remote_mpids);
+    ovsdb_idl_omit_alert(idl, &ovsrec_interface_set_cfm_health);
    ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_lacp_current);
    ovsdb_idl_omit(idl, &ovsrec_interface_col_external_ids);

@@ -1547,6 +1548,7 @@ iface_refresh_cfm_stats(struct iface *iface)
    int fault, error;
    const uint64_t *rmps;
    size_t n_rmps;
+    int health;

    if (iface_is_synthetic(iface)) {
        return;
@@ -1582,6 +1584,15 @@ iface_refresh_cfm_stats(struct iface *iface)
    } else {
        ovsrec_interface_set_cfm_remote_mpids(cfg, NULL, 0);
    }
+
+    health = ofproto_port_get_cfm_health(iface->port->bridge->ofproto,
+                                        iface->ofp_port);
+    if (health >= 0) {
+        int64_t cfm_health = health;
+        ovsrec_interface_set_cfm_health(cfg, &cfm_health, 1);
+    } else {
+        ovsrec_interface_set_cfm_health(cfg, NULL, 0);
+    }
 }

 static void
--- a/vswitchd/vswitch.ovsschema
+++ b/vswitchd/vswitch.ovsschema
@@ -1,6 +1,6 @@
 {"name": "Open_vSwitch",
- "version": "6.8.0",
- "cksum": "4106006492 16485",
+ "version": "6.9.0",
+ "cksum": "617116616 16682",
 "tables": {
   "Open_vSwitch": {
     "columns": {
@@ -197,6 +197,11 @@
         "ephemeral": true},
       "cfm_fault_status": {
         "type": {"key": "string", "min": 0, "max": "unlimited"}},
+       "cfm_health": {
+         "type": {"key": {"type": "integer",
+                          "minInteger": 0,
+                          "maxInteger": 100},
+                  "min": 0, "max": 1}},
       "lacp_current": {
         "type": {"key": {"type": "boolean"},
                  "min": 0, "max": 1},
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -1726,6 +1726,27 @@
        an <code>ovs-appctl</code> command.
      </column>

+      <column name="cfm_health">
+        <p>
+          Indicates the health of the interface as a percentage of CCM frames
+          received over 21 <ref column="other_config" key="cfm_interval"/>s.
+          The health of an interface is undefined if it is communicating with
+          more than one <ref column="cfm_remote_mpids"/>.  It reduces if
+          healthy heartbeats are not received at the expected rate, and
+          gradually improves as healthy heartbeats are received at the desired
+          rate. Every 21 <ref column="other_config" key="cfm_interval"/>s, the
+          health of the interface is refreshed.
+        </p>
+        <p>
+          As mentioned above, the faults can be triggered for several reasons.
+          The link health will deteriorate even if heartbeats are received but
+          they are reported to be unhealthy.  An unhealthy heartbeat in this
+          context is a heartbeat for which either some fault is set or is out
+          of sequence.  The interface health can be 100 only on receiving
+          healthy heartbeats at the desired rate.
+        </p>
+      </column>
+
      <column name="cfm_remote_mpids">
        When CFM is properly configured, Open vSwitch will occasionally
        receive CCM broadcasts.  These broadcasts contain the MPID of the