2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-30 13:58:14 +00:00

Granular link health statistics for cfm.

The changes display the cfm_health of an interface.  The cfm_health
is an exponential weighted moving average of the health of all
remote_mpids.  The value can vary from 0 to 100, 100 being very healthy
and 0 being unhealthy.

Feature #10363
Requested-by: Ethan Jackson <ethan@nicira.com>
Signed-off-by: Mehak Mahajan <mmahajan@nicira.com>
This commit is contained in:
Mehak Mahajan
2012-03-29 14:34:51 -07:00
parent 7dc05f69ef
commit c75b7e39d9
10 changed files with 146 additions and 3 deletions

2
NEWS
View File

@@ -6,6 +6,8 @@ post-v1.6.0
- Added ability to configure dscp setting for manager and controller
connections. By default, these connections have a DSCP value of
Internetwork Control (0xc0).
- Added the granular link health statistics, 'cfm_health', to an
interface.
v1.6.0 - xx xxx xxxx

View File

@@ -60,6 +60,7 @@ static const uint8_t eth_addr_ccm_x[6] = {
#define CCM_MAID_LEN 48
#define CCM_OPCODE 1 /* CFM message opcode meaning CCM. */
#define CCM_RDI_MASK 0x80
#define CFM_HEALTH_INTERVAL 6
struct ccm {
uint8_t mdlevel_version; /* MD Level and Version */
uint8_t opcode;
@@ -111,6 +112,12 @@ struct cfm {
* avoid flapping. */
uint64_t *rmps_array; /* Cache of remote_mps. */
size_t rmps_array_len; /* Number of rmps in 'rmps_array'. */
int health; /* Percentage of the number of CCM frames
received. */
int health_interval; /* Number of fault_intervals since health was
recomputed. */
};
/* Remote MPs represent foreign network entities that are configured to have
@@ -124,6 +131,9 @@ struct remote_mp {
receiving CCMs that it's expecting to. */
bool opup; /* Operational State. */
uint32_t seq; /* Most recently received sequence number. */
uint8_t num_health_ccm; /* Number of received ccm frames every
CFM_HEALTH_INTERVAL * 'fault_interval'. */
};
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(20, 30);
@@ -290,6 +300,7 @@ cfm_create(const char *name)
hmap_insert(&all_cfms, &cfm->hmap_node, hash_string(cfm->name, 0));
cfm->remote_opup = true;
cfm->fault_override = -1;
cfm->health = -1;
return cfm;
}
@@ -332,6 +343,37 @@ cfm_run(struct cfm *cfm)
sizeof *cfm->rmps_array);
cfm->remote_opup = true;
if (cfm->health_interval == CFM_HEALTH_INTERVAL) {
/* Calculate the cfm health of the interface. If the number of
* remote_mpids of a cfm interface is > 1, the cfm health is
* undefined. If the number of remote_mpids is 1, the cfm health is
* the percentage of the ccm frames received in the
* (CFM_HEALTH_INTERVAL * 3.5)ms, else it is 0. */
if (hmap_count(&cfm->remote_mps) > 1) {
cfm->health = -1;
} else if (hmap_is_empty(&cfm->remote_mps)) {
cfm->health = 0;
} else {
int exp_ccm_recvd;
rmp = CONTAINER_OF(hmap_first(&cfm->remote_mps),
struct remote_mp, node);
exp_ccm_recvd = (CFM_HEALTH_INTERVAL * 7) / 2;
/* Calculate the percentage of healthy ccm frames received.
* Since the 'fault_interval' is (3.5 * cfm_interval), and
* 1 CCM packet must be received every cfm_interval,
* the 'remote_mpid' health reports the percentage of
* healthy CCM frames received every
* 'CFM_HEALTH_INTERVAL'th 'fault_interval'. */
cfm->health = (rmp->num_health_ccm * 100) / exp_ccm_recvd;
cfm->health = MIN(cfm->health, 100);
rmp->num_health_ccm = 0;
assert(cfm->health >= 0 && cfm->health <= 100);
}
cfm->health_interval = 0;
}
cfm->health_interval++;
HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, &cfm->remote_mps) {
if (!rmp->recv) {
@@ -535,6 +577,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
uint64_t ccm_mpid;
uint32_t ccm_seq;
bool ccm_opdown;
bool fault = false;
if (cfm->extended) {
ccm_mpid = ntohll(ccm->mpid64);
@@ -549,6 +592,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid interval"
" (%"PRIu8") from RMP %"PRIu64, cfm->name,
ccm_interval, ccm_mpid);
fault = true;
}
if (cfm->extended && ccm_interval == 0
@@ -556,6 +600,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid extended"
" interval (%"PRIu16"ms) from RMP %"PRIu64, cfm->name,
ccm_interval_ms_x, ccm_mpid);
fault = true;
}
rmp = lookup_remote_mp(cfm, ccm_mpid);
@@ -569,6 +614,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
"%s: dropped CCM with MPID %"PRIu64" from MAC "
ETH_ADDR_FMT, cfm->name, ccm_mpid,
ETH_ADDR_ARGS(eth->eth_src));
fault = true;
}
}
@@ -576,16 +622,23 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
" (interval %"PRIu8") (RDI %s)", cfm->name, ccm_seq,
ccm_mpid, ccm_interval, ccm_rdi ? "true" : "false");
if (ccm_rdi) {
fault = true;
}
if (rmp) {
if (rmp->seq && ccm_seq != (rmp->seq + 1)) {
VLOG_WARN_RL(&rl, "%s: (mpid %"PRIu64") detected sequence"
" numbers which indicate possible connectivity"
" problems (previous %"PRIu32") (current %"PRIu32
")", cfm->name, ccm_mpid, rmp->seq, ccm_seq);
fault = true;
}
rmp->mpid = ccm_mpid;
rmp->recv = true;
if (!fault) {
rmp->num_health_ccm++;
}
rmp->seq = ccm_seq;
rmp->rdi = ccm_rdi;
rmp->opup = !ccm_opdown;
@@ -605,6 +658,17 @@ cfm_get_fault(const struct cfm *cfm)
return cfm->fault;
}
/* Gets the health of 'cfm'. Returns an integer between 0 and 100 indicating
* the health of the link as a percentage of ccm frames received in
* CFM_HEALTH_INTERVAL * 'fault_interval' if there is only 1 remote_mpid,
* returns 0 if there are no remote_mpids, and returns -1 if there are more
* than 1 remote_mpids. */
int
cfm_get_health(const struct cfm *cfm)
{
return cfm->health;
}
/* Gets the operational state of 'cfm'. 'cfm' is considered operationally down
* if it has received a CCM with the operationally down bit set from any of its
* remote maintenance points. Returns true if 'cfm' is operationally up. False
@@ -656,6 +720,11 @@ cfm_print_details(struct ds *ds, const struct cfm *cfm)
ds_put_cstr(ds, "\n");
}
if (cfm->health == -1) {
ds_put_format(ds, "\taverage health: undefined\n");
} else {
ds_put_format(ds, "\taverage health: %d\n", cfm->health);
}
ds_put_format(ds, "\topstate: %s\n", cfm->opup ? "up" : "down");
ds_put_format(ds, "\tremote_opstate: %s\n",
cfm->remote_opup ? "up" : "down");

View File

@@ -69,6 +69,7 @@ bool cfm_configure(struct cfm *, const struct cfm_settings *);
bool cfm_should_process_flow(const struct cfm *cfm, const struct flow *);
void cfm_process_heartbeat(struct cfm *, const struct ofpbuf *packet);
int cfm_get_fault(const struct cfm *);
int cfm_get_health(const struct cfm *);
bool cfm_get_opup(const struct cfm *);
void cfm_get_remote_mpids(const struct cfm *, const uint64_t **rmps,
size_t *n_rmps);

View File

@@ -1119,6 +1119,14 @@ get_cfm_remote_mpids(const struct ofport *ofport_, const uint64_t **rmps,
return -1;
}
}
static int
get_cfm_health(const struct ofport *ofport_)
{
struct ofport_dpif *ofport = ofport_dpif_cast(ofport_);
return ofport->cfm ? cfm_get_health(ofport->cfm) : -1;
}
/* Spanning Tree. */
@@ -6491,6 +6499,7 @@ const struct ofproto_class ofproto_dpif_class = {
set_cfm,
get_cfm_fault,
get_cfm_remote_mpids,
get_cfm_health,
set_stp,
get_stp_status,
set_stp_port,

View File

@@ -980,6 +980,17 @@ struct ofproto_class {
int (*get_cfm_remote_mpids)(const struct ofport *ofport,
const uint64_t **rmps, size_t *n_rmps);
/* Checks the health of CFM configured on 'ofport'. Returns an integer
* to indicate the health percentage of the 'ofport' which is an average of
* the health of all the remote_mps. Returns an integer between 0 and 100
* where 0 means that the 'ofport' is very unhealthy and 100 means the
* 'ofport' is perfectly healthy. Returns -1 if CFM is not enabled on
* 'port' or if the number of remote_mpids is > 1.
*
* This function may be a null pointer if the ofproto implementation does
* not support CFM. */
int (*get_cfm_health)(const struct ofport *ofport);
/* Configures spanning tree protocol (STP) on 'ofproto' using the
* settings defined in 's'.
*

View File

@@ -2481,6 +2481,19 @@ ofproto_port_get_cfm_remote_mpids(const struct ofproto *ofproto,
: -1);
}
/* Checks the health of the CFM for 'ofp_port' within 'ofproto'. Returns an
* integer value between 0 and 100 to indicate the health of the port as a
* percentage which is the average of cfm health of all the remote_mpids or
* returns -1 if CFM is not enabled on 'ofport'. */
int
ofproto_port_get_cfm_health(const struct ofproto *ofproto, uint16_t ofp_port)
{
struct ofport *ofport = ofproto_get_port(ofproto, ofp_port);
return (ofport && ofproto->ofproto_class->get_cfm_health
? ofproto->ofproto_class->get_cfm_health(ofport)
: -1);
}
static enum ofperr
handle_aggregate_stats_request(struct ofconn *ofconn,
const struct ofp_stats_msg *osm)

View File

@@ -348,7 +348,8 @@ int ofproto_port_get_cfm_fault(const struct ofproto *, uint16_t ofp_port);
int ofproto_port_get_cfm_remote_mpids(const struct ofproto *,
uint16_t ofp_port, const uint64_t **rmps,
size_t *n_rmps);
int ofproto_port_get_cfm_health(const struct ofproto *ofproto,
uint16_t ofp_port);
void ofproto_get_ofproto_controller_info(const struct ofproto *, struct shash *);
void ofproto_free_ofproto_controller_info(struct shash *);

View File

@@ -279,6 +279,7 @@ bridge_init(const char *remote)
ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_fault);
ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_fault_status);
ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_remote_mpids);
ovsdb_idl_omit_alert(idl, &ovsrec_interface_set_cfm_health);
ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_lacp_current);
ovsdb_idl_omit(idl, &ovsrec_interface_col_external_ids);
@@ -1547,6 +1548,7 @@ iface_refresh_cfm_stats(struct iface *iface)
int fault, error;
const uint64_t *rmps;
size_t n_rmps;
int health;
if (iface_is_synthetic(iface)) {
return;
@@ -1582,6 +1584,15 @@ iface_refresh_cfm_stats(struct iface *iface)
} else {
ovsrec_interface_set_cfm_remote_mpids(cfg, NULL, 0);
}
health = ofproto_port_get_cfm_health(iface->port->bridge->ofproto,
iface->ofp_port);
if (health >= 0) {
int64_t cfm_health = health;
ovsrec_interface_set_cfm_health(cfg, &cfm_health, 1);
} else {
ovsrec_interface_set_cfm_health(cfg, NULL, 0);
}
}
static void

View File

@@ -1,6 +1,6 @@
{"name": "Open_vSwitch",
"version": "6.8.0",
"cksum": "4106006492 16485",
"version": "6.9.0",
"cksum": "617116616 16682",
"tables": {
"Open_vSwitch": {
"columns": {
@@ -197,6 +197,11 @@
"ephemeral": true},
"cfm_fault_status": {
"type": {"key": "string", "min": 0, "max": "unlimited"}},
"cfm_health": {
"type": {"key": {"type": "integer",
"minInteger": 0,
"maxInteger": 100},
"min": 0, "max": 1}},
"lacp_current": {
"type": {"key": {"type": "boolean"},
"min": 0, "max": 1},

View File

@@ -1726,6 +1726,27 @@
an <code>ovs-appctl</code> command.
</column>
<column name="cfm_health">
<p>
Indicates the health of the interface as a percentage of CCM frames
received over 21 <ref column="other_config" key="cfm_interval"/>s.
The health of an interface is undefined if it is communicating with
more than one <ref column="cfm_remote_mpids"/>. It reduces if
healthy heartbeats are not received at the expected rate, and
gradually improves as healthy heartbeats are received at the desired
rate. Every 21 <ref column="other_config" key="cfm_interval"/>s, the
health of the interface is refreshed.
</p>
<p>
As mentioned above, the faults can be triggered for several reasons.
The link health will deteriorate even if heartbeats are received but
they are reported to be unhealthy. An unhealthy heartbeat in this
context is a heartbeat for which either some fault is set or is out
of sequence. The interface health can be 100 only on receiving
healthy heartbeats at the desired rate.
</p>
</column>
<column name="cfm_remote_mpids">
When CFM is properly configured, Open vSwitch will occasionally
receive CCM broadcasts. These broadcasts contain the MPID of the