mirror of
https://github.com/openvswitch/ovs
synced 2025-08-30 13:58:14 +00:00
Granular link health statistics for cfm.
The changes display the cfm_health of an interface. The cfm_health is an exponential weighted moving average of the health of all remote_mpids. The value can vary from 0 to 100, 100 being very healthy and 0 being unhealthy. Feature #10363 Requested-by: Ethan Jackson <ethan@nicira.com> Signed-off-by: Mehak Mahajan <mmahajan@nicira.com>
This commit is contained in:
2
NEWS
2
NEWS
@@ -6,6 +6,8 @@ post-v1.6.0
|
||||
- Added ability to configure dscp setting for manager and controller
|
||||
connections. By default, these connections have a DSCP value of
|
||||
Internetwork Control (0xc0).
|
||||
- Added the granular link health statistics, 'cfm_health', to an
|
||||
interface.
|
||||
|
||||
|
||||
v1.6.0 - xx xxx xxxx
|
||||
|
69
lib/cfm.c
69
lib/cfm.c
@@ -60,6 +60,7 @@ static const uint8_t eth_addr_ccm_x[6] = {
|
||||
#define CCM_MAID_LEN 48
|
||||
#define CCM_OPCODE 1 /* CFM message opcode meaning CCM. */
|
||||
#define CCM_RDI_MASK 0x80
|
||||
#define CFM_HEALTH_INTERVAL 6
|
||||
struct ccm {
|
||||
uint8_t mdlevel_version; /* MD Level and Version */
|
||||
uint8_t opcode;
|
||||
@@ -111,6 +112,12 @@ struct cfm {
|
||||
* avoid flapping. */
|
||||
uint64_t *rmps_array; /* Cache of remote_mps. */
|
||||
size_t rmps_array_len; /* Number of rmps in 'rmps_array'. */
|
||||
|
||||
int health; /* Percentage of the number of CCM frames
|
||||
received. */
|
||||
int health_interval; /* Number of fault_intervals since health was
|
||||
recomputed. */
|
||||
|
||||
};
|
||||
|
||||
/* Remote MPs represent foreign network entities that are configured to have
|
||||
@@ -124,6 +131,9 @@ struct remote_mp {
|
||||
receiving CCMs that it's expecting to. */
|
||||
bool opup; /* Operational State. */
|
||||
uint32_t seq; /* Most recently received sequence number. */
|
||||
uint8_t num_health_ccm; /* Number of received ccm frames every
|
||||
CFM_HEALTH_INTERVAL * 'fault_interval'. */
|
||||
|
||||
};
|
||||
|
||||
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(20, 30);
|
||||
@@ -290,6 +300,7 @@ cfm_create(const char *name)
|
||||
hmap_insert(&all_cfms, &cfm->hmap_node, hash_string(cfm->name, 0));
|
||||
cfm->remote_opup = true;
|
||||
cfm->fault_override = -1;
|
||||
cfm->health = -1;
|
||||
return cfm;
|
||||
}
|
||||
|
||||
@@ -332,6 +343,37 @@ cfm_run(struct cfm *cfm)
|
||||
sizeof *cfm->rmps_array);
|
||||
|
||||
cfm->remote_opup = true;
|
||||
if (cfm->health_interval == CFM_HEALTH_INTERVAL) {
|
||||
/* Calculate the cfm health of the interface. If the number of
|
||||
* remote_mpids of a cfm interface is > 1, the cfm health is
|
||||
* undefined. If the number of remote_mpids is 1, the cfm health is
|
||||
* the percentage of the ccm frames received in the
|
||||
* (CFM_HEALTH_INTERVAL * 3.5)ms, else it is 0. */
|
||||
if (hmap_count(&cfm->remote_mps) > 1) {
|
||||
cfm->health = -1;
|
||||
} else if (hmap_is_empty(&cfm->remote_mps)) {
|
||||
cfm->health = 0;
|
||||
} else {
|
||||
int exp_ccm_recvd;
|
||||
|
||||
rmp = CONTAINER_OF(hmap_first(&cfm->remote_mps),
|
||||
struct remote_mp, node);
|
||||
exp_ccm_recvd = (CFM_HEALTH_INTERVAL * 7) / 2;
|
||||
/* Calculate the percentage of healthy ccm frames received.
|
||||
* Since the 'fault_interval' is (3.5 * cfm_interval), and
|
||||
* 1 CCM packet must be received every cfm_interval,
|
||||
* the 'remote_mpid' health reports the percentage of
|
||||
* healthy CCM frames received every
|
||||
* 'CFM_HEALTH_INTERVAL'th 'fault_interval'. */
|
||||
cfm->health = (rmp->num_health_ccm * 100) / exp_ccm_recvd;
|
||||
cfm->health = MIN(cfm->health, 100);
|
||||
rmp->num_health_ccm = 0;
|
||||
assert(cfm->health >= 0 && cfm->health <= 100);
|
||||
}
|
||||
cfm->health_interval = 0;
|
||||
}
|
||||
cfm->health_interval++;
|
||||
|
||||
HMAP_FOR_EACH_SAFE (rmp, rmp_next, node, &cfm->remote_mps) {
|
||||
|
||||
if (!rmp->recv) {
|
||||
@@ -535,6 +577,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
|
||||
uint64_t ccm_mpid;
|
||||
uint32_t ccm_seq;
|
||||
bool ccm_opdown;
|
||||
bool fault = false;
|
||||
|
||||
if (cfm->extended) {
|
||||
ccm_mpid = ntohll(ccm->mpid64);
|
||||
@@ -549,6 +592,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
|
||||
VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid interval"
|
||||
" (%"PRIu8") from RMP %"PRIu64, cfm->name,
|
||||
ccm_interval, ccm_mpid);
|
||||
fault = true;
|
||||
}
|
||||
|
||||
if (cfm->extended && ccm_interval == 0
|
||||
@@ -556,6 +600,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
|
||||
VLOG_WARN_RL(&rl, "%s: received a CCM with an invalid extended"
|
||||
" interval (%"PRIu16"ms) from RMP %"PRIu64, cfm->name,
|
||||
ccm_interval_ms_x, ccm_mpid);
|
||||
fault = true;
|
||||
}
|
||||
|
||||
rmp = lookup_remote_mp(cfm, ccm_mpid);
|
||||
@@ -569,6 +614,7 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
|
||||
"%s: dropped CCM with MPID %"PRIu64" from MAC "
|
||||
ETH_ADDR_FMT, cfm->name, ccm_mpid,
|
||||
ETH_ADDR_ARGS(eth->eth_src));
|
||||
fault = true;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -576,16 +622,23 @@ cfm_process_heartbeat(struct cfm *cfm, const struct ofpbuf *p)
|
||||
" (interval %"PRIu8") (RDI %s)", cfm->name, ccm_seq,
|
||||
ccm_mpid, ccm_interval, ccm_rdi ? "true" : "false");
|
||||
|
||||
if (ccm_rdi) {
|
||||
fault = true;
|
||||
}
|
||||
if (rmp) {
|
||||
if (rmp->seq && ccm_seq != (rmp->seq + 1)) {
|
||||
VLOG_WARN_RL(&rl, "%s: (mpid %"PRIu64") detected sequence"
|
||||
" numbers which indicate possible connectivity"
|
||||
" problems (previous %"PRIu32") (current %"PRIu32
|
||||
")", cfm->name, ccm_mpid, rmp->seq, ccm_seq);
|
||||
fault = true;
|
||||
}
|
||||
|
||||
rmp->mpid = ccm_mpid;
|
||||
rmp->recv = true;
|
||||
if (!fault) {
|
||||
rmp->num_health_ccm++;
|
||||
}
|
||||
rmp->seq = ccm_seq;
|
||||
rmp->rdi = ccm_rdi;
|
||||
rmp->opup = !ccm_opdown;
|
||||
@@ -605,6 +658,17 @@ cfm_get_fault(const struct cfm *cfm)
|
||||
return cfm->fault;
|
||||
}
|
||||
|
||||
/* Gets the health of 'cfm'. Returns an integer between 0 and 100 indicating
|
||||
* the health of the link as a percentage of ccm frames received in
|
||||
* CFM_HEALTH_INTERVAL * 'fault_interval' if there is only 1 remote_mpid,
|
||||
* returns 0 if there are no remote_mpids, and returns -1 if there are more
|
||||
* than 1 remote_mpids. */
|
||||
int
|
||||
cfm_get_health(const struct cfm *cfm)
|
||||
{
|
||||
return cfm->health;
|
||||
}
|
||||
|
||||
/* Gets the operational state of 'cfm'. 'cfm' is considered operationally down
|
||||
* if it has received a CCM with the operationally down bit set from any of its
|
||||
* remote maintenance points. Returns true if 'cfm' is operationally up. False
|
||||
@@ -656,6 +720,11 @@ cfm_print_details(struct ds *ds, const struct cfm *cfm)
|
||||
ds_put_cstr(ds, "\n");
|
||||
}
|
||||
|
||||
if (cfm->health == -1) {
|
||||
ds_put_format(ds, "\taverage health: undefined\n");
|
||||
} else {
|
||||
ds_put_format(ds, "\taverage health: %d\n", cfm->health);
|
||||
}
|
||||
ds_put_format(ds, "\topstate: %s\n", cfm->opup ? "up" : "down");
|
||||
ds_put_format(ds, "\tremote_opstate: %s\n",
|
||||
cfm->remote_opup ? "up" : "down");
|
||||
|
@@ -69,6 +69,7 @@ bool cfm_configure(struct cfm *, const struct cfm_settings *);
|
||||
bool cfm_should_process_flow(const struct cfm *cfm, const struct flow *);
|
||||
void cfm_process_heartbeat(struct cfm *, const struct ofpbuf *packet);
|
||||
int cfm_get_fault(const struct cfm *);
|
||||
int cfm_get_health(const struct cfm *);
|
||||
bool cfm_get_opup(const struct cfm *);
|
||||
void cfm_get_remote_mpids(const struct cfm *, const uint64_t **rmps,
|
||||
size_t *n_rmps);
|
||||
|
@@ -1119,6 +1119,14 @@ get_cfm_remote_mpids(const struct ofport *ofport_, const uint64_t **rmps,
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
static int
|
||||
get_cfm_health(const struct ofport *ofport_)
|
||||
{
|
||||
struct ofport_dpif *ofport = ofport_dpif_cast(ofport_);
|
||||
|
||||
return ofport->cfm ? cfm_get_health(ofport->cfm) : -1;
|
||||
}
|
||||
|
||||
/* Spanning Tree. */
|
||||
|
||||
@@ -6491,6 +6499,7 @@ const struct ofproto_class ofproto_dpif_class = {
|
||||
set_cfm,
|
||||
get_cfm_fault,
|
||||
get_cfm_remote_mpids,
|
||||
get_cfm_health,
|
||||
set_stp,
|
||||
get_stp_status,
|
||||
set_stp_port,
|
||||
|
@@ -980,6 +980,17 @@ struct ofproto_class {
|
||||
int (*get_cfm_remote_mpids)(const struct ofport *ofport,
|
||||
const uint64_t **rmps, size_t *n_rmps);
|
||||
|
||||
/* Checks the health of CFM configured on 'ofport'. Returns an integer
|
||||
* to indicate the health percentage of the 'ofport' which is an average of
|
||||
* the health of all the remote_mps. Returns an integer between 0 and 100
|
||||
* where 0 means that the 'ofport' is very unhealthy and 100 means the
|
||||
* 'ofport' is perfectly healthy. Returns -1 if CFM is not enabled on
|
||||
* 'port' or if the number of remote_mpids is > 1.
|
||||
*
|
||||
* This function may be a null pointer if the ofproto implementation does
|
||||
* not support CFM. */
|
||||
int (*get_cfm_health)(const struct ofport *ofport);
|
||||
|
||||
/* Configures spanning tree protocol (STP) on 'ofproto' using the
|
||||
* settings defined in 's'.
|
||||
*
|
||||
|
@@ -2481,6 +2481,19 @@ ofproto_port_get_cfm_remote_mpids(const struct ofproto *ofproto,
|
||||
: -1);
|
||||
}
|
||||
|
||||
/* Checks the health of the CFM for 'ofp_port' within 'ofproto'. Returns an
|
||||
* integer value between 0 and 100 to indicate the health of the port as a
|
||||
* percentage which is the average of cfm health of all the remote_mpids or
|
||||
* returns -1 if CFM is not enabled on 'ofport'. */
|
||||
int
|
||||
ofproto_port_get_cfm_health(const struct ofproto *ofproto, uint16_t ofp_port)
|
||||
{
|
||||
struct ofport *ofport = ofproto_get_port(ofproto, ofp_port);
|
||||
return (ofport && ofproto->ofproto_class->get_cfm_health
|
||||
? ofproto->ofproto_class->get_cfm_health(ofport)
|
||||
: -1);
|
||||
}
|
||||
|
||||
static enum ofperr
|
||||
handle_aggregate_stats_request(struct ofconn *ofconn,
|
||||
const struct ofp_stats_msg *osm)
|
||||
|
@@ -348,7 +348,8 @@ int ofproto_port_get_cfm_fault(const struct ofproto *, uint16_t ofp_port);
|
||||
int ofproto_port_get_cfm_remote_mpids(const struct ofproto *,
|
||||
uint16_t ofp_port, const uint64_t **rmps,
|
||||
size_t *n_rmps);
|
||||
|
||||
int ofproto_port_get_cfm_health(const struct ofproto *ofproto,
|
||||
uint16_t ofp_port);
|
||||
void ofproto_get_ofproto_controller_info(const struct ofproto *, struct shash *);
|
||||
void ofproto_free_ofproto_controller_info(struct shash *);
|
||||
|
||||
|
@@ -279,6 +279,7 @@ bridge_init(const char *remote)
|
||||
ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_fault);
|
||||
ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_fault_status);
|
||||
ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_cfm_remote_mpids);
|
||||
ovsdb_idl_omit_alert(idl, &ovsrec_interface_set_cfm_health);
|
||||
ovsdb_idl_omit_alert(idl, &ovsrec_interface_col_lacp_current);
|
||||
ovsdb_idl_omit(idl, &ovsrec_interface_col_external_ids);
|
||||
|
||||
@@ -1547,6 +1548,7 @@ iface_refresh_cfm_stats(struct iface *iface)
|
||||
int fault, error;
|
||||
const uint64_t *rmps;
|
||||
size_t n_rmps;
|
||||
int health;
|
||||
|
||||
if (iface_is_synthetic(iface)) {
|
||||
return;
|
||||
@@ -1582,6 +1584,15 @@ iface_refresh_cfm_stats(struct iface *iface)
|
||||
} else {
|
||||
ovsrec_interface_set_cfm_remote_mpids(cfg, NULL, 0);
|
||||
}
|
||||
|
||||
health = ofproto_port_get_cfm_health(iface->port->bridge->ofproto,
|
||||
iface->ofp_port);
|
||||
if (health >= 0) {
|
||||
int64_t cfm_health = health;
|
||||
ovsrec_interface_set_cfm_health(cfg, &cfm_health, 1);
|
||||
} else {
|
||||
ovsrec_interface_set_cfm_health(cfg, NULL, 0);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
@@ -1,6 +1,6 @@
|
||||
{"name": "Open_vSwitch",
|
||||
"version": "6.8.0",
|
||||
"cksum": "4106006492 16485",
|
||||
"version": "6.9.0",
|
||||
"cksum": "617116616 16682",
|
||||
"tables": {
|
||||
"Open_vSwitch": {
|
||||
"columns": {
|
||||
@@ -197,6 +197,11 @@
|
||||
"ephemeral": true},
|
||||
"cfm_fault_status": {
|
||||
"type": {"key": "string", "min": 0, "max": "unlimited"}},
|
||||
"cfm_health": {
|
||||
"type": {"key": {"type": "integer",
|
||||
"minInteger": 0,
|
||||
"maxInteger": 100},
|
||||
"min": 0, "max": 1}},
|
||||
"lacp_current": {
|
||||
"type": {"key": {"type": "boolean"},
|
||||
"min": 0, "max": 1},
|
||||
|
@@ -1726,6 +1726,27 @@
|
||||
an <code>ovs-appctl</code> command.
|
||||
</column>
|
||||
|
||||
<column name="cfm_health">
|
||||
<p>
|
||||
Indicates the health of the interface as a percentage of CCM frames
|
||||
received over 21 <ref column="other_config" key="cfm_interval"/>s.
|
||||
The health of an interface is undefined if it is communicating with
|
||||
more than one <ref column="cfm_remote_mpids"/>. It reduces if
|
||||
healthy heartbeats are not received at the expected rate, and
|
||||
gradually improves as healthy heartbeats are received at the desired
|
||||
rate. Every 21 <ref column="other_config" key="cfm_interval"/>s, the
|
||||
health of the interface is refreshed.
|
||||
</p>
|
||||
<p>
|
||||
As mentioned above, the faults can be triggered for several reasons.
|
||||
The link health will deteriorate even if heartbeats are received but
|
||||
they are reported to be unhealthy. An unhealthy heartbeat in this
|
||||
context is a heartbeat for which either some fault is set or is out
|
||||
of sequence. The interface health can be 100 only on receiving
|
||||
healthy heartbeats at the desired rate.
|
||||
</p>
|
||||
</column>
|
||||
|
||||
<column name="cfm_remote_mpids">
|
||||
When CFM is properly configured, Open vSwitch will occasionally
|
||||
receive CCM broadcasts. These broadcasts contain the MPID of the
|
||||
|
Reference in New Issue
Block a user