diff --git a/NEWS b/NEWS index 535ff4eb4..39f42c5c6 100644 --- a/NEWS +++ b/NEWS @@ -31,6 +31,8 @@ Post-v2.9.0 * Commands ovs-appctl dpif-netdev/pmd-*-show can now work on a single PMD * Detailed PMD performance metrics available with new command ovs-appctl dpif-netdev/pmd-perf-show + * Supervision of PMD performance metrics and logging of suspicious + iterations v2.9.0 - 19 Feb 2018 -------------------- diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c index 55b4067fa..4c6c5cfff 100644 --- a/lib/dpif-netdev-perf.c +++ b/lib/dpif-netdev-perf.c @@ -25,6 +25,24 @@ VLOG_DEFINE_THIS_MODULE(pmd_perf); +#define ITER_US_THRESHOLD 250 /* Warning threshold for iteration duration + in microseconds. */ +#define VHOST_QUEUE_FULL 128 /* Size of the virtio TX queue. */ +#define LOG_IT_BEFORE 5 /* Number of iterations to log before + suspicious iteration. */ +#define LOG_IT_AFTER 5 /* Number of iterations to log after + suspicious iteration. */ + +bool log_enabled = false; +bool log_extend = false; +static uint32_t log_it_before = LOG_IT_BEFORE; +static uint32_t log_it_after = LOG_IT_AFTER; +static uint32_t log_us_thr = ITER_US_THRESHOLD; +uint32_t log_q_thr = VHOST_QUEUE_FULL; +uint64_t iter_cycle_threshold; + +static struct vlog_rate_limit latency_rl = VLOG_RATE_LIMIT_INIT(600, 600); + #ifdef DPDK_NETDEV static uint64_t get_tsc_hz(void) @@ -141,6 +159,10 @@ pmd_perf_stats_init(struct pmd_perf_stats *s) histogram_walls_set_log(&s->max_vhost_qfill, 0, 512); s->iteration_cnt = 0; s->start_ms = time_msec(); + s->log_susp_it = UINT32_MAX; + s->log_begin_it = UINT32_MAX; + s->log_end_it = UINT32_MAX; + s->log_reason = NULL; } void @@ -391,6 +413,10 @@ pmd_perf_stats_clear_lock(struct pmd_perf_stats *s) history_init(&s->milliseconds); s->start_ms = time_msec(); s->milliseconds.sample[0].timestamp = s->start_ms; + s->log_susp_it = UINT32_MAX; + s->log_begin_it = UINT32_MAX; + s->log_end_it = UINT32_MAX; + s->log_reason = NULL; /* Clearing finished. */ s->clear = false; ovs_mutex_unlock(&s->clear_mutex); @@ -443,6 +469,7 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, uint64_t now_tsc = cycles_counter_update(s); struct iter_stats *cum_ms; uint64_t cycles, cycles_per_pkt = 0; + char *reason = NULL; cycles = now_tsc - s->start_tsc; s->current.timestamp = s->iteration_cnt; @@ -492,9 +519,30 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, cum_ms->batches += s->current.batches; cum_ms->max_vhost_qfill += s->current.max_vhost_qfill; + if (log_enabled) { + /* Log suspicious iterations. */ + if (cycles > iter_cycle_threshold) { + reason = "Excessive total cycles"; + } else if (s->current.max_vhost_qfill >= log_q_thr) { + reason = "Vhost RX queue full"; + } + if (OVS_UNLIKELY(reason)) { + pmd_perf_set_log_susp_iteration(s, reason); + cycles_counter_update(s); + } + + /* Log iteration interval around suspicious iteration when reaching + * the end of the range to be logged. */ + if (OVS_UNLIKELY(s->log_end_it == s->iterations.idx)) { + pmd_perf_log_susp_iteration_neighborhood(s); + cycles_counter_update(s); + } + } + /* Store in iteration history. This advances the iteration idx and * clears the next slot in the iteration history. */ history_store(&s->iterations, &s->current); + if (now_tsc > s->next_check_tsc) { /* Check if ms is completed and store in milliseconds history. */ uint64_t now = time_msec(); @@ -515,3 +563,178 @@ pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets, s->next_check_tsc = cycles_counter_update(s) + 10000; } } + +/* Delay logging of the suspicious iteration and the range of iterations + * around it until after the last iteration in the range to be logged. + * This avoids any distortion of the measurements through the cost of + * logging itself. */ + +void +pmd_perf_set_log_susp_iteration(struct pmd_perf_stats *s, + char *reason) +{ + if (s->log_susp_it == UINT32_MAX) { + /* No logging scheduled yet. */ + s->log_susp_it = s->iterations.idx; + s->log_reason = reason; + s->log_begin_it = history_sub(s->iterations.idx, log_it_before); + s->log_end_it = history_add(s->iterations.idx, log_it_after + 1); + } else if (log_extend) { + /* Logging was initiated earlier, we log the previous suspicious + * iteration now and extend the logging interval, if possible. */ + struct iter_stats *susp = &s->iterations.sample[s->log_susp_it]; + uint32_t new_end_it, old_range, new_range; + + VLOG_WARN_RL(&latency_rl, + "Suspicious iteration (%s): iter=%"PRIu64 + " duration=%"PRIu64" us\n", + s->log_reason, + susp->timestamp, + (1000000L * susp->cycles) / get_tsc_hz()); + + new_end_it = history_add(s->iterations.idx, log_it_after + 1); + new_range = history_sub(new_end_it, s->log_begin_it); + old_range = history_sub(s->log_end_it, s->log_begin_it); + if (new_range < old_range) { + /* Extended range exceeds history length. */ + new_end_it = s->log_begin_it; + } + s->log_susp_it = s->iterations.idx; + s->log_reason = reason; + s->log_end_it = new_end_it; + } +} + +void +pmd_perf_log_susp_iteration_neighborhood(struct pmd_perf_stats *s) +{ + ovs_assert(s->log_reason != NULL); + ovs_assert(s->log_susp_it != UINT32_MAX); + + struct ds log = DS_EMPTY_INITIALIZER; + struct iter_stats *susp = &s->iterations.sample[s->log_susp_it]; + uint32_t range = history_sub(s->log_end_it, s->log_begin_it); + + VLOG_WARN_RL(&latency_rl, + "Suspicious iteration (%s): iter=%"PRIu64 + " duration=%"PRIu64" us\n", + s->log_reason, + susp->timestamp, + (1000000L * susp->cycles) / get_tsc_hz()); + + pmd_perf_format_iteration_history(&log, s, range); + VLOG_WARN_RL(&latency_rl, + "Neighborhood of suspicious iteration:\n" + "%s", ds_cstr(&log)); + ds_destroy(&log); + s->log_susp_it = s->log_end_it = s->log_begin_it = UINT32_MAX; + s->log_reason = NULL; + + if (range > 100) { + /* Reset to safe default values to avoid disturbance. */ + log_it_before = LOG_IT_BEFORE; + log_it_after = LOG_IT_AFTER; + log_extend = false; + } +} + +void +pmd_perf_log_set_cmd(struct unixctl_conn *conn, + int argc, const char *argv[], + void *aux OVS_UNUSED) +{ + unsigned int it_before, it_after, us_thr, q_thr; + bool on, extend; + bool usage = false; + + on = log_enabled; + extend = log_extend; + it_before = log_it_before; + it_after = log_it_after; + q_thr = log_q_thr; + us_thr = log_us_thr; + + while (argc > 1) { + if (!strcmp(argv[1], "on")) { + on = true; + argc--; + argv++; + } else if (!strcmp(argv[1], "off")) { + on = false; + argc--; + argv++; + } else if (!strcmp(argv[1], "-e")) { + extend = true; + argc--; + argv++; + } else if (!strcmp(argv[1], "-ne")) { + extend = false; + argc--; + argv++; + } else if (!strcmp(argv[1], "-a") && argc > 2) { + if (str_to_uint(argv[2], 10, &it_after)) { + if (it_after > HISTORY_LEN - 2) { + it_after = HISTORY_LEN - 2; + } + } else { + usage = true; + break; + } + argc -= 2; + argv += 2; + } else if (!strcmp(argv[1], "-b") && argc > 2) { + if (str_to_uint(argv[2], 10, &it_before)) { + if (it_before > HISTORY_LEN - 2) { + it_before = HISTORY_LEN - 2; + } + } else { + usage = true; + break; + } + argc -= 2; + argv += 2; + } else if (!strcmp(argv[1], "-q") && argc > 2) { + if (!str_to_uint(argv[2], 10, &q_thr)) { + usage = true; + break; + } + argc -= 2; + argv += 2; + } else if (!strcmp(argv[1], "-us") && argc > 2) { + if (!str_to_uint(argv[2], 10, &us_thr)) { + usage = true; + break; + } + argc -= 2; + argv += 2; + } else { + usage = true; + break; + } + } + if (it_before + it_after > HISTORY_LEN - 2) { + it_after = HISTORY_LEN - 2 - it_before; + } + + if (usage) { + unixctl_command_reply_error(conn, + "Usage: ovs-appctl dpif-netdev/pmd-perf-log-set " + "[on|off] [-b before] [-a after] [-e|-ne] " + "[-us usec] [-q qlen]"); + return; + } + + VLOG_INFO("pmd-perf-log-set: %s, before=%d, after=%d, extend=%s, " + "us_thr=%d, q_thr=%d\n", + on ? "on" : "off", it_before, it_after, + extend ? "true" : "false", us_thr, q_thr); + log_enabled = on; + log_extend = extend; + log_it_before = it_before; + log_it_after = it_after; + log_q_thr = q_thr; + log_us_thr = us_thr; + iter_cycle_threshold = (log_us_thr * get_tsc_hz()) / 1000000L; + + unixctl_command_reply(conn, ""); +} diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h index e225d1089..b8aa4e344 100644 --- a/lib/dpif-netdev-perf.h +++ b/lib/dpif-netdev-perf.h @@ -175,6 +175,14 @@ struct pmd_perf_stats { struct history iterations; /* Millisecond history buffer. */ struct history milliseconds; + /* Suspicious iteration log. */ + uint32_t log_susp_it; + /* Start of iteration range to log. */ + uint32_t log_begin_it; + /* End of iteration range to log. */ + uint32_t log_end_it; + /* Reason for logging suspicious iteration. */ + char *log_reason; }; /* Support for accurate timing of PMD execution on TSC clock cycle level. @@ -350,6 +358,16 @@ history_store(struct history *h, struct iter_stats *is) return history_next(h); } +/* Data and function related to logging of suspicious iterations. */ + +extern bool log_enabled; +extern bool log_extend; +extern uint32_t log_q_thr; +extern uint64_t iter_cycle_threshold; + +void pmd_perf_set_log_susp_iteration(struct pmd_perf_stats *s, char *reason); +void pmd_perf_log_susp_iteration_neighborhood(struct pmd_perf_stats *s); + /* Functions recording PMD metrics per iteration. */ void @@ -375,6 +393,9 @@ void pmd_perf_format_iteration_history(struct ds *str, int n_iter); void pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s, int n_ms); +void pmd_perf_log_set_cmd(struct unixctl_conn *conn, + int argc, const char *argv[], + void *aux OVS_UNUSED); #ifdef __cplusplus } diff --git a/lib/dpif-netdev-unixctl.man b/lib/dpif-netdev-unixctl.man index 76c3e4efa..ab8619e41 100644 --- a/lib/dpif-netdev-unixctl.man +++ b/lib/dpif-netdev-unixctl.man @@ -149,6 +149,65 @@ reported by the \fBdpif-netdev/pmd-stats-show\fR command. To reset the counters and start a new measurement use \fBdpif-netdev/pmd-stats-clear\fR. . +.IP "\fBdpif-netdev/pmd-perf-log-set\fR \fBon\fR|\fBoff\fR \ +[\fB-b\fR \fIbefore\fR] [\fB-a\fR \fIafter\fR] [\fB-e\fR|\fB-ne\fR] \ +[\fB-us\fR \fIusec\fR] [\fB-q\fR \fIqlen\fR]" +. +The userspace "netdev" datapath is able to supervise the PMD performance +metrics and detect iterations with suspicious statistics according to the +following criteria: +.RS +.IP \(em +The iteration lasts longer than \fIusec\fR microseconds (default 250). +This can be used to capture events where a PMD is blocked or interrupted for +such a period of time that there is a risk for dropped packets on any of its Rx +queues. +.IP \(em +The max vhost qlen exceeds a threshold \fIqlen\fR (default 128). This can be +used to infer virtio queue overruns and dropped packets inside a VM, which are +not visible in OVS otherwise. +.RE +.IP +Such suspicious iterations can be logged together with their iteration +statistics in the \fBovs-vswitchd.log\fR to be able to correlate them to +packet drop or other events outside OVS. + +The above command enables (\fBon\fR) or disables (\fBoff\fR) supervision and +logging at run-time and can be used to adjust the above thresholds for +detecting suspicious iterations. By default supervision and logging is +disabled. + +The command options are: +.RS +.IP "\fB-b\fR \fIbefore\fR" +The number of iterations before the suspicious iteration to be logged +(default 5). +.IP "\fB-a\fR \fIafter\fR" +The number of iterations after the suspicious iteration to be logged +(default 5). +.IP "\fB-e\fR" +Extend logging interval if another suspicious iteration is detected +before logging occurs. +.IP "\fB-ne\fR" +Do not extend logging interval if another suspicious iteration is detected +before logging occurs (default). +.IP "\fB-q\fR \fIqlen\fR" +Suspicious vhost queue fill level threshold. Increase this to 512 if the Qemu +supports 1024 virtio queue length (default 128). +.IP "\fB-us\fR \fIusec\fR" +Change the duration threshold for a suspicious iteration (default 250 us). +.RE + +Note: Logging of suspicious iterations itself consumes a considerable amount +of processing cycles of a PMD which may be visible in the iteration history. +In the worst case this can lead OVS to detect another suspicious iteration +caused by logging. + +If more than 100 iterations around a suspicious iteration have been logged +once, OVS falls back to the safe default values (-b 5 -a 5 -ne) to avoid +that logging itself continuously causes logging of further suspicious +iterations. +. .IP "\fBdpif-netdev/pmd-rxq-show\fR [\fB-pmd\fR \fIcore\fR] [\fIdp\fR]" For one or all pmd threads of the datapath \fIdp\fR show the list of queue-ids with port names, which this thread polls. diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index 624296894..f86ed2aba 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -1235,6 +1235,11 @@ dpif_netdev_init(void) unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]", 0, 1, dpif_netdev_pmd_rebalance, NULL); + unixctl_command_register("dpif-netdev/pmd-perf-log-set", + "on|off [-b before] [-a after] [-e|-ne] " + "[-us usec] [-q qlen]", + 0, 10, pmd_perf_log_set_cmd, + NULL); return 0; }