2
0
mirror of https://github.com/openvswitch/ovs synced 2025-10-15 14:17:18 +00:00
Files
openvswitch/lib/dpif-netdev-perf.c
Jan Scheurich 79f368756c dpif-netdev: Detailed performance stats for PMDs
This patch instruments the dpif-netdev datapath to record detailed
statistics of what is happening in every iteration of a PMD thread.

The collection of detailed statistics can be controlled by a new
Open_vSwitch configuration parameter "other_config:pmd-perf-metrics".
By default it is disabled. The run-time overhead, when enabled, is
in the order of 1%.

The covered metrics per iteration are:
  - cycles
  - packets
  - (rx) batches
  - packets/batch
  - max. vhostuser qlen
  - upcalls
  - cycles spent in upcalls

This raw recorded data is used threefold:

1. In histograms for each of the following metrics:
   - cycles/iteration (log.)
   - packets/iteration (log.)
   - cycles/packet
   - packets/batch
   - max. vhostuser qlen (log.)
   - upcalls
   - cycles/upcall (log)
   The histograms bins are divided linear or logarithmic.

2. A cyclic history of the above statistics for 999 iterations

3. A cyclic history of the cummulative/average values per millisecond
   wall clock for the last 1000 milliseconds:
   - number of iterations
   - avg. cycles/iteration
   - packets (Kpps)
   - avg. packets/batch
   - avg. max vhost qlen
   - upcalls
   - avg. cycles/upcall

The gathered performance metrics can be printed at any time with the
new CLI command

ovs-appctl dpif-netdev/pmd-perf-show [-nh] [-it iter_len] [-ms ms_len]
    [-pmd core] [dp]

The options are

-nh:            Suppress the histograms
-it iter_len:   Display the last iter_len iteration stats
-ms ms_len:     Display the last ms_len millisecond stats
-pmd core:      Display only the specified PMD

The performance statistics are reset with the existing
dpif-netdev/pmd-stats-clear command.

The output always contains the following global PMD statistics,
similar to the pmd-stats-show command:

Time: 15:24:55.270
Measurement duration: 1.008 s

pmd thread numa_id 0 core_id 1:

  Cycles:            2419034712  (2.40 GHz)
  Iterations:            572817  (1.76 us/it)
  - idle:                486808  (15.9 % cycles)
  - busy:                 86009  (84.1 % cycles)
  Rx packets:           2399607  (2381 Kpps, 848 cycles/pkt)
  Datapath passes:      3599415  (1.50 passes/pkt)
  - EMC hits:            336472  ( 9.3 %)
  - Megaflow hits:      3262943  (90.7 %, 1.00 subtbl lookups/hit)
  - Upcalls:                  0  ( 0.0 %, 0.0 us/upcall)
  - Lost upcalls:             0  ( 0.0 %)
  Tx packets:           2399607  (2381 Kpps)
  Tx batches:            171400  (14.00 pkts/batch)

Signed-off-by: Jan Scheurich <jan.scheurich@ericsson.com>
Acked-by: Billy O'Mahony <billy.o.mahony@intel.com>
Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-05-11 08:08:24 +01:00

518 lines
19 KiB
C

/*
* Copyright (c) 2017 Ericsson AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <config.h>
#include <stdint.h>
#include "dpif-netdev-perf.h"
#include "openvswitch/dynamic-string.h"
#include "openvswitch/vlog.h"
#include "ovs-thread.h"
#include "timeval.h"
VLOG_DEFINE_THIS_MODULE(pmd_perf);
#ifdef DPDK_NETDEV
static uint64_t
get_tsc_hz(void)
{
return rte_get_tsc_hz();
}
#else
/* This function is only invoked from PMD threads which depend on DPDK.
* A dummy function is sufficient when building without DPDK_NETDEV. */
static uint64_t
get_tsc_hz(void)
{
return 1;
}
#endif
/* Histogram functions. */
static void
histogram_walls_set_lin(struct histogram *hist, uint32_t min, uint32_t max)
{
int i;
ovs_assert(min < max);
for (i = 0; i < NUM_BINS-1; i++) {
hist->wall[i] = min + (i * (max - min)) / (NUM_BINS - 2);
}
hist->wall[NUM_BINS-1] = UINT32_MAX;
}
static void
histogram_walls_set_log(struct histogram *hist, uint32_t min, uint32_t max)
{
int i, start, bins, wall;
double log_min, log_max;
ovs_assert(min < max);
if (min > 0) {
log_min = log(min);
log_max = log(max);
start = 0;
bins = NUM_BINS - 1;
} else {
hist->wall[0] = 0;
log_min = log(1);
log_max = log(max);
start = 1;
bins = NUM_BINS - 2;
}
wall = start;
for (i = 0; i < bins; i++) {
/* Make sure each wall is monotonically increasing. */
wall = MAX(wall, exp(log_min + (i * (log_max - log_min)) / (bins-1)));
hist->wall[start + i] = wall++;
}
if (hist->wall[NUM_BINS-2] < max) {
hist->wall[NUM_BINS-2] = max;
}
hist->wall[NUM_BINS-1] = UINT32_MAX;
}
uint64_t
histogram_samples(const struct histogram *hist)
{
uint64_t samples = 0;
for (int i = 0; i < NUM_BINS; i++) {
samples += hist->bin[i];
}
return samples;
}
static void
histogram_clear(struct histogram *hist)
{
int i;
for (i = 0; i < NUM_BINS; i++) {
hist->bin[i] = 0;
}
}
static void
history_init(struct history *h)
{
memset(h, 0, sizeof(*h));
}
void
pmd_perf_stats_init(struct pmd_perf_stats *s)
{
memset(s, 0, sizeof(*s));
ovs_mutex_init(&s->stats_mutex);
ovs_mutex_init(&s->clear_mutex);
/* Logarithmic histogram for cycles/it ranging from 500 to 24M
* (corresponding to 200 ns to 9.6 ms at 2.5 GHz TSC clock). */
histogram_walls_set_log(&s->cycles, 500, 24000000);
/* Logarithmic histogram for pkts/it ranging from 0 to 1000. */
histogram_walls_set_log(&s->pkts, 0, 1000);
/* Linear histogram for cycles/pkt ranging from 100 to 30K. */
histogram_walls_set_lin(&s->cycles_per_pkt, 100, 30000);
/* Linear histogram for pkts/rx batch ranging from 0 to 32,
* the maximum rx batch size in OVS. */
histogram_walls_set_lin(&s->pkts_per_batch, 0, 32);
/* Linear histogram for upcalls/it ranging from 0 to 30. */
histogram_walls_set_lin(&s->upcalls, 0, 30);
/* Logarithmic histogram for cycles/upcall ranging from 1000 to 1M
* (corresponding to 400 ns to 400 us at 2.5 GHz TSC clock). */
histogram_walls_set_log(&s->cycles_per_upcall, 1000, 1000000);
/* Log. histogram for max vhostuser queue fill level from 0 to 512.
* 512 is the maximum fill level for a virtio queue with 1024
* descriptors (maximum configurable length in Qemu), with the
* DPDK 17.11 virtio PMD in the guest. */
histogram_walls_set_log(&s->max_vhost_qfill, 0, 512);
s->iteration_cnt = 0;
s->start_ms = time_msec();
}
void
pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s,
double duration)
{
uint64_t stats[PMD_N_STATS];
double us_per_cycle = 1000000.0 / get_tsc_hz();
if (duration == 0) {
return;
}
pmd_perf_read_counters(s, stats);
uint64_t tot_cycles = stats[PMD_CYCLES_ITER_IDLE] +
stats[PMD_CYCLES_ITER_BUSY];
uint64_t rx_packets = stats[PMD_STAT_RECV];
uint64_t tx_packets = stats[PMD_STAT_SENT_PKTS];
uint64_t tx_batches = stats[PMD_STAT_SENT_BATCHES];
uint64_t passes = stats[PMD_STAT_RECV] +
stats[PMD_STAT_RECIRC];
uint64_t upcalls = stats[PMD_STAT_MISS];
uint64_t upcall_cycles = stats[PMD_CYCLES_UPCALL];
uint64_t tot_iter = histogram_samples(&s->pkts);
uint64_t idle_iter = s->pkts.bin[0];
uint64_t busy_iter = tot_iter >= idle_iter ? tot_iter - idle_iter : 0;
ds_put_format(str,
" Cycles: %12"PRIu64" (%.2f GHz)\n"
" Iterations: %12"PRIu64" (%.2f us/it)\n"
" - idle: %12"PRIu64" (%4.1f %% cycles)\n"
" - busy: %12"PRIu64" (%4.1f %% cycles)\n",
tot_cycles, (tot_cycles / duration) / 1E9,
tot_iter, tot_cycles * us_per_cycle / tot_iter,
idle_iter,
100.0 * stats[PMD_CYCLES_ITER_IDLE] / tot_cycles,
busy_iter,
100.0 * stats[PMD_CYCLES_ITER_BUSY] / tot_cycles);
if (rx_packets > 0) {
ds_put_format(str,
" Rx packets: %12"PRIu64" (%.0f Kpps, %.0f cycles/pkt)\n"
" Datapath passes: %12"PRIu64" (%.2f passes/pkt)\n"
" - EMC hits: %12"PRIu64" (%4.1f %%)\n"
" - Megaflow hits: %12"PRIu64" (%4.1f %%, %.2f subtbl lookups/"
"hit)\n"
" - Upcalls: %12"PRIu64" (%4.1f %%, %.1f us/upcall)\n"
" - Lost upcalls: %12"PRIu64" (%4.1f %%)\n",
rx_packets, (rx_packets / duration) / 1000,
1.0 * stats[PMD_CYCLES_ITER_BUSY] / rx_packets,
passes, rx_packets ? 1.0 * passes / rx_packets : 0,
stats[PMD_STAT_EXACT_HIT],
100.0 * stats[PMD_STAT_EXACT_HIT] / passes,
stats[PMD_STAT_MASKED_HIT],
100.0 * stats[PMD_STAT_MASKED_HIT] / passes,
stats[PMD_STAT_MASKED_HIT]
? 1.0 * stats[PMD_STAT_MASKED_LOOKUP] / stats[PMD_STAT_MASKED_HIT]
: 0,
upcalls, 100.0 * upcalls / passes,
upcalls ? (upcall_cycles * us_per_cycle) / upcalls : 0,
stats[PMD_STAT_LOST],
100.0 * stats[PMD_STAT_LOST] / passes);
} else {
ds_put_format(str, " Rx packets: %12d\n", 0);
}
if (tx_packets > 0) {
ds_put_format(str,
" Tx packets: %12"PRIu64" (%.0f Kpps)\n"
" Tx batches: %12"PRIu64" (%.2f pkts/batch)"
"\n",
tx_packets, (tx_packets / duration) / 1000,
tx_batches, 1.0 * tx_packets / tx_batches);
} else {
ds_put_format(str, " Tx packets: %12d\n\n", 0);
}
}
void
pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s)
{
int i;
ds_put_cstr(str, "Histograms\n");
ds_put_format(str,
" %-21s %-21s %-21s %-21s %-21s %-21s %-21s\n",
"cycles/it", "packets/it", "cycles/pkt", "pkts/batch",
"max vhost qlen", "upcalls/it", "cycles/upcall");
for (i = 0; i < NUM_BINS-1; i++) {
ds_put_format(str,
" %-9d %-11"PRIu64" %-9d %-11"PRIu64" %-9d %-11"PRIu64
" %-9d %-11"PRIu64" %-9d %-11"PRIu64" %-9d %-11"PRIu64
" %-9d %-11"PRIu64"\n",
s->cycles.wall[i], s->cycles.bin[i],
s->pkts.wall[i],s->pkts.bin[i],
s->cycles_per_pkt.wall[i], s->cycles_per_pkt.bin[i],
s->pkts_per_batch.wall[i], s->pkts_per_batch.bin[i],
s->max_vhost_qfill.wall[i], s->max_vhost_qfill.bin[i],
s->upcalls.wall[i], s->upcalls.bin[i],
s->cycles_per_upcall.wall[i], s->cycles_per_upcall.bin[i]);
}
ds_put_format(str,
" %-9s %-11"PRIu64" %-9s %-11"PRIu64" %-9s %-11"PRIu64
" %-9s %-11"PRIu64" %-9s %-11"PRIu64" %-9s %-11"PRIu64
" %-9s %-11"PRIu64"\n",
">", s->cycles.bin[i],
">", s->pkts.bin[i],
">", s->cycles_per_pkt.bin[i],
">", s->pkts_per_batch.bin[i],
">", s->max_vhost_qfill.bin[i],
">", s->upcalls.bin[i],
">", s->cycles_per_upcall.bin[i]);
if (s->totals.iterations > 0) {
ds_put_cstr(str,
"-----------------------------------------------------"
"-----------------------------------------------------"
"------------------------------------------------\n");
ds_put_format(str,
" %-21s %-21s %-21s %-21s %-21s %-21s %-21s\n",
"cycles/it", "packets/it", "cycles/pkt", "pkts/batch",
"vhost qlen", "upcalls/it", "cycles/upcall");
ds_put_format(str,
" %-21"PRIu64" %-21.5f %-21"PRIu64
" %-21.5f %-21.5f %-21.5f %-21"PRIu32"\n",
s->totals.cycles / s->totals.iterations,
1.0 * s->totals.pkts / s->totals.iterations,
s->totals.pkts
? s->totals.busy_cycles / s->totals.pkts : 0,
s->totals.batches
? 1.0 * s->totals.pkts / s->totals.batches : 0,
1.0 * s->totals.max_vhost_qfill / s->totals.iterations,
1.0 * s->totals.upcalls / s->totals.iterations,
s->totals.upcalls
? s->totals.upcall_cycles / s->totals.upcalls : 0);
}
}
void
pmd_perf_format_iteration_history(struct ds *str, struct pmd_perf_stats *s,
int n_iter)
{
struct iter_stats *is;
size_t index;
int i;
if (n_iter == 0) {
return;
}
ds_put_format(str, " %-17s %-10s %-10s %-10s %-10s "
"%-10s %-10s %-10s\n",
"iter", "cycles", "packets", "cycles/pkt", "pkts/batch",
"vhost qlen", "upcalls", "cycles/upcall");
for (i = 1; i <= n_iter; i++) {
index = history_sub(s->iterations.idx, i);
is = &s->iterations.sample[index];
ds_put_format(str,
" %-17"PRIu64" %-11"PRIu64" %-11"PRIu32
" %-11"PRIu64" %-11"PRIu32" %-11"PRIu32
" %-11"PRIu32" %-11"PRIu32"\n",
is->timestamp,
is->cycles,
is->pkts,
is->pkts ? is->cycles / is->pkts : 0,
is->batches ? is->pkts / is->batches : 0,
is->max_vhost_qfill,
is->upcalls,
is->upcalls ? is->upcall_cycles / is->upcalls : 0);
}
}
void
pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s, int n_ms)
{
struct iter_stats *is;
size_t index;
int i;
if (n_ms == 0) {
return;
}
ds_put_format(str,
" %-12s %-10s %-10s %-10s %-10s"
" %-10s %-10s %-10s %-10s\n",
"ms", "iterations", "cycles/it", "Kpps", "cycles/pkt",
"pkts/batch", "vhost qlen", "upcalls", "cycles/upcall");
for (i = 1; i <= n_ms; i++) {
index = history_sub(s->milliseconds.idx, i);
is = &s->milliseconds.sample[index];
ds_put_format(str,
" %-12"PRIu64" %-11"PRIu32" %-11"PRIu64
" %-11"PRIu32" %-11"PRIu64" %-11"PRIu32
" %-11"PRIu32" %-11"PRIu32" %-11"PRIu32"\n",
is->timestamp,
is->iterations,
is->iterations ? is->cycles / is->iterations : 0,
is->pkts,
is->pkts ? is->busy_cycles / is->pkts : 0,
is->batches ? is->pkts / is->batches : 0,
is->iterations
? is->max_vhost_qfill / is->iterations : 0,
is->upcalls,
is->upcalls ? is->upcall_cycles / is->upcalls : 0);
}
}
void
pmd_perf_read_counters(struct pmd_perf_stats *s,
uint64_t stats[PMD_N_STATS])
{
uint64_t val;
/* These loops subtracts reference values (.zero[*]) from the counters.
* Since loads and stores are relaxed, it might be possible for a .zero[*]
* value to be more recent than the current value we're reading from the
* counter. This is not a big problem, since these numbers are not
* supposed to be 100% accurate, but we should at least make sure that
* the result is not negative. */
for (int i = 0; i < PMD_N_STATS; i++) {
atomic_read_relaxed(&s->counters.n[i], &val);
if (val > s->counters.zero[i]) {
stats[i] = val - s->counters.zero[i];
} else {
stats[i] = 0;
}
}
}
/* This function clears the PMD performance counters from within the PMD
* thread or from another thread when the PMD thread is not executing its
* poll loop. */
void
pmd_perf_stats_clear_lock(struct pmd_perf_stats *s)
OVS_REQUIRES(s->stats_mutex)
{
ovs_mutex_lock(&s->clear_mutex);
for (int i = 0; i < PMD_N_STATS; i++) {
atomic_read_relaxed(&s->counters.n[i], &s->counters.zero[i]);
}
/* The following stats are only applicable in PMD thread and */
memset(&s->current, 0 , sizeof(struct iter_stats));
memset(&s->totals, 0 , sizeof(struct iter_stats));
histogram_clear(&s->cycles);
histogram_clear(&s->pkts);
histogram_clear(&s->cycles_per_pkt);
histogram_clear(&s->upcalls);
histogram_clear(&s->cycles_per_upcall);
histogram_clear(&s->pkts_per_batch);
histogram_clear(&s->max_vhost_qfill);
history_init(&s->iterations);
history_init(&s->milliseconds);
s->start_ms = time_msec();
s->milliseconds.sample[0].timestamp = s->start_ms;
/* Clearing finished. */
s->clear = false;
ovs_mutex_unlock(&s->clear_mutex);
}
/* This function can be called from the anywhere to clear the stats
* of PMD and non-PMD threads. */
void
pmd_perf_stats_clear(struct pmd_perf_stats *s)
{
if (ovs_mutex_trylock(&s->stats_mutex) == 0) {
/* Locking successful. PMD not polling. */
pmd_perf_stats_clear_lock(s);
ovs_mutex_unlock(&s->stats_mutex);
} else {
/* Request the polling PMD to clear the stats. There is no need to
* block here as stats retrieval is prevented during clearing. */
s->clear = true;
}
}
/* Functions recording PMD metrics per iteration. */
inline void
pmd_perf_start_iteration(struct pmd_perf_stats *s)
OVS_REQUIRES(s->stats_mutex)
{
if (s->clear) {
/* Clear the PMD stats before starting next iteration. */
pmd_perf_stats_clear_lock(s);
}
s->iteration_cnt++;
/* Initialize the current interval stats. */
memset(&s->current, 0, sizeof(struct iter_stats));
if (OVS_LIKELY(s->last_tsc)) {
/* We assume here that last_tsc was updated immediately prior at
* the end of the previous iteration, or just before the first
* iteration. */
s->start_tsc = s->last_tsc;
} else {
/* In case last_tsc has never been set before. */
s->start_tsc = cycles_counter_update(s);
}
}
inline void
pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets,
int tx_packets, bool full_metrics)
{
uint64_t now_tsc = cycles_counter_update(s);
struct iter_stats *cum_ms;
uint64_t cycles, cycles_per_pkt = 0;
cycles = now_tsc - s->start_tsc;
s->current.timestamp = s->iteration_cnt;
s->current.cycles = cycles;
s->current.pkts = rx_packets;
if (rx_packets + tx_packets > 0) {
pmd_perf_update_counter(s, PMD_CYCLES_ITER_BUSY, cycles);
} else {
pmd_perf_update_counter(s, PMD_CYCLES_ITER_IDLE, cycles);
}
/* Add iteration samples to histograms. */
histogram_add_sample(&s->cycles, cycles);
histogram_add_sample(&s->pkts, rx_packets);
if (!full_metrics) {
return;
}
s->counters.n[PMD_CYCLES_UPCALL] += s->current.upcall_cycles;
if (rx_packets > 0) {
cycles_per_pkt = cycles / rx_packets;
histogram_add_sample(&s->cycles_per_pkt, cycles_per_pkt);
}
if (s->current.batches > 0) {
histogram_add_sample(&s->pkts_per_batch,
rx_packets / s->current.batches);
}
histogram_add_sample(&s->upcalls, s->current.upcalls);
if (s->current.upcalls > 0) {
histogram_add_sample(&s->cycles_per_upcall,
s->current.upcall_cycles / s->current.upcalls);
}
histogram_add_sample(&s->max_vhost_qfill, s->current.max_vhost_qfill);
/* Add iteration samples to millisecond stats. */
cum_ms = history_current(&s->milliseconds);
cum_ms->iterations++;
cum_ms->cycles += cycles;
if (rx_packets > 0) {
cum_ms->busy_cycles += cycles;
}
cum_ms->pkts += s->current.pkts;
cum_ms->upcalls += s->current.upcalls;
cum_ms->upcall_cycles += s->current.upcall_cycles;
cum_ms->batches += s->current.batches;
cum_ms->max_vhost_qfill += s->current.max_vhost_qfill;
/* Store in iteration history. This advances the iteration idx and
* clears the next slot in the iteration history. */
history_store(&s->iterations, &s->current);
if (now_tsc > s->next_check_tsc) {
/* Check if ms is completed and store in milliseconds history. */
uint64_t now = time_msec();
if (now != cum_ms->timestamp) {
/* Add ms stats to totals. */
s->totals.iterations += cum_ms->iterations;
s->totals.cycles += cum_ms->cycles;
s->totals.busy_cycles += cum_ms->busy_cycles;
s->totals.pkts += cum_ms->pkts;
s->totals.upcalls += cum_ms->upcalls;
s->totals.upcall_cycles += cum_ms->upcall_cycles;
s->totals.batches += cum_ms->batches;
s->totals.max_vhost_qfill += cum_ms->max_vhost_qfill;
cum_ms = history_next(&s->milliseconds);
cum_ms->timestamp = now;
}
/* Do the next check after 10K cycles (4 us at 2.5 GHz TSC clock). */
s->next_check_tsc = cycles_counter_update(s) + 10000;
}
}