2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 09:58:01 +00:00
ovs/lib/dpif-netdev-perf.h
Kevin Traynor de3bbdc479 dpif-netdev: Add PMD load based sleeping.
Sleep for an incremental amount of time if none of the Rx queues
assigned to a PMD have at least half a batch of packets (i.e. 16 pkts)
on an polling iteration of the PMD.

Upon detecting the threshold of >= 16 pkts on an Rxq, reset the
sleep time to zero (i.e. no sleep).

Sleep time will be increased on each iteration where the low load
conditions remain up to a total of the max sleep time which is set
by the user e.g:
ovs-vsctl set Open_vSwitch . other_config:pmd-maxsleep=500

The default pmd-maxsleep value is 0, which means that no sleeps
will occur and the default behaviour is unchanged from previously.

Also add new stats to pmd-perf-show to get visibility of operation
e.g.
...
   - sleep iterations:       153994  ( 76.8 % of iterations)
   Sleep time (us):         9159399  ( 59 us/iteration avg.)
...

Reviewed-by: Robin Jarry <rjarry@redhat.com>
Reviewed-by: David Marchand <david.marchand@redhat.com>
Signed-off-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2023-01-12 18:56:05 +01:00

442 lines
15 KiB
C

/*
* Copyright (c) 2017 Ericsson AB.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef DPIF_NETDEV_PERF_H
#define DPIF_NETDEV_PERF_H 1
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#include <string.h>
#include <time.h>
#include <math.h>
#ifdef DPDK_NETDEV
#include <rte_config.h>
#include <rte_cycles.h>
#endif
#include "openvswitch/vlog.h"
#include "ovs-atomic.h"
#include "timeval.h"
#include "unixctl.h"
#include "util.h"
#ifdef __cplusplus
extern "C" {
#endif
/* This module encapsulates data structures and functions to maintain basic PMD
* performance metrics such as packet counters, execution cycles as well as
* histograms and time series recording for more detailed PMD metrics.
*
* It provides a clean API for dpif-netdev to initialize, update and read and
* reset these metrics.
*
* The basic set of PMD counters is implemented as atomic_uint64_t variables
* to guarantee correct read also in 32-bit systems.
*
* The detailed PMD performance metrics are only supported on 64-bit systems
* with atomic 64-bit read and store semantics for plain uint64_t counters.
*/
/* Set of counter types maintained in pmd_perf_stats. */
enum pmd_stat_type {
PMD_STAT_PHWOL_HIT, /* Packets that had a partial HWOL hit (phwol). */
PMD_STAT_MFEX_OPT_HIT, /* Packets that had miniflow optimized match. */
PMD_STAT_SIMPLE_HIT, /* Packets that had a simple match hit. */
PMD_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */
PMD_STAT_SMC_HIT, /* Packets that had a sig match hit (SMC). */
PMD_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
PMD_STAT_MISS, /* Packets that did not match and upcall was ok. */
PMD_STAT_LOST, /* Packets that did not match and upcall failed. */
/* The above statistics account for the total
* number of packet passes through the datapath
* pipeline and should not be overlapping with each
* other. */
PMD_STAT_MASKED_LOOKUP, /* Number of subtable lookups for flow table
hits. Each MASKED_HIT hit will have >= 1
MASKED_LOOKUP(s). */
PMD_STAT_RECV, /* Packets entering the datapath pipeline from an
* interface. */
PMD_STAT_RECIRC, /* Packets reentering the datapath pipeline due to
* recirculation. */
PMD_STAT_SENT_PKTS, /* Packets that have been sent. */
PMD_STAT_SENT_BATCHES, /* Number of batches sent. */
PMD_CYCLES_ITER_IDLE, /* Cycles spent in idle iterations. */
PMD_CYCLES_ITER_BUSY, /* Cycles spent in busy iterations. */
PMD_CYCLES_UPCALL, /* Cycles spent processing upcalls. */
PMD_SLEEP_ITER, /* Iterations where a sleep has taken place. */
PMD_CYCLES_SLEEP, /* Total cycles slept to save power. */
PMD_N_STATS
};
/* Array of PMD counters indexed by enum pmd_stat_type.
* The n[] array contains the actual counter values since initialization
* of the PMD. Counters are atomically updated from the PMD but are
* read and cleared also from other processes. To clear the counters at
* PMD run-time, the current counter values are copied over to the zero[]
* array. To read counters we subtract zero[] value from n[]. */
struct pmd_counters {
atomic_uint64_t n[PMD_N_STATS]; /* Value since _init(). */
uint64_t zero[PMD_N_STATS]; /* Value at last _clear(). */
};
/* Data structure to collect statistical distribution of an integer measurement
* type in form of a histogram. The wall[] array contains the inclusive
* upper boundaries of the bins, while the bin[] array contains the actual
* counters per bin. The histogram walls are typically set automatically
* using the functions provided below.*/
#define NUM_BINS 32 /* Number of histogram bins. */
struct histogram {
uint32_t wall[NUM_BINS];
uint64_t bin[NUM_BINS];
};
/* Data structure to record details PMD execution metrics per iteration for
* a history period of up to HISTORY_LEN iterations in circular buffer.
* Also used to record up to HISTORY_LEN millisecond averages/totals of these
* metrics.*/
struct iter_stats {
uint64_t timestamp; /* Iteration no. or millisecond. */
uint64_t cycles; /* Number of TSC cycles spent in it. or ms. */
uint64_t busy_cycles; /* Cycles spent in busy iterations or ms. */
uint32_t iterations; /* Iterations in ms. */
uint32_t pkts; /* Packets processed in iteration or ms. */
uint32_t upcalls; /* Number of upcalls in iteration or ms. */
uint32_t upcall_cycles; /* Cycles spent in upcalls in it. or ms. */
uint32_t batches; /* Number of rx batches in iteration or ms. */
uint32_t max_vhost_qfill; /* Maximum fill level in iteration or ms. */
};
#define HISTORY_LEN 1000 /* Length of recorded history
(iterations and ms). */
#define DEF_HIST_SHOW 20 /* Default number of history samples to
display. */
struct history {
size_t idx; /* Slot to which next call to history_store()
will write. */
struct iter_stats sample[HISTORY_LEN];
};
/* Container for all performance metrics of a PMD within the struct
* dp_netdev_pmd_thread. The metrics must be updated from within the PMD
* thread but can be read from any thread. The basic PMD counters in
* struct pmd_counters can be read without protection against concurrent
* clearing. The other metrics may only be safely read with the clear_mutex
* held to protect against concurrent clearing. */
struct pmd_perf_stats {
/* Prevents interference between PMD polling and stats clearing. */
struct ovs_mutex stats_mutex;
/* Set by CLI thread to order clearing of PMD stats. */
volatile bool clear;
/* Prevents stats retrieval while clearing is in progress. */
struct ovs_mutex clear_mutex;
/* Start of the current performance measurement period. */
uint64_t start_ms;
/* Counter for PMD iterations. */
uint64_t iteration_cnt;
/* Start of the current iteration. */
uint64_t start_tsc;
/* Latest TSC time stamp taken in PMD. */
uint64_t last_tsc;
/* Used to space certain checks in time. */
uint64_t next_check_tsc;
/* If non-NULL, outermost cycle timer currently running in PMD. */
struct cycle_timer *cur_timer;
/* Set of PMD counters with their zero offsets. */
struct pmd_counters counters;
/* Statistics of the current iteration. */
struct iter_stats current;
/* Totals for the current millisecond. */
struct iter_stats totals;
/* Histograms for the PMD metrics. */
struct histogram cycles;
struct histogram pkts;
struct histogram cycles_per_pkt;
struct histogram upcalls;
struct histogram cycles_per_upcall;
struct histogram pkts_per_batch;
struct histogram max_vhost_qfill;
/* Iteration history buffer. */
struct history iterations;
/* Millisecond history buffer. */
struct history milliseconds;
/* Suspicious iteration log. */
uint32_t log_susp_it;
/* Start of iteration range to log. */
uint32_t log_begin_it;
/* End of iteration range to log. */
uint32_t log_end_it;
/* Reason for logging suspicious iteration. */
char *log_reason;
};
#ifdef __linux__
static inline uint64_t
rdtsc_syscall(struct pmd_perf_stats *s)
{
struct timespec val;
uint64_t v;
if (clock_gettime(CLOCK_MONOTONIC_RAW, &val) != 0) {
return s->last_tsc;
}
v = val.tv_sec * UINT64_C(1000000000) + val.tv_nsec;
return s->last_tsc = v;
}
#endif
/* Support for accurate timing of PMD execution on TSC clock cycle level.
* These functions are intended to be invoked in the context of pmd threads. */
/* Read the TSC cycle register and cache it. Any function not requiring clock
* cycle accuracy should read the cached value using cycles_counter_get() to
* avoid the overhead of reading the TSC register. */
static inline uint64_t
cycles_counter_update(struct pmd_perf_stats *s)
{
#ifdef DPDK_NETDEV
return s->last_tsc = rte_get_tsc_cycles();
#elif !defined(_MSC_VER) && defined(__x86_64__)
uint32_t h, l;
asm volatile("rdtsc" : "=a" (l), "=d" (h));
return s->last_tsc = ((uint64_t) h << 32) | l;
#elif !defined(_MSC_VER) && defined(__aarch64__)
asm volatile("mrs %0, cntvct_el0" : "=r" (s->last_tsc));
return s->last_tsc;
#elif defined(__linux__)
return rdtsc_syscall(s);
#else
return s->last_tsc = 0;
#endif
}
static inline uint64_t
cycles_counter_get(struct pmd_perf_stats *s)
{
return s->last_tsc;
}
void pmd_perf_estimate_tsc_frequency(void);
/* A nestable timer for measuring execution time in TSC cycles.
*
* Usage:
* struct cycle_timer timer;
*
* cycle_timer_start(pmd, &timer);
* <Timed execution>
* uint64_t cycles = cycle_timer_stop(pmd, &timer);
*
* The caller must guarantee that a call to cycle_timer_start() is always
* paired with a call to cycle_stimer_stop().
*
* Is is possible to have nested cycles timers within the timed code. The
* execution time measured by the nested timers is excluded from the time
* measured by the embracing timer.
*/
struct cycle_timer {
uint64_t start;
uint64_t suspended;
struct cycle_timer *interrupted;
};
static inline void
cycle_timer_start(struct pmd_perf_stats *s,
struct cycle_timer *timer)
{
struct cycle_timer *cur_timer = s->cur_timer;
uint64_t now = cycles_counter_update(s);
if (cur_timer) {
cur_timer->suspended = now;
}
timer->interrupted = cur_timer;
timer->start = now;
timer->suspended = 0;
s->cur_timer = timer;
}
static inline uint64_t
cycle_timer_stop(struct pmd_perf_stats *s,
struct cycle_timer *timer)
{
/* Assert that this is the current cycle timer. */
ovs_assert(s->cur_timer == timer);
uint64_t now = cycles_counter_update(s);
struct cycle_timer *intr_timer = timer->interrupted;
if (intr_timer) {
/* Adjust the start offset by the suspended cycles. */
intr_timer->start += now - intr_timer->suspended;
}
/* Restore suspended timer, if any. */
s->cur_timer = intr_timer;
return now - timer->start;
}
/* Functions to initialize and reset the PMD performance metrics. */
void pmd_perf_stats_init(struct pmd_perf_stats *s);
void pmd_perf_stats_clear(struct pmd_perf_stats *s);
void pmd_perf_stats_clear_lock(struct pmd_perf_stats *s);
/* Functions to read and update PMD counters. */
void pmd_perf_read_counters(struct pmd_perf_stats *s,
uint64_t stats[PMD_N_STATS]);
/* PMD performance counters are updated lock-less. For real PMDs
* they are only updated from the PMD thread itself. In the case of the
* NON-PMD they might be updated from multiple threads, but we can live
* with losing a rare update as 100% accuracy is not required.
* However, as counters are read for display from outside the PMD thread
* with e.g. pmd-stats-show, we make sure that the 64-bit read and store
* operations are atomic also on 32-bit systems so that readers cannot
* not read garbage. On 64-bit systems this incurs no overhead. */
static inline void
pmd_perf_update_counter(struct pmd_perf_stats *s,
enum pmd_stat_type counter, int delta)
{
uint64_t tmp;
atomic_read_relaxed(&s->counters.n[counter], &tmp);
tmp += delta;
atomic_store_relaxed(&s->counters.n[counter], tmp);
}
/* Functions to manipulate a sample history. */
static inline void
histogram_add_sample(struct histogram *hist, uint32_t val)
{
/* TODO: Can do better with binary search? */
for (int i = 0; i < NUM_BINS-1; i++) {
if (val <= hist->wall[i]) {
hist->bin[i]++;
return;
}
}
hist->bin[NUM_BINS-1]++;
}
uint64_t histogram_samples(const struct histogram *hist);
/* This function is used to advance the given history index by positive
* offset in the circular history buffer. */
static inline uint32_t
history_add(uint32_t idx, uint32_t offset)
{
return (idx + offset) % HISTORY_LEN;
}
/* This function computes the difference between two indices into the
* circular history buffer. The result is always positive in the range
* 0 .. HISTORY_LEN-1 and specifies the number of steps to reach idx1
* starting from idx2. It can also be used to retreat the history index
* idx1 by idx2 steps. */
static inline uint32_t
history_sub(uint32_t idx1, uint32_t idx2)
{
return (idx1 + HISTORY_LEN - idx2) % HISTORY_LEN;
}
static inline struct iter_stats *
history_current(struct history *h)
{
return &h->sample[h->idx];
}
static inline struct iter_stats *
history_next(struct history *h)
{
size_t next_idx = history_add(h->idx, 1);
struct iter_stats *next = &h->sample[next_idx];
memset(next, 0, sizeof(*next));
h->idx = next_idx;
return next;
}
static inline struct iter_stats *
history_store(struct history *h, struct iter_stats *is)
{
if (is) {
h->sample[h->idx] = *is;
}
/* Advance the history pointer */
return history_next(h);
}
/* Data and function related to logging of suspicious iterations. */
extern bool log_enabled;
extern bool log_extend;
extern uint32_t log_q_thr;
extern uint64_t iter_cycle_threshold;
void pmd_perf_set_log_susp_iteration(struct pmd_perf_stats *s, char *reason);
void pmd_perf_log_susp_iteration_neighborhood(struct pmd_perf_stats *s);
/* Functions recording PMD metrics per iteration. */
void
pmd_perf_start_iteration(struct pmd_perf_stats *s);
void
pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets,
int tx_packets, uint64_t sleep_cycles,
bool full_metrics);
/* Formatting the output of commands. */
struct pmd_perf_params {
int command_type;
bool histograms;
size_t iter_hist_len;
size_t ms_hist_len;
};
void pmd_perf_format_overall_stats(struct ds *str, struct pmd_perf_stats *s,
double duration);
void pmd_perf_format_histograms(struct ds *str, struct pmd_perf_stats *s);
void pmd_perf_format_iteration_history(struct ds *str,
struct pmd_perf_stats *s,
int n_iter);
void pmd_perf_format_ms_history(struct ds *str, struct pmd_perf_stats *s,
int n_ms);
void pmd_perf_log_set_cmd(struct unixctl_conn *conn,
int argc, const char *argv[],
void *aux OVS_UNUSED);
#ifdef __cplusplus
}
#endif
#endif /* DPIF_NETDEV_PERF_H */