mirror of
https://github.com/openvswitch/ovs
synced 2025-10-25 15:07:05 +00:00
poll-loop: Automatically log reason for wakeup when CPU usage spikes.
For a long time, the poll-loop module has had the ability to log the reason for wakeups, which is valuable for debugging excessive use of CPU time. But I have to ask users to turn up the log level for the module, which wastes their time and mine. This commit improves the situation by automatically logging the reason for a wakeup whenever a process's estimated CPU usage rises above 50%. (ovs-vswitchd often uses less than 1% CPU; more than 5% CPU is uncommon.)
This commit is contained in:
@@ -146,23 +146,65 @@ poll_immediate_wake(const char *where)
|
||||
poll_timer_wait(0, where);
|
||||
}
|
||||
|
||||
static void PRINTF_FORMAT(2, 3)
|
||||
log_wakeup(const char *where, const char *format, ...)
|
||||
/* Logs, if appropriate, that the poll loop was awakened by an event
|
||||
* registered at 'where' (typically a source file and line number). The other
|
||||
* arguments have two possible interpretations:
|
||||
*
|
||||
* - If 'pollfd' is nonnull then it should be the "struct pollfd" that caused
|
||||
* the wakeup. In this case, 'timeout' is ignored.
|
||||
*
|
||||
* - If 'pollfd' is nonnull then 'timeout' is the number of milliseconds
|
||||
* after which the poll loop woke up.
|
||||
*/
|
||||
static void
|
||||
log_wakeup(const char *where, const struct pollfd *pollfd, int timeout)
|
||||
{
|
||||
struct ds ds;
|
||||
va_list args;
|
||||
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(120, 120);
|
||||
enum vlog_level level;
|
||||
int cpu_usage;
|
||||
struct ds s;
|
||||
|
||||
ds_init(&ds);
|
||||
va_start(args, format);
|
||||
ds_put_format_valist(&ds, format, args);
|
||||
va_end(args);
|
||||
|
||||
if (where) {
|
||||
ds_put_format(&ds, " at %s", where);
|
||||
cpu_usage = get_cpu_usage();
|
||||
if (VLOG_IS_DBG_ENABLED()) {
|
||||
level = VLL_DBG;
|
||||
} else if (cpu_usage > 50 && !VLOG_DROP_WARN(&rl)) {
|
||||
level = VLL_WARN;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
VLOG_DBG("%s", ds_cstr(&ds));
|
||||
ds_destroy(&ds);
|
||||
ds_init(&s);
|
||||
ds_put_cstr(&s, "wakeup due to ");
|
||||
if (pollfd) {
|
||||
char *description = describe_fd(pollfd->fd);
|
||||
if (pollfd->revents & POLLIN) {
|
||||
ds_put_cstr(&s, "[POLLIN]");
|
||||
}
|
||||
if (pollfd->revents & POLLOUT) {
|
||||
ds_put_cstr(&s, "[POLLOUT]");
|
||||
}
|
||||
if (pollfd->revents & POLLERR) {
|
||||
ds_put_cstr(&s, "[POLLERR]");
|
||||
}
|
||||
if (pollfd->revents & POLLHUP) {
|
||||
ds_put_cstr(&s, "[POLLHUP]");
|
||||
}
|
||||
if (pollfd->revents & POLLNVAL) {
|
||||
ds_put_cstr(&s, "[POLLNVAL]");
|
||||
}
|
||||
ds_put_format(&s, " on fd %d (%s)", pollfd->fd, description);
|
||||
free(description);
|
||||
} else {
|
||||
ds_put_format(&s, "%d-ms timeout", timeout);
|
||||
}
|
||||
if (where) {
|
||||
ds_put_format(&s, " at %s", where);
|
||||
}
|
||||
if (cpu_usage >= 0) {
|
||||
ds_put_format(&s, " (%d%% CPU usage)", cpu_usage);
|
||||
}
|
||||
VLOG(level, "%s", ds_cstr(&s));
|
||||
ds_destroy(&s);
|
||||
}
|
||||
|
||||
/* Blocks until one or more of the events registered with poll_fd_wait()
|
||||
@@ -203,21 +245,13 @@ poll_block(void)
|
||||
if (retval < 0) {
|
||||
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
|
||||
VLOG_ERR_RL(&rl, "poll: %s", strerror(-retval));
|
||||
} else if (!retval && VLOG_IS_DBG_ENABLED()) {
|
||||
log_wakeup(timeout_where, "%d-ms timeout", timeout);
|
||||
} else if (!retval) {
|
||||
log_wakeup(timeout_where, NULL, timeout);
|
||||
}
|
||||
|
||||
LIST_FOR_EACH_SAFE (pw, next, node, &waiters) {
|
||||
if (pw->pollfd->revents && VLOG_IS_DBG_ENABLED()) {
|
||||
char *description = describe_fd(pw->fd);
|
||||
log_wakeup(pw->where, "%s%s%s%s%s on fd %d (%s)",
|
||||
pw->pollfd->revents & POLLIN ? "[POLLIN]" : "",
|
||||
pw->pollfd->revents & POLLOUT ? "[POLLOUT]" : "",
|
||||
pw->pollfd->revents & POLLERR ? "[POLLERR]" : "",
|
||||
pw->pollfd->revents & POLLHUP ? "[POLLHUP]" : "",
|
||||
pw->pollfd->revents & POLLNVAL ? "[POLLNVAL]" : "",
|
||||
pw->fd, description);
|
||||
free(description);
|
||||
if (pw->pollfd->revents) {
|
||||
log_wakeup(pw->where, pw->pollfd, 0);
|
||||
}
|
||||
poll_cancel(pw);
|
||||
}
|
||||
|
||||
@@ -59,8 +59,9 @@ static void refresh_monotonic_if_ticked(void);
|
||||
static time_t time_add(time_t, time_t);
|
||||
static void block_sigalrm(sigset_t *);
|
||||
static void unblock_sigalrm(const sigset_t *);
|
||||
static void log_poll_interval(long long int last_wakeup,
|
||||
const struct rusage *last_rusage);
|
||||
static void log_poll_interval(long long int last_wakeup);
|
||||
static struct rusage *get_recent_rusage(void);
|
||||
static void refresh_rusage(void);
|
||||
|
||||
/* Initializes the timetracking module.
|
||||
*
|
||||
@@ -280,14 +281,13 @@ int
|
||||
time_poll(struct pollfd *pollfds, int n_pollfds, int timeout)
|
||||
{
|
||||
static long long int last_wakeup;
|
||||
static struct rusage last_rusage;
|
||||
long long int start;
|
||||
sigset_t oldsigs;
|
||||
bool blocked;
|
||||
int retval;
|
||||
|
||||
time_refresh();
|
||||
log_poll_interval(last_wakeup, &last_rusage);
|
||||
log_poll_interval(last_wakeup);
|
||||
coverage_clear();
|
||||
start = time_msec();
|
||||
blocked = false;
|
||||
@@ -318,7 +318,7 @@ time_poll(struct pollfd *pollfds, int n_pollfds, int timeout)
|
||||
unblock_sigalrm(&oldsigs);
|
||||
}
|
||||
last_wakeup = time_msec();
|
||||
getrusage(RUSAGE_SELF, &last_rusage);
|
||||
refresh_rusage();
|
||||
return retval;
|
||||
}
|
||||
|
||||
@@ -399,7 +399,7 @@ timeval_diff_msec(const struct timeval *a, const struct timeval *b)
|
||||
}
|
||||
|
||||
static void
|
||||
log_poll_interval(long long int last_wakeup, const struct rusage *last_rusage)
|
||||
log_poll_interval(long long int last_wakeup)
|
||||
{
|
||||
static unsigned int mean_interval; /* In 16ths of a millisecond. */
|
||||
static unsigned int n_samples;
|
||||
@@ -415,6 +415,7 @@ log_poll_interval(long long int last_wakeup, const struct rusage *last_rusage)
|
||||
/* Warn if we took too much time between polls: at least 50 ms and at least
|
||||
* 8X the mean interval. */
|
||||
if (n_samples > 10 && interval > mean_interval * 8 && interval > 50 * 16) {
|
||||
const struct rusage *last_rusage = get_recent_rusage();
|
||||
struct rusage rusage;
|
||||
|
||||
getrusage(RUSAGE_SELF, &rusage);
|
||||
@@ -460,3 +461,55 @@ log_poll_interval(long long int last_wakeup, const struct rusage *last_rusage)
|
||||
mean_interval = interval;
|
||||
}
|
||||
}
|
||||
|
||||
/* CPU usage tracking. */
|
||||
|
||||
struct cpu_usage {
|
||||
long long int when; /* Time that this sample was taken. */
|
||||
unsigned long long int cpu; /* Total user+system CPU usage when sampled. */
|
||||
};
|
||||
|
||||
static struct rusage recent_rusage;
|
||||
static struct cpu_usage older = { LLONG_MIN, 0 };
|
||||
static struct cpu_usage newer = { LLONG_MIN, 0 };
|
||||
static int cpu_usage = -1;
|
||||
|
||||
static struct rusage *
|
||||
get_recent_rusage(void)
|
||||
{
|
||||
return &recent_rusage;
|
||||
}
|
||||
|
||||
static void
|
||||
refresh_rusage(void)
|
||||
{
|
||||
long long int now;
|
||||
|
||||
now = time_msec();
|
||||
getrusage(RUSAGE_SELF, &recent_rusage);
|
||||
|
||||
if (now >= newer.when + 3 * 1000) {
|
||||
older = newer;
|
||||
newer.when = now;
|
||||
newer.cpu = (timeval_to_msec(&recent_rusage.ru_utime) +
|
||||
timeval_to_msec(&recent_rusage.ru_stime));
|
||||
|
||||
if (older.when != LLONG_MIN && newer.cpu > older.cpu) {
|
||||
unsigned int dividend = newer.cpu - older.cpu;
|
||||
unsigned int divisor = (newer.when - older.when) / 100;
|
||||
cpu_usage = divisor > 0 ? dividend / divisor : -1;
|
||||
} else {
|
||||
cpu_usage = -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Returns an estimate of this process's CPU usage, as a percentage, over the
|
||||
* past few seconds of wall-clock time. Returns -1 if no estimate is available
|
||||
* (which will happen if the process has not been running long enough to have
|
||||
* an estimate, and can happen for other reasons as well). */
|
||||
int
|
||||
get_cpu_usage(void)
|
||||
{
|
||||
return cpu_usage;
|
||||
}
|
||||
|
||||
@@ -62,6 +62,8 @@ long long int timeval_to_msec(const struct timeval *);
|
||||
|
||||
void xgettimeofday(struct timeval *);
|
||||
|
||||
int get_cpu_usage(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user