mirror of
https://github.com/openvswitch/ovs
synced 2025-08-22 09:58:01 +00:00
utilities: Add upcall statistics to the kernel_delay.py script.
This patch installs a kernel return probe on ovs_dp_upcall() to record all successful and failed calls per CPU. The statistics are reported when the script exits, providing insights into the upcall behavior. This is an example output: # UPCALL STATISTICS (TOTAL [CPU_ID: N_UPCALLS_PER_CPU, ...]): Total upcalls : 183247 [0: 8937, 2: 14464, 4: 10372, 6: 4254, ...] Successfull upcalls : 120195 [0: 5945, 2: 6379, 4: 5966, 6: 4254, ...] Failed upcalls : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...] 11, EAGAIN : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...] Tested-by: Adrian Moreno <amoreno@redhat.com> Reviewed-by: Adrian Moreno <amoreno@redhat.com> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
This commit is contained in:
parent
a48b3278f0
commit
638433bb92
@ -149,6 +149,7 @@ FLAKE8_PYFILES += utilities/ovs-pcap.in \
|
||||
utilities/ovs-pipegen.py \
|
||||
utilities/usdt-scripts/dpif_op_nl_monitor.py \
|
||||
utilities/usdt-scripts/flow_reval_monitor.py \
|
||||
utilities/usdt-scripts/kernel_delay.py \
|
||||
utilities/usdt-scripts/upcall_monitor.py \
|
||||
utilities/usdt-scripts/upcall_cost.py
|
||||
|
||||
|
@ -39,6 +39,7 @@
|
||||
#
|
||||
import argparse
|
||||
import datetime
|
||||
import errno
|
||||
import os
|
||||
import pytz
|
||||
import psutil
|
||||
@ -556,6 +557,36 @@ TRACEPOINT_PROBE(irq, softirq_exit)
|
||||
data->start_ns = 0;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* For measuring upcall statistics (per CPU).
|
||||
*/
|
||||
BPF_PERCPU_HASH(upcall_count);
|
||||
|
||||
#if <INSTALL_OVS_DP_UPCALL_PROBE>
|
||||
int kretprobe__ovs_dp_upcall(struct pt_regs *ctx)
|
||||
{
|
||||
int ret = PT_REGS_RC(ctx);
|
||||
u64 zero = 0;
|
||||
u64 *entry;
|
||||
u64 key;
|
||||
|
||||
if (!capture_enabled__())
|
||||
return 0;
|
||||
|
||||
if (ret >= 0)
|
||||
key = 0;
|
||||
else
|
||||
key = -ret;
|
||||
|
||||
entry = upcall_count.lookup_or_try_init(&key, &zero);
|
||||
if (entry)
|
||||
*entry += 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif /* For measuring upcall statistics/errors. */
|
||||
"""
|
||||
|
||||
|
||||
@ -887,6 +918,7 @@ def reset_capture():
|
||||
bpf["stack_traces"].clear()
|
||||
bpf["stop_start"].clear()
|
||||
bpf["stop_data"].clear()
|
||||
bpf["upcall_count"].clear()
|
||||
|
||||
|
||||
#
|
||||
@ -900,6 +932,97 @@ def print_timestamp(msg):
|
||||
print(time_string)
|
||||
|
||||
|
||||
#
|
||||
# Get errno short string
|
||||
#
|
||||
def get_errno_short(err):
|
||||
try:
|
||||
return errno.errorcode[err]
|
||||
except KeyError:
|
||||
return "_unknown_"
|
||||
|
||||
|
||||
#
|
||||
# Format a eBPF per-cpu hash entry (if the count is > 0)
|
||||
#
|
||||
def format_per_cpu_hash(cpu_hash, key=None, skip_key=None):
|
||||
per_cpu = ""
|
||||
|
||||
if key is not None:
|
||||
total = cpu_hash.sum(key).value
|
||||
if total > 0:
|
||||
for cpu, value in enumerate(cpu_hash.getvalue(key)):
|
||||
if value == 0:
|
||||
continue
|
||||
|
||||
per_cpu += " {}: {},".format(cpu, value)
|
||||
else:
|
||||
total = 0
|
||||
total_cpu = None
|
||||
|
||||
for key in cpu_hash.keys():
|
||||
if skip_key is not None and skip_key.value == key.value:
|
||||
continue
|
||||
|
||||
if total_cpu is None:
|
||||
total_cpu = [0] * len(cpu_hash.getvalue(key))
|
||||
|
||||
for cpu, value in enumerate(cpu_hash.getvalue(key)):
|
||||
total_cpu[cpu] += value
|
||||
total += value
|
||||
|
||||
if total >= 0 and total_cpu:
|
||||
for cpu, value in enumerate(total_cpu):
|
||||
if value == 0:
|
||||
continue
|
||||
|
||||
per_cpu += " {}: {},".format(cpu, value)
|
||||
|
||||
return total, per_cpu.strip(", ")
|
||||
|
||||
|
||||
#
|
||||
# Display kernel upcall statistics
|
||||
#
|
||||
def display_upcall_results():
|
||||
upcalls = bpf["upcall_count"]
|
||||
have_upcalls = False
|
||||
|
||||
for k in upcalls:
|
||||
if upcalls.sum(k).value == 0:
|
||||
continue
|
||||
have_upcalls = True
|
||||
break
|
||||
|
||||
if not have_upcalls:
|
||||
return
|
||||
|
||||
print("\n\n# UPCALL STATISTICS (TOTAL [CPU_ID: N_UPCALLS_PER_CPU, ...]):\n"
|
||||
" Total upcalls : {} [{}]".format(
|
||||
*format_per_cpu_hash(upcalls)))
|
||||
|
||||
for k in sorted(upcalls, key=lambda x: int(x.value)):
|
||||
error = k.value
|
||||
total, per_cpu = format_per_cpu_hash(upcalls, key=k)
|
||||
|
||||
if error != 0 and total == 0:
|
||||
continue
|
||||
|
||||
if error == 0:
|
||||
total_failed, per_cpu_failed = format_per_cpu_hash(upcalls,
|
||||
skip_key=k)
|
||||
if total_failed == 0:
|
||||
continue
|
||||
|
||||
print(" Successfull upcalls : {} [{}]".format(total, per_cpu))
|
||||
print(" Failed upcalls : {} [{}]".format(total_failed,
|
||||
per_cpu_failed))
|
||||
else:
|
||||
print(" {:3}, {:13}: {} [{}]".format(error,
|
||||
get_errno_short(error),
|
||||
total, per_cpu))
|
||||
|
||||
|
||||
#
|
||||
# process_results()
|
||||
#
|
||||
@ -1074,7 +1197,12 @@ def process_results(syscall_events=None, trigger_delta=None):
|
||||
indent, "TOTAL:", "", total_count, total_ns))
|
||||
|
||||
#
|
||||
# Print events
|
||||
# Print upcall statistics
|
||||
#
|
||||
display_upcall_results()
|
||||
|
||||
#
|
||||
# Print syscall events
|
||||
#
|
||||
lost_stack_traces = 0
|
||||
if syscall_events:
|
||||
@ -1194,6 +1322,9 @@ def main():
|
||||
parser.add_argument("--skip-syscall-poll-events",
|
||||
help="Skip poll() syscalls with --syscall-events",
|
||||
action="store_true")
|
||||
parser.add_argument("--skip-upcall-stats",
|
||||
help="Skip the collection of upcall statistics",
|
||||
action="store_true")
|
||||
parser.add_argument("--stack-trace-size",
|
||||
help="Number of unique stack traces that can be "
|
||||
"recorded, default 4096. 0 to disable",
|
||||
@ -1298,6 +1429,9 @@ def main():
|
||||
source = source.replace("<STACK_TRACE_ENABLED>", "true"
|
||||
if options.stack_trace_size > 0 else "false")
|
||||
|
||||
source = source.replace("<INSTALL_OVS_DP_UPCALL_PROBE>", "0"
|
||||
if options.skip_upcall_stats else "1")
|
||||
|
||||
#
|
||||
# Handle start/stop probes
|
||||
#
|
||||
|
@ -109,6 +109,7 @@ followed by resource-specific data. Which are:
|
||||
- ``THREAD STOPPED STATISTICS``
|
||||
- ``HARD IRQ STATISTICS``
|
||||
- ``SOFT IRQ STATISTICS``
|
||||
- ``UPCALL STATISTICS``
|
||||
|
||||
The following sections will describe in detail what statistics they report.
|
||||
|
||||
@ -187,6 +188,14 @@ number of interrupts (``COUNT``), the total time spent in the interrupt
|
||||
handler (``TOTAL ns``), and the worst-case duration (``MAX ns``).
|
||||
|
||||
|
||||
``UPCALL STATISTICS``
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
The ``UPCALL STATISTICS`` section provides information on the number of
|
||||
upcalls sent by the kernel to userspace. If any upcalls fail to be sent,
|
||||
the specific return codes are recorded. Statistics are presented both as
|
||||
a total and on a per-CPU basis.
|
||||
|
||||
|
||||
The ``--syscall-events`` option
|
||||
-------------------------------
|
||||
In addition to reporting global syscall statistics in ``SYSCALL_STATISTICS``,
|
||||
|
Loading…
x
Reference in New Issue
Block a user