2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 09:58:01 +00:00

utilities: Add upcall statistics to the kernel_delay.py script.

This patch installs a kernel return probe on ovs_dp_upcall() to record
all successful and failed calls per CPU. The statistics are reported
when the script exits, providing insights into the upcall behavior.

This is an example output:

  # UPCALL STATISTICS (TOTAL [CPU_ID: N_UPCALLS_PER_CPU, ...]):
    Total upcalls       : 183247 [0: 8937, 2: 14464, 4: 10372, 6: 4254, ...]
    Successfull upcalls : 120195 [0: 5945, 2: 6379, 4: 5966, 6: 4254, ...]
    Failed upcalls      : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...]
       11, EAGAIN       : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...]

Tested-by: Adrian Moreno <amoreno@redhat.com>
Reviewed-by: Adrian Moreno <amoreno@redhat.com>
Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
This commit is contained in:
Eelco Chaudron 2024-12-19 15:29:43 +01:00
parent a48b3278f0
commit 638433bb92
3 changed files with 145 additions and 1 deletions

View File

@ -149,6 +149,7 @@ FLAKE8_PYFILES += utilities/ovs-pcap.in \
utilities/ovs-pipegen.py \
utilities/usdt-scripts/dpif_op_nl_monitor.py \
utilities/usdt-scripts/flow_reval_monitor.py \
utilities/usdt-scripts/kernel_delay.py \
utilities/usdt-scripts/upcall_monitor.py \
utilities/usdt-scripts/upcall_cost.py

View File

@ -39,6 +39,7 @@
#
import argparse
import datetime
import errno
import os
import pytz
import psutil
@ -556,6 +557,36 @@ TRACEPOINT_PROBE(irq, softirq_exit)
data->start_ns = 0;
return 0;
}
/*
* For measuring upcall statistics (per CPU).
*/
BPF_PERCPU_HASH(upcall_count);
#if <INSTALL_OVS_DP_UPCALL_PROBE>
int kretprobe__ovs_dp_upcall(struct pt_regs *ctx)
{
int ret = PT_REGS_RC(ctx);
u64 zero = 0;
u64 *entry;
u64 key;
if (!capture_enabled__())
return 0;
if (ret >= 0)
key = 0;
else
key = -ret;
entry = upcall_count.lookup_or_try_init(&key, &zero);
if (entry)
*entry += 1;
return 0;
}
#endif /* For measuring upcall statistics/errors. */
"""
@ -887,6 +918,7 @@ def reset_capture():
bpf["stack_traces"].clear()
bpf["stop_start"].clear()
bpf["stop_data"].clear()
bpf["upcall_count"].clear()
#
@ -900,6 +932,97 @@ def print_timestamp(msg):
print(time_string)
#
# Get errno short string
#
def get_errno_short(err):
try:
return errno.errorcode[err]
except KeyError:
return "_unknown_"
#
# Format a eBPF per-cpu hash entry (if the count is > 0)
#
def format_per_cpu_hash(cpu_hash, key=None, skip_key=None):
per_cpu = ""
if key is not None:
total = cpu_hash.sum(key).value
if total > 0:
for cpu, value in enumerate(cpu_hash.getvalue(key)):
if value == 0:
continue
per_cpu += " {}: {},".format(cpu, value)
else:
total = 0
total_cpu = None
for key in cpu_hash.keys():
if skip_key is not None and skip_key.value == key.value:
continue
if total_cpu is None:
total_cpu = [0] * len(cpu_hash.getvalue(key))
for cpu, value in enumerate(cpu_hash.getvalue(key)):
total_cpu[cpu] += value
total += value
if total >= 0 and total_cpu:
for cpu, value in enumerate(total_cpu):
if value == 0:
continue
per_cpu += " {}: {},".format(cpu, value)
return total, per_cpu.strip(", ")
#
# Display kernel upcall statistics
#
def display_upcall_results():
upcalls = bpf["upcall_count"]
have_upcalls = False
for k in upcalls:
if upcalls.sum(k).value == 0:
continue
have_upcalls = True
break
if not have_upcalls:
return
print("\n\n# UPCALL STATISTICS (TOTAL [CPU_ID: N_UPCALLS_PER_CPU, ...]):\n"
" Total upcalls : {} [{}]".format(
*format_per_cpu_hash(upcalls)))
for k in sorted(upcalls, key=lambda x: int(x.value)):
error = k.value
total, per_cpu = format_per_cpu_hash(upcalls, key=k)
if error != 0 and total == 0:
continue
if error == 0:
total_failed, per_cpu_failed = format_per_cpu_hash(upcalls,
skip_key=k)
if total_failed == 0:
continue
print(" Successfull upcalls : {} [{}]".format(total, per_cpu))
print(" Failed upcalls : {} [{}]".format(total_failed,
per_cpu_failed))
else:
print(" {:3}, {:13}: {} [{}]".format(error,
get_errno_short(error),
total, per_cpu))
#
# process_results()
#
@ -1074,7 +1197,12 @@ def process_results(syscall_events=None, trigger_delta=None):
indent, "TOTAL:", "", total_count, total_ns))
#
# Print events
# Print upcall statistics
#
display_upcall_results()
#
# Print syscall events
#
lost_stack_traces = 0
if syscall_events:
@ -1194,6 +1322,9 @@ def main():
parser.add_argument("--skip-syscall-poll-events",
help="Skip poll() syscalls with --syscall-events",
action="store_true")
parser.add_argument("--skip-upcall-stats",
help="Skip the collection of upcall statistics",
action="store_true")
parser.add_argument("--stack-trace-size",
help="Number of unique stack traces that can be "
"recorded, default 4096. 0 to disable",
@ -1298,6 +1429,9 @@ def main():
source = source.replace("<STACK_TRACE_ENABLED>", "true"
if options.stack_trace_size > 0 else "false")
source = source.replace("<INSTALL_OVS_DP_UPCALL_PROBE>", "0"
if options.skip_upcall_stats else "1")
#
# Handle start/stop probes
#

View File

@ -109,6 +109,7 @@ followed by resource-specific data. Which are:
- ``THREAD STOPPED STATISTICS``
- ``HARD IRQ STATISTICS``
- ``SOFT IRQ STATISTICS``
- ``UPCALL STATISTICS``
The following sections will describe in detail what statistics they report.
@ -187,6 +188,14 @@ number of interrupts (``COUNT``), the total time spent in the interrupt
handler (``TOTAL ns``), and the worst-case duration (``MAX ns``).
``UPCALL STATISTICS``
~~~~~~~~~~~~~~~~~~~~~
The ``UPCALL STATISTICS`` section provides information on the number of
upcalls sent by the kernel to userspace. If any upcalls fail to be sent,
the specific return codes are recorded. Statistics are presented both as
a total and on a per-CPU basis.
The ``--syscall-events`` option
-------------------------------
In addition to reporting global syscall statistics in ``SYSCALL_STATISTICS``,