mirror of
https://github.com/openvswitch/ovs
synced 2025-08-25 11:27:46 +00:00
utilities: Add upcall statistics to the kernel_delay.py script.
This patch installs a kernel return probe on ovs_dp_upcall() to record all successful and failed calls per CPU. The statistics are reported when the script exits, providing insights into the upcall behavior. This is an example output: # UPCALL STATISTICS (TOTAL [CPU_ID: N_UPCALLS_PER_CPU, ...]): Total upcalls : 183247 [0: 8937, 2: 14464, 4: 10372, 6: 4254, ...] Successfull upcalls : 120195 [0: 5945, 2: 6379, 4: 5966, 6: 4254, ...] Failed upcalls : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...] 11, EAGAIN : 63052 [0: 2992, 2: 8085, 4: 4406, 8: 2275, ...] Tested-by: Adrian Moreno <amoreno@redhat.com> Reviewed-by: Adrian Moreno <amoreno@redhat.com> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
This commit is contained in:
parent
a48b3278f0
commit
638433bb92
@ -149,6 +149,7 @@ FLAKE8_PYFILES += utilities/ovs-pcap.in \
|
|||||||
utilities/ovs-pipegen.py \
|
utilities/ovs-pipegen.py \
|
||||||
utilities/usdt-scripts/dpif_op_nl_monitor.py \
|
utilities/usdt-scripts/dpif_op_nl_monitor.py \
|
||||||
utilities/usdt-scripts/flow_reval_monitor.py \
|
utilities/usdt-scripts/flow_reval_monitor.py \
|
||||||
|
utilities/usdt-scripts/kernel_delay.py \
|
||||||
utilities/usdt-scripts/upcall_monitor.py \
|
utilities/usdt-scripts/upcall_monitor.py \
|
||||||
utilities/usdt-scripts/upcall_cost.py
|
utilities/usdt-scripts/upcall_cost.py
|
||||||
|
|
||||||
|
@ -39,6 +39,7 @@
|
|||||||
#
|
#
|
||||||
import argparse
|
import argparse
|
||||||
import datetime
|
import datetime
|
||||||
|
import errno
|
||||||
import os
|
import os
|
||||||
import pytz
|
import pytz
|
||||||
import psutil
|
import psutil
|
||||||
@ -556,6 +557,36 @@ TRACEPOINT_PROBE(irq, softirq_exit)
|
|||||||
data->start_ns = 0;
|
data->start_ns = 0;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For measuring upcall statistics (per CPU).
|
||||||
|
*/
|
||||||
|
BPF_PERCPU_HASH(upcall_count);
|
||||||
|
|
||||||
|
#if <INSTALL_OVS_DP_UPCALL_PROBE>
|
||||||
|
int kretprobe__ovs_dp_upcall(struct pt_regs *ctx)
|
||||||
|
{
|
||||||
|
int ret = PT_REGS_RC(ctx);
|
||||||
|
u64 zero = 0;
|
||||||
|
u64 *entry;
|
||||||
|
u64 key;
|
||||||
|
|
||||||
|
if (!capture_enabled__())
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (ret >= 0)
|
||||||
|
key = 0;
|
||||||
|
else
|
||||||
|
key = -ret;
|
||||||
|
|
||||||
|
entry = upcall_count.lookup_or_try_init(&key, &zero);
|
||||||
|
if (entry)
|
||||||
|
*entry += 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif /* For measuring upcall statistics/errors. */
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
@ -887,6 +918,7 @@ def reset_capture():
|
|||||||
bpf["stack_traces"].clear()
|
bpf["stack_traces"].clear()
|
||||||
bpf["stop_start"].clear()
|
bpf["stop_start"].clear()
|
||||||
bpf["stop_data"].clear()
|
bpf["stop_data"].clear()
|
||||||
|
bpf["upcall_count"].clear()
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -900,6 +932,97 @@ def print_timestamp(msg):
|
|||||||
print(time_string)
|
print(time_string)
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Get errno short string
|
||||||
|
#
|
||||||
|
def get_errno_short(err):
|
||||||
|
try:
|
||||||
|
return errno.errorcode[err]
|
||||||
|
except KeyError:
|
||||||
|
return "_unknown_"
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Format a eBPF per-cpu hash entry (if the count is > 0)
|
||||||
|
#
|
||||||
|
def format_per_cpu_hash(cpu_hash, key=None, skip_key=None):
|
||||||
|
per_cpu = ""
|
||||||
|
|
||||||
|
if key is not None:
|
||||||
|
total = cpu_hash.sum(key).value
|
||||||
|
if total > 0:
|
||||||
|
for cpu, value in enumerate(cpu_hash.getvalue(key)):
|
||||||
|
if value == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
per_cpu += " {}: {},".format(cpu, value)
|
||||||
|
else:
|
||||||
|
total = 0
|
||||||
|
total_cpu = None
|
||||||
|
|
||||||
|
for key in cpu_hash.keys():
|
||||||
|
if skip_key is not None and skip_key.value == key.value:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if total_cpu is None:
|
||||||
|
total_cpu = [0] * len(cpu_hash.getvalue(key))
|
||||||
|
|
||||||
|
for cpu, value in enumerate(cpu_hash.getvalue(key)):
|
||||||
|
total_cpu[cpu] += value
|
||||||
|
total += value
|
||||||
|
|
||||||
|
if total >= 0 and total_cpu:
|
||||||
|
for cpu, value in enumerate(total_cpu):
|
||||||
|
if value == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
per_cpu += " {}: {},".format(cpu, value)
|
||||||
|
|
||||||
|
return total, per_cpu.strip(", ")
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Display kernel upcall statistics
|
||||||
|
#
|
||||||
|
def display_upcall_results():
|
||||||
|
upcalls = bpf["upcall_count"]
|
||||||
|
have_upcalls = False
|
||||||
|
|
||||||
|
for k in upcalls:
|
||||||
|
if upcalls.sum(k).value == 0:
|
||||||
|
continue
|
||||||
|
have_upcalls = True
|
||||||
|
break
|
||||||
|
|
||||||
|
if not have_upcalls:
|
||||||
|
return
|
||||||
|
|
||||||
|
print("\n\n# UPCALL STATISTICS (TOTAL [CPU_ID: N_UPCALLS_PER_CPU, ...]):\n"
|
||||||
|
" Total upcalls : {} [{}]".format(
|
||||||
|
*format_per_cpu_hash(upcalls)))
|
||||||
|
|
||||||
|
for k in sorted(upcalls, key=lambda x: int(x.value)):
|
||||||
|
error = k.value
|
||||||
|
total, per_cpu = format_per_cpu_hash(upcalls, key=k)
|
||||||
|
|
||||||
|
if error != 0 and total == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if error == 0:
|
||||||
|
total_failed, per_cpu_failed = format_per_cpu_hash(upcalls,
|
||||||
|
skip_key=k)
|
||||||
|
if total_failed == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(" Successfull upcalls : {} [{}]".format(total, per_cpu))
|
||||||
|
print(" Failed upcalls : {} [{}]".format(total_failed,
|
||||||
|
per_cpu_failed))
|
||||||
|
else:
|
||||||
|
print(" {:3}, {:13}: {} [{}]".format(error,
|
||||||
|
get_errno_short(error),
|
||||||
|
total, per_cpu))
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# process_results()
|
# process_results()
|
||||||
#
|
#
|
||||||
@ -1074,7 +1197,12 @@ def process_results(syscall_events=None, trigger_delta=None):
|
|||||||
indent, "TOTAL:", "", total_count, total_ns))
|
indent, "TOTAL:", "", total_count, total_ns))
|
||||||
|
|
||||||
#
|
#
|
||||||
# Print events
|
# Print upcall statistics
|
||||||
|
#
|
||||||
|
display_upcall_results()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Print syscall events
|
||||||
#
|
#
|
||||||
lost_stack_traces = 0
|
lost_stack_traces = 0
|
||||||
if syscall_events:
|
if syscall_events:
|
||||||
@ -1194,6 +1322,9 @@ def main():
|
|||||||
parser.add_argument("--skip-syscall-poll-events",
|
parser.add_argument("--skip-syscall-poll-events",
|
||||||
help="Skip poll() syscalls with --syscall-events",
|
help="Skip poll() syscalls with --syscall-events",
|
||||||
action="store_true")
|
action="store_true")
|
||||||
|
parser.add_argument("--skip-upcall-stats",
|
||||||
|
help="Skip the collection of upcall statistics",
|
||||||
|
action="store_true")
|
||||||
parser.add_argument("--stack-trace-size",
|
parser.add_argument("--stack-trace-size",
|
||||||
help="Number of unique stack traces that can be "
|
help="Number of unique stack traces that can be "
|
||||||
"recorded, default 4096. 0 to disable",
|
"recorded, default 4096. 0 to disable",
|
||||||
@ -1298,6 +1429,9 @@ def main():
|
|||||||
source = source.replace("<STACK_TRACE_ENABLED>", "true"
|
source = source.replace("<STACK_TRACE_ENABLED>", "true"
|
||||||
if options.stack_trace_size > 0 else "false")
|
if options.stack_trace_size > 0 else "false")
|
||||||
|
|
||||||
|
source = source.replace("<INSTALL_OVS_DP_UPCALL_PROBE>", "0"
|
||||||
|
if options.skip_upcall_stats else "1")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Handle start/stop probes
|
# Handle start/stop probes
|
||||||
#
|
#
|
||||||
|
@ -109,6 +109,7 @@ followed by resource-specific data. Which are:
|
|||||||
- ``THREAD STOPPED STATISTICS``
|
- ``THREAD STOPPED STATISTICS``
|
||||||
- ``HARD IRQ STATISTICS``
|
- ``HARD IRQ STATISTICS``
|
||||||
- ``SOFT IRQ STATISTICS``
|
- ``SOFT IRQ STATISTICS``
|
||||||
|
- ``UPCALL STATISTICS``
|
||||||
|
|
||||||
The following sections will describe in detail what statistics they report.
|
The following sections will describe in detail what statistics they report.
|
||||||
|
|
||||||
@ -187,6 +188,14 @@ number of interrupts (``COUNT``), the total time spent in the interrupt
|
|||||||
handler (``TOTAL ns``), and the worst-case duration (``MAX ns``).
|
handler (``TOTAL ns``), and the worst-case duration (``MAX ns``).
|
||||||
|
|
||||||
|
|
||||||
|
``UPCALL STATISTICS``
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
The ``UPCALL STATISTICS`` section provides information on the number of
|
||||||
|
upcalls sent by the kernel to userspace. If any upcalls fail to be sent,
|
||||||
|
the specific return codes are recorded. Statistics are presented both as
|
||||||
|
a total and on a per-CPU basis.
|
||||||
|
|
||||||
|
|
||||||
The ``--syscall-events`` option
|
The ``--syscall-events`` option
|
||||||
-------------------------------
|
-------------------------------
|
||||||
In addition to reporting global syscall statistics in ``SYSCALL_STATISTICS``,
|
In addition to reporting global syscall statistics in ``SYSCALL_STATISTICS``,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user