2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 09:58:01 +00:00
ovs/vswitchd/ovs-vswitchd.c

329 lines
8.9 KiB
C
Raw Normal View History

/* Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <config.h>
#include <errno.h>
#include <getopt.h>
#include <limits.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_MLOCKALL
#include <sys/mman.h>
#endif
#include "bridge.h"
#include "command-line.h"
#include "compiler.h"
#include "daemon.h"
#include "dirs.h"
#include "dpif.h"
#include "dummy.h"
#include "fatal-signal.h"
#include "memory.h"
#include "netdev.h"
#include "openflow/openflow.h"
#include "ovsdb-idl.h"
#include "ovs-rcu.h"
#include "ovs-router.h"
ovs-thread: Fix thread id for threads not started with ovs_thread_create() When ping-pong'in a live VM migration between two machines running OVS-DPDK every now and then the ping misses would increase dramatically. For example: Acked-by: Ilya Maximets <i.maximets@samsung.com> ===========Stream Rate: 3Mpps=========== No Stream_Rate Downtime Totaltime Ping_Loss Moongen_Loss 0 3Mpps 128 13974 115 7168374 1 3Mpps 145 13620 17 1169770 2 3Mpps 140 14499 116 7141175 3 3Mpps 142 13358 16 1150606 4 3Mpps 136 14004 16 1124020 5 3Mpps 139 15494 214 13170452 6 3Mpps 136 15610 217 13282413 7 3Mpps 146 13194 17 1167512 8 3Mpps 148 12871 16 1162655 9 3Mpps 137 15615 214 13170656 I identified this issue being introduced in OVS commit, f3e7ec254738 ("Update relevant artifacts to add support for DPDK 17.05.1.") and more specific due to DPDK commit, af1475918124 ("vhost: introduce API to start a specific driver"). The combined changes no longer have OVS start the vhost socket polling thread at startup, but DPDK will do it on its own when the first vhost client is started. Figuring out the reason why this happens kept me puzzled for quite some time... What happens is that the callbacks called from the vhost thread are calling ovsrcu_synchronize() as part of destroy_device(). This will end-up calling seq_wait__(). By default, all created threads outside of OVS will get thread id 0, which is equal to the main ovs thread. So for example in the seq_wait__() function above if the main thread is waiting already we won't add ourselves as a waiter. The fix below assigns OVSTHREAD_ID_UNSET to none OVS created threads, which will get updated to a valid ID on the first call to ovsthread_id_self(). Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Fixes: f3e7ec254738 ("Update relevant artifacts to add support for DPDK 17.05.1.") Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-06-04 10:07:36 +02:00
#include "ovs-thread.h"
#include "openvswitch/poll-loop.h"
#include "simap.h"
#include "stream-ssl.h"
#include "stream.h"
#include "svec.h"
#include "timeval.h"
#include "unixctl.h"
#include "util.h"
#include "openvswitch/usdt-probes.h"
#include "openvswitch/vconn.h"
#include "openvswitch/vlog.h"
#include "lib/vswitch-idl.h"
#include "lib/dns-resolve.h"
VLOG_DEFINE_THIS_MODULE(vswitchd);
vswitchd: Only lock pages that are faulted in. The main purpose of locking the memory is to ensure that OVS can keep doing what it did before in case of increased memory pressure, e.g., during VM ingest / migration. Fulfilling this requirement can be achieved without locking all the allocated memory, but only the pages already accessed in the past (faulted in). Processing of the new traffic involves new memory allocations. Latency on these operations can't be guaranteed by the locking. The main difference would be the pre-faulting of the stack memory. However, in order to revalidate or process upcalls on the same traffic, the same amount of stack is likely needed, so all the necessary memory will already be faulted in. Switch 'mlockall' to MCL_ONFAULT to avoid consuming unnecessarily large amounts of RAM on systems with high core counts. For example, in a densely populated OVN cluster this saves about 650 MB of RAM per node on a system with 64 cores. This equates to 320 GB of allocated but unused RAM in a 500 node cluster. This also makes OVS better suited by default for small systems with limited amount of memory. The MCL_ONFAULT flag was introduced in Linux kernel 4.4 and wasn't available at the time of '--mlockall' introduction, but we can use it now. Falling back to an old way of locking in case we're running on an older kernel just in case. Only locking the faulted in pages also makes locking compatible with vhost post-copy live migration by default, because we'll no longer pre-fault all the guest's memory. Post-copy relies on userfaultfd to work on shared huge pages, which is only available in 4.11+ kernels. So, technically, it should not be possible for MCL_ONFAULT to fail and the call without it to succeed. But keeping the check just in case for now. Acked-by: Simon Horman <horms@ovn.org> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-06-14 14:22:47 +02:00
/* --mlockall: If set, locks all present process memory pages into physical
* RAM and all the new pages the moment they are faulted in, preventing
* the kernel from paging any of its memory to disk. */
static bool want_mlockall;
/* --hw-rawio-access: If set, retains CAP_SYS_RAWIO privileges. */
static bool hw_rawio_access;
static unixctl_cb_func ovs_vswitchd_exit;
static char *parse_options(int argc, char *argv[], char **unixctl_path);
OVS_NO_RETURN static void usage(void);
static struct ovs_vswitchd_exit_args {
struct unixctl_conn **conns;
size_t n_conns;
bool exiting;
bool cleanup;
} exit_args;
int
main(int argc, char *argv[])
{
struct unixctl_server *unixctl;
char *unixctl_path = NULL;
char *remote;
int retval;
set_program_name(argv[0]);
ovs-thread: Fix thread id for threads not started with ovs_thread_create() When ping-pong'in a live VM migration between two machines running OVS-DPDK every now and then the ping misses would increase dramatically. For example: Acked-by: Ilya Maximets <i.maximets@samsung.com> ===========Stream Rate: 3Mpps=========== No Stream_Rate Downtime Totaltime Ping_Loss Moongen_Loss 0 3Mpps 128 13974 115 7168374 1 3Mpps 145 13620 17 1169770 2 3Mpps 140 14499 116 7141175 3 3Mpps 142 13358 16 1150606 4 3Mpps 136 14004 16 1124020 5 3Mpps 139 15494 214 13170452 6 3Mpps 136 15610 217 13282413 7 3Mpps 146 13194 17 1167512 8 3Mpps 148 12871 16 1162655 9 3Mpps 137 15615 214 13170656 I identified this issue being introduced in OVS commit, f3e7ec254738 ("Update relevant artifacts to add support for DPDK 17.05.1.") and more specific due to DPDK commit, af1475918124 ("vhost: introduce API to start a specific driver"). The combined changes no longer have OVS start the vhost socket polling thread at startup, but DPDK will do it on its own when the first vhost client is started. Figuring out the reason why this happens kept me puzzled for quite some time... What happens is that the callbacks called from the vhost thread are calling ovsrcu_synchronize() as part of destroy_device(). This will end-up calling seq_wait__(). By default, all created threads outside of OVS will get thread id 0, which is equal to the main ovs thread. So for example in the seq_wait__() function above if the main thread is waiting already we won't add ourselves as a waiter. The fix below assigns OVSTHREAD_ID_UNSET to none OVS created threads, which will get updated to a valid ID on the first call to ovsthread_id_self(). Signed-off-by: Eelco Chaudron <echaudro@redhat.com> Fixes: f3e7ec254738 ("Update relevant artifacts to add support for DPDK 17.05.1.") Acked-by: Ilya Maximets <i.maximets@samsung.com> Signed-off-by: Ian Stokes <ian.stokes@intel.com>
2018-06-04 10:07:36 +02:00
ovsthread_id_init();
dns_resolve_init(true);
ovs_cmdl_proctitle_init(argc, argv);
service_start(&argc, &argv);
remote = parse_options(argc, argv, &unixctl_path);
fatal_ignore_sigpipe();
daemonize_start(true, hw_rawio_access);
if (want_mlockall) {
#ifdef HAVE_MLOCKALL
vswitchd: Only lock pages that are faulted in. The main purpose of locking the memory is to ensure that OVS can keep doing what it did before in case of increased memory pressure, e.g., during VM ingest / migration. Fulfilling this requirement can be achieved without locking all the allocated memory, but only the pages already accessed in the past (faulted in). Processing of the new traffic involves new memory allocations. Latency on these operations can't be guaranteed by the locking. The main difference would be the pre-faulting of the stack memory. However, in order to revalidate or process upcalls on the same traffic, the same amount of stack is likely needed, so all the necessary memory will already be faulted in. Switch 'mlockall' to MCL_ONFAULT to avoid consuming unnecessarily large amounts of RAM on systems with high core counts. For example, in a densely populated OVN cluster this saves about 650 MB of RAM per node on a system with 64 cores. This equates to 320 GB of allocated but unused RAM in a 500 node cluster. This also makes OVS better suited by default for small systems with limited amount of memory. The MCL_ONFAULT flag was introduced in Linux kernel 4.4 and wasn't available at the time of '--mlockall' introduction, but we can use it now. Falling back to an old way of locking in case we're running on an older kernel just in case. Only locking the faulted in pages also makes locking compatible with vhost post-copy live migration by default, because we'll no longer pre-fault all the guest's memory. Post-copy relies on userfaultfd to work on shared huge pages, which is only available in 4.11+ kernels. So, technically, it should not be possible for MCL_ONFAULT to fail and the call without it to succeed. But keeping the check just in case for now. Acked-by: Simon Horman <horms@ovn.org> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-06-14 14:22:47 +02:00
/* MCL_ONFAULT introduced in Linux kernel 4.4. */
#ifndef MCL_ONFAULT
#define MCL_ONFAULT 4
#endif
if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) {
if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
VLOG_ERR("mlockall failed: %s", ovs_strerror(errno));
} else {
set_all_memory_locked();
}
}
#else
VLOG_ERR("mlockall not supported on this system");
#endif
}
retval = unixctl_server_create(unixctl_path, &unixctl);
if (retval) {
exit(EXIT_FAILURE);
}
unixctl_command_register("exit", "[--cleanup]", 0, 1,
ovs_vswitchd_exit, NULL);
bridge_init(remote);
free(remote);
while (!exit_args.exiting) {
OVS_USDT_PROBE(main, run_start);
memory_run();
if (memory_should_report()) {
struct simap usage;
simap_init(&usage);
bridge_get_memory_usage(&usage);
memory_report(&usage);
simap_destroy(&usage);
}
bridge_run();
unixctl_server_run(unixctl);
netdev_run();
memory_wait();
bridge_wait();
unixctl_server_wait(unixctl);
netdev_wait();
if (exit_args.exiting) {
poll_immediate_wake();
}
OVS_USDT_PROBE(main, poll_block);
poll_block();
if (should_service_stop()) {
exit_args.exiting = true;
}
}
bridge_exit(exit_args.cleanup);
for (size_t i = 0; i < exit_args.n_conns; i++) {
unixctl_command_reply(exit_args.conns[i], NULL);
}
free(exit_args.conns);
unixctl_server_destroy(unixctl);
service_stop();
vlog_disable_async();
ovsrcu_exit();
dns_resolve_destroy();
return 0;
}
static char *
parse_options(int argc, char *argv[], char **unixctl_pathp)
{
enum {
OPT_PEER_CA_CERT = UCHAR_MAX + 1,
OPT_MLOCKALL,
OPT_UNIXCTL,
VLOG_OPTION_ENUMS,
OPT_BOOTSTRAP_CA_CERT,
OPT_ENABLE_DUMMY,
OPT_DISABLE_SYSTEM,
OPT_DISABLE_SYSTEM_ROUTE,
DAEMON_OPTION_ENUMS,
OPT_DPDK,
SSL_OPTION_ENUMS,
OPT_DUMMY_NUMA,
OPT_HW_RAWIO_ACCESS,
};
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'V'},
{"mlockall", no_argument, NULL, OPT_MLOCKALL},
{"unixctl", required_argument, NULL, OPT_UNIXCTL},
DAEMON_LONG_OPTIONS,
VLOG_LONG_OPTIONS,
STREAM_SSL_LONG_OPTIONS,
{"peer-ca-cert", required_argument, NULL, OPT_PEER_CA_CERT},
{"bootstrap-ca-cert", required_argument, NULL, OPT_BOOTSTRAP_CA_CERT},
{"enable-dummy", optional_argument, NULL, OPT_ENABLE_DUMMY},
{"disable-system", no_argument, NULL, OPT_DISABLE_SYSTEM},
{"disable-system-route", no_argument, NULL, OPT_DISABLE_SYSTEM_ROUTE},
{"dpdk", optional_argument, NULL, OPT_DPDK},
{"dummy-numa", required_argument, NULL, OPT_DUMMY_NUMA},
{"hw-rawio-access", no_argument, NULL, OPT_HW_RAWIO_ACCESS},
{NULL, 0, NULL, 0},
};
char *short_options = ovs_cmdl_long_options_to_short_options(long_options);
for (;;) {
int c;
c = getopt_long(argc, argv, short_options, long_options, NULL);
if (c == -1) {
break;
}
switch (c) {
case 'h':
usage();
case 'V':
ovs_print_version(0, 0);
print_dpdk_version();
exit(EXIT_SUCCESS);
case OPT_MLOCKALL:
want_mlockall = true;
break;
case OPT_UNIXCTL:
*unixctl_pathp = optarg;
break;
VLOG_OPTION_HANDLERS
DAEMON_OPTION_HANDLERS
STREAM_SSL_OPTION_HANDLERS
case OPT_PEER_CA_CERT:
stream_ssl_set_peer_ca_cert_file(optarg);
break;
case OPT_BOOTSTRAP_CA_CERT:
stream_ssl_set_ca_cert_file(optarg, true);
break;
case OPT_ENABLE_DUMMY:
dummy_enable(optarg);
break;
case OPT_DISABLE_SYSTEM:
dp_disallow_provider("system");
break;
case OPT_DISABLE_SYSTEM_ROUTE:
ovs_router_disable_system_routing_table();
break;
case '?':
exit(EXIT_FAILURE);
case OPT_DPDK:
ovs_fatal(0, "Using --dpdk to configure DPDK is not supported.");
break;
case OPT_DUMMY_NUMA:
ovs_numa_set_dummy(optarg);
break;
case OPT_HW_RAWIO_ACCESS:
hw_rawio_access = true;
break;
default:
abort();
}
}
free(short_options);
argc -= optind;
argv += optind;
switch (argc) {
case 0:
return xasprintf("unix:%s/db.sock", ovs_rundir());
case 1:
return xstrdup(argv[0]);
default:
VLOG_FATAL("at most one non-option argument accepted; "
"use --help for usage");
}
}
static void
usage(void)
{
printf("%s: Open vSwitch daemon\n"
"usage: %s [OPTIONS] [DATABASE]\n"
"where DATABASE is a socket on which ovsdb-server is listening\n"
" (default: \"unix:%s/db.sock\").\n",
program_name, program_name, ovs_rundir());
stream_usage("DATABASE", true, false, true);
daemon_usage();
vlog_usage();
printf("\nDPDK options:\n"
"Configuration of DPDK via command-line is removed from this\n"
"version of Open vSwitch. DPDK is configured through ovsdb.\n"
);
printf("\nOther options:\n"
" --unixctl=SOCKET override default control socket name\n"
" -h, --help display this help message\n"
" -V, --version display version information\n");
exit(EXIT_SUCCESS);
}
static void
ovs_vswitchd_exit(struct unixctl_conn *conn, int argc,
const char *argv[], void *args OVS_UNUSED)
{
exit_args.n_conns++;
exit_args.conns = xrealloc(exit_args.conns,
exit_args.n_conns * sizeof *exit_args.conns);
exit_args.conns[exit_args.n_conns - 1] = conn;
exit_args.exiting = true;
if (!exit_args.cleanup) {
exit_args.cleanup = argc == 2 && !strcmp(argv[1], "--cleanup");
}
}