2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 01:51:26 +00:00
ovs/vswitchd/ovs-vswitchd.c
Ilya Maximets 56e315937e vswitchd: Only lock pages that are faulted in.
The main purpose of locking the memory is to ensure that OVS can keep
doing what it did before in case of increased memory pressure, e.g.,
during VM ingest / migration.  Fulfilling this requirement can be
achieved without locking all the allocated memory, but only the pages
already accessed in the past (faulted in).  Processing of the new
traffic involves new memory allocations.  Latency on these operations
can't be guaranteed by the locking.  The main difference would be
the pre-faulting of the stack memory.  However, in order to revalidate
or process upcalls on the same traffic, the same amount of stack is
likely needed, so all the necessary memory will already be faulted in.

Switch 'mlockall' to MCL_ONFAULT to avoid consuming unnecessarily
large amounts of RAM on systems with high core counts.  For example,
in a densely populated OVN cluster this saves about 650 MB of RAM per
node on a system with 64 cores.  This equates to 320 GB of allocated
but unused RAM in a 500 node cluster.

This also makes OVS better suited by default for small systems with
limited amount of memory.

The MCL_ONFAULT flag was introduced in Linux kernel 4.4 and wasn't
available at the time of '--mlockall' introduction, but we can use it
now.  Falling back to an old way of locking in case we're running on
an older kernel just in case.

Only locking the faulted in pages also makes locking compatible with
vhost post-copy live migration by default, because we'll no longer
pre-fault all the guest's memory.  Post-copy relies on userfaultfd
to work on shared huge pages, which is only available in 4.11+ kernels.
So, technically, it should not be possible for MCL_ONFAULT to fail and
the call without it to succeed.  But keeping the check just in case
for now.

Acked-by: Simon Horman <horms@ovn.org>
Acked-by: Eelco Chaudron <echaudro@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
2024-06-28 23:44:53 +02:00

329 lines
8.9 KiB
C

/* Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016 Nicira, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at:
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <config.h>
#include <errno.h>
#include <getopt.h>
#include <limits.h>
#include <signal.h>
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_MLOCKALL
#include <sys/mman.h>
#endif
#include "bridge.h"
#include "command-line.h"
#include "compiler.h"
#include "daemon.h"
#include "dirs.h"
#include "dpif.h"
#include "dummy.h"
#include "fatal-signal.h"
#include "memory.h"
#include "netdev.h"
#include "openflow/openflow.h"
#include "ovsdb-idl.h"
#include "ovs-rcu.h"
#include "ovs-router.h"
#include "ovs-thread.h"
#include "openvswitch/poll-loop.h"
#include "simap.h"
#include "stream-ssl.h"
#include "stream.h"
#include "svec.h"
#include "timeval.h"
#include "unixctl.h"
#include "util.h"
#include "openvswitch/usdt-probes.h"
#include "openvswitch/vconn.h"
#include "openvswitch/vlog.h"
#include "lib/vswitch-idl.h"
#include "lib/dns-resolve.h"
VLOG_DEFINE_THIS_MODULE(vswitchd);
/* --mlockall: If set, locks all present process memory pages into physical
* RAM and all the new pages the moment they are faulted in, preventing
* the kernel from paging any of its memory to disk. */
static bool want_mlockall;
/* --hw-rawio-access: If set, retains CAP_SYS_RAWIO privileges. */
static bool hw_rawio_access;
static unixctl_cb_func ovs_vswitchd_exit;
static char *parse_options(int argc, char *argv[], char **unixctl_path);
OVS_NO_RETURN static void usage(void);
static struct ovs_vswitchd_exit_args {
struct unixctl_conn **conns;
size_t n_conns;
bool exiting;
bool cleanup;
} exit_args;
int
main(int argc, char *argv[])
{
struct unixctl_server *unixctl;
char *unixctl_path = NULL;
char *remote;
int retval;
set_program_name(argv[0]);
ovsthread_id_init();
dns_resolve_init(true);
ovs_cmdl_proctitle_init(argc, argv);
service_start(&argc, &argv);
remote = parse_options(argc, argv, &unixctl_path);
fatal_ignore_sigpipe();
daemonize_start(true, hw_rawio_access);
if (want_mlockall) {
#ifdef HAVE_MLOCKALL
/* MCL_ONFAULT introduced in Linux kernel 4.4. */
#ifndef MCL_ONFAULT
#define MCL_ONFAULT 4
#endif
if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) {
if (mlockall(MCL_CURRENT | MCL_FUTURE)) {
VLOG_ERR("mlockall failed: %s", ovs_strerror(errno));
} else {
set_all_memory_locked();
}
}
#else
VLOG_ERR("mlockall not supported on this system");
#endif
}
retval = unixctl_server_create(unixctl_path, &unixctl);
if (retval) {
exit(EXIT_FAILURE);
}
unixctl_command_register("exit", "[--cleanup]", 0, 1,
ovs_vswitchd_exit, NULL);
bridge_init(remote);
free(remote);
while (!exit_args.exiting) {
OVS_USDT_PROBE(main, run_start);
memory_run();
if (memory_should_report()) {
struct simap usage;
simap_init(&usage);
bridge_get_memory_usage(&usage);
memory_report(&usage);
simap_destroy(&usage);
}
bridge_run();
unixctl_server_run(unixctl);
netdev_run();
memory_wait();
bridge_wait();
unixctl_server_wait(unixctl);
netdev_wait();
if (exit_args.exiting) {
poll_immediate_wake();
}
OVS_USDT_PROBE(main, poll_block);
poll_block();
if (should_service_stop()) {
exit_args.exiting = true;
}
}
bridge_exit(exit_args.cleanup);
for (size_t i = 0; i < exit_args.n_conns; i++) {
unixctl_command_reply(exit_args.conns[i], NULL);
}
free(exit_args.conns);
unixctl_server_destroy(unixctl);
service_stop();
vlog_disable_async();
ovsrcu_exit();
dns_resolve_destroy();
return 0;
}
static char *
parse_options(int argc, char *argv[], char **unixctl_pathp)
{
enum {
OPT_PEER_CA_CERT = UCHAR_MAX + 1,
OPT_MLOCKALL,
OPT_UNIXCTL,
VLOG_OPTION_ENUMS,
OPT_BOOTSTRAP_CA_CERT,
OPT_ENABLE_DUMMY,
OPT_DISABLE_SYSTEM,
OPT_DISABLE_SYSTEM_ROUTE,
DAEMON_OPTION_ENUMS,
OPT_DPDK,
SSL_OPTION_ENUMS,
OPT_DUMMY_NUMA,
OPT_HW_RAWIO_ACCESS,
};
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h'},
{"version", no_argument, NULL, 'V'},
{"mlockall", no_argument, NULL, OPT_MLOCKALL},
{"unixctl", required_argument, NULL, OPT_UNIXCTL},
DAEMON_LONG_OPTIONS,
VLOG_LONG_OPTIONS,
STREAM_SSL_LONG_OPTIONS,
{"peer-ca-cert", required_argument, NULL, OPT_PEER_CA_CERT},
{"bootstrap-ca-cert", required_argument, NULL, OPT_BOOTSTRAP_CA_CERT},
{"enable-dummy", optional_argument, NULL, OPT_ENABLE_DUMMY},
{"disable-system", no_argument, NULL, OPT_DISABLE_SYSTEM},
{"disable-system-route", no_argument, NULL, OPT_DISABLE_SYSTEM_ROUTE},
{"dpdk", optional_argument, NULL, OPT_DPDK},
{"dummy-numa", required_argument, NULL, OPT_DUMMY_NUMA},
{"hw-rawio-access", no_argument, NULL, OPT_HW_RAWIO_ACCESS},
{NULL, 0, NULL, 0},
};
char *short_options = ovs_cmdl_long_options_to_short_options(long_options);
for (;;) {
int c;
c = getopt_long(argc, argv, short_options, long_options, NULL);
if (c == -1) {
break;
}
switch (c) {
case 'h':
usage();
case 'V':
ovs_print_version(0, 0);
print_dpdk_version();
exit(EXIT_SUCCESS);
case OPT_MLOCKALL:
want_mlockall = true;
break;
case OPT_UNIXCTL:
*unixctl_pathp = optarg;
break;
VLOG_OPTION_HANDLERS
DAEMON_OPTION_HANDLERS
STREAM_SSL_OPTION_HANDLERS
case OPT_PEER_CA_CERT:
stream_ssl_set_peer_ca_cert_file(optarg);
break;
case OPT_BOOTSTRAP_CA_CERT:
stream_ssl_set_ca_cert_file(optarg, true);
break;
case OPT_ENABLE_DUMMY:
dummy_enable(optarg);
break;
case OPT_DISABLE_SYSTEM:
dp_disallow_provider("system");
break;
case OPT_DISABLE_SYSTEM_ROUTE:
ovs_router_disable_system_routing_table();
break;
case '?':
exit(EXIT_FAILURE);
case OPT_DPDK:
ovs_fatal(0, "Using --dpdk to configure DPDK is not supported.");
break;
case OPT_DUMMY_NUMA:
ovs_numa_set_dummy(optarg);
break;
case OPT_HW_RAWIO_ACCESS:
hw_rawio_access = true;
break;
default:
abort();
}
}
free(short_options);
argc -= optind;
argv += optind;
switch (argc) {
case 0:
return xasprintf("unix:%s/db.sock", ovs_rundir());
case 1:
return xstrdup(argv[0]);
default:
VLOG_FATAL("at most one non-option argument accepted; "
"use --help for usage");
}
}
static void
usage(void)
{
printf("%s: Open vSwitch daemon\n"
"usage: %s [OPTIONS] [DATABASE]\n"
"where DATABASE is a socket on which ovsdb-server is listening\n"
" (default: \"unix:%s/db.sock\").\n",
program_name, program_name, ovs_rundir());
stream_usage("DATABASE", true, false, true);
daemon_usage();
vlog_usage();
printf("\nDPDK options:\n"
"Configuration of DPDK via command-line is removed from this\n"
"version of Open vSwitch. DPDK is configured through ovsdb.\n"
);
printf("\nOther options:\n"
" --unixctl=SOCKET override default control socket name\n"
" -h, --help display this help message\n"
" -V, --version display version information\n");
exit(EXIT_SUCCESS);
}
static void
ovs_vswitchd_exit(struct unixctl_conn *conn, int argc,
const char *argv[], void *args OVS_UNUSED)
{
exit_args.n_conns++;
exit_args.conns = xrealloc(exit_args.conns,
exit_args.n_conns * sizeof *exit_args.conns);
exit_args.conns[exit_args.n_conns - 1] = conn;
exit_args.exiting = true;
if (!exit_args.cleanup) {
exit_args.cleanup = argc == 2 && !strcmp(argv[1], "--cleanup");
}
}