2
0
mirror of https://github.com/openvswitch/ovs synced 2025-08-22 01:51:26 +00:00

dpdk: Allow retaining CAP_SYS_RAWIO privileges.

Open vSwitch generally tries to let the underlying operating system
managed the low level details of hardware, for example DMA mapping,
bus arbitration, etc.  However, when using DPDK, the underlying
operating system yields control of many of these details to userspace
for management.

In the case of some DPDK port drivers, configuring rte_flow or even
allocating resources may require access to iopl/ioperm calls, which
are guarded by the CAP_SYS_RAWIO privilege on linux systems.  These
calls are dangerous, and can allow a process to completely compromise
a system.  However, they are needed in the case of some userspace
driver code which manages the hardware (for example, the mlx
implementation of backend support for rte_flow).

Here, we create an opt-in flag passed to the command line to allow
this access.  We need to do this before ever accessing the database,
because we want to drop all privileges asap, and cannot wait for
a connection to the database to be established and functional before
dropping.  There may be distribution specific ways to do capability
management as well (using for example, systemd), but they are not
as universal to the vswitchd as a flag.

Reviewed-by: Simon Horman <simon.horman@corigine.com>
Signed-off-by: Aaron Conole <aconole@redhat.com>
Acked-by: Flavio Leitner <fbl@sysclose.org>
Acked-by: Gaetan Rivet <gaetanr@nvidia.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
This commit is contained in:
Aaron Conole 2023-03-16 08:00:39 -04:00 committed by Ilya Maximets
parent e90a0727f1
commit 07cf5810de
14 changed files with 67 additions and 26 deletions

4
NEWS
View File

@ -17,6 +17,10 @@ Post-v3.1.0
in order to create OVSDB sockets with access mode of 0770.
- QoS:
* Added new configuration option 'jitter' for a linux-netem QoS type.
- DPDK:
* ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started
with the --hw-rawio-access command line option. This allows the
process extra privileges when mapping physical interconnect memory.
v3.1.0 - 16 Feb 2023

View File

@ -88,7 +88,8 @@ static bool switch_user = false;
static uid_t uid;
static gid_t gid;
static char *user = NULL;
static void daemon_become_new_user__(bool access_datapath);
static void daemon_become_new_user__(bool access_datapath,
bool access_hardware_ports);
static void check_already_running(void);
static int lock_pidfile(FILE *, int command);
@ -443,13 +444,13 @@ monitor_daemon(pid_t daemon_pid)
* daemonize_complete()) or that it failed to start up (by exiting with a
* nonzero exit code). */
void
daemonize_start(bool access_datapath)
daemonize_start(bool access_datapath, bool access_hardware_ports)
{
assert_single_threaded();
daemonize_fd = -1;
if (switch_user) {
daemon_become_new_user__(access_datapath);
daemon_become_new_user__(access_datapath, access_hardware_ports);
switch_user = false;
}
@ -807,7 +808,8 @@ daemon_become_new_user_unix(void)
/* Linux specific implementation of daemon_become_new_user()
* using libcap-ng. */
static void
daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
daemon_become_new_user_linux(bool access_datapath OVS_UNUSED,
bool access_hardware_ports OVS_UNUSED)
{
#if defined __linux__ && HAVE_LIBCAPNG
int ret;
@ -827,6 +829,20 @@ daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
ret = capng_update(CAPNG_ADD, cap_sets, CAP_NET_ADMIN)
|| capng_update(CAPNG_ADD, cap_sets, CAP_NET_RAW)
|| capng_update(CAPNG_ADD, cap_sets, CAP_NET_BROADCAST);
#ifdef DPDK_NETDEV
if (access_hardware_ports && !ret) {
ret = capng_update(CAPNG_ADD, cap_sets, CAP_SYS_RAWIO);
if (!ret) {
VLOG_INFO("The Linux capability CAP_SYS_RAWIO "
"is enabled.");
}
}
#else
if (access_hardware_ports) {
VLOG_WARN("No driver requires Linux capability "
"CAP_SYS_RAWIO, disabling it.");
}
#endif
}
} else {
ret = -1;
@ -854,7 +870,7 @@ daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
}
static void
daemon_become_new_user__(bool access_datapath)
daemon_become_new_user__(bool access_datapath, bool access_hardware_ports)
{
/* If vlog file has been created, change its owner to the non-root user
* as specifed by the --user option. */
@ -862,7 +878,8 @@ daemon_become_new_user__(bool access_datapath)
if (LINUX) {
if (LIBCAPNG) {
daemon_become_new_user_linux(access_datapath);
daemon_become_new_user_linux(access_datapath,
access_hardware_ports);
} else {
VLOG_FATAL("%s: fail to downgrade user using libcap-ng. "
"(libcap-ng is not configured at compile time), "
@ -877,11 +894,11 @@ daemon_become_new_user__(bool access_datapath)
* However, there in case the user switch needs to be done
* before daemonize_start(), the following API can be used. */
void
daemon_become_new_user(bool access_datapath)
daemon_become_new_user(bool access_datapath, bool access_hardware_ports)
{
assert_single_threaded();
if (switch_user) {
daemon_become_new_user__(access_datapath);
daemon_become_new_user__(access_datapath, access_hardware_ports);
/* daemonize_start() should not switch user again. */
switch_user = false;
}

View File

@ -498,7 +498,8 @@ make_pidfile(void)
}
void
daemonize_start(bool access_datapath OVS_UNUSED)
daemonize_start(bool access_datapath OVS_UNUSED,
bool access_hardware_ports OVS_UNUSED)
{
if (pidfile) {
make_pidfile();
@ -526,7 +527,8 @@ daemonize_complete(void)
}
void
daemon_become_new_user(bool access_datapath OVS_UNUSED)
daemon_become_new_user(bool access_datapath OVS_UNUSED,
bool access_hardware_ports OVS_UNUSED)
{
}

View File

@ -48,7 +48,7 @@ get_detach(void)
void
daemonize(void)
{
daemonize_start(false);
daemonize_start(false, false);
daemonize_complete();
}

View File

@ -167,10 +167,10 @@ void set_detach(void);
bool get_detach(void);
void daemon_save_fd(int fd);
void daemonize(void);
void daemonize_start(bool access_datapath);
void daemonize_start(bool access_datapath, bool access_hardware_ports);
void daemonize_complete(void);
void daemon_set_new_user(const char * user_spec);
void daemon_become_new_user(bool access_datapath);
void daemon_become_new_user(bool access_datapath, bool access_hardware_ports);
void daemon_usage(void);
void daemon_disable_self_confinement(void);
bool daemon_should_self_confine(void);

View File

@ -250,7 +250,7 @@ main(int argc, char *argv[])
parse_options(argc, argv);
fatal_ignore_sigpipe();
daemon_become_new_user(false);
daemon_become_new_user(false, false);
if (optind >= argc) {
ovs_fatal(0, "missing command name; use --help for help");
}
@ -1392,7 +1392,7 @@ do_monitor__(struct jsonrpc *rpc, const char *database,
daemon_save_fd(STDOUT_FILENO);
daemon_save_fd(STDERR_FILENO);
daemonize_start(false);
daemonize_start(false, false);
if (get_detach()) {
int error;
@ -2276,7 +2276,7 @@ do_lock(struct jsonrpc *rpc, const char *method, const char *lock)
getting a reply of the previous
request. */
daemon_save_fd(STDOUT_FILENO);
daemonize_start(false);
daemonize_start(false, false);
lock_req_init(&lock_req, method, lock);
if (get_detach()) {

View File

@ -341,7 +341,7 @@ main(int argc, char *argv[])
&run_command, &sync_from, &sync_exclude, &active);
is_backup = sync_from && !active;
daemon_become_new_user(false);
daemon_become_new_user(false, false);
/* Create and initialize 'config_tmpfile' as a temporary file to hold
* ovsdb-server's most basic configuration, and then save our initial
@ -359,7 +359,7 @@ main(int argc, char *argv[])
save_config__(config_tmpfile, &remotes, &db_filenames, sync_from,
sync_exclude, is_backup);
daemonize_start(false);
daemonize_start(false, false);
/* Load the saved config. */
load_config(config_tmpfile, &remotes, &db_filenames, &sync_from,

View File

@ -195,7 +195,7 @@ test_netflow_main(int argc, char *argv[])
}
daemon_save_fd(STDOUT_FILENO);
daemonize_start(false);
daemonize_start(false, false);
error = unixctl_server_create(NULL, &server);
if (error) {

View File

@ -709,7 +709,7 @@ test_sflow_main(int argc, char *argv[])
}
daemon_save_fd(STDOUT_FILENO);
daemonize_start(false);
daemonize_start(false, false);
error = unixctl_server_create(NULL, &server);
if (error) {

View File

@ -83,7 +83,7 @@ test_unixctl_main(int argc, char *argv[])
fatal_ignore_sigpipe();
parse_options(&argc, &argv, &unixctl_path);
daemonize_start(false);
daemonize_start(false, false);
int retval = unixctl_server_create(unixctl_path, &unixctl);
if (retval) {
exit(EXIT_FAILURE);

View File

@ -173,7 +173,7 @@ main(int argc, char *argv[])
ctx.argc = argc - optind;
ctx.argv = argv + optind;
daemon_become_new_user(false);
daemon_become_new_user(false, false);
if (read_only) {
ovs_cmdl_run_command_read_only(&ctx, get_all_commands());
} else {
@ -2127,7 +2127,7 @@ monitor_vconn(struct vconn *vconn, bool reply_to_echo_requests,
int error;
daemon_save_fd(STDERR_FILENO);
daemonize_start(false);
daemonize_start(false, false);
error = unixctl_server_create(unixctl_path, &server);
if (error) {
ovs_fatal(error, "failed to create unixctl server");

View File

@ -109,7 +109,7 @@ main(int argc, char *argv[])
parse_options(argc, argv);
fatal_ignore_sigpipe();
daemon_become_new_user(false);
daemon_become_new_user(false, false);
if (argc - optind < 1) {
ovs_fatal(0, "at least one vconn argument required; "
@ -148,7 +148,7 @@ main(int argc, char *argv[])
ovs_fatal(0, "no active or passive switch connections");
}
daemonize_start(false);
daemonize_start(false, false);
retval = unixctl_server_create(unixctl_path, &unixctl);
if (retval) {

View File

@ -81,6 +81,15 @@ unavailable or unsuccessful.
.SS "DPDK Options"
For details on initializing \fBovs\-vswitchd\fR to use DPDK ports,
refer to the documentation or \fBovs\-vswitchd.conf.db\fR(5).
.SS "DPDK HW Access Options"
.IP "\fB\-\-hw\-rawio\-access\fR"
Tells \fBovs\-vswitchd\fR to retain the \fBCAP_SYS_RAWIO\fR capability,
to allow userspace drivers access to raw hardware memory. This will
also allow the \fBovs\-vswitchd\fR daemon to call \fBiopl()\fR and
\fBioperm()\fR functions as well as access memory devices to set port
access. This is a \fBvery\fR powerful capability, so generally only
enable as needed for specific hardware (for example mlx5 with full
hardware offload via rte_flow).
.SS "Daemon Options"
.ds DD \
\fBovs\-vswitchd\fR detaches only after it has connected to the \

View File

@ -60,6 +60,9 @@ VLOG_DEFINE_THIS_MODULE(vswitchd);
* the kernel from paging any of its memory to disk. */
static bool want_mlockall;
/* --hw-rawio-access: If set, retains CAP_SYS_RAWIO privileges. */
static bool hw_rawio_access;
static unixctl_cb_func ovs_vswitchd_exit;
static char *parse_options(int argc, char *argv[], char **unixctl_path);
@ -89,7 +92,7 @@ main(int argc, char *argv[])
remote = parse_options(argc, argv, &unixctl_path);
fatal_ignore_sigpipe();
daemonize_start(true);
daemonize_start(true, hw_rawio_access);
if (want_mlockall) {
#ifdef HAVE_MLOCKALL
@ -169,6 +172,7 @@ parse_options(int argc, char *argv[], char **unixctl_pathp)
OPT_DPDK,
SSL_OPTION_ENUMS,
OPT_DUMMY_NUMA,
OPT_HW_RAWIO_ACCESS,
};
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h'},
@ -185,6 +189,7 @@ parse_options(int argc, char *argv[], char **unixctl_pathp)
{"disable-system-route", no_argument, NULL, OPT_DISABLE_SYSTEM_ROUTE},
{"dpdk", optional_argument, NULL, OPT_DPDK},
{"dummy-numa", required_argument, NULL, OPT_DUMMY_NUMA},
{"hw-rawio-access", no_argument, NULL, OPT_HW_RAWIO_ACCESS},
{NULL, 0, NULL, 0},
};
char *short_options = ovs_cmdl_long_options_to_short_options(long_options);
@ -249,6 +254,10 @@ parse_options(int argc, char *argv[], char **unixctl_pathp)
ovs_numa_set_dummy(optarg);
break;
case OPT_HW_RAWIO_ACCESS:
hw_rawio_access = true;
break;
default:
abort();
}