mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-31 22:35:33 +00:00
sysctl: move sysctl calls to usernsd
When in a userns, tasks can't write to certain sysctl files: (00.009653) 1: Error (sysctl.c:142): Can't open sysctl kernel/hostname: Permission denied See inline comments for details on affected namespaces. Mostly for my own education in what is required to port something to be userns restorable, I ported the sysctl stuff. A potential concern for this patch is that copying structures with pointers around is kind of gory. I did it ad-hoc here, but it may be worth inventing some mechanisms to make it easier, although I'm not sure what exactly that would look like (potentially re-using some of the protobuf bits; I'll investigate this more if it looks helpful when doing the cgroup user namespaces port?). Another issue is that there is not a great way to return non-fd stuff in memory right now from userns_call; one of the little hacks in this code would be "simplified" if we invented a way to do this. v2: coalesce the individual struct sysctl_req requests into one big sysctl_userns_req that is in a contiguous region of memory so that we can pass it via userns_call. Hopefully nobody finds my little ascii diagram too offensive :) v3: use the fork/setns trick to change the syctl values in the right ns for IPC/UTS nses; see inline comment for details v4: only use sysctl_userns_req when actually doing a userns_call. Signed-off-by: Tycho Andersen <tycho.andersen@canonical.com> Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
This commit is contained in:
committed by
Pavel Emelyanov
parent
6d52d6ee21
commit
f79f4546cf
@@ -103,7 +103,7 @@ typedef int (*uns_call_t)(void *arg, int fd, pid_t pid);
|
||||
*/
|
||||
#define UNS_FDOUT 0x2
|
||||
|
||||
#define MAX_UNSFD_MSG_SIZE 256
|
||||
#define MAX_UNSFD_MSG_SIZE 4096
|
||||
|
||||
/*
|
||||
* When we're restoring inside user namespace, some things are
|
||||
|
@@ -8,7 +8,7 @@ struct sysctl_req {
|
||||
int flags;
|
||||
};
|
||||
|
||||
extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op);
|
||||
extern int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns);
|
||||
|
||||
enum {
|
||||
CTL_READ,
|
||||
|
@@ -263,4 +263,5 @@ int fd_has_data(int lfd);
|
||||
|
||||
int make_yard(char *path);
|
||||
|
||||
const char *ns_to_string(unsigned int ns);
|
||||
#endif /* __CR_UTIL_H__ */
|
||||
|
12
ipc_ns.c
12
ipc_ns.c
@@ -181,7 +181,7 @@ static int dump_ipc_msg_queue_messages(struct cr_img *img, const IpcMsgEntry *ms
|
||||
{ "kernel/msgmax", &msgmax, CTL_U32 },
|
||||
};
|
||||
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ);
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, CLONE_NEWIPC);
|
||||
if (ret < 0) {
|
||||
pr_err("Failed to read max IPC message size\n");
|
||||
goto err;
|
||||
@@ -313,7 +313,7 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op)
|
||||
|
||||
int ret;
|
||||
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), op);
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), op, CLONE_NEWIPC);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
@@ -322,7 +322,7 @@ static int ipc_sysctl_req(IpcVarEntry *e, int op)
|
||||
return 0;
|
||||
}
|
||||
|
||||
return sysctl_op(req_mq, ARRAY_SIZE(req_mq), op);
|
||||
return sysctl_op(req_mq, ARRAY_SIZE(req_mq), op, CLONE_NEWIPC);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -555,7 +555,7 @@ static int prepare_ipc_sem_desc(struct cr_img *img, const IpcSemEntry *sem)
|
||||
};
|
||||
struct semid_ds semid;
|
||||
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
|
||||
if (ret < 0) {
|
||||
pr_err("Failed to set desired IPC sem ID\n");
|
||||
return ret;
|
||||
@@ -691,7 +691,7 @@ static int prepare_ipc_msg_queue(struct cr_img *img, const IpcMsgEntry *msq)
|
||||
};
|
||||
struct msqid_ds msqid;
|
||||
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
|
||||
if (ret < 0) {
|
||||
pr_err("Failed to set desired IPC msg ID\n");
|
||||
return ret;
|
||||
@@ -802,7 +802,7 @@ static int prepare_ipc_shm_seg(struct cr_img *img, const IpcShmEntry *shm)
|
||||
};
|
||||
struct shmid_ds shmid;
|
||||
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWIPC);
|
||||
if (ret < 0) {
|
||||
pr_err("Failed to set desired IPC shm ID\n");
|
||||
return ret;
|
||||
|
@@ -221,7 +221,7 @@ static int tcp_read_sysctl_limits(void)
|
||||
* Lets figure out which exactly amount of memory is
|
||||
* availabe for send/read queues on restore.
|
||||
*/
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ);
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0);
|
||||
if (ret) {
|
||||
pr_warn("TCP mem sysctls are not available. Using defaults.\n");
|
||||
goto out;
|
||||
@@ -268,7 +268,7 @@ static int get_last_cap(void)
|
||||
{ "kernel/cap_last_cap", &kdat.last_cap, CTL_U32 },
|
||||
};
|
||||
|
||||
return sysctl_op(req, ARRAY_SIZE(req), CTL_READ);
|
||||
return sysctl_op(req, ARRAY_SIZE(req), CTL_READ, 0);
|
||||
}
|
||||
|
||||
static bool kerndat_has_memfd_create(void)
|
||||
|
2
net.c
2
net.c
@@ -127,7 +127,7 @@ static int ipv4_conf_op(char *tgt, int *conf, int n, int op, NetnsEntry **netns)
|
||||
ri++;
|
||||
}
|
||||
|
||||
ret = sysctl_op(req, ri, op);
|
||||
ret = sysctl_op(req, ri, op, CLONE_NEWNET);
|
||||
if (ret < 0) {
|
||||
pr_err("Failed to %s %s/<confs>\n", (op == CTL_READ)?"read":"write", tgt);
|
||||
return -1;
|
||||
|
295
sysctl.c
295
sysctl.c
@@ -3,11 +3,25 @@
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
|
||||
#include "asm/types.h"
|
||||
#include "namespaces.h"
|
||||
#include "sysctl.h"
|
||||
#include "util.h"
|
||||
|
||||
/* These are the namespaces we know how to restore in various ways.
|
||||
*/
|
||||
#define KNOWN_NS_MASK (CLONE_NEWUTS | CLONE_NEWNET | CLONE_NEWIPC)
|
||||
|
||||
struct sysctl_userns_req {
|
||||
int op;
|
||||
unsigned int ns;
|
||||
size_t nr_req;
|
||||
struct sysctl_req *reqs;
|
||||
};
|
||||
|
||||
#define __SYSCTL_OP(__ret, __fd, __req, __type, __nr, __op) \
|
||||
do { \
|
||||
if (__op == CTL_READ) \
|
||||
@@ -126,22 +140,32 @@ err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __sysctl_op(int dir, struct sysctl_req *req, int op)
|
||||
static int sysctl_userns_arg_size(int type)
|
||||
{
|
||||
int fd, ret = -1, nr = 1, flags;
|
||||
switch(CTL_TYPE(type)) {
|
||||
case __CTL_U32A:
|
||||
return sizeof(u32) * CTL_LEN(type);
|
||||
case CTL_U32:
|
||||
return sizeof(u32);
|
||||
case CTL_32:
|
||||
return sizeof(s32);
|
||||
case __CTL_U64A:
|
||||
return sizeof(u64) * CTL_LEN(type);
|
||||
case CTL_U64:
|
||||
return sizeof(u64);
|
||||
case __CTL_STR:
|
||||
return sizeof(char) * CTL_LEN(type) + 1;
|
||||
default:
|
||||
pr_err("unknown arg type %d\n", type);
|
||||
|
||||
if (op == CTL_READ)
|
||||
flags = O_RDONLY;
|
||||
else
|
||||
flags = O_WRONLY;
|
||||
|
||||
fd = openat(dir, req->name, flags);
|
||||
if (fd < 0) {
|
||||
if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
|
||||
return 0;
|
||||
pr_perror("Can't open sysctl %s", req->name);
|
||||
return -1;
|
||||
/* Ensure overflow to cause an error */
|
||||
return MAX_UNSFD_MSG_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
static int do_sysctl_op(int fd, struct sysctl_req *req, int op)
|
||||
{
|
||||
int ret = -1, nr = 1;
|
||||
|
||||
switch (CTL_TYPE(req->type)) {
|
||||
case __CTL_U32A:
|
||||
@@ -163,30 +187,257 @@ static int __sysctl_op(int dir, struct sysctl_req *req, int op)
|
||||
break;
|
||||
}
|
||||
|
||||
close_safe(&fd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __userns_sysctl_op(void *arg, int unused, pid_t pid)
|
||||
{
|
||||
int fd, ret = -1, dir, i, status, *fds = NULL;
|
||||
struct sysctl_userns_req *userns_req = arg;
|
||||
int op = userns_req->op;
|
||||
struct sysctl_req *req, **reqs = NULL;
|
||||
pid_t worker;
|
||||
|
||||
// fix up the pointer
|
||||
req = userns_req->reqs = (struct sysctl_req *) &userns_req[1];
|
||||
|
||||
/* For files in the IPC/UTS namespaces, restoring is more complicated
|
||||
* than for net. Unprivileged users cannot even open these files, so
|
||||
* they must be opened by usernsd. However, the value in the kernel is
|
||||
* changed for the IPC/UTS namespace that write()s to the open sysctl
|
||||
* file (not who opened it). So, we must set the value from inside the
|
||||
* usernsd caller's namespace. We:
|
||||
*
|
||||
* 1. unsd opens the sysctl files
|
||||
* 2. forks a task
|
||||
* 3. setns()es to the UTS/IPC namespace of the caller
|
||||
* 4. write()s to the files and exits
|
||||
*/
|
||||
dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
|
||||
if (dir < 0) {
|
||||
pr_perror("Can't open sysctl dir");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fds = xmalloc(sizeof(int) * userns_req->nr_req);
|
||||
if (!fds)
|
||||
goto out;
|
||||
|
||||
reqs = xmalloc(sizeof(struct sysctl_req) * userns_req->nr_req);
|
||||
if (!fds)
|
||||
goto out;
|
||||
|
||||
memset(fds, -1, sizeof(int) * userns_req->nr_req);
|
||||
|
||||
for (i = 0; i < userns_req->nr_req; i++) {
|
||||
int arg_len = sysctl_userns_arg_size(req->type);
|
||||
int name_len = strlen((char *) &req[1]) + 1;
|
||||
int total_len = sizeof(*req) + arg_len + name_len;
|
||||
int flags;
|
||||
|
||||
/* fix up the pointers */
|
||||
req->name = (char *) &req[1];
|
||||
req->arg = req->name + name_len;
|
||||
|
||||
if (((char *) req) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
|
||||
pr_err("bad sysctl req %s, too big: %d\n", req->name, total_len);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (op == CTL_READ)
|
||||
flags = O_RDONLY;
|
||||
else
|
||||
flags = O_WRONLY;
|
||||
|
||||
fd = openat(dir, req->name, flags);
|
||||
if (fd < 0) {
|
||||
if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
|
||||
continue;
|
||||
pr_perror("Can't open sysctl %s", req->name);
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* save a pointer to the req, so we don't need to recompute its
|
||||
* location
|
||||
*/
|
||||
reqs[i] = req;
|
||||
fds[i] = fd;
|
||||
|
||||
req = (struct sysctl_req *) (((char *) req) + total_len);
|
||||
}
|
||||
|
||||
worker = fork();
|
||||
if (worker < 0)
|
||||
goto out;
|
||||
|
||||
if (!worker) {
|
||||
int nsfd;
|
||||
const char *nsname = ns_to_string(userns_req->ns);
|
||||
|
||||
BUG_ON(!nsname);
|
||||
nsfd = open_proc(pid, "ns/%s", nsname);
|
||||
if (nsfd < 0) {
|
||||
pr_perror("failed to open pid %d's ns %s", pid, nsname);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
if (setns(nsfd, 0) < 0) {
|
||||
pr_perror("failed to setns to %d's ns %s", pid, nsname);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
close(nsfd);
|
||||
|
||||
for (i = 0; i < userns_req->nr_req; i++) {
|
||||
if (do_sysctl_op(fds[i], reqs[i], op) < 0)
|
||||
exit(1);
|
||||
}
|
||||
|
||||
exit(0);
|
||||
}
|
||||
|
||||
if (waitpid(worker, &status, 0) != worker) {
|
||||
pr_err("worker didn't die?");
|
||||
kill(worker, SIGKILL);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!WIFEXITED(status) || WEXITSTATUS(status)) {
|
||||
pr_err("worker failed: %d\n", status);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
|
||||
out:
|
||||
if (fds) {
|
||||
for (i = 0; i < userns_req->nr_req; i++) {
|
||||
if (fds[i] < 0)
|
||||
break;
|
||||
close_safe(&fds[i]);
|
||||
}
|
||||
|
||||
xfree(fds);
|
||||
}
|
||||
|
||||
if (reqs)
|
||||
xfree(reqs);
|
||||
|
||||
close_safe(&dir);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int sysctl_op(struct sysctl_req *req, size_t nr_req, int op)
|
||||
static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op)
|
||||
{
|
||||
int ret = 0;
|
||||
int dir = -1;
|
||||
int dir, ret = -1;;
|
||||
|
||||
dir = open("/proc/sys", O_RDONLY);
|
||||
dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
|
||||
if (dir < 0) {
|
||||
pr_perror("Can't open sysctl dir");
|
||||
return -1;
|
||||
}
|
||||
|
||||
while (nr_req--) {
|
||||
ret = __sysctl_op(dir, req, op);
|
||||
if (ret < 0)
|
||||
break;
|
||||
int fd, flags;
|
||||
|
||||
if (op == CTL_READ)
|
||||
flags = O_RDONLY;
|
||||
else
|
||||
flags = O_WRONLY;
|
||||
|
||||
fd = openat(dir, req->name, flags);
|
||||
if (fd < 0) {
|
||||
if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
|
||||
continue;
|
||||
pr_perror("Can't open sysctl %s", req->name);
|
||||
goto out;
|
||||
}
|
||||
|
||||
ret = do_sysctl_op(fd, req, op);
|
||||
close(fd);
|
||||
req++;
|
||||
}
|
||||
|
||||
close_safe(&dir);
|
||||
ret = 0;
|
||||
|
||||
out:
|
||||
close(dir);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns)
|
||||
{
|
||||
int i;
|
||||
struct sysctl_userns_req *userns_req;
|
||||
struct sysctl_req *cur;
|
||||
|
||||
if (nr_req == 0)
|
||||
return 0;
|
||||
|
||||
if (ns & !KNOWN_NS_MASK) {
|
||||
pr_err("don't know how to restore some namespaces in %u\n", ns);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* The way sysctl files behave on open/write depends on the namespace
|
||||
* they correspond to. If we don't want to interact with something in a
|
||||
* namespace (e.g. kernel/cap_last_cap is global), we can do this from
|
||||
* the current process. Similarly, if we're accessing net namespaces,
|
||||
* we can just do the operation from our current process, since
|
||||
* anything with CAP_NET_ADMIN can write to the net/ sysctls, and we
|
||||
* still have that even when restoring in a user ns.
|
||||
*
|
||||
* For IPC/UTS, we restore them as described above.
|
||||
*
|
||||
* For read operations, we need to copy the values back to return.
|
||||
* Fortunately, we only do read on dump (or global reads on restore),
|
||||
* so we can do those in process as well.
|
||||
*/
|
||||
if (!ns || ns & CLONE_NEWNET || op == CTL_READ)
|
||||
return __nonuserns_sysctl_op(req, nr_req, op);
|
||||
|
||||
/*
|
||||
* In order to avoid lots of opening of /proc/sys for each struct sysctl_req,
|
||||
* we encode each array of sysctl_reqs into one contiguous region of memory so
|
||||
* it can be passed via userns_call if necessary. It looks like this:
|
||||
*
|
||||
* struct sysctl_userns_req struct sysctl_req name arg
|
||||
* ---------------------------------------------------------------------------
|
||||
* | op | nr_req | reqs | <fields> | name | arg | "the name" | "the arg" ...
|
||||
* ---------------------------------------------------------------------------
|
||||
* |____^ |______|__^ ^
|
||||
* |_______________|
|
||||
*/
|
||||
userns_req = alloca(MAX_UNSFD_MSG_SIZE);
|
||||
userns_req->op = op;
|
||||
userns_req->nr_req = nr_req;
|
||||
userns_req->ns = ns;
|
||||
userns_req->reqs = (struct sysctl_req *) (&userns_req[1]);
|
||||
|
||||
cur = userns_req->reqs;
|
||||
for (i = 0; i < nr_req; i++) {
|
||||
int arg_len = sysctl_userns_arg_size(req[i].type);
|
||||
int name_len = strlen(req[i].name) + 1;
|
||||
int total_len = sizeof(*cur) + arg_len + name_len;
|
||||
|
||||
if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
|
||||
pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* copy over the non-pointer fields */
|
||||
cur->type = req[i].type;
|
||||
cur->flags = req[i].flags;
|
||||
|
||||
cur->name = (char *) &cur[1];
|
||||
strcpy(cur->name, req[i].name);
|
||||
|
||||
cur->arg = cur->name + name_len;
|
||||
memcpy(cur->arg, req[i].arg, arg_len);
|
||||
|
||||
cur = (struct sysctl_req *) (((char *) cur) + total_len);
|
||||
}
|
||||
|
||||
return userns_call(__userns_sysctl_op, UNS_ASYNC, userns_req, MAX_UNSFD_MSG_SIZE, -1);
|
||||
}
|
||||
|
20
util.c
20
util.c
@@ -860,3 +860,23 @@ int make_yard(char *path)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
const char *ns_to_string(unsigned int ns)
|
||||
{
|
||||
switch (ns) {
|
||||
case CLONE_NEWIPC:
|
||||
return "ipc";
|
||||
case CLONE_NEWNS:
|
||||
return "mnt";
|
||||
case CLONE_NEWNET:
|
||||
return "net";
|
||||
case CLONE_NEWPID:
|
||||
return "pid";
|
||||
case CLONE_NEWUSER:
|
||||
return "user";
|
||||
case CLONE_NEWUTS:
|
||||
return "uts";
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
2
uts_ns.c
2
uts_ns.c
@@ -61,7 +61,7 @@ int prepare_utsns(int pid)
|
||||
req[1].arg = ue->domainname;
|
||||
req[1].type = CTL_STR(strlen(ue->domainname));
|
||||
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE);
|
||||
ret = sysctl_op(req, ARRAY_SIZE(req), CTL_WRITE, CLONE_NEWUTS);
|
||||
utsns_entry__free_unpacked(ue, NULL);
|
||||
out:
|
||||
close_image(img);
|
||||
|
Reference in New Issue
Block a user