mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-23 02:17:22 +00:00
When we don't use userns, __userns_sysctl_op is called in context of the current process. A mount namespaces is restored the last one, so when we restore namespaces, we see /proc from the host pid namespace. In this case we can't use virtual pid to access /proc/pid. Let's open /proc/self/ns and use this descriptor to switch namespaces. Cc: Tycho Andersen <tycho.andersen@canonical.com> Fixes: f79f4546cfc0 ("sysctl: move sysctl calls to usernsd") Signed-off-by: Andrew Vagin <avagin@openvz.org> Acked-by: Tycho Andersen <tycho.andersen@canonical.com> Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
464 lines
11 KiB
C
464 lines
11 KiB
C
#include <unistd.h>
|
|
#include <fcntl.h>
|
|
#include <ctype.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
#include <sys/types.h>
|
|
#include <sys/wait.h>
|
|
|
|
#include "asm/types.h"
|
|
#include "namespaces.h"
|
|
#include "sysctl.h"
|
|
#include "util.h"
|
|
|
|
/* These are the namespaces we know how to restore in various ways.
|
|
*/
|
|
#define KNOWN_NS_MASK (CLONE_NEWUTS | CLONE_NEWNET | CLONE_NEWIPC)
|
|
|
|
struct sysctl_userns_req {
|
|
int op;
|
|
unsigned int ns;
|
|
size_t nr_req;
|
|
struct sysctl_req *reqs;
|
|
};
|
|
|
|
#define __SYSCTL_OP(__ret, __fd, __req, __type, __nr, __op) \
|
|
do { \
|
|
if (__op == CTL_READ) \
|
|
__ret = sysctl_read_##__type(__fd, __req, \
|
|
(__type *)(__req)->arg, \
|
|
__nr); \
|
|
else if (__op == CTL_WRITE) \
|
|
__ret = sysctl_write_##__type(__fd, __req, \
|
|
(__type *)(__req)->arg, \
|
|
__nr); \
|
|
else \
|
|
__ret = -1; \
|
|
} while (0)
|
|
|
|
#define GEN_SYSCTL_READ_FUNC(__type, __conv) \
|
|
static int sysctl_read_##__type(int fd, \
|
|
struct sysctl_req *req, \
|
|
__type *arg, \
|
|
int nr) \
|
|
{ \
|
|
char buf[1024] = {0}; \
|
|
int i, ret = -1; \
|
|
char *p = buf; \
|
|
\
|
|
ret = read(fd, buf, sizeof(buf)); \
|
|
if (ret < 0) { \
|
|
pr_perror("Can't read %s", req->name); \
|
|
ret = -1; \
|
|
goto err; \
|
|
} \
|
|
\
|
|
for (i = 0; i < nr && p < buf + sizeof(buf); p++, i++) \
|
|
((__type *)arg)[i] = __conv(p, &p, 10); \
|
|
\
|
|
if (i != nr) { \
|
|
pr_err("Not enough params for %s (%d != %d)\n", \
|
|
req->name, i, nr); \
|
|
goto err; \
|
|
} \
|
|
\
|
|
ret = 0; \
|
|
\
|
|
err: \
|
|
return ret; \
|
|
}
|
|
|
|
#define GEN_SYSCTL_WRITE_FUNC(__type, __fmt) \
|
|
static int sysctl_write_##__type(int fd, \
|
|
struct sysctl_req *req, \
|
|
__type *arg, \
|
|
int nr) \
|
|
{ \
|
|
char buf[1024]; \
|
|
int i, ret = -1; \
|
|
int off = 0; \
|
|
\
|
|
for (i = 0; i < nr && off < sizeof(buf) - 1; i++) { \
|
|
snprintf(&buf[off], sizeof(buf) - off, __fmt, arg[i]); \
|
|
off += strlen(&buf[off]); \
|
|
} \
|
|
\
|
|
if (i != nr) { \
|
|
pr_err("Not enough space for %s (%d != %d)\n", \
|
|
req->name, i, nr); \
|
|
goto err; \
|
|
} \
|
|
\
|
|
/* trailing spaces in format */ \
|
|
while (off > 0 && isspace(buf[off - 1])) \
|
|
off--; \
|
|
buf[off + 0] = '\n'; \
|
|
ret = write(fd, buf, off + 1); \
|
|
if (ret < 0) { \
|
|
pr_perror("Can't write %s", req->name); \
|
|
ret = -1; \
|
|
goto err; \
|
|
} \
|
|
\
|
|
ret = 0; \
|
|
err: \
|
|
return ret; \
|
|
}
|
|
|
|
GEN_SYSCTL_READ_FUNC(u32, strtoul);
|
|
GEN_SYSCTL_READ_FUNC(u64, strtoull);
|
|
GEN_SYSCTL_READ_FUNC(s32, strtol);
|
|
|
|
GEN_SYSCTL_WRITE_FUNC(u32, "%u ");
|
|
GEN_SYSCTL_WRITE_FUNC(u64, "%"PRIu64" ");
|
|
GEN_SYSCTL_WRITE_FUNC(s32, "%d ");
|
|
|
|
static int
|
|
sysctl_write_char(int fd, struct sysctl_req *req, char *arg, int nr)
|
|
{
|
|
pr_debug("%s nr %d\n", req->name, nr);
|
|
if (dprintf(fd, "%s\n", arg) < 0)
|
|
return -1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int
|
|
sysctl_read_char(int fd, struct sysctl_req *req, char *arg, int nr)
|
|
{
|
|
int ret = -1;
|
|
|
|
pr_debug("%s nr %d\n", req->name, nr);
|
|
ret = read(fd, arg, nr);
|
|
if (ret < 0) {
|
|
pr_perror("Can't read %s", req->name);
|
|
goto err;
|
|
}
|
|
ret = 0;
|
|
|
|
err:
|
|
return ret;
|
|
}
|
|
|
|
static int sysctl_userns_arg_size(int type)
|
|
{
|
|
switch(CTL_TYPE(type)) {
|
|
case __CTL_U32A:
|
|
return sizeof(u32) * CTL_LEN(type);
|
|
case CTL_U32:
|
|
return sizeof(u32);
|
|
case CTL_32:
|
|
return sizeof(s32);
|
|
case __CTL_U64A:
|
|
return sizeof(u64) * CTL_LEN(type);
|
|
case CTL_U64:
|
|
return sizeof(u64);
|
|
case __CTL_STR:
|
|
return sizeof(char) * CTL_LEN(type) + 1;
|
|
default:
|
|
pr_err("unknown arg type %d\n", type);
|
|
|
|
/* Ensure overflow to cause an error */
|
|
return MAX_UNSFD_MSG_SIZE;
|
|
}
|
|
}
|
|
|
|
static int do_sysctl_op(int fd, struct sysctl_req *req, int op)
|
|
{
|
|
int ret = -1, nr = 1;
|
|
|
|
switch (CTL_TYPE(req->type)) {
|
|
case __CTL_U32A:
|
|
nr = CTL_LEN(req->type);
|
|
/* fallthrough */
|
|
case CTL_U32:
|
|
__SYSCTL_OP(ret, fd, req, u32, nr, op);
|
|
break;
|
|
case CTL_32:
|
|
__SYSCTL_OP(ret, fd, req, s32, nr, op);
|
|
break;
|
|
case __CTL_U64A:
|
|
nr = CTL_LEN(req->type);
|
|
/* fallthrough */
|
|
case CTL_U64:
|
|
__SYSCTL_OP(ret, fd, req, u64, nr, op);
|
|
break;
|
|
case __CTL_STR:
|
|
nr = CTL_LEN(req->type);
|
|
__SYSCTL_OP(ret, fd, req, char, nr, op);
|
|
break;
|
|
}
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int __userns_sysctl_op(void *arg, int proc_fd, pid_t pid)
|
|
{
|
|
int fd, ret = -1, dir, i, status, *fds = NULL;
|
|
struct sysctl_userns_req *userns_req = arg;
|
|
int op = userns_req->op;
|
|
struct sysctl_req *req, **reqs = NULL;
|
|
sigset_t blockmask, oldmask;
|
|
pid_t worker;
|
|
|
|
// fix up the pointer
|
|
req = userns_req->reqs = (struct sysctl_req *) &userns_req[1];
|
|
|
|
/* For files in the IPC/UTS namespaces, restoring is more complicated
|
|
* than for net. Unprivileged users cannot even open these files, so
|
|
* they must be opened by usernsd. However, the value in the kernel is
|
|
* changed for the IPC/UTS namespace that write()s to the open sysctl
|
|
* file (not who opened it). So, we must set the value from inside the
|
|
* usernsd caller's namespace. We:
|
|
*
|
|
* 1. unsd opens the sysctl files
|
|
* 2. forks a task
|
|
* 3. setns()es to the UTS/IPC namespace of the caller
|
|
* 4. write()s to the files and exits
|
|
*/
|
|
dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
|
|
if (dir < 0) {
|
|
pr_perror("Can't open sysctl dir");
|
|
return -1;
|
|
}
|
|
|
|
fds = xmalloc(sizeof(int) * userns_req->nr_req);
|
|
if (!fds)
|
|
goto out;
|
|
|
|
reqs = xmalloc(sizeof(struct sysctl_req) * userns_req->nr_req);
|
|
if (!reqs)
|
|
goto out;
|
|
|
|
memset(fds, -1, sizeof(int) * userns_req->nr_req);
|
|
|
|
for (i = 0; i < userns_req->nr_req; i++) {
|
|
int arg_len = sysctl_userns_arg_size(req->type);
|
|
int name_len = strlen((char *) &req[1]) + 1;
|
|
int total_len = sizeof(*req) + arg_len + name_len;
|
|
int flags;
|
|
|
|
/* fix up the pointers */
|
|
req->name = (char *) &req[1];
|
|
req->arg = req->name + name_len;
|
|
|
|
if (((char *) req) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
|
|
pr_err("bad sysctl req %s, too big: %d\n", req->name, total_len);
|
|
goto out;
|
|
}
|
|
|
|
if (op == CTL_READ)
|
|
flags = O_RDONLY;
|
|
else
|
|
flags = O_WRONLY;
|
|
|
|
fd = openat(dir, req->name, flags);
|
|
if (fd < 0) {
|
|
if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
|
|
continue;
|
|
pr_perror("Can't open sysctl %s", req->name);
|
|
goto out;
|
|
}
|
|
|
|
/* save a pointer to the req, so we don't need to recompute its
|
|
* location
|
|
*/
|
|
reqs[i] = req;
|
|
fds[i] = fd;
|
|
|
|
req = (struct sysctl_req *) (((char *) req) + total_len);
|
|
}
|
|
|
|
/*
|
|
* Don't let the sigchld_handler() mess with us
|
|
* calling waitpid() on the exited worker. The
|
|
* same is done in cr_system().
|
|
*/
|
|
|
|
sigemptyset(&blockmask);
|
|
sigaddset(&blockmask, SIGCHLD);
|
|
sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
|
|
|
|
worker = fork();
|
|
if (worker < 0)
|
|
goto out;
|
|
|
|
if (!worker) {
|
|
int nsfd;
|
|
const char *nsname = ns_to_string(userns_req->ns);
|
|
|
|
BUG_ON(!nsname);
|
|
nsfd = openat(proc_fd, nsname, O_RDONLY);
|
|
if (nsfd < 0) {
|
|
pr_perror("failed to open pid %d's ns %s", pid, nsname);
|
|
exit(1);
|
|
}
|
|
|
|
if (setns(nsfd, 0) < 0) {
|
|
pr_perror("failed to setns to %d's ns %s", pid, nsname);
|
|
exit(1);
|
|
}
|
|
|
|
close(nsfd);
|
|
|
|
for (i = 0; i < userns_req->nr_req; i++) {
|
|
if (do_sysctl_op(fds[i], reqs[i], op) < 0)
|
|
exit(1);
|
|
}
|
|
|
|
exit(0);
|
|
}
|
|
|
|
if (waitpid(worker, &status, 0) != worker) {
|
|
pr_perror("worker didn't die?");
|
|
kill(worker, SIGKILL);
|
|
goto out;
|
|
}
|
|
sigprocmask(SIG_BLOCK, &oldmask, NULL);
|
|
|
|
if (!WIFEXITED(status) || WEXITSTATUS(status)) {
|
|
pr_err("worker failed: %d\n", status);
|
|
goto out;
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
if (fds) {
|
|
for (i = 0; i < userns_req->nr_req; i++) {
|
|
if (fds[i] < 0)
|
|
break;
|
|
close_safe(&fds[i]);
|
|
}
|
|
|
|
xfree(fds);
|
|
}
|
|
|
|
if (reqs)
|
|
xfree(reqs);
|
|
|
|
close_safe(&dir);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int __nonuserns_sysctl_op(struct sysctl_req *req, size_t nr_req, int op)
|
|
{
|
|
int dir, ret = -1;;
|
|
|
|
dir = open("/proc/sys", O_RDONLY, O_DIRECTORY);
|
|
if (dir < 0) {
|
|
pr_perror("Can't open sysctl dir");
|
|
return -1;
|
|
}
|
|
|
|
while (nr_req--) {
|
|
int fd, flags;
|
|
|
|
if (op == CTL_READ)
|
|
flags = O_RDONLY;
|
|
else
|
|
flags = O_WRONLY;
|
|
|
|
fd = openat(dir, req->name, flags);
|
|
if (fd < 0) {
|
|
if (errno == ENOENT && (req->flags & CTL_FLAGS_OPTIONAL))
|
|
continue;
|
|
pr_perror("Can't open sysctl %s", req->name);
|
|
goto out;
|
|
}
|
|
|
|
ret = do_sysctl_op(fd, req, op);
|
|
close(fd);
|
|
req++;
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
|
close(dir);
|
|
return ret;
|
|
}
|
|
|
|
int sysctl_op(struct sysctl_req *req, size_t nr_req, int op, unsigned int ns)
|
|
{
|
|
int i, fd, ret;
|
|
struct sysctl_userns_req *userns_req;
|
|
struct sysctl_req *cur;
|
|
|
|
if (nr_req == 0)
|
|
return 0;
|
|
|
|
if (ns & ~KNOWN_NS_MASK) {
|
|
pr_err("don't know how to restore some namespaces in %u\n", ns);
|
|
return -1;
|
|
}
|
|
|
|
/* The way sysctl files behave on open/write depends on the namespace
|
|
* they correspond to. If we don't want to interact with something in a
|
|
* namespace (e.g. kernel/cap_last_cap is global), we can do this from
|
|
* the current process. Similarly, if we're accessing net namespaces,
|
|
* we can just do the operation from our current process, since
|
|
* anything with CAP_NET_ADMIN can write to the net/ sysctls, and we
|
|
* still have that even when restoring in a user ns.
|
|
*
|
|
* For IPC/UTS, we restore them as described above.
|
|
*
|
|
* For read operations, we need to copy the values back to return.
|
|
* Fortunately, we only do read on dump (or global reads on restore),
|
|
* so we can do those in process as well.
|
|
*/
|
|
if (!ns || ns & CLONE_NEWNET || op == CTL_READ)
|
|
return __nonuserns_sysctl_op(req, nr_req, op);
|
|
|
|
/*
|
|
* In order to avoid lots of opening of /proc/sys for each struct sysctl_req,
|
|
* we encode each array of sysctl_reqs into one contiguous region of memory so
|
|
* it can be passed via userns_call if necessary. It looks like this:
|
|
*
|
|
* struct sysctl_userns_req struct sysctl_req name arg
|
|
* ---------------------------------------------------------------------------
|
|
* | op | nr_req | reqs | <fields> | name | arg | "the name" | "the arg" ...
|
|
* ---------------------------------------------------------------------------
|
|
* |____^ |______|__^ ^
|
|
* |_______________|
|
|
*/
|
|
userns_req = alloca(MAX_UNSFD_MSG_SIZE);
|
|
userns_req->op = op;
|
|
userns_req->nr_req = nr_req;
|
|
userns_req->ns = ns;
|
|
userns_req->reqs = (struct sysctl_req *) (&userns_req[1]);
|
|
|
|
cur = userns_req->reqs;
|
|
for (i = 0; i < nr_req; i++) {
|
|
int arg_len = sysctl_userns_arg_size(req[i].type);
|
|
int name_len = strlen(req[i].name) + 1;
|
|
int total_len = sizeof(*cur) + arg_len + name_len;
|
|
|
|
if (((char *) cur) + total_len >= ((char *) userns_req) + MAX_UNSFD_MSG_SIZE) {
|
|
pr_err("sysctl msg %s too big: %d\n", req[i].name, total_len);
|
|
return -1;
|
|
}
|
|
|
|
/* copy over the non-pointer fields */
|
|
cur->type = req[i].type;
|
|
cur->flags = req[i].flags;
|
|
|
|
cur->name = (char *) &cur[1];
|
|
strcpy(cur->name, req[i].name);
|
|
|
|
cur->arg = cur->name + name_len;
|
|
memcpy(cur->arg, req[i].arg, arg_len);
|
|
|
|
cur = (struct sysctl_req *) (((char *) cur) + total_len);
|
|
}
|
|
|
|
fd = open_proc(PROC_SELF, "ns");
|
|
if (fd < 0)
|
|
return -1;
|
|
|
|
ret = userns_call(__userns_sysctl_op, UNS_ASYNC, userns_req, MAX_UNSFD_MSG_SIZE, fd);
|
|
close(fd);
|
|
return ret;
|
|
}
|