2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 09:58:09 +00:00
criu/mount.c
Cyrill Gorcunov 275740a71e mnt: Export __open_mountpoint
We gonna need it for inotify handle testing.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
2015-10-22 17:13:31 +04:00

2944 lines
64 KiB
C

#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <errno.h>
#include <sys/stat.h>
#include <string.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/types.h>
#include <sys/wait.h>
#include "cr_options.h"
#include "asm/types.h"
#include "util.h"
#include "util-pie.h"
#include "log.h"
#include "plugin.h"
#include "mount.h"
#include "pstree.h"
#include "proc_parse.h"
#include "image.h"
#include "namespaces.h"
#include "protobuf.h"
#include "kerndat.h"
#include "fs-magic.h"
#include "sysfs_parse.h"
#include "protobuf/mnt.pb-c.h"
#define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED"
int ext_mount_add(char *key, char *val)
{
struct ext_mount *em;
em = xmalloc(sizeof(*em));
if (!em)
return -1;
em->key = key;
em->val = val;
list_add_tail(&em->list, &opts.ext_mounts);
pr_info("Added %s:%s ext mount mapping\n", key, val);
return 0;
}
/* Lookup ext_mount by key field */
static struct ext_mount *ext_mount_lookup(char *key)
{
struct ext_mount *em;
list_for_each_entry(em, &opts.ext_mounts, list)
if (!strcmp(em->key, key))
return em;
return NULL;
}
/*
* Single linked list of mount points get from proc/images
*/
struct mount_info *mntinfo;
static void mntinfo_add_list(struct mount_info *new)
{
if (!mntinfo)
mntinfo = new;
else {
struct mount_info *pm;
/* Add to the tail. (FIXME -- make O(1) ) */
for (pm = mntinfo; pm->next != NULL; pm = pm->next)
;
pm->next = new;
}
}
static int open_mountpoint(struct mount_info *pm);
static struct mount_info *mnt_build_tree(struct mount_info *list);
static int validate_mounts(struct mount_info *info, bool for_dump);
/* Asolute paths are used on dump and relative paths are used on restore */
static inline int is_root(char *p)
{
return (!strcmp(p, "/"));
}
/* True for the root mount (the topmost one) */
static inline int is_root_mount(struct mount_info *mi)
{
return is_root(mi->mountpoint + 1);
}
/*
* True if the mountpoint target is root on its FS.
*
* This is used to determine whether we need to postpone
* mounting. E.g. one can bind mount some subdir from a
* disk, and in this case we'll have to get the root disk
* mount first, then bind-mount it. See do_mount_one().
*/
static inline int fsroot_mounted(struct mount_info *mi)
{
return is_root(mi->root);
}
static struct mount_info *__lookup_overlayfs(struct mount_info *list, char *rpath,
unsigned int st_dev, unsigned int st_ino,
unsigned int mnt_id)
{
/*
* Goes through all entries in the mountinfo table
* looking for a mount point that contains the file specified
* in rpath. Uses the device number st_dev and the inode number st_ino
* to make sure the file is correct.
*/
struct mount_info *mi_ret = NULL;
struct mount_info *m;
int mntns_root = -1;
for (m = list; m != NULL; m = m->next) {
struct stat f_stat;
int ret_stat;
if (m->fstype->code != FSTYPE__OVERLAYFS)
continue;
/*
* We need the mntns root fd of the process to be dumped,
* to make sure we stat the correct file
*/
if (mntns_root == -1) {
mntns_root = __mntns_get_root_fd(root_item->pid.real);
if (mntns_root < 0) {
pr_err("Unable to get the root file descriptor of pid %d\n", root_item->pid.real);
return ERR_PTR(-ENOENT);
}
}
/* Concatenates m->mountpoint with rpath and attempts to stat the resulting path */
if (is_root_mount(m)) {
ret_stat = fstatat(mntns_root, rpath, &f_stat, 0);
} else {
char _full_path[PATH_MAX];
int n = snprintf(_full_path, PATH_MAX, "%s/%s", m->mountpoint, rpath);
if (n >= PATH_MAX) {
pr_err("Not enough space to concatenate %s and %s\n", m->mountpoint, rpath);
return ERR_PTR(-ENOSPC);
}
ret_stat = fstatat(mntns_root, _full_path, &f_stat, 0);
}
if (ret_stat == 0 && st_dev == f_stat.st_dev && st_ino == f_stat.st_ino)
mi_ret = m;
}
return mi_ret;
}
/*
* Looks up the mnt_id and path of a file in an overlayFS directory.
*
* This is useful in order to fix the OverlayFS bug present in the
* Linux Kernel before version 4.2. See fixup_overlayfs for details.
*
* We first check to see if the mnt_id and st_dev numbers currently match
* some entry in the mountinfo table. If so, we already have the correct mnt_id
* and no fixup is needed.
*
* Then we proceed to see if there are any overlayFS mounted directories
* in the mountinfo table. If so, we concatenate the mountpoint with the
* name of the file, and stat the resulting path to check if we found the
* correct device id and node number. If that is the case, we update the
* mount id and link variables with the correct values.
*/
struct mount_info *lookup_overlayfs(char *rpath, unsigned int st_dev,
unsigned int st_ino, unsigned int mnt_id)
{
struct mount_info *m;
/* If the mnt_id and device number match for some entry, no fixup is needed */
for (m = mntinfo; m != NULL; m = m->next)
if (st_dev == m->s_dev && mnt_id == m->mnt_id)
return NULL;
return __lookup_overlayfs(mntinfo, rpath, st_dev, st_ino, mnt_id);
}
static struct mount_info *__lookup_mnt_id(struct mount_info *list, int id)
{
struct mount_info *m;
for (m = list; m != NULL; m = m->next)
if (m->mnt_id == id)
return m;
return NULL;
}
struct mount_info *lookup_mnt_id(unsigned int id)
{
return __lookup_mnt_id(mntinfo, id);
}
struct mount_info *lookup_mnt_sdev(unsigned int s_dev)
{
struct mount_info *m;
for (m = mntinfo; m != NULL; m = m->next)
if (m->s_dev == s_dev)
return m;
return NULL;
}
static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, const char *path)
{
size_t pathlen = strlen(path);
struct mount_info *m = mntinfo_tree, *c;
while (1) {
list_for_each_entry(c, &m->children, siblings) {
size_t n;
n = strlen(c->mountpoint + 1);
if (n > pathlen)
continue;
if (strncmp(c->mountpoint + 1, path, min(n, pathlen)))
continue;
if (n < pathlen && path[n] != '/')
continue;
m = c;
break;
}
if (&c->siblings == &m->children)
break;
}
pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->mountpoint);
return m;
}
dev_t phys_stat_resolve_dev(struct ns_id *ns, dev_t st_dev, const char *path)
{
struct mount_info *m;
m = mount_resolve_path(ns->mnt.mntinfo_tree, path);
/*
* BTRFS returns subvolume dev-id instead of
* superblock dev-id, in such case return device
* obtained from mountinfo (ie subvolume0).
*/
return strcmp(m->fstype->name, "btrfs") ?
MKKDEV(major(st_dev), minor(st_dev)) : m->s_dev;
}
bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev,
struct ns_id *ns, const char *path)
{
if (st_dev == kdev_to_odev(phys_dev))
return true;
return phys_dev == phys_stat_resolve_dev(ns, st_dev, path);
}
/*
* Comparer two mounts. Return true if only mount points are differ.
* Don't care about root and mountpoints, if bind is true.
*/
static bool mounts_equal(struct mount_info* mi, struct mount_info *c, bool bind)
{
if (mi->s_dev != c->s_dev ||
c->fstype != mi->fstype ||
strcmp(c->source, mi->source) ||
strcmp(c->options, mi->options))
return false;
if (bind)
return true;
if (strcmp(c->root, mi->root))
return false;
if (strcmp(basename(c->mountpoint), basename(mi->mountpoint)))
return false;
return true;
}
/*
* mnt_roots is a temporary directory for restoring sub-trees of
* non-root namespaces.
*/
static char *mnt_roots;
static struct mount_info *mnt_build_ids_tree(struct mount_info *list)
{
struct mount_info *m, *root = NULL;
struct mount_info *tmp_root_mount = NULL;
if (mnt_roots) {
/* mnt_roots is a tmpfs mount and it's private */
tmp_root_mount = mnt_entry_alloc();
if (!tmp_root_mount)
return NULL;
tmp_root_mount->mountpoint = mnt_roots;
tmp_root_mount->mounted = true;
}
/*
* Just resolve the mnt_id:parent_mnt_id relations
*/
pr_debug("\tBuilding plain mount tree\n");
for (m = list; m != NULL; m = m->next) {
struct mount_info *p;
pr_debug("\t\tWorking on %d->%d\n", m->mnt_id, m->parent_mnt_id);
if (m->mnt_id != m->parent_mnt_id)
p = __lookup_mnt_id(list, m->parent_mnt_id);
else /* a circular mount reference. It's rootfs or smth like it. */
p = NULL;
if (!p) {
/* This should be / */
if (root == NULL && is_root_mount(m)) {
root = m;
continue;
}
pr_err("Mountpoint %d w/o parent %d found @%s (root %s)\n",
m->mnt_id, m->parent_mnt_id, m->mountpoint,
root ? "found" : "not found");
if (root && m->is_ns_root) {
if (!mounts_equal(root, m, true) ||
strcmp(root->root, m->root)) {
pr_err("Nested mount namespaces with different roots are not supported yet");
return NULL;
}
/*
* A root of a sub mount namespace is
* mounted in a temporary directory in the
* root mount namespace, so its parent is
* the main root.
*/
p = tmp_root_mount;
} else
return NULL;
}
m->parent = p;
list_add_tail(&m->siblings, &p->children);
}
if (!root) {
pr_err("No root found for tree\n");
return NULL;
}
if (mnt_roots) {
tmp_root_mount->parent = root;
list_add_tail(&tmp_root_mount->siblings, &root->children);
}
return root;
}
static int mnt_depth(struct mount_info *m)
{
int depth = 0;
char *c;
for (c = m->mountpoint; *c != '\0'; c++)
if (*c == '/')
depth++;
return depth;
}
static void mnt_resort_siblings(struct mount_info *tree)
{
struct mount_info *m, *p;
LIST_HEAD(list);
/*
* Put siblings of each node in an order they can be (u)mounted
* I.e. if we have mounts on foo/bar/, foo/bar/foobar/ and foo/
* we should put them in the foo/bar/foobar/, foo/bar/, foo/ order.
* Otherwise we will not be able to (u)mount them in a sequence.
*
* Funny, but all we need for this is to sort them in the descending
* order of the amount of /-s in a path =)
*
* Use stupid insertion sort here, we're not expecting mount trees
* to contain hundreds (or more) elements.
*/
pr_info("\tResorting siblings on %d\n", tree->mnt_id);
while (!list_empty(&tree->children)) {
int depth;
m = list_first_entry(&tree->children, struct mount_info, siblings);
list_del(&m->siblings);
depth = mnt_depth(m);
list_for_each_entry(p, &list, siblings)
if (mnt_depth(p) <= depth)
break;
list_add(&m->siblings, &p->siblings);
mnt_resort_siblings(m);
}
list_splice(&list, &tree->children);
}
static void mnt_tree_show(struct mount_info *tree, int off)
{
struct mount_info *m;
pr_info("%*s[%s](%d->%d)\n", off, "",
tree->mountpoint, tree->mnt_id, tree->parent_mnt_id);
list_for_each_entry(m, &tree->children, siblings)
mnt_tree_show(m, off + 1);
pr_info("%*s<--\n", off, "");
}
static int try_resolve_ext_mount(struct mount_info *info)
{
struct ext_mount *em;
em = ext_mount_lookup(info->mountpoint + 1 /* trim the . */);
if (em == NULL)
return -ENOTSUP;
pr_info("Found %s mapping for %s mountpoint\n",
em->val, info->mountpoint);
info->external = em;
return 0;
}
static struct mount_info *find_widest_shared(struct mount_info *m)
{
struct mount_info *p;
/*
* Try to find a mount, which is wider or equal.
* A is wider than B, if A->root is a subpath of B->root.
*/
list_for_each_entry(p, &m->mnt_share, mnt_share)
if (issubpath(m->root, p->root))
return p;
return NULL;
}
static struct mount_info *find_shared_peer(struct mount_info *m,
struct mount_info *ct, char *ct_mountpoint, int m_mpnt_l)
{
struct mount_info *cm;
list_for_each_entry(cm, &m->children, siblings) {
if (strcmp(ct_mountpoint, cm->mountpoint + m_mpnt_l))
continue;
if (!mounts_equal(cm, ct, false))
break;
return cm;
}
return NULL;
}
static inline int path_length(char *path)
{
int off;
off = strlen(path);
/*
* If we're pure / then set lenght to zero so that adding this
* value as sub-path offset would produce the correct result.
* E.g. the tail path of the "/foo/bar" relative to the "/foo"
* will be the "/foo/bar" + len("/foo") == "/bar", while the
* same relative to the "/" should be +0 to be the "/foo/bar",
* not +1 and the "foo/bar".
*/
if (path[off - 1] == '/')
off--;
return off;
}
static int validate_shared(struct mount_info *m)
{
struct mount_info *t, *ct;
int t_root_l, m_root_l, t_mpnt_l, m_mpnt_l;
char *m_root_rpath;
LIST_HEAD(children);
/*
* Check that all mounts in one shared group has the same set of
* children. Only visible children are accounted. A non-root bind-mount
* doesn't see children out of its root and it's excpected case.
*
* Here is a few conditions:
* 1. t is wider than m
* 2. We search a wider mount in the same direction, so when we
* enumirate all mounts, we can't be sure that all of them
* has the same set of children.
*/
t = find_widest_shared(m);
if (!t)
/*
* The current mount is the widest one in its shared group,
* all others will be compared to it or with some other,
* which will be compared to it.
*/
return 0;
/* A set of childrent which ar visiable for both should be the same */
t_root_l = path_length(t->root);
m_root_l = path_length(m->root);
t_mpnt_l = path_length(t->mountpoint);
m_mpnt_l = path_length(m->mountpoint);
/* For example:
* t->root = / t->mp = ./zdtm/live/static/mntns_root_bind.test
* m->root = /test m->mp = ./zdtm/live/static/mntns_root_bind.test/test.bind
* t_root_l = 0 t_mpnt_l = 39
* m_root_l = 5 m_mpnt_l = 49
* ct->root = / ct->mp = ./zdtm/live/static/mntns_root_bind.test/test/sub
* tp = /test/sub mp = /test len=5
*/
/*
* ct: | t->root | child mount point |
* cm: | m->root | child mount point |
* ct: | | /test/sub |
* cm: | /test | /sub |
* | A | B |
* | ct->mountpoint + t_mpnt_l
* | m->root + strlen(t->root)
*/
m_root_rpath = m->root + t_root_l; /* path from t->root to m->root */
/* Search a child, which is visiable in both mounts. */
list_for_each_entry(ct, &t->children, siblings) {
char *ct_mpnt_rpath;
struct mount_info *cm;
if (ct->is_ns_root)
continue;
ct_mpnt_rpath = ct->mountpoint + t_mpnt_l; /* path from t->mountpoint to ct->mountpoint */
/*
* Check whether ct can be is visible at m, i.e. the
* ct's rpath starts (as path) with m's rpath.
*/
if (!issubpath(ct_mpnt_rpath, m_root_rpath))
continue;
/*
* The ct has peer in m but with the mount path deeper according
* to m's depth relavie to t. Thus -- trim this difference (the
* lenght of m_root_rpath) from ct's mountpoint path.
*/
ct_mpnt_rpath += m_root_l - t_root_l;
/*
* Find in m the mountpoint that fully matches with ct (with the
* described above path corrections).
*/
cm = find_shared_peer(m, ct, ct_mpnt_rpath, m_mpnt_l);
if (!cm)
goto err;
/*
* Keep this one aside. At the end of t's children scan we should
* move _all_ m's children here (the list_empty check below).
*/
list_move(&cm->siblings, &children);
}
if (!list_empty(&m->children))
goto err;
list_splice(&children, &m->children);
return 0;
err:
list_splice(&children, &m->children);
pr_err("%d:%s and %d:%s have different set of mounts\n",
m->mnt_id, m->mountpoint, t->mnt_id, t->mountpoint);
return -1;
}
/*
* Find the mount_info from which the respective bind-mount
* can be created. It can be either an FS-root mount, or the
* root of the tree (the latter only if its root path is the
* sub-path of the bind mount's root).
*/
static struct mount_info *find_fsroot_mount_for(struct mount_info *bm)
{
struct mount_info *sm;
list_for_each_entry(sm, &bm->mnt_bind, mnt_bind)
if (fsroot_mounted(sm) ||
(sm->parent == NULL &&
strstartswith(bm->root, sm->root)))
return sm;
return NULL;
}
static int validate_mounts(struct mount_info *info, bool for_dump)
{
struct mount_info *m, *t;
for (m = info; m; m = m->next) {
if (m->parent == NULL || m->is_ns_root)
/* root mount can be any */
continue;
if (m->shared_id && validate_shared(m))
return -1;
/*
* Mountpoint can point to / of an FS. In that case this FS
* should be of some known type so that we can just mount one.
*
* Otherwise it's a bindmount mountpoint and we try to find
* what fsroot mountpoint it's bound to. If this point is the
* root mount, the path to bindmount root should be accessible
* form the rootmount path (the strstartswith check in the
* else branch below).
*/
if (fsroot_mounted(m)) {
if (m->fstype->code == FSTYPE__UNSUPPORTED) {
pr_err("FS mnt %s dev %#x root %s unsupported id %d\n",
m->mountpoint, m->s_dev, m->root, m->mnt_id);
return -1;
}
} else {
t = find_fsroot_mount_for(m);
if (!t) {
int ret;
if (for_dump) {
/*
* We've already resolved the mount
* and it is external.
*/
if (m->external) {
ret = 0;
} else {
ret = run_plugins(DUMP_EXT_MOUNT, m->mountpoint, m->mnt_id);
if (ret == 0)
m->need_plugin = true;
}
} else {
if (m->need_plugin || m->external)
/*
* plugin should take care of this one
* in restore_ext_mount, or do_bind_mount
* will mount it as external
*/
ret = 0;
else
ret = -ENOTSUP;
}
if (ret < 0) {
if (ret == -ENOTSUP)
pr_err("%d:%s doesn't have a proper root mount\n",
m->mnt_id, m->mountpoint);
return -1;
}
}
}
list_for_each_entry(t, &m->parent->children, siblings) {
if (m == t)
continue;
if (!issubpath(m->mountpoint, t->mountpoint))
continue;
pr_err("%d:%s is overmounted\n", m->mnt_id, m->mountpoint);
return -1;
}
}
return 0;
}
static char *cut_root_for_bind(char *target_root, char *source_root)
{
int tok = 0;
/*
* Cut common part of root.
* For non-root binds the source is always "/" (checked)
* so this will result in this slash removal only.
*/
while (target_root[tok] == source_root[tok]) {
tok++;
if (source_root[tok] == '\0')
break;
BUG_ON(target_root[tok] == '\0');
}
return target_root + tok;
}
static struct mount_info *find_best_external_match(struct mount_info *list, struct mount_info *info)
{
struct mount_info *it, *candidate = NULL;
for (it = list; it; it = it->next) {
if (!mounts_equal(info, it, true))
continue;
/*
* This means we have a situation like:
*
* root@criu:~# mount --bind bind1/subdir/ bind2
* root@criu:~# mount --bind bind1/ bind3
*
* outside the container, and bind1 is directly bind mounted
* inside the container. mounts_equal() considers these mounts
* equal for bind purposes, but their roots are different, and
* we want to match the one with the right root.
*/
if (!issubpath(info->root, it->root))
continue;
candidate = it;
/*
* Consider the case of:
*
* mount /xxx
* mount --bind /xxx /yyy
* mount --make-shared /yyy
* mount --bind /xxx /zzz
* mount --make-shared /zzz
* bind mount a shared mount into the namespace
*
* Here, we want to return the /right/ mount, not just a mount
* that's equal. However, in the case:
*
* bind mount a shared mount into the namespace
* inside the namespace, remount MS_PRIVATE
* inside the namespace, remount MS_SHARED
*
* there will be no external mount with matching sharing
* because the sharing is only internal; we still want to bind
* mount from this mountinfo so we should return it, but we
* should make the sharing namespace private after that bind
* mount.
*
* Below are the cases where we found an exact match.
*/
if (info->flags & MS_SHARED && info->shared_id == it->shared_id)
return candidate;
if (info->flags & MS_SLAVE && info->master_id == it->shared_id)
return candidate;
}
return candidate;
}
static struct ns_id *find_ext_ns_id(void)
{
int pid = getpid();
struct ns_id *ns;
for (ns = ns_ids; ns->next; ns = ns->next)
if (ns->pid == pid && ns->nd == &mnt_ns_desc) {
if (!ns->mnt.mntinfo_list &&
!collect_mntinfo(ns, true))
break;
return ns;
}
pr_err("Failed to find criu pid's mount ns!");
return NULL;
}
static int resolve_external_mounts(struct mount_info *info)
{
struct ns_id *ext_ns = NULL;
struct mount_info *m;
if (opts.autodetect_ext_mounts) {
ext_ns = find_ext_ns_id();
if (!ext_ns)
return -1;
}
for (m = info; m; m = m->next) {
int ret, size;
char *p, *cut_root;
struct ext_mount *em;
struct mount_info *match;
if (m->parent == NULL || m->is_ns_root)
continue;
ret = try_resolve_ext_mount(m);
if (ret < 0 && ret != -ENOTSUP) {
return -1;
} else if (ret == -ENOTSUP && !ext_ns) {
continue;
} else if (ret == 0) {
continue;
}
match = find_best_external_match(ext_ns->mnt.mntinfo_list, m);
if (!match)
continue;
if (m->flags & MS_SHARED) {
if (!opts.enable_external_sharing)
continue;
if (m->shared_id != match->shared_id)
m->internal_sharing = true;
}
if (m->flags & MS_SLAVE) {
if (!opts.enable_external_masters)
continue;
/*
* In order to support something like internal slavery,
* we need to teach can_mount_now and do_mount_one
* about slavery relationships in external mounts. This
* seems like an uncommon case, so we punt for not.
*/
if (m->master_id != match->shared_id)
continue;
}
cut_root = cut_root_for_bind(m->root, match->root);
/* +2 for the NULL byte and the extra / in the sprintf below,
* which we cut off in cut_root_for_bind(). */
size = strlen(match->mountpoint + 1) + strlen(cut_root) + 2;
p = xmalloc(sizeof(char) * size);
if (!p)
return -1;
ret = snprintf(p, size, "%s/%s", match->mountpoint + 1, cut_root);
if (ret < 0 || ret >= size) {
free(p);
return -1;
}
em = xmalloc(sizeof(struct ext_mount));
if (!em) {
free(p);
return -1;
}
em->val = AUTODETECTED_MOUNT;
em->key = p;
m->external = em;
xfree(m->source);
m->source = p;
pr_info("autodetected external mount %s for %s\n", p, m->mountpoint);
}
return 0;
}
static int collect_shared(struct mount_info *info, bool for_dump)
{
struct mount_info *m, *t;
/*
* If we have a shared mounts, both master
* slave targets are to be present in mount
* list, otherwise we can't be sure if we can
* recreate the scheme later on restore.
*/
for (m = info; m; m = m->next) {
bool need_share, need_master;
need_share = m->shared_id && list_empty(&m->mnt_share);
need_master = m->master_id;
for (t = info; t && (need_share || need_master); t = t->next) {
if (t == m)
continue;
if (need_master && t->shared_id == m->master_id) {
pr_debug("The mount %d is slave for %d\n", m->mnt_id, t->mnt_id);
list_add(&m->mnt_slave, &t->mnt_slave_list);
m->mnt_master = t;
need_master = false;
}
/* Collect all mounts from this group */
if (need_share && t->shared_id == m->shared_id) {
pr_debug("Mount %d is shared with %d group %d\n",
m->mnt_id, t->mnt_id, m->shared_id);
list_add(&t->mnt_share, &m->mnt_share);
}
}
/*
* If we haven't already determined this mount is external,
* then we don't know where it came from.
*/
if (need_master && m->parent && !m->external) {
pr_err("Mount %d %s (master_id: %d shared_id: %d) "
"has unreachable sharing. Try --enable-external-masters.\n", m->mnt_id,
m->mountpoint, m->master_id, m->shared_id);
return -1;
}
/* Search bind-mounts */
if (list_empty(&m->mnt_bind)) {
/*
* A first mounted point will be set up as a source point
* for others. Look at propagate_mount()
*/
for (t = m->next; t; t = t->next) {
if (mounts_equal(m, t, true))
list_add(&t->mnt_bind, &m->mnt_bind);
}
}
}
return 0;
}
static struct mount_info *mnt_build_tree(struct mount_info *list)
{
struct mount_info *tree;
/*
* Organize them in a sequence in which they can be mounted/umounted.
*/
pr_info("Building mountpoints tree\n");
tree = mnt_build_ids_tree(list);
if (!tree)
return NULL;
mnt_resort_siblings(tree);
pr_info("Done:\n");
mnt_tree_show(tree, 0);
return tree;
}
/*
* mnt_fd is a file descriptor on the mountpoint, which is closed in an error case.
* If mnt_fd is -1, the mountpoint will be opened by this function.
*/
int __open_mountpoint(struct mount_info *pm, int mnt_fd)
{
dev_t dev;
struct stat st;
int ret;
if (mnt_fd == -1) {
int mntns_root;
mntns_root = mntns_get_root_fd(pm->nsid);
if (mntns_root < 0)
return -1;
mnt_fd = openat(mntns_root, pm->ns_mountpoint, O_RDONLY);
if (mnt_fd < 0) {
pr_perror("Can't open %s", pm->ns_mountpoint);
return -1;
}
}
ret = fstat(mnt_fd, &st);
if (ret < 0) {
pr_perror("fstat(%s) failed", pm->ns_mountpoint);
goto err;
}
dev = phys_stat_resolve_dev(pm->nsid, st.st_dev, pm->ns_mountpoint + 1);
if (dev != pm->s_dev) {
pr_err("The file system %#x (%#x) %s %s is inaccessible\n",
pm->s_dev, (int)dev, pm->fstype->name, pm->ns_mountpoint);
goto err;
}
return mnt_fd;
err:
close(mnt_fd);
return -1;
}
int open_mount(unsigned int s_dev)
{
struct mount_info *m;
m = lookup_mnt_sdev(s_dev);
if (!m)
return -ENOENT;
return __open_mountpoint(m, -1);
}
static int open_mountpoint(struct mount_info *pm)
{
int fd = -1, ns_old = -1;
char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
char *mnt_path = mnt_path_tmp;
int cwd_fd;
/*
* If a mount doesn't have children, we can open a mount point,
* otherwise we need to create a "private" copy.
*/
if (list_empty(&pm->children))
return __open_mountpoint(pm, -1);
pr_info("Something is mounted on top of %s\n", pm->mountpoint);
/*
* To create a "private" copy, the target mount is bind-mounted
* in a temporary place w/o MS_REC (non-recursively).
* A mount point can't be bind-mounted in criu's namespace, it will be
* mounted in a target namespace. The sequence of actions is
* mkdtemp, setns(tgt), mount, open, detach, setns(old).
*/
cwd_fd = open(".", O_DIRECTORY);
if (cwd_fd < 0) {
pr_perror("Unable to open cwd");
return -1;
}
if (switch_ns(root_item->pid.real, &mnt_ns_desc, &ns_old) < 0)
goto out;
mnt_path = mkdtemp(mnt_path_tmp);
if (mnt_path == NULL && errno == ENOENT)
mnt_path = mkdtemp(mnt_path_root);
if (mnt_path == NULL) {
pr_perror("Can't create a temporary directory");
goto out;
}
if (mount(pm->mountpoint, mnt_path, NULL, MS_BIND, NULL)) {
pr_perror("Can't bind-mount %d:%s to %s",
pm->mnt_id, pm->mountpoint, mnt_path);
rmdir(mnt_path);
goto out;
}
fd = open_detach_mount(mnt_path);
if (fd < 0)
goto out;
if (restore_ns(ns_old, &mnt_ns_desc)) {
ns_old = -1;
goto out;
}
if (fchdir(cwd_fd)) {
pr_perror("Unable to restore cwd");
close(cwd_fd);
close(fd);
return -1;
}
close(cwd_fd);
return __open_mountpoint(pm, fd);
out:
if (ns_old >= 0)
restore_ns(ns_old, &mnt_ns_desc);
close_safe(&fd);
if (fchdir(cwd_fd))
pr_perror("Unable to restore cwd");
close(cwd_fd);
return -1;
}
static int attach_option(struct mount_info *pm, char *opt)
{
char *buf;
int len, olen;
len = strlen(pm->options);
olen = strlen(opt);
buf = xrealloc(pm->options, len + olen + 2);
if (buf == NULL)
return -1;
if (len && buf[len - 1] != ',') {
buf[len] = ',';
len++;
}
memcpy(buf + len, opt, olen + 1);
pm->options = buf;
return 0;
}
/* Is it mounted w or w/o the newinstance option */
static int devpts_parse(struct mount_info *pm)
{
int ret;
ret = kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVPTS, pm->s_dev);
if (ret <= 0)
return ret;
/*
* Kernel hides this option, but if the fs instance
* is new (virtualized) we know that it was created
* with -o newinstance.
*/
return attach_option(pm, "newinstance");
}
static int tmpfs_dump(struct mount_info *pm)
{
int ret = -1, fd = -1, userns_pid = -1;
char tmpfs_path[PSFDS];
struct cr_img *img;
fd = open_mountpoint(pm);
if (fd < 0)
return -1;
if (fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) & ~FD_CLOEXEC) == -1) {
pr_perror("Can not drop FD_CLOEXEC");
goto out;
}
img = open_image(CR_FD_TMPFS_DEV, O_DUMP, pm->s_dev);
if (!img)
goto out;
sprintf(tmpfs_path, "/proc/self/fd/%d", fd);
if (root_ns_mask & CLONE_NEWUSER)
userns_pid = root_item->pid.real;
ret = cr_system_userns(-1, img_raw_fd(img), -1, "tar", (char *[])
{ "tar", "--create",
"--gzip",
"--no-unquote",
"--no-wildcards",
"--one-file-system",
"--check-links",
"--preserve-permissions",
"--sparse",
"--numeric-owner",
"--directory", tmpfs_path, ".", NULL }, userns_pid);
if (ret)
pr_err("Can't dump tmpfs content\n");
close_image(img);
out:
close_safe(&fd);
return ret;
}
/*
* Virtualized devtmpfs on any side (dump or restore)
* means, that we should try to handle it as a plain
* tmpfs.
*
* Interesting case -- shared on dump and virtual on
* restore -- will fail, since no tarball with the fs
* contents will be found.
*/
static int devtmpfs_virtual(struct mount_info *pm)
{
return kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVTMPFS, pm->s_dev);
}
static int devtmpfs_dump(struct mount_info *pm)
{
int ret;
ret = devtmpfs_virtual(pm);
if (ret == 1)
ret = tmpfs_dump(pm);
return ret;
}
static int tmpfs_restore(struct mount_info *pm)
{
int ret;
struct cr_img *img;
img = open_image(CR_FD_TMPFS_DEV, O_RSTR, pm->s_dev);
if (empty_image(img)) {
close_image(img);
img = open_image(CR_FD_TMPFS_IMG, O_RSTR, pm->mnt_id);
}
if (!img || empty_image(img))
return -1;
ret = cr_system(img_raw_fd(img), -1, -1, "tar",
(char *[]) {"tar", "--extract", "--gzip",
"--no-unquote", "--no-wildcards",
"--directory", pm->mountpoint, NULL});
close_image(img);
if (ret) {
pr_err("Can't restore tmpfs content\n");
return -1;
}
return 0;
}
static int devtmpfs_restore(struct mount_info *pm)
{
int ret;
ret = devtmpfs_virtual(pm);
if (ret == 1)
ret = tmpfs_restore(pm);
return ret;
}
static int binfmt_misc_dump(struct mount_info *pm)
{
int fd, ret = -1;
struct dirent *de;
DIR *fdir = NULL;
fd = open_mountpoint(pm);
if (fd < 0)
return -1;
fdir = fdopendir(fd);
if (fdir == NULL) {
close(fd);
return -1;
}
while ((de = readdir(fdir))) {
if (dir_dots(de))
continue;
if (!strcmp(de->d_name, "register"))
continue;
if (!strcmp(de->d_name, "status"))
continue;
pr_err("binfmt_misc isn't empty: %s\n", de->d_name);
goto out;
}
ret = 0;
out:
closedir(fdir);
return ret;
}
static int fusectl_dump(struct mount_info *pm)
{
int fd, ret = -1;
struct dirent *de;
DIR *fdir = NULL;
fd = open_mountpoint(pm);
if (fd < 0)
return -1;
fdir = fdopendir(fd);
if (fdir == NULL) {
close(fd);
return -1;
}
while ((de = readdir(fdir))) {
int id;
struct mount_info *it;
if (dir_dots(de))
continue;
if (sscanf(de->d_name, "%d", &id) != 1) {
pr_err("wrong number of items scanned in fusectl dump\n");
goto out;
}
for (it = mntinfo; it; it = it->next) {
if (it->fstype->code == FSTYPE__FUSE && id == minor(it->s_dev) && !it->external) {
pr_err("%s is a fuse mount but not external\n", it->mountpoint);
goto out;
}
}
}
ret = 0;
out:
closedir(fdir);
return ret;
}
static int dump_empty_fs(struct mount_info *pm)
{
int fd, ret = -1;
struct dirent *de;
DIR *fdir = NULL;
fd = open_mountpoint(pm);
if (fd < 0)
return -1;
fdir = fdopendir(fd);
if (fdir == NULL) {
close(fd);
return -1;
}
while ((de = readdir(fdir))) {
if (dir_dots(de))
continue;
pr_err("%s isn't empty: %s\n", pm->fstype->name, de->d_name);
goto out;
}
ret = 0;
out:
closedir(fdir);
return ret;
}
/*
* Some fses (fuse) cannot be dumped, so we should always fail on dump/restore
* of these fses.
*/
static int always_fail(struct mount_info *pm)
{
return -1;
}
static struct fstype fstypes[32] = {
{
.name = "unsupported",
.code = FSTYPE__UNSUPPORTED,
}, {
.name = "proc",
.code = FSTYPE__PROC,
}, {
.name = "sysfs",
.code = FSTYPE__SYSFS,
}, {
.name = "devtmpfs",
.code = FSTYPE__DEVTMPFS,
.dump = devtmpfs_dump,
.restore = devtmpfs_restore,
}, {
.name = "binfmt_misc",
.code = FSTYPE__BINFMT_MISC,
.dump = binfmt_misc_dump,
}, {
.name = "tmpfs",
.code = FSTYPE__TMPFS,
.dump = tmpfs_dump,
.restore = tmpfs_restore,
}, {
.name = "devpts",
.parse = devpts_parse,
.code = FSTYPE__DEVPTS,
}, {
.name = "simfs",
.code = FSTYPE__SIMFS,
}, {
.name = "btrfs",
.code = FSTYPE__UNSUPPORTED,
}, {
.name = "pstore",
.dump = dump_empty_fs,
.code = FSTYPE__PSTORE,
}, {
.name = "mqueue",
.dump = dump_empty_fs,
.code = FSTYPE__MQUEUE,
}, {
.name = "securityfs",
.code = FSTYPE__SECURITYFS,
}, {
.name = "fusectl",
.dump = fusectl_dump,
.code = FSTYPE__FUSECTL,
}, {
.name = "debugfs",
.code = FSTYPE__DEBUGFS,
}, {
.name = "cgroup",
.code = FSTYPE__CGROUP,
}, {
.name = "aufs",
.code = FSTYPE__AUFS,
.parse = aufs_parse,
}, {
.name = "fuse",
.code = FSTYPE__FUSE,
.dump = always_fail,
.restore = always_fail,
}, {
.name = "overlay",
.code = FSTYPE__OVERLAYFS,
.parse = overlayfs_parse,
},
};
static char fsauto_all[] = "all";
static char *fsauto_names;
static bool css_contains(const char *css, const char *str)
{
int len = strlen(str);
const char *cur;
if (!len)
return false;
for (cur = css; (cur = strstr(cur, str)); cur += len) {
if (cur > css && cur[-1] != ',')
continue;
if (cur[len] && cur[len] != ',')
continue;
return true;
}
return false;
}
static bool fsname_is_auto(const char *name)
{
if (!fsauto_names)
return false;
if (fsauto_names == fsauto_all)
return true;
return css_contains(fsauto_names, name);
}
bool add_fsname_auto(const char *names)
{
char *old = fsauto_names;
if (old == fsauto_all)
return true;
if (css_contains(names, fsauto_all))
fsauto_names = fsauto_all;
else if (!old)
fsauto_names = xstrdup(names);
else {
if (asprintf(&fsauto_names, "%s,%s", old, names) < 0)
fsauto_names = NULL;
}
xfree(old);
return fsauto_names != NULL;
}
static struct fstype *__find_fstype_by_name(char *fst, bool force_auto)
{
int i;
/*
* This fn is required for two things.
* 1st -- to check supported filesystems (as just mounting
* anything is wrong, almost every fs has its own features)
* 2nd -- save some space in the image (since we scan all
* names anyway)
*/
for (i = 1; i < ARRAY_SIZE(fstypes); i++) {
struct fstype *fstype = fstypes + i;
if (!fstype->name) {
if (!force_auto && !fsname_is_auto(fst))
break;
fstype->name = xstrdup(fst);
fstype->code = FSTYPE__AUTO;
return fstype;
}
if (!strcmp(fstype->name, fst))
return fstype;
}
if (i == ARRAY_SIZE(fstypes)) /* ensure we have a room for auto */
pr_err_once("fstypes[] overflow!\n");
return &fstypes[0];
}
struct fstype *find_fstype_by_name(char *fst)
{
return __find_fstype_by_name(fst, false);
}
static struct fstype *decode_fstype(u32 fst, char *fsname)
{
int i;
if (fst == FSTYPE__AUTO)
return __find_fstype_by_name(fsname, true);
if (fst == FSTYPE__UNSUPPORTED)
goto uns;
for (i = 1; i < ARRAY_SIZE(fstypes); i++) {
struct fstype *fstype = fstypes + i;
if (!fstype->name)
break;
if (fstype->code == fst)
return fstype;
}
uns:
return &fstypes[0];
}
static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img)
{
MntEntry me = MNT_ENTRY__INIT;
pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev,
pm->root, pm->mountpoint);
me.fstype = pm->fstype->code;
if (me.fstype == FSTYPE__AUTO)
me.fsname = pm->fstype->name;
if (pm->parent && !pm->dumped && !pm->need_plugin &&
pm->fstype->dump && fsroot_mounted(pm)) {
struct mount_info *t;
if (pm->fstype->dump(pm))
return -1;
list_for_each_entry(t, &pm->mnt_bind, mnt_bind)
t->dumped = true;
}
me.mnt_id = pm->mnt_id;
me.root_dev = pm->s_dev;
me.parent_mnt_id = pm->parent_mnt_id;
me.flags = pm->flags;
me.mountpoint = pm->mountpoint + 1;
me.source = pm->source;
me.options = pm->options;
me.shared_id = pm->shared_id;
me.has_shared_id = true;
me.master_id = pm->master_id;
me.has_master_id = true;
if (pm->need_plugin) {
me.has_with_plugin = true;
me.with_plugin = true;
}
if (pm->deleted) {
me.has_deleted = true;
me.deleted = true;
}
if (pm->internal_sharing) {
me.has_internal_sharing = true;
me.internal_sharing = true;
}
if (pm->external) {
/*
* For external mount points dump the mapping's
* value instead of root. See collect_mnt_from_image
* for reverse mapping details.
*/
me.root = pm->external->val;
me.has_ext_mount = true;
me.ext_mount = true;
} else
me.root = pm->root;
if (pb_write_one(img, &me, PB_MNT))
return -1;
return 0;
}
static void free_mntinfo(struct mount_info *pms)
{
while (pms) {
struct mount_info *pm;
pm = pms->next;
mnt_entry_free(pms);
pms = pm;
}
}
struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump)
{
struct mount_info *pm;
pm = parse_mountinfo(ns->pid, ns, for_dump);
if (!pm) {
pr_err("Can't parse %d's mountinfo\n", ns->pid);
return NULL;
}
ns->mnt.mntinfo_tree = mnt_build_tree(pm);
if (ns->mnt.mntinfo_tree == NULL)
goto err;
ns->mnt.mntinfo_list = pm;
return pm;
err:
free_mntinfo(pm);
return NULL;
}
static int dump_mnt_ns(struct ns_id *ns, struct mount_info *pms)
{
struct mount_info *pm;
int ret = -1;
struct cr_img *img;
int ns_id = ns->id;
pr_info("Dumping mountpoints\n");
img = open_image(CR_FD_MNTS, O_DUMP, ns_id);
if (!img)
goto err;
for (pm = pms; pm && pm->nsid == ns; pm = pm->next)
if (dump_one_mountpoint(pm, img))
goto err_i;
ret = 0;
err_i:
close_image(img);
err:
return ret;
}
/*
* _fn_f - pre-order traversal function
* _fn_f - post-order traversal function
* _plist - a postpone list. _el is added to this list, if _fn_f returns
* a positive value, and all lower elements are not enumirated.
*/
#define MNT_TREE_WALK(_r, _el, _fn_f, _fn_r, _plist, _prgs) do { \
struct mount_info *_mi = _r; \
\
while (1) { \
int ret; \
\
list_del_init(&_mi->postpone); \
\
ret = _fn_f(_mi); \
if (ret < 0) \
return -1; \
else if (ret > 0) { \
list_add_tail(&_mi->postpone, _plist); \
goto up; \
} \
\
_prgs++; \
\
if (!list_empty(&_mi->children)) { \
_mi = list_entry(_mi->children._el, \
struct mount_info, siblings); \
continue; \
} \
up: \
if (_fn_r(_mi)) \
return -1; \
if (_mi == _r) \
break; \
if (_mi->siblings._el == &_mi->parent->children) { \
_mi = _mi->parent; \
goto up; \
} \
_mi = list_entry(_mi->siblings._el, \
struct mount_info, siblings); \
} \
} while (0)
#define MNT_WALK_NONE 0 &&
static int mnt_tree_for_each(struct mount_info *start,
int (*fn)(struct mount_info *))
{
struct mount_info *tmp;
LIST_HEAD(postpone);
LIST_HEAD(postpone2);
int progress;
pr_debug("Start with %d:%s\n", start->mnt_id, start->mountpoint);
list_add(&start->postpone, &postpone);
again:
progress = 0;
list_for_each_entry_safe(start, tmp, &postpone, postpone)
MNT_TREE_WALK(start, next, fn, MNT_WALK_NONE, &postpone2, progress);
if (!progress) {
struct mount_info *m;
pr_err("A few mount points can't be mounted\n");
list_for_each_entry(m, &postpone2, postpone) {
pr_err("%d:%d %s %s %s\n", m->mnt_id,
m->parent_mnt_id, m->root,
m->mountpoint, m->source);
}
return -1;
}
list_splice_init(&postpone2, &postpone);
if (!list_empty(&postpone))
goto again;
return 0;
}
static int mnt_tree_for_each_reverse(struct mount_info *m,
int (*fn)(struct mount_info *))
{
int progress = 0;
MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *) NULL, progress);
return 0;
}
static char *resolve_source(struct mount_info *mi)
{
if (kdev_major(mi->s_dev) == 0)
/*
* Anonymous block device. Kernel creates them for
* diskless mounts.
*/
return mi->source;
if (mi->fstype->code == FSTYPE__AUTO) {
struct stat st;
if (!stat(mi->source, &st) && S_ISBLK(st.st_mode) &&
major(st.st_rdev) == kdev_major(mi->s_dev) &&
minor(st.st_rdev) == kdev_minor(mi->s_dev))
return mi->source;
}
pr_err("No device for %s mount\n", mi->mountpoint);
return NULL;
}
static int restore_shared_options(struct mount_info *mi, bool private, bool shared, bool slave)
{
pr_debug("%d:%s private %d shared %d slave %d\n",
mi->mnt_id, mi->mountpoint, private, shared, slave);
if (mi->flags & MS_UNBINDABLE) {
if (shared || slave)
pr_warn("%s has both unbindable and sharing, ignoring unbindable\n", mi->mountpoint);
else
return mount(NULL, mi->mountpoint, NULL, MS_UNBINDABLE, NULL);
}
if (private && mount(NULL, mi->mountpoint, NULL, MS_PRIVATE, NULL)) {
pr_perror("Unable to make %s private", mi->mountpoint);
return -1;
}
if (slave && mount(NULL, mi->mountpoint, NULL, MS_SLAVE, NULL)) {
pr_perror("Unable to make %s slave", mi->mountpoint);
return -1;
}
if (shared && mount(NULL, mi->mountpoint, NULL, MS_SHARED, NULL)) {
pr_perror("Unable to make %s shared", mi->mountpoint);
return -1;
}
return 0;
}
/*
* Umount points, which are propagated in slave parents, because
* we can't be sure, that they were inherited in a real life.
*/
static int umount_from_slaves(struct mount_info *mi)
{
struct mount_info *t;
char mpath[PATH_MAX];
list_for_each_entry(t, &mi->parent->mnt_slave_list, mnt_slave) {
if (!t->mounted)
continue;
snprintf(mpath, sizeof(mpath), "%s/%s",
t->mountpoint, basename(mi->mountpoint));
pr_debug("\t\tUmount %s\n", mpath);
if (umount(mpath) == -1) {
pr_perror("Can't umount %s", mpath);
return -1;
}
}
return 0;
}
/*
* If something is mounted in one shared point, it will be spread in
* all other points from this shared group.
*
* Look at Documentation/filesystems/sharedsubtree.txt for more details
*/
static int propagate_siblings(struct mount_info *mi)
{
struct mount_info *t;
/*
* Find all mounts, which must be bind-mounted from this one
* to inherite shared group or master id
*/
list_for_each_entry(t, &mi->mnt_share, mnt_share) {
if (t->mounted)
continue;
pr_debug("\t\tBind %s\n", t->mountpoint);
t->bind = mi;
}
list_for_each_entry(t, &mi->mnt_slave_list, mnt_slave) {
if (t->mounted)
continue;
pr_debug("\t\tBind %s\n", t->mountpoint);
t->bind = mi;
}
return 0;
}
static int propagate_mount(struct mount_info *mi)
{
struct mount_info *t;
propagate_siblings(mi);
if (!mi->parent)
goto skip_parent;
umount_from_slaves(mi);
/* Propagate this mount to everyone from a parent group */
list_for_each_entry(t, &mi->parent->mnt_share, mnt_share) {
struct mount_info *c;
list_for_each_entry(c, &t->children, siblings) {
if (mounts_equal(mi, c, false)) {
pr_debug("\t\tPropogate %s\n", c->mountpoint);
c->mounted = true;
propagate_siblings(c);
umount_from_slaves(c);
}
}
}
skip_parent:
/*
* FIXME Currently non-root mounts can be restored
* only if a proper root mount exists
*/
if (fsroot_mounted(mi) || mi->parent == NULL) {
list_for_each_entry(t, &mi->mnt_bind, mnt_bind) {
if (t->mounted)
continue;
if (t->bind)
continue;
if (t->master_id)
continue;
t->bind = mi;
}
}
return 0;
}
static int do_new_mount(struct mount_info *mi)
{
unsigned long mflags = MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE;
char *src;
struct fstype *tp = mi->fstype;
bool remount_ro = (tp->restore && mi->flags & MS_RDONLY);
src = resolve_source(mi);
if (!src)
return -1;
if (remount_ro)
mflags |= MS_RDONLY;
if (mount(src, mi->mountpoint, tp->name,
mi->flags & ~mflags, mi->options) < 0) {
pr_perror("Can't mount at %s", mi->mountpoint);
return -1;
}
if (restore_shared_options(mi, !mi->shared_id && !mi->master_id,
mi->shared_id,
mi->master_id))
return -1;
mi->mounted = true;
if (tp->restore && tp->restore(mi))
return -1;
if (remount_ro)
return mount(NULL, mi->mountpoint, tp->name,
MS_REMOUNT | MS_RDONLY, NULL);
return 0;
}
static int restore_ext_mount(struct mount_info *mi)
{
int ret;
pr_debug("Restoring external bind mount %s\n", mi->mountpoint);
ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, mi->mountpoint, "/", NULL);
if (ret)
pr_err("Can't restore ext mount (%d)\n", ret);
return ret;
}
static int do_bind_mount(struct mount_info *mi)
{
bool shared = 0;
bool force_private_remount = false;
struct stat st;
if (!mi->need_plugin) {
char *root, *cut_root, rpath[PATH_MAX];
if (mi->external) {
/*
* We have / pointing to criu's ns root still,
* so just use the mapping's path. The mountpoint
* is tuned in collect_mnt_from_image to refer
* to proper location in the namespace we restore.
*/
root = mi->root;
force_private_remount = mi->internal_sharing;
goto do_bind;
}
shared = mi->shared_id && mi->shared_id == mi->bind->shared_id;
cut_root = cut_root_for_bind(mi->root, mi->bind->root);
snprintf(rpath, sizeof(rpath), "%s/%s",
mi->bind->mountpoint, cut_root);
root = rpath;
do_bind:
pr_info("\tBind %s to %s\n", root, mi->mountpoint);
if (unlikely(mi->deleted)) {
if (stat(mi->mountpoint, &st)) {
pr_perror("Can't fetch stat on %s", mi->mountpoint);
return -1;
}
if (S_ISDIR(st.st_mode)) {
if (mkdir(root, (st.st_mode & ~S_IFMT))) {
pr_perror("Can't re-create deleted directory %s\n", root);
return -1;
}
} else if (S_ISREG(st.st_mode)) {
int fd = open(root, O_WRONLY | O_CREAT | O_EXCL,
st.st_mode & ~S_IFMT);
if (fd < 0) {
pr_perror("Can't re-create deleted file %s\n", root);
return -1;
}
close(fd);
} else {
pr_err("Unsupported st_mode 0%o deleted root %s\n",
(int)st.st_mode, root);
return -1;
}
}
if (mount(root, mi->mountpoint, NULL, MS_BIND, NULL) < 0) {
pr_perror("Can't mount at %s", mi->mountpoint);
return -1;
}
if (unlikely(mi->deleted)) {
if (S_ISDIR(st.st_mode)) {
if (rmdir(root)) {
pr_perror("Can't remove deleted directory %s\n", root);
return -1;
}
} else if (S_ISREG(st.st_mode)) {
if (unlink(root)) {
pr_perror("Can't unlink deleted file %s\n", root);
return -1;
}
}
}
} else {
if (restore_ext_mount(mi))
return -1;
}
/*
* shared - the mount is in the same shared group with mi->bind
* mi->shared_id && !shared - create a new shared group
*/
if (restore_shared_options(mi, force_private_remount || (!shared && !mi->master_id),
mi->shared_id && !shared,
mi->master_id))
return -1;
mi->mounted = true;
return 0;
}
static bool can_mount_now(struct mount_info *mi)
{
/* The root mount */
if (!mi->parent)
return true;
if (mi->external)
return true;
/*
* We're the slave peer:
* - Make sure the master peer is already mounted
* - Make sure all children is mounted as well to
* eliminame mounts duplications
*/
if (mi->master_id) {
struct mount_info *c;
if (mi->bind == NULL)
return false;
list_for_each_entry(c, &mi->bind->children, siblings) {
if (!c->mounted)
return false;
}
}
if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin && !mi->external))
return false;
if (mi->parent->shared_id) {
struct mount_info *p = mi->parent, *n;
if (mi->parent->shared_id == mi->shared_id) {
int rlen = strlen(mi->root);
list_for_each_entry(n, &p->mnt_share, mnt_share)
if (strlen(n->root) < rlen && !n->mounted)
return false;
} else {
list_for_each_entry(n, &p->mnt_share, mnt_share)
if (!n->mounted)
return false;
}
}
return true;
}
static int do_mount_root(struct mount_info *mi)
{
if (restore_shared_options(mi, !mi->shared_id && !mi->master_id,
mi->shared_id, mi->master_id))
return -1;
return 0;
}
static int do_mount_one(struct mount_info *mi)
{
int ret;
if (mi->mounted)
return 0;
if (!can_mount_now(mi)) {
pr_debug("Postpone slave %s\n", mi->mountpoint);
return 1;
}
pr_debug("\tMounting %s @%s (%d)\n", mi->fstype->name, mi->mountpoint, mi->need_plugin);
if (!mi->parent) {
/* do_mount_root() is called from populate_mnt_ns() */
mi->mounted = true;
ret = 0;
} else if (!mi->bind && !mi->need_plugin && !mi->external)
ret = do_new_mount(mi);
else
ret = do_bind_mount(mi);
if (ret == 0 && propagate_mount(mi))
return -1;
if (mi->fstype->code == FSTYPE__UNSUPPORTED) {
struct statfs st;
if (statfs(mi->mountpoint, &st)) {
pr_perror("Unable to statfs %s", mi->mountpoint);
return -1;
}
if (st.f_type == BTRFS_SUPER_MAGIC)
mi->fstype = find_fstype_by_name("btrfs");
}
return ret;
}
static int do_umount_one(struct mount_info *mi)
{
if (!mi->parent)
return 0;
if (mount("none", mi->parent->mountpoint, "none", MS_REC|MS_PRIVATE, NULL)) {
pr_perror("Can't mark %s as private", mi->parent->mountpoint);
return -1;
}
if (umount(mi->mountpoint)) {
pr_perror("Can't umount at %s", mi->mountpoint);
return -1;
}
pr_info("Umounted at %s\n", mi->mountpoint);
return 0;
}
static int clean_mnt_ns(struct mount_info *mntinfo_tree)
{
pr_info("Cleaning mount namespace\n");
/*
* Mountinfos were collected at prepare stage
*/
return mnt_tree_for_each_reverse(mntinfo_tree, do_umount_one);
}
static int cr_pivot_root(char *root)
{
char put_root[] = "crtools-put-root.XXXXXX";
int exit_code = -1;
pr_info("Move the root to %s\n", root ? : ".");
if (root) {
if (chdir(root)) {
pr_perror("chdir(%s) failed", root);
return -1;
}
}
if (mkdtemp(put_root) == NULL) {
pr_perror("Can't create a temporary directory");
return -1;
}
if (mount(put_root, put_root, NULL, MS_BIND, NULL)) {
pr_perror("Unable to mount tmpfs in %s", put_root);
goto err_root;
}
if (mount(NULL, put_root, NULL, MS_PRIVATE, NULL)) {
pr_perror("Can't remount %s with MS_PRIVATE", put_root);
goto err_tmpfs;
}
if (pivot_root(".", put_root)) {
pr_perror("pivot_root(., %s) failed", put_root);
goto err_tmpfs;
}
if (mount("none", put_root, "none", MS_REC|MS_PRIVATE, NULL)) {
pr_perror("Can't remount root with MS_PRIVATE");
return -1;
}
exit_code = 0;
if (umount2(put_root, MNT_DETACH)) {
pr_perror("Can't umount %s", put_root);
return -1;
}
err_tmpfs:
if (umount2(put_root, MNT_DETACH)) {
pr_perror("Can't umount %s", put_root);
return -1;
}
err_root:
if (rmdir(put_root)) {
pr_perror("Can't remove the directory %s", put_root);
return -1;
}
return exit_code;
}
struct mount_info *mnt_entry_alloc()
{
struct mount_info *new;
new = xzalloc(sizeof(struct mount_info));
if (new) {
INIT_LIST_HEAD(&new->children);
INIT_LIST_HEAD(&new->siblings);
INIT_LIST_HEAD(&new->mnt_slave_list);
INIT_LIST_HEAD(&new->mnt_share);
INIT_LIST_HEAD(&new->mnt_bind);
INIT_LIST_HEAD(&new->postpone);
}
return new;
}
void mnt_entry_free(struct mount_info *mi)
{
if (mi) {
xfree(mi->root);
xfree(mi->mountpoint);
xfree(mi->source);
xfree(mi->options);
xfree(mi);
}
}
/*
* Helper for getting a path to where the namespace's root
* is re-constructed.
*/
static inline int print_ns_root(struct ns_id *ns, char *buf, int bs)
{
return snprintf(buf, bs, "%s/%d", mnt_roots, ns->id);
}
static int create_mnt_roots(void)
{
if (mnt_roots)
return 0;
if (chdir(opts.root ? : "/")) {
pr_perror("Unable to change working directory on %s", opts.root);
return -1;
}
mnt_roots = strdup(".criu.mntns.XXXXXX");
if (mnt_roots == NULL) {
pr_perror("Can't allocate memory");
return -1;
}
if (mkdtemp(mnt_roots) == NULL) {
pr_perror("Unable to create a temporary directory");
mnt_roots = NULL;
return -1;
}
return 0;
}
static int rst_collect_local_mntns(void)
{
struct ns_id *nsid;
nsid = rst_new_ns_id(0, getpid(), &mnt_ns_desc);
if (!nsid)
return -1;
mntinfo = collect_mntinfo(nsid, false);
if (!mntinfo)
return -1;
futex_set(&nsid->ns_created, 1);
return 0;
}
static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid)
{
MntEntry *me = NULL;
int ret, root_len = 1;
struct cr_img *img;
char root[PATH_MAX] = ".";
img = open_image(CR_FD_MNTS, O_RSTR, nsid->id);
if (!img)
return -1;
if (nsid->id != root_item->ids->mnt_ns_id)
root_len = print_ns_root(nsid, root, sizeof(root));
pr_debug("Reading mountpoint images\n");
while (1) {
struct mount_info *pm;
int len;
ret = pb_read_one_eof(img, &me, PB_MNT);
if (ret <= 0)
break;
pm = mnt_entry_alloc();
if (!pm)
goto err;
pm->nsid = nsid;
pm->next = *pms;
*pms = pm;
pm->mnt_id = me->mnt_id;
pm->parent_mnt_id = me->parent_mnt_id;
pm->s_dev = me->root_dev;
pm->flags = me->flags;
pm->shared_id = me->shared_id;
pm->master_id = me->master_id;
pm->need_plugin = me->with_plugin;
pm->deleted = me->deleted;
pm->is_ns_root = is_root(me->mountpoint);
pr_debug("\t\tGetting source for %d\n", pm->mnt_id);
pm->source = xstrdup(me->source);
if (!pm->source)
goto err;
if (me->has_internal_sharing)
pm->internal_sharing = me->internal_sharing;
/* FIXME: abort unsupported early */
pm->fstype = decode_fstype(me->fstype, me->fsname);
if (me->ext_mount) {
struct ext_mount *em;
/*
* External mount point -- get the reverse mapping
* from the command line and put into root's place
*/
em = ext_mount_lookup(me->root);
if (!em) {
if (!opts.autodetect_ext_mounts) {
pr_err("No mapping for %s mountpoint\n", me->mountpoint);
goto err;
}
/*
* Make up an external mount entry for this
* mount point, since we couldn't find a user
* supplied one.
*/
em = xmalloc(sizeof(struct ext_mount));
if (!em)
goto err;
em->val = pm->source;
/*
* Put a : in here since those are invalid on
* the cli, so we know it's autogenerated in
* debugging.
*/
em->key = AUTODETECTED_MOUNT;
}
pm->external = em;
pm->root = em->val;
pr_debug("Mountpoint %s will have root from %s\n",
me->mountpoint, pm->root);
} else {
pr_debug("\t\tGetting root for %d\n", pm->mnt_id);
pm->root = xstrdup(me->root);
if (!pm->root)
goto err;
}
len = strlen(me->mountpoint) + root_len + 1;
pm->mountpoint = xmalloc(len);
if (!pm->mountpoint)
goto err;
pm->ns_mountpoint = pm->mountpoint + root_len;
/*
* For bind-mounts we would also fix the root here
* too, but bind-mounts restore merges mountpoint
* and root paths together, so there's no need in
* that.
*/
strcpy(pm->mountpoint, root);
strcpy(pm->mountpoint + root_len, me->mountpoint);
pr_debug("\t\tGetting mpt for %d %s\n", pm->mnt_id, pm->mountpoint);
pr_debug("\t\tGetting opts for %d\n", pm->mnt_id);
pm->options = xstrdup(me->options);
if (!pm->options)
goto err;
pr_debug("\tRead %d mp @ %s\n", pm->mnt_id, pm->mountpoint);
}
if (me)
mnt_entry__free_unpacked(me, NULL);
close_image(img);
return 0;
err:
close_image(img);
return -1;
}
static struct mount_info *read_mnt_ns_img(void)
{
struct mount_info *pms = NULL;
struct ns_id *nsid;
for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
if (nsid->nd != &mnt_ns_desc)
continue;
if (nsid->id != root_item->ids->mnt_ns_id)
/*
* If we have more than one (root) namespace,
* then we'll need the roots yard.
*/
if (create_mnt_roots())
return NULL;
if (collect_mnt_from_image(&pms, nsid))
return NULL;
}
/* Here it doesn't matter where the mount list is saved */
mntinfo = pms;
return pms;
}
char *rst_get_mnt_root(int mnt_id)
{
struct mount_info *m;
static char path[PATH_MAX] = "/";
if (!(root_ns_mask & CLONE_NEWNS))
return path;
if (mnt_id == -1)
return path;
m = lookup_mnt_id(mnt_id);
if (m == NULL)
return NULL;
if (m->nsid->pid == getpid())
return path;
print_ns_root(m->nsid, path, sizeof(path));
return path;
}
static int do_restore_task_mnt_ns(struct ns_id *nsid)
{
char path[PATH_MAX];
if (nsid->pid != getpid()) {
int fd;
futex_wait_while_eq(&nsid->ns_created, 0);
fd = open_proc(nsid->pid, "ns/mnt");
if (fd < 0)
return -1;
if (setns(fd, CLONE_NEWNS)) {
pr_perror("Unable to change mount namespace");
return -1;
}
close(fd);
return 0;
}
if (unshare(CLONE_NEWNS)) {
pr_perror("Unable to unshare mount namespace");
return -1;
}
path[0] = '/';
print_ns_root(nsid, path + 1, sizeof(path) - 1);
if (cr_pivot_root(path))
return -1;
futex_set_and_wake(&nsid->ns_created, 1);
return 0;
}
int restore_task_mnt_ns(struct pstree_item *current)
{
if (current->ids && current->ids->has_mnt_ns_id) {
unsigned int id = current->ids->mnt_ns_id;
struct ns_id *nsid;
/*
* Regardless of the namespace a task wants to
* live in, by that point they all will live in
* root's one (see prepare_pstree_kobj_ids() +
* get_clone_mask()). So if the current task's
* target namespace is the root's one -- it's
* already there, otherwise it will have to do
* setns().
*/
if (root_item->ids->mnt_ns_id == id)
return 0;
nsid = lookup_ns_by_id(id, &mnt_ns_desc);
if (nsid == NULL) {
pr_err("Can't find mount namespace %d\n", id);
return -1;
}
if (do_restore_task_mnt_ns(nsid))
return -1;
}
return 0;
}
/*
* All nested mount namespaces are restore as sub-trees of the root namespace.
*/
static int prepare_roots_yard(void)
{
char path[PATH_MAX];
struct ns_id *nsid;
if (mnt_roots == NULL)
return 0;
if (mount("none", mnt_roots, "tmpfs", 0, NULL)) {
pr_perror("Unable to mount tmpfs in %s", mnt_roots);
return -1;
}
if (mount("none", mnt_roots, NULL, MS_PRIVATE, NULL))
return -1;
for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
if (nsid->nd != &mnt_ns_desc)
continue;
print_ns_root(nsid, path, sizeof(path));
if (mkdir(path, 0600)) {
pr_perror("Unable to create %s", path);
return -1;
}
}
return 0;
}
static int populate_mnt_ns(struct mount_info *mis)
{
struct mount_info *pms;
struct ns_id *nsid;
pms = mnt_build_tree(mis);
if (!pms)
return -1;
if (collect_shared(mis, false))
return -1;
for (nsid = ns_ids; nsid; nsid = nsid->next) {
if (nsid->nd != &mnt_ns_desc)
continue;
/*
* Make trees of all namespaces look the
* same, so that manual paths resolution
* works on them.
*/
nsid->mnt.mntinfo_tree = pms;
}
if (validate_mounts(mis, false))
return -1;
/*
* Set properties for the root before mounting a root yard,
* otherwise the root yard can be propagated into the host
* mntns and remain there.
*/
if (do_mount_root(pms))
return -1;
if (prepare_roots_yard())
return -1;
return mnt_tree_for_each(pms, do_mount_one);
}
int fini_mnt_ns(void)
{
int ret = 0;
if (mnt_roots == NULL)
return 0;
if (mount("none", mnt_roots, "none", MS_REC|MS_PRIVATE, NULL)) {
pr_perror("Can't remount root with MS_PRIVATE");
ret = 1;
}
/*
* Don't exit after a first error, becuase this function
* can be used to rollback in a error case.
* Don't worry about MNT_DETACH, because files are restored after this
* and nobody will not be restored from a wrong mount namespace.
*/
if (umount2(mnt_roots, MNT_DETACH)) {
pr_perror("Can't unmount %s", mnt_roots);
ret = 1;
}
if (rmdir(mnt_roots)) {
pr_perror("Can't remove the directory %s", mnt_roots);
ret = 1;
}
return ret;
}
int prepare_mnt_ns(void)
{
int ret = -1;
struct mount_info *mis, *old;
struct ns_id ns = { .pid = PROC_SELF, .nd = &mnt_ns_desc };
if (!(root_ns_mask & CLONE_NEWNS))
return rst_collect_local_mntns();
pr_info("Restoring mount namespace\n");
old = collect_mntinfo(&ns, false);
if (old == NULL)
return -1;
close_proc();
mis = read_mnt_ns_img();
if (!mis)
goto out;
/*
* The new mount namespace is filled with the mountpoint
* clones from the original one. We have to umount them
* prior to recreating new ones.
*/
if (!opts.root) {
if (chdir("/")) {
pr_perror("chdir(\"/\") failed");
return -1;
}
if (clean_mnt_ns(ns.mnt.mntinfo_tree))
return -1;
} else {
struct mount_info *mi;
/* moving a mount residing under a shared mount is invalid. */
mi = mount_resolve_path(ns.mnt.mntinfo_tree, opts.root);
if (mi == NULL) {
pr_err("Unable to find mount point for %s\n", opts.root);
return -1;
}
if (mi->parent == NULL) {
pr_err("New root and old root are the same\n");
return -1;
}
/* Our root is mounted over the parent (in the same directory) */
if (!strcmp(mi->parent->mountpoint, mi->mountpoint)) {
pr_err("The parent of the new root is unreachable\n");
return -1;
}
if (mount("none", mi->parent->mountpoint + 1, "none", MS_SLAVE, NULL)) {
pr_perror("Can't remount the parent of the new root with MS_SLAVE");
return -1;
}
/* Unprivileged users can't reveal what is under a mount */
if (root_ns_mask & CLONE_NEWUSER) {
if (mount(opts.root, opts.root, NULL, MS_BIND | MS_REC, NULL)) {
pr_perror("Can't remount bind-mount %s into itself", opts.root);
return -1;
}
}
if (chdir(opts.root)) {
pr_perror("chdir(%s) failed", opts.root ? : "/");
return -1;
}
}
free_mntinfo(old);
ret = populate_mnt_ns(mis);
if (ret)
goto out;
if (opts.root)
ret = cr_pivot_root(NULL);
out:
return ret;
}
int __mntns_get_root_fd(pid_t pid)
{
static int mntns_root_pid = -1;
int fd, pfd;
int ret;
char path[PATH_MAX + 1];
if (mntns_root_pid == pid) /* The required root is already opened */
return get_service_fd(ROOT_FD_OFF);
close_service_fd(ROOT_FD_OFF);
if (!(root_ns_mask & CLONE_NEWNS)) {
/*
* If criu and tasks we dump live in the same mount
* namespace, we can just open the root directory.
* All paths resolution would occur relative to criu's
* root. Even if it is not namespace's root, provided
* file paths are resolved, we'd get consistent dump.
*/
fd = open("/", O_RDONLY | O_DIRECTORY);
if (fd < 0) {
pr_perror("Can't open root");
return -1;
}
goto set_root;
}
/*
* If /proc/pid/root links on '/', it signs that a root of the task
* and a root of mntns is the same.
*/
pfd = open_pid_proc(pid);
ret = readlinkat(pfd, "root", path, sizeof(path) - 1);
if (ret < 0) {
close_pid_proc();
return ret;
}
path[ret] = '\0';
if (ret != 1 || path[0] != '/') {
pr_err("The root task has another root than mntns: %s\n", path);
close_pid_proc();
return -1;
}
fd = openat(pfd, "root", O_RDONLY | O_DIRECTORY, 0);
close_pid_proc();
if (fd < 0) {
pr_perror("Can't open the task root");
return -1;
}
set_root:
ret = install_service_fd(ROOT_FD_OFF, fd);
if (ret >= 0)
mntns_root_pid = pid;
close(fd);
return ret;
}
int mntns_get_root_fd(struct ns_id *mntns)
{
return __mntns_get_root_fd(mntns->pid);
}
struct ns_id *lookup_nsid_by_mnt_id(int mnt_id)
{
struct mount_info *mi;
/*
* Kernel before 3.15 doesn't show mnt_id for file descriptors.
* mnt_id isn't saved for files, if mntns isn't dumped.
* In both these cases we have only one root, so here
* is not matter which mount will be restured.
*/
if (mnt_id == -1)
mi = mntinfo;
else
mi = lookup_mnt_id(mnt_id);
return mi ? mi->nsid : NULL;
}
int mntns_get_root_by_mnt_id(int mnt_id)
{
struct ns_id *mntns;
mntns = lookup_nsid_by_mnt_id(mnt_id);
BUG_ON(mntns == NULL);
return mntns_get_root_fd(mntns);
}
struct collect_mntns_arg {
bool need_to_validate;
bool for_dump;
};
static int collect_mntns(struct ns_id *ns, void *__arg)
{
struct collect_mntns_arg *arg = __arg;
struct mount_info *pms;
pms = collect_mntinfo(ns, arg->for_dump);
if (!pms)
return -1;
if (arg->for_dump && ns->pid != getpid())
arg->need_to_validate = true;
mntinfo_add_list(pms);
return 0;
}
int collect_mnt_namespaces(bool for_dump)
{
struct collect_mntns_arg arg;
int ret;
arg.for_dump = for_dump;
arg.need_to_validate = false;
ret = walk_namespaces(&mnt_ns_desc, collect_mntns, &arg);
if (ret)
goto err;
ret = resolve_external_mounts(mntinfo);
if (ret)
goto err;
if (arg.need_to_validate) {
ret = -1;
if (collect_shared(mntinfo, true))
goto err;
if (validate_mounts(mntinfo, true))
goto err;
}
ret = 0;
err:
return ret;
}
int dump_mnt_namespaces(void)
{
struct ns_id *nsid;
int n = 0;
if (!(root_ns_mask & CLONE_NEWNS))
return 0;
for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
if (nsid->nd != &mnt_ns_desc)
continue;
if (nsid->pid == getpid())
continue;
if (++n == 2 && check_mnt_id()) {
pr_err("Nested mount namespaces are not supported "
"without mnt_id in fdinfo\n");
return -1;
}
if (dump_mnt_ns(nsid, nsid->mnt.mntinfo_list))
return -1;
}
return 0;
}
struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");