From b35c842d0f9778bcb13991071b97913d27fce6d2 Mon Sep 17 00:00:00 2001 From: Pavel Tikhomirov Date: Mon, 25 May 2020 18:12:49 +0300 Subject: [PATCH] mount: add new mounts-v2 engine Design of mounts-v2: As a preparation step we classify mounts in groups by (shared_id, master_id) in new resolve_shared_mounts_v2 (just after reading images). New function prepare_mnt_ns_v2 is our main entry point when switching from old mount engine to new one actually happens. First we pre-create each mount namespace nearly empty, only with root yard in place (pre_create_mount_namespaces). We walk the mount tree and mount each mount similar to old mount engine but not in mount tree but as a sub-directory of root yard (plain mountpoint) in service (criu) mount namespace. Also we bind this mount from service mntns to real mntns just after creation. (do_mount_in_right_mntns) Note: this way we initially have the final mount which would be visible to restored container user with right mnt_id for the sake of e.g. creating unix sockets on it (for unix socket bindmounts), and both have copy of the mount in service mntns so that old code which accesses files on mounts through service mntns still can acces them. New can_mount_now_v2 is now free from heuristics we had for restoring shared groups, we will restore them later via MOVE_MOUNT_SET_GROUP, for now everything is private. Now when all plain mount are created in real mount namespaces, we can move them to the tree for each namespace. Also we open fds on the mountpoint: one mp_fd_id before moving and another mnt_fd_id after, so that we can access each file later from final mntns via those fds. (assemble_mount_namespaces) New restore_mount_sharing_options walks each root sharing group and their descendants with dfs tree walk. It creates sharing for the first mount in the sharing group and then sets the same sharing on all other mounts in this group. Sharing creation for fist mount is two step: a) If mount has master_id we either copy shared_id from parent sharing group or from external source and then make mount slave thus converting it to right master_id. b) Next if mount has shared_id we just make us shared, creating right shared_id. Cherry-picked from Virtuozzo criu: https://src.openvz.org/projects/OVZ/repos/criu/commits/596651d02 Changes: - Split all "exporting" to separate preparational patches - Rework cr_time - Switch to MOVE_MOUNT_SET_GROUP - Use resolve_mountpoint for external mounts (for MOVE_MOUNT_SET_GROUP) - Mounting plain mounts both in service and in restored-final mntns - Call MOVE_MOUNT_SET_GROUP from usernsd - Rework can_mount_now_v2 to handle bind of both root and external. - Use sys_move_mount for mount assembling. Signed-off-by: Pavel Tikhomirov --- criu/include/mount-v2.h | 35 ++ criu/include/mount.h | 10 + criu/mount-v2.c | 1246 +++++++++++++++++++++++++++++++++++++++ criu/mount.c | 16 + 4 files changed, 1307 insertions(+) diff --git a/criu/include/mount-v2.h b/criu/include/mount-v2.h index 61f62966d..2f2812221 100644 --- a/criu/include/mount-v2.h +++ b/criu/include/mount-v2.h @@ -4,6 +4,8 @@ #include "linux/mount.h" #include "linux/openat2.h" +#include "common/list.h" + #include #ifndef MOVE_MOUNT_SET_GROUP @@ -57,4 +59,37 @@ static inline long sys_openat2(int dirfd, const char *pathname, struct open_how extern int check_mount_v2(void); +struct sharing_group { + /* This pair identifies the group */ + int shared_id; + int master_id; + + /* List of shared groups */ + struct list_head list; + + /* List of mounts in this group */ + struct list_head mnt_list; + + /* + * List of dependant shared groups: + * - all siblings have equal master_id + * - the parent has shared_id equal to children's master_id + * + * This is a bit tricky: parent pointer indicates if there is one + * parent sharing_group in list or only siblings. + * So for traversal if parent pointer is set we can do: + * list_for_each_entry(t, &sg->parent->children, siblings) + * and overvise we can do: + * list_for_each_entry(t, &sg->siblings, siblings) + */ + struct list_head children; + struct list_head siblings; + struct sharing_group *parent; + + char *source; +}; + +extern int resolve_shared_mounts_v2(void); +extern int prepare_mnt_ns_v2(void); + #endif /* __CR_MOUNT_V2_H__ */ diff --git a/criu/include/mount.h b/criu/include/mount.h index c16a5c293..4d2d64470 100644 --- a/criu/include/mount.h +++ b/criu/include/mount.h @@ -62,7 +62,15 @@ struct mount_info { */ char *mountpoint; char *ns_mountpoint; + + /* Mount-v2 specific */ char *plain_mountpoint; + int is_dir; + int mp_fd_id; + int mnt_fd_id; + struct sharing_group *sg; + struct list_head mnt_sharing; + int fd; unsigned flags; unsigned sb_flags; @@ -79,6 +87,8 @@ struct mount_info { bool need_plugin; bool is_ns_root; bool deleted; + int deleted_level; + struct list_head deleted_list; struct mount_info *next; struct ns_id *nsid; diff --git a/criu/mount-v2.c b/criu/mount-v2.c index 801ade748..ec3965140 100644 --- a/criu/mount-v2.c +++ b/criu/mount-v2.c @@ -1,9 +1,35 @@ +#include +#include +#include +#include +#include +#include +#include + #include "kerndat.h" #include "log.h" +#include "cr_options.h" +#include "xmalloc.h" +#include "util.h" +#include "filesystems.h" +#include "mount.h" +#include "mount-v2.h" +#include "namespaces.h" +#include "fs-magic.h" +#include "path.h" +#include "files-reg.h" +#include "fdstore.h" +#include "common/list.h" +#include "common/bug.h" +#include "common/compiler.h" + +#include "images/mnt.pb-c.h" #undef LOG_PREFIX #define LOG_PREFIX "mnt-v2: " +LIST_HEAD(sharing_groups); + int check_mount_v2(void) { if (!kdat.has_move_mount_set_group) { @@ -18,3 +44,1223 @@ int check_mount_v2(void) return 0; } + +static struct sharing_group *get_sharing_group(int shared_id, int master_id) +{ + struct sharing_group *sg; + + list_for_each_entry(sg, &sharing_groups, list) { + if (sg->shared_id == shared_id && sg->master_id == master_id) + return sg; + } + + return NULL; +} + +static struct sharing_group *alloc_sharing_group(int shared_id, int master_id) +{ + struct sharing_group *sg; + + sg = xzalloc(sizeof(struct sharing_group)); + if (!sg) + return NULL; + + sg->shared_id = shared_id; + sg->master_id = master_id; + + INIT_LIST_HEAD(&sg->list); + INIT_LIST_HEAD(&sg->mnt_list); + INIT_LIST_HEAD(&sg->children); + INIT_LIST_HEAD(&sg->siblings); + + list_add(&sg->list, &sharing_groups); + + return sg; +} + +int resolve_shared_mounts_v2(void) +{ + struct sharing_group *sg; + struct mount_info *mi; + + /* + * Create sharing groups for each unique shared_id+master_id pair and + * link each mount to the corresponding sharing group. + */ + for (mi = mntinfo; mi; mi = mi->next) { + if (!mi->shared_id && !mi->master_id) + continue; + + pr_debug("Inspecting sharing on %2d shared_id %d master_id %d (@%s)\n", mi->mnt_id, mi->shared_id, + mi->master_id, mi->ns_mountpoint); + + sg = get_sharing_group(mi->shared_id, mi->master_id); + if (!sg) { + sg = alloc_sharing_group(mi->shared_id, mi->master_id); + if (!sg) + return -1; + } + + list_add(&mi->mnt_sharing, &sg->mnt_list); + mi->sg = sg; + } + + /* + * Collect sharing groups tree. Mount propagation between sharing + * groups only goes down this tree, meaning that only mounts of same or + * descendant sharing groups recieve mount propagation. + */ + list_for_each_entry(sg, &sharing_groups, list) { + if (sg->master_id) { + struct sharing_group *p; + + /* + * Lookup parent sharing group. If one sharing group + * has master_id equal to shared_id of another sharing + * group than the former is a child (slave) of the + * latter. Also sharing groups should not have two + * parents so we check this here too. + */ + list_for_each_entry(p, &sharing_groups, list) { + if (p->shared_id != sg->master_id) + continue; + + if (sg->parent) { + pr_err("Sharing group (%d, %d) parent collision (%d, %d) (%d, %d)\n", + sg->shared_id, sg->master_id, p->shared_id, p->master_id, + sg->parent->shared_id, sg->parent->master_id); + return -1; + } + sg->parent = p; + + if (!list_empty(&sg->siblings)) { + pr_err("External slavery sharing group (%d, %d) has parent (%d, %d)\n", + sg->shared_id, sg->master_id, p->shared_id, p->master_id); + return -1; + } + list_add(&sg->siblings, &p->children); + /* Don't break to check for parent collision */ + } + + /* + * If sharing group has master_id but we did't find + * parent for it inside the dumped container yet, this + * means that the master_id is external and a mount on + * host should exist with corresponding shared_id. + */ + if (!sg->parent && list_empty(&sg->siblings)) { + struct mount_info *ext; + struct sharing_group *s; + char *source = NULL; + + /* + * Though we don't have parent sharing group + * (inaccessible sharing), we can still have + * siblings, sharing groups with same master_id + * but different shared_id, let's collect them + * to the list. + */ + list_for_each_entry(s, &sharing_groups, list) { + if (s->master_id != sg->master_id) + continue; + + if (s->parent) { + pr_err("External slavery sharing group (%d, %d) has parent (%d, %d)\n", + sg->shared_id, sg->master_id, s->parent->shared_id, + s->parent->master_id); + return -1; + } + + if (!list_empty(&s->siblings)) { + pr_err("External slavery sharing group collision (%d, %d) (%d, %d)\n", + sg->shared_id, sg->master_id, s->shared_id, s->master_id); + return -1; + } + list_add(&s->siblings, &sg->siblings); + } + + BUG_ON(list_empty(&sg->mnt_list)); + mi = list_entry(sg->mnt_list.next, struct mount_info, mnt_sharing); + + /* + * We need to know from which mount on host we + * can get this external master_id. There are + * two options: mountpoint external mount or + * root mount of container. + */ + if ((ext = mnt_get_external_bind_nodev(mi))) + source = ext->external; + else if (mnt_is_root_bind(mi)) + source = opts.root; + + if (!source) { + pr_err("Sharing group (%d, %d) " + "has unreachable sharing. Try --enable-external-masters.\n", + sg->shared_id, sg->master_id); + return -1; + } + + sg->source = source; + list_for_each_entry(s, &sg->siblings, siblings) + s->source = sg->source; + + pr_debug("Detected external slavery for shared group (%d, %d) with source %s\n", + sg->shared_id, sg->master_id, source); + } + } + } + + return 0; +} + +/* + * When first mount from superblock is mounted, give other mounts + * a hint that they can now just bindmount from the first one. + */ +static int propagate_mount_v2(struct mount_info *mi) +{ + struct mount_info *t; + + list_for_each_entry(t, &mi->mnt_bind, mnt_bind) { + if (t->mounted) + continue; + if (t->bind) + continue; + if (!issubpath(t->root, mi->root)) + continue; + pr_debug("\t\tPropagate %d to %d\n", mi->mnt_id, t->mnt_id); + t->bind = mi; + t->s_dev_rt = mi->s_dev_rt; + } + + return 0; +} + +/* + * Mounts first mount of superblock + */ +static int do_new_mount_v2(struct mount_info *mi) +{ + unsigned long sflags = mi->sb_flags; + unsigned long mflags = mi->flags & (~MS_PROPAGATE); + char *src; + struct fstype *tp = mi->fstype; + bool remount_ro = (tp->restore && mi->sb_flags & MS_RDONLY); + mount_fn_t do_mount = (tp->mount) ? tp->mount : do_simple_mount; + + src = resolve_source(mi); + if (!src) + return -1; + + /* Merge superblock and mount flags if it's possible */ + if (!(mflags & ~MS_MNT_KNOWN_FLAGS) && !((sflags ^ mflags) & MS_RDONLY)) { + sflags |= mflags; + mflags = 0; + } + + if (remount_ro) + sflags &= ~MS_RDONLY; + + if (do_mount(mi, src, mnt_fsname(mi), sflags) < 0) { + pr_perror("Can't mount at %s", mi->plain_mountpoint); + return -1; + } + + /* + * Mount-v2 relies that before mount tree is constructed all mounts + * should remain private. Newly created mounts can become non-private + * initially depending on parent/source sharing, let's be as explicit + * as posible here and make it obvious that mount becomes private. + */ + if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { + pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); + return -1; + } + + if (tp->restore && tp->restore(mi)) + return -1; + + if (remount_ro) { + int fd; + + fd = open(mi->plain_mountpoint, O_PATH); + if (fd < 0) { + pr_perror("Unable to open %s", mi->plain_mountpoint); + return -1; + } + sflags |= MS_RDONLY | MS_REMOUNT; + if (userns_call(apply_sb_flags, 0, &sflags, sizeof(sflags), fd)) { + pr_perror("Unable to apply mount flags %d for %s", mi->sb_flags, mi->plain_mountpoint); + close(fd); + return -1; + } + close(fd); + } + + if (mflags && mount(NULL, mi->plain_mountpoint, NULL, MS_REMOUNT | MS_BIND | mflags, NULL)) { + pr_perror("Unable to apply bind-mount options"); + return -1; + } + + mi->mounted = true; + return 0; +} + +/* + * Does simple bindmount, but via new kernel mount api, + * which also handles autofs and symlink without resolving. + */ +static int __do_bind_mount_v2(char *from, char *to) +{ + int detached_fd; + + detached_fd = sys_open_tree(AT_FDCWD, from, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE); + if (detached_fd == -1) { + pr_perror("Failed to open_tree %s", from); + return -1; + } + + if (sys_move_mount(detached_fd, "", AT_FDCWD, to, MOVE_MOUNT_F_EMPTY_PATH)) { + pr_perror("Failed to move_mount from %s to %s", from, to); + close(detached_fd); + return -1; + } + close(detached_fd); + + return 0; +} + +LIST_HEAD(deleted_mounts); + +/* + * Bind-mounts all later mounts of superblock from first one, + * also handles first mounts of mountpoint external mounts. + */ +static int do_bind_mount_v2(struct mount_info *mi) +{ + char *root = NULL, *cut_root, rpath[PATH_MAX]; + unsigned long mflags; + int exit_code = -1; + char *mnt_path = NULL; + int level = 0; + + if (mi->need_plugin) { + if (restore_ext_mount(mi)) + return -1; + goto out; + } + + if (mnt_is_nodev_external(mi)) { + root = mi->external; + goto do_bind; + } + + cut_root = get_relative_path(mi->root, mi->bind->root); + if (!cut_root) { + pr_err("Failed to find root for %d in our supposed bind %d\n", mi->mnt_id, mi->bind->mnt_id); + return -1; + } + + /* + * Mount ->private can be initialized on fstype->mount() callback, + * which is called for first mount of superblock in do_new_mount(). + * Also ->private have to be copied to all other mounts of superblock + * to provide users of it with actual data. + */ + mi->private = mi->bind->private; + + mnt_path = mi->bind->plain_mountpoint; + + if (cut_root[0]) { + snprintf(rpath, sizeof(rpath), "%s/%s", mnt_path, cut_root); + root = rpath; + } else { + root = mnt_path; + } +do_bind: + pr_info("\tBind %s to %s\n", root, mi->plain_mountpoint); + + if (unlikely(mi->deleted)) { + level = make_parent_dirs_if_need(-1, root); + if (level < 0) + goto err; + + if (mi->is_dir) { + if (mkdir(root, 0600)) { + pr_perror("Can't re-create deleted directory %s", root); + goto err; + } + } else { + int fd = open(root, O_WRONLY | O_CREAT | O_EXCL, 0600); + if (fd < 0) { + pr_perror("Can't re-create deleted file %s", root); + goto err; + } + close(fd); + } + } + + if (__do_bind_mount_v2(root, mi->plain_mountpoint)) + goto err; + + /* + * Mount-v2 relies that before mount tree is constructed all mounts + * should remain private. Newly created mounts can become non-private + * initially depending on parent/source sharing, let's be as explicit + * as posible here and make it obvious that mount becomes private. + */ + if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { + pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); + goto err; + } + + mflags = mi->flags & (~MS_PROPAGATE); + if (!mi->bind || mflags != (mi->bind->flags & (~MS_PROPAGATE))) + if (mount(NULL, mi->plain_mountpoint, NULL, MS_BIND | MS_REMOUNT | mflags, NULL)) { + pr_perror("Can't bind remount 0x%lx at %s", mflags, mi->plain_mountpoint); + goto err; + } + + if (mi->deleted) { + /* + * Deleted mounts can't be moved, will delete source after + * moving to proper position in the mount tree FIXME. + */ + mi->deleted_level = level; + level = 0; + list_add(&mi->deleted_list, &deleted_mounts); + } +out: + mi->mounted = true; + exit_code = 0; +err: + if (level) + rm_parent_dirs(-1, root, level); + + return exit_code; +} + +/* Mounts root container mount. */ +static int do_mount_root_v2(struct mount_info *mi) +{ + unsigned long flags = MS_BIND; + int fd; + + if (root_ns_mask & CLONE_NEWUSER) { + fd = open(mi->plain_mountpoint, O_PATH); + if (fd < 0) { + pr_perror("Unable to open %s", mi->plain_mountpoint); + return -1; + } + + if (userns_call(mount_root, 0, &flags, sizeof(flags), fd)) { + pr_err("Unable to mount %s\n", mi->plain_mountpoint); + close(fd); + return -1; + } + close(fd); + } else { + if (mount(opts.root, mi->plain_mountpoint, NULL, flags, NULL)) { + pr_perror("Unable to mount %s %s (id=%d)", opts.root, mi->plain_mountpoint, mi->mnt_id); + return -1; + } + } + + /* + * Mount-v2 relies that before mount tree is constructed all mounts + * should remain private. Newly created mounts can become non-private + * initially depending on parent/source sharing, let's be as explicit + * as posible here and make it obvious that mount becomes private. + */ + if (mount(NULL, mi->plain_mountpoint, NULL, MS_PRIVATE, NULL)) { + pr_perror("Can't remount %s with MS_PRIVATE", mi->plain_mountpoint); + return -1; + } + + mi->mounted = true; + + return 0; +} + +/* Check if mount is ready to be mounted. */ +static bool can_mount_now_v2(struct mount_info *mi) +{ + struct mount_info *root, *ext; + + /* Parent should be mounted already, that's how mnt_tree_for_each works */ + BUG_ON(mi->parent && !mi->parent->mounted); + + /* Root mounts can be mounted at any moment */ + if (rst_mnt_is_root(mi)) { + pr_debug("%s: true as %d is global root\n", __func__, mi->mnt_id); + return true; + } + + /* External mounts can be mounted at any moment */ + if (mi->external) { + pr_debug("%s: true as %d is external\n", __func__, mi->mnt_id); + return true; + } + + /* + * Container root and external mounts should go before + * anything which should be bindmounted from them. + */ + if (!mi->bind) { + root = mnt_get_root_bind(mi); + if (root) { + pr_debug("%s: false as %d is bind of not mounted global root %d\n", __func__, mi->mnt_id, + root->mnt_id); + return false; + } + + ext = mnt_get_external_bind(mi); + if (ext) { + pr_debug("%s: false as %d is a bind of not mounted external %d\n", __func__, mi->mnt_id, + ext->mnt_id); + return false; + } + } + + /* Non fsroot mounts can not be mounted without bind-mount */ + if (!fsroot_mounted(mi) && !mi->bind && !mi->need_plugin) { + pr_debug("%s: false as %d is non-root without bind or plugin\n", __func__, mi->mnt_id); + return false; + } + + return true; +} + +static int __set_unbindable_v2(struct mount_info *mi) +{ + if (mi->flags & MS_UNBINDABLE) { + if (mount(NULL, service_mountpoint(mi), NULL, MS_UNBINDABLE, NULL)) { + pr_perror("Failed to set mount %d unbindable", mi->mnt_id); + return -1; + } + } + return 0; +} + +/* + * Setting MS_UNBINDABLE flag is slightly delayed, + * obviousely until we finish bind-mounting everything. + */ +static int set_unbindable_v2(void) +{ + int orig_nsfd = -1, nsfd = -1, exit_code = -1; + struct mount_info *mi; + struct ns_id *nsid; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + bool ns_has_unbindable = false; + + if (nsid->nd != &mnt_ns_desc) + continue; + + for (mi = mntinfo; mi != NULL; mi = mi->next) + if (mi->nsid == nsid && mi->flags & MS_UNBINDABLE) + ns_has_unbindable = true; + + if (!ns_has_unbindable) + continue; + + nsfd = fdstore_get(nsid->mnt.nsfd_id); + if (nsfd < 0) + goto err; + + if (switch_ns_by_fd(nsfd, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) + goto err; + close_safe(&nsfd); + + if (mnt_tree_for_each(nsid->mnt.mntinfo_tree, __set_unbindable_v2)) + goto err; + } + + exit_code = 0; +err: + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + close_safe(&nsfd); + return exit_code; +} + +/* + * Detects if mount is a directory mount or file mount based on stat on + * its mountpoint inside already mounted parent mount. This is deeply + * integrated in plain mount creation process because before mounting + * something plain we need to create right type of mountpoint for it. + */ +static int detect_is_dir(struct mount_info *mi) +{ + static char mountpoint[PATH_MAX]; + char *rel_path; + struct stat st; + + if (mi->is_dir != -1) + return 0; + + if (mi->mnt_id == HELPER_MNT_ID) { + pr_err("Helper %s should have is_dir pre-set\n", mi->ns_mountpoint); + return -1; + } + + if (!mi->parent || mi->parent == root_yard_mp) { + pr_err("Mount namespace root mount %d should have is_dir pre-set\n", mi->mnt_id); + return -1; + } + + if (!mi->parent->mounted) { + pr_err("Parent mount %d of %d should be mounted\n", mi->parent->mnt_id, mi->mnt_id); + return -1; + } + + rel_path = get_relative_path(mi->ns_mountpoint, mi->parent->ns_mountpoint); + if (!rel_path) { + pr_err("Child-parent mountpoint missmatch %d:%s %d:%s\n", mi->mnt_id, mi->ns_mountpoint, + mi->parent->mnt_id, mi->parent->ns_mountpoint); + return -1; + } + + snprintf(mountpoint, sizeof(mountpoint), "%s%s%s", mi->parent->plain_mountpoint, rel_path[0] ? "/" : "", + rel_path); + if (stat(mountpoint, &st)) { + pr_perror("Can't stat mountpoint %s", mountpoint); + return -1; + } + + if (S_ISDIR(st.st_mode)) + mi->is_dir = true; + else + mi->is_dir = false; + + pr_debug("Mount %d is detected as %s-mount\n", mi->mnt_id, mi->is_dir ? "dir" : "file"); + return 0; +} + +static int create_plain_mountpoint(struct mount_info *mi) +{ + BUG_ON(mi->is_dir == -1); + + pr_debug("Create plain mountpoint %s for %d\n", mi->plain_mountpoint, mi->mnt_id); + if (mi->is_dir) { + if (mkdir(mi->plain_mountpoint, 0600)) { + pr_perror("Unable to mkdir mountpoint %s", mi->plain_mountpoint); + return -1; + } + } else { + int fd; + + fd = creat(mi->plain_mountpoint, 0600); + if (fd < 0) { + pr_perror("Unable to create mountpoint %s", mi->plain_mountpoint); + return -1; + } + close(fd); + } + + return 0; +} + +/* + * At this point we already have a mount in service mount namespace now we + * bind-mount it to the final restored mount namespace via new kernel mount + * API. + */ +static int do_mount_in_right_mntns(struct mount_info *mi) +{ + int nsfd = -1, orig_nsfd = -1, detached_fd = -1, exit_code = -1; + + if (!mi->nsid) + return 0; + + detached_fd = + sys_open_tree(AT_FDCWD, mi->plain_mountpoint, AT_NO_AUTOMOUNT | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE); + if (detached_fd == -1) { + pr_perror("Failed to open_tree %s", mi->plain_mountpoint); + goto err; + } + + nsfd = fdstore_get(mi->nsid->mnt.nsfd_id); + if (nsfd < 0) + goto err; + + if (switch_ns_by_fd(nsfd, &mnt_ns_desc, &orig_nsfd)) + goto err; + + if (create_plain_mountpoint(mi)) + goto err; + + if (sys_move_mount(detached_fd, "", AT_FDCWD, mi->plain_mountpoint, MOVE_MOUNT_F_EMPTY_PATH)) { + pr_perror("Failed to cross-mntns move_mount plain mount %d", mi->mnt_id); + goto err; + } + + exit_code = 0; +err: + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + close_safe(&nsfd); + close_safe(&detached_fd); + return exit_code; +} + +static int do_mount_one_v2(struct mount_info *mi) +{ + int ret; + + if (mi->mounted) + return 0; + + if (!can_mount_now_v2(mi)) { + pr_debug("Postpone mount %d\n", mi->mnt_id); + return 1; + } + + if (detect_is_dir(mi)) + return -1; + + if (create_plain_mountpoint(mi)) + return -1; + + pr_debug("\tMounting %s @%d (%d)\n", mi->fstype->name, mi->mnt_id, mi->need_plugin); + + if (rst_mnt_is_root(mi)) { + if (opts.root == NULL) { + pr_err("The --root option is required to restore a mount namespace\n"); + return -1; + } + ret = do_mount_root_v2(mi); + } else if (!mi->bind && !mi->need_plugin && (!mi->external || !strcmp(mi->external, EXTERNAL_DEV_MOUNT))) { + ret = do_new_mount_v2(mi); + } else { + ret = do_bind_mount_v2(mi); + } + + if (ret == 0 && fetch_rt_stat(mi, mi->plain_mountpoint)) + return -1; + + if (ret == 0 && propagate_mount_v2(mi)) + return -1; + + if (mi->fstype->code == FSTYPE__UNSUPPORTED) { + struct statfs st; + + if (statfs(mi->plain_mountpoint, &st)) { + pr_perror("Unable to statfs %s", mi->plain_mountpoint); + return -1; + } + if (st.f_type == BTRFS_SUPER_MAGIC) + mi->fstype = find_fstype_by_name("btrfs"); + } + + if (ret == 0 && do_mount_in_right_mntns(mi)) + return -1; + + return ret; +} + +static int populate_mnt_ns_v2(void) +{ + if (make_yard(mnt_roots)) + return -1; + + if (mnt_tree_for_each(root_yard_mp, do_mount_one_v2)) + return -1; + + return set_unbindable_v2(); +} + +/* + * This function moves plain mounts into actual mount tree. + * + * Mounts in children list are sorted the way that sibling overmount goes after + * all siblings which it overmounts (see __mnt_resort_children). The function + * mnt_tree_for_each is effectively DFS (in case we don't postpone), thus all + * descendants of all mounts which we sibling-overmount are mounted before us. + * Be carefull, we can't postpone (return >0) from this function because of it. + */ +static int move_mount_to_tree(struct mount_info *mi) +{ + int fd; + + fd = open(mi->mountpoint, O_PATH); + if (fd < 0) { + pr_perror("Failed to open real mountpoint of %d", mi->mnt_id); + return -1; + } + + mi->mp_fd_id = fdstore_add(fd); + close(fd); + if (mi->mp_fd_id < 0) { + pr_err("Can't add mountpoint of mount %d to fdstore\n", mi->mnt_id); + return -1; + } + + pr_info("Move mount %d from %s to %s\n", mi->mnt_id, mi->plain_mountpoint, mi->mountpoint); + if (sys_move_mount(AT_FDCWD, mi->plain_mountpoint, AT_FDCWD, mi->mountpoint, 0)) { + pr_perror("Failed to move mount %d from %s to %s", mi->mnt_id, mi->plain_mountpoint, mi->mountpoint); + return -1; + } + + fd = open(mi->mountpoint, O_PATH); + if (fd < 0) { + pr_perror("Failed to open real mountpoint of %d", mi->mnt_id); + return -1; + } + + mi->mnt_fd_id = fdstore_add(fd); + close(fd); + if (mi->mnt_fd_id < 0) { + pr_err("Can't add mount %d fd to fdstore\n", mi->mnt_id); + return -1; + } + + return 0; +} + +static int assemble_tree_from_plain_mounts(struct ns_id *nsid) +{ + return mnt_tree_for_each(nsid->mnt.mntinfo_tree, move_mount_to_tree); +} + +/* + * With MOVE_MOUNT_SET_GROUP source mount should have wider root than + * destination, thus let's choose widest mount from group as first. + */ +static struct mount_info *get_first_mount(struct sharing_group *sg) +{ + struct mount_info *first = NULL, *tmp; + int min_len = 0; + + list_for_each_entry(tmp, &sg->mnt_list, mnt_sharing) { + int len = strlen(tmp->root); + + if (!first || len < min_len) { + first = tmp; + min_len = len; + } + } + + return first; +} + +struct set_group_arg { + int src_id; + char source[PATH_MAX]; + int dst_id; +}; + +static int __move_mount_set_group(void *arg, int dfd, int pid) +{ + struct set_group_arg *sga = (struct set_group_arg *)arg; + int src_fd, dst_fd, exit_code = -1; + + if (sga->src_id != -1) { + src_fd = fdstore_get(sga->src_id); + BUG_ON(src_fd < 0); + } else { + char *source_mp; + + BUG_ON(sga->source[0] == '\0'); + /* + * Source path should not always be a mountpoint as we + * automatically resolve it to mountpoint below. + */ + source_mp = resolve_mountpoint(sga->source); + if (!source_mp) { + pr_err("Failed to find %s mountpoint\n", sga->source); + return -1; + } + + src_fd = open(source_mp, O_PATH); + if (src_fd < 0) { + pr_perror("Failed to open %s mountpoint", source_mp); + xfree(source_mp); + return -1; + } + xfree(source_mp); + } + + dst_fd = fdstore_get(sga->dst_id); + BUG_ON(dst_fd < 0); + + /* Copy shared_id of the source */ + if (sys_move_mount(src_fd, "", dst_fd, "", + MOVE_MOUNT_SET_GROUP | MOVE_MOUNT_F_EMPTY_PATH | MOVE_MOUNT_T_EMPTY_PATH)) { + pr_perror("Failed to copy sharing from %d:%s to %d", sga->src_id, sga->source ?: "", sga->dst_id); + goto err; + } + + exit_code = 0; +err: + close(src_fd); + close(dst_fd); + return exit_code; +} + +/* + * Copy sharing between mounts passing mountpoint fds via fdstore ids. Also it + * is possible (for external mounts) to pass path on mountpoint via source path, + * it would resolve to mountpoint automatically. + */ +static int move_mount_set_group(int src_id, char *source, int dst_id) +{ + struct set_group_arg sga = { + .src_id = src_id, + .dst_id = dst_id, + }; + + sga.source[0] = '\0'; + if (source) { + if (snprintf(sga.source, sizeof(sga.source), "%s", source) >= sizeof(sga.source)) { + pr_err("Source %s is too long\n", source); + return -1; + } + } + + if (userns_call(__move_mount_set_group, 0, &sga, sizeof(sga), -1)) + return -1; + + return 0; +} + +static int restore_one_sharing_group(struct sharing_group *sg) +{ + struct mount_info *first, *other; + char first_path[PATH_MAX]; + int first_fd; + + first = get_first_mount(sg); + first_fd = fdstore_get(first->mnt_fd_id); + BUG_ON(first_fd < 0); + snprintf(first_path, sizeof(first_path), "/proc/self/fd/%d", first_fd); + + /* Restore first's master_id from shared_id of the source */ + if (sg->master_id) { + if (sg->parent) { + struct mount_info *p; + + /* Get shared_id from parent sharing group */ + p = get_first_mount(sg->parent); + if (move_mount_set_group(p->mnt_fd_id, NULL, first->mnt_fd_id)) { + pr_err("Failed to copy sharing from %d to %d\n", p->mnt_id, first->mnt_id); + close(first_fd); + return -1; + } + } else { + /* + * External slavery. We rely on the user to give us the + * right source for external mount with all proper + * sharing optioins setup (it should be either shared + * or non-shared slave). If source is a private mount + * we would fail. + */ + if (move_mount_set_group(-1, sg->source, first->mnt_fd_id)) { + pr_err("Failed to copy sharing from source %s to %d\n", sg->source, first->mnt_id); + close(first_fd); + return -1; + } + } + + /* Convert shared_id to master_id */ + if (mount(NULL, first_path, NULL, MS_SLAVE, NULL)) { + pr_perror("Failed to make mount %d slave", first->mnt_id); + close(first_fd); + return -1; + } + } + + /* Restore first's shared_id */ + if (sg->shared_id) { + if (mount(NULL, first_path, NULL, MS_SHARED, NULL)) { + pr_perror("Failed to make mount %d shared", first->mnt_id); + close(first_fd); + return -1; + } + } + close(first_fd); + + /* Restore sharing for other mounts from the sharing group */ + list_for_each_entry(other, &sg->mnt_list, mnt_sharing) { + if (other == first) + continue; + + if (move_mount_set_group(first->mnt_fd_id, NULL, other->mnt_fd_id)) { + pr_err("Failed to copy sharing from %d to %d\n", first->mnt_id, other->mnt_id); + return -1; + } + } + + return 0; +} + +static struct sharing_group *sharing_group_next(struct sharing_group *sg) +{ + if (!list_empty(&sg->children)) + return list_entry(sg->children.next, struct sharing_group, siblings); + + while (sg->parent) { + if (sg->siblings.next == &sg->parent->children) + sg = sg->parent; + else + return list_entry(sg->siblings.next, struct sharing_group, siblings); + } + + return NULL; +} + +static int restore_mount_sharing_options(void) +{ + struct sharing_group *sg; + + list_for_each_entry(sg, &sharing_groups, list) { + struct sharing_group *t; + + if (sg->parent) + continue; + + /* Handle dependant sharing groups in tree order */ + for (t = sg; t != NULL; t = sharing_group_next(t)) { + if (restore_one_sharing_group(t)) + return -1; + } + } + + return 0; +} + +static int remove_source_of_deleted_mount(struct mount_info *mi) +{ + char *cut_root, path[PATH_MAX], *root; + + BUG_ON(!mi->deleted || !mi->bind); + + cut_root = get_relative_path(mi->root, mi->bind->root); + if (!cut_root) { + pr_err("Failed to find root for %d in our supposed bind %d\n", mi->mnt_id, mi->bind->mnt_id); + return -1; + } + + if (cut_root[0]) { + snprintf(path, sizeof(path), "%s/%s", mi->bind->plain_mountpoint, cut_root); + root = path; + } else { + root = mi->bind->plain_mountpoint; + } + + if (mi->is_dir) { + if (rmdir(root)) { + pr_perror("Can't remove deleted directory %s", root); + return -1; + } + } else { + if (unlink(root)) { + pr_perror("Can't unlink deleted file %s", root); + return -1; + } + } + + if (mi->deleted_level) + rm_parent_dirs(-1, root, mi->deleted_level); + + return 0; +} + +/* Delay making mounts deleted until we've restored sharing groups */ +static int remove_sources_of_deleted_mounts(void) +{ + struct mount_info *mi; + int ret = 0; + + list_for_each_entry(mi, &deleted_mounts, deleted_list) { + if (remove_source_of_deleted_mount(mi)) + ret = -1; + } + + return ret; +} + +static int get_empty_mntns(void) +{ + int orig_nsfd, nsfd = -1; + + orig_nsfd = open_proc(PROC_SELF, "ns/mnt"); + if (orig_nsfd < 0) + return -1; + + /* Create the new mount namespace */ + if (unshare(CLONE_NEWNS)) { + pr_perror("Unable to create a new mntns"); + close(orig_nsfd); + return -1; + } + + if (mount("none", "/", NULL, MS_REC | MS_PRIVATE, NULL)) { + pr_perror("Can't remount \"/\" with MS_PRIVATE"); + goto err; + } + + if (make_yard(mnt_roots)) + goto err; + + if (cr_pivot_root(mnt_roots)) + goto err; + + if (mkdirpat(AT_FDCWD, mnt_roots, 0777)) { + pr_err("Failed to setup root yard in empty mntns\n"); + goto err; + } + + nsfd = open_proc(PROC_SELF, "ns/mnt"); +err: + if (restore_ns(orig_nsfd, &mnt_ns_desc)) + close_safe(&nsfd); + return nsfd; +} + +/* Create almost empty mount namespaces only with root yard precreated */ +static int pre_create_mount_namespaces(void) +{ + int orig_nsfd = -1, nsfd = -1, empty_mntns, exit_code = -1; + char path[PATH_MAX]; + struct ns_id *nsid; + + empty_mntns = get_empty_mntns(); + if (empty_mntns == -1) { + pr_err("Failed to create empty mntns\n"); + goto err; + } + + /* restore mount namespaces */ + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &mnt_ns_desc) + continue; + + if (switch_ns_by_fd(empty_mntns, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) + goto err; + + /* Create the new mount namespace */ + if (unshare(CLONE_NEWNS)) { + pr_perror("Unable to create a new mntns"); + goto err; + } + + nsfd = open_proc(PROC_SELF, "ns/mnt"); + if (nsfd < 0) + goto err; + + /* Pin new mntns with a file descriptor */ + nsid->mnt.nsfd_id = fdstore_add(nsfd); + close(nsfd); + if (nsid->mnt.nsfd_id < 0) { + pr_err("Can't add mntns fd to fdstore\n"); + goto err; + } + + if (make_yard(mnt_roots)) + goto err; + + print_ns_root(nsid, 0, path, sizeof(path)); + if (mkdir(path, 0600)) { + pr_perror("Unable to create %s", path); + goto err; + } + } + + exit_code = 0; +err: + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + close_safe(&empty_mntns); + return exit_code; +} + +/* + * Assemble the mount tree for each restored mount namespace + * from pre-created plain mounts. + */ +static int assemble_mount_namespaces(void) +{ + int orig_nsfd = -1, nsfd = -1, rootfd = -1, exit_code = -1; + char path[PATH_MAX]; + struct ns_id *nsid; + + for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) { + if (nsid->nd != &mnt_ns_desc) + continue; + + nsfd = fdstore_get(nsid->mnt.nsfd_id); + if (nsfd < 0) + goto err; + + if (switch_ns_by_fd(nsfd, &mnt_ns_desc, orig_nsfd == -1 ? &orig_nsfd : NULL)) { + close(nsfd); + goto err; + } + close(nsfd); + + if (assemble_tree_from_plain_mounts(nsid)) + goto err; + + /* Set its root */ + print_ns_root(nsid, 0, path, sizeof(path) - 1); + if (cr_pivot_root(path)) + goto err; + + /* root fd is used to restore file mappings */ + rootfd = open_proc(PROC_SELF, "root"); + if (rootfd < 0) + goto err; + nsid->mnt.root_fd_id = fdstore_add(rootfd); + if (nsid->mnt.root_fd_id < 0) { + pr_err("Can't add root fd to fdstore\n"); + close(rootfd); + goto err; + } + close(rootfd); + } + + exit_code = 0; +err: + if (orig_nsfd >= 0 && restore_ns(orig_nsfd, &mnt_ns_desc)) + exit_code = -1; + return exit_code; +} + +/* The main entry point of mount-v2 for creating mounts */ +int prepare_mnt_ns_v2(void) +{ + if (!(root_ns_mask & CLONE_NEWNS)) + return 0; + +#ifdef CONFIG_BINFMT_MISC_VIRTUALIZED + if (!opts.has_binfmt_misc && !list_empty(&binfmt_misc_list)) { + /* + * Add to root yard along with other plain mounts and mntns + * directories. This mount would be created and restored by + * generic mount creation code, but it would never be moved to + * any restored mount namespaces. + */ + if (!add_cr_time_mount(root_yard_mp, "binfmt_misc", "binfmt_misc", 0, true)) + return -1; + } +#endif + + if (validate_mounts(mntinfo, false)) + return -1; + + if (pre_create_mount_namespaces()) + return -1; + + if (populate_mnt_ns_v2()) + return -1; + + if (assemble_mount_namespaces()) + return -1; + + if (restore_mount_sharing_options()) + return -1; + + return remove_sources_of_deleted_mounts(); +} diff --git a/criu/mount.c b/criu/mount.c index c39b44ac6..85fcb8843 100644 --- a/criu/mount.c +++ b/criu/mount.c @@ -17,6 +17,7 @@ #include "plugin.h" #include "filesystems.h" #include "mount.h" +#include "mount-v2.h" #include "pstree.h" #include "image.h" #include "namespaces.h" @@ -1680,6 +1681,7 @@ struct mount_info __maybe_unused *add_cr_time_mount(struct mount_info *root, cha goto err; } mi->mnt_id = HELPER_MNT_ID; + mi->is_dir = true; mi->flags = mi->sb_flags = 0; mi->root = xstrdup("/"); mi->fsname = xstrdup(fsname); @@ -2987,6 +2989,9 @@ struct mount_info *mnt_entry_alloc(bool rst) } memset(new->rmi, 0, sizeof(struct rst_mount_info)); } + new->mp_fd_id = -1; + new->mnt_fd_id = -1; + new->is_dir = -1; new->fd = -1; new->is_overmounted = -1; INIT_LIST_HEAD(&new->children); @@ -2999,6 +3004,7 @@ struct mount_info *mnt_entry_alloc(bool rst) INIT_LIST_HEAD(&new->mnt_notprop); INIT_LIST_HEAD(&new->mnt_unbindable); INIT_LIST_HEAD(&new->postpone); + INIT_LIST_HEAD(&new->deleted_list); } return new; } @@ -3314,6 +3320,7 @@ static int merge_mount_trees(void) root_yard_mp->plain_mountpoint = xstrdup(mnt_roots); if (!root_yard_mp->plain_mountpoint) return -1; + root_yard_mp->is_dir = true; root_yard_mp->mounted = true; root_yard_mp->mnt_bind_is_populated = true; root_yard_mp->is_overmounted = false; @@ -3359,6 +3366,9 @@ int read_mnt_ns_img(void) if (!nsid->mnt.mntinfo_tree) return -1; + /* mntns root mounts are always directories */ + nsid->mnt.mntinfo_tree->is_dir = true; + tail->next = pms; pms = head; } @@ -3368,6 +3378,9 @@ int read_mnt_ns_img(void) search_bindmounts(); prepare_is_overmounted(); + if (!opts.mntns_compat_mode && resolve_shared_mounts_v2()) + return -1; + if (merge_mount_trees()) return -1; @@ -3691,6 +3704,9 @@ int prepare_mnt_ns(void) free_mntinfo(old); } + if (!opts.mntns_compat_mode) + return prepare_mnt_ns_v2(); + ret = populate_mnt_ns(); if (ret) return -1;