criu/mount.c

#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <errno.h>
#include <sys/stat.h>
#include <string.h>
#include <stdlib.h>
#include <sys/mount.h>
#include <sys/types.h>
#include <sys/wait.h>

#include "cr_options.h"
#include "asm/types.h"
#include "util.h"
#include "util-pie.h"
#include "log.h"
#include "plugin.h"
#include "mount.h"
#include "pstree.h"
#include "proc_parse.h"
#include "image.h"
#include "namespaces.h"
#include "protobuf.h"
#include "kerndat.h"
#include "fs-magic.h"
#include "sysfs_parse.h"

#include "protobuf/mnt.pb-c.h"

#define AUTODETECTED_MOUNT "CRIU:AUTOGENERATED"

/*
 * Structure to keep external mount points resolving info.
 *
 * On dump the key is the mountpoint as seen from the mount
 * namespace, the val is some name that will be put into image
 * instead of the mount point's root path.
 *
 * On restore the key is the name from the image (the one 
 * mentioned above) and the val is the path in criu's mount 
 * namespace that will become the mount point's root, i.e. -- 
 * be bind mounted to the respective mountpoint.
 */

struct ext_mount {
	char *key;
	char *val;
	struct list_head l;
};

int ext_mount_add(char *key, char *val)
{
	struct ext_mount *em;

	em = xmalloc(sizeof(*em));
	if (!em)
		return -1;

	em->key = key;
	em->val = val;
	list_add_tail(&em->l, &opts.ext_mounts);
	pr_info("Added %s:%s ext mount mapping\n", key, val);
	return 0;
}

/* Lookup ext_mount by key field */
static struct ext_mount *ext_mount_lookup(char *key)
{
	struct ext_mount *em;

	list_for_each_entry(em, &opts.ext_mounts, l)
		if (!strcmp(em->key, key))
			return em;

	return NULL;
}

/*
 * Single linked list of mount points get from proc/images
 */
struct mount_info *mntinfo;

static void mntinfo_add_list(struct mount_info *new)
{
	if (!mntinfo)
		mntinfo = new;
	else {
		struct mount_info *pm;

		/* Add to the tail. (FIXME -- make O(1) ) */
		for (pm = mntinfo; pm->next != NULL; pm = pm->next)
			;
		pm->next = new;
	}
}

static int open_mountpoint(struct mount_info *pm);

static struct mount_info *mnt_build_tree(struct mount_info *list);
static int validate_mounts(struct mount_info *info, bool for_dump);

/* Asolute paths are used on dump and relative paths are used on restore */
static inline int is_root(char *p)
{
	return (!strcmp(p, "/"));
}

/* True for the root mount (the topmost one) */
static inline int is_root_mount(struct mount_info *mi)
{
	return is_root(mi->mountpoint + 1);
}

/*
 * True if the mountpoint target is root on its FS.
 *
 * This is used to determine whether we need to postpone
 * mounting. E.g. one can bind mount some subdir from a
 * disk, and in this case we'll have to get the root disk
 * mount first, then bind-mount it. See do_mount_one().
 */
static inline int fsroot_mounted(struct mount_info *mi)
{
	return is_root(mi->root);
}

static struct mount_info *__lookup_mnt_id(struct mount_info *list, int id)
{
	struct mount_info *m;

	for (m = list; m != NULL; m = m->next)
		if (m->mnt_id == id)
			return m;

	return NULL;
}

struct mount_info *lookup_mnt_id(unsigned int id)
{
	return __lookup_mnt_id(mntinfo, id);
}

struct mount_info *lookup_mnt_sdev(unsigned int s_dev)
{
	struct mount_info *m;

	for (m = mntinfo; m != NULL; m = m->next)
		if (m->s_dev == s_dev)
			return m;

	return NULL;
}

static struct mount_info *mount_resolve_path(struct mount_info *mntinfo_tree, const char *path)
{
	size_t pathlen = strlen(path);
	struct mount_info *m = mntinfo_tree, *c;

	while (1) {
		list_for_each_entry(c, &m->children, siblings) {
			size_t n;

			n = strlen(c->mountpoint + 1);
			if (n > pathlen)
				continue;

			if (strncmp(c->mountpoint + 1, path, min(n, pathlen)))
				continue;
			if (n < pathlen && path[n] != '/')
				continue;

			m = c;
			break;
		}
		if (&c->siblings == &m->children)
			break;
	}

	pr_debug("Path `%s' resolved to `%s' mountpoint\n", path, m->mountpoint);
	return m;
}

dev_t phys_stat_resolve_dev(struct ns_id *ns, dev_t st_dev, const char *path)
{
	struct mount_info *m;

	m = mount_resolve_path(ns->mnt.mntinfo_tree, path);
	/*
	 * BTRFS returns subvolume dev-id instead of
	 * superblock dev-id, in such case return device
	 * obtained from mountinfo (ie subvolume0).
	 */
	return strcmp(m->fstype->name, "btrfs") ?
		MKKDEV(major(st_dev), minor(st_dev)) : m->s_dev;
}

bool phys_stat_dev_match(dev_t st_dev, dev_t phys_dev,
		struct ns_id *ns, const char *path)
{
	if (st_dev == kdev_to_odev(phys_dev))
		return true;

	return phys_dev == phys_stat_resolve_dev(ns, st_dev, path);
}

/*
 * Comparer two mounts. Return true if only mount points are differ.
 * Don't care about root and mountpoints, if bind is true.
 */
static bool mounts_equal(struct mount_info* mi, struct mount_info *c, bool bind)
{
	if (mi->s_dev != c->s_dev ||
	    c->fstype != mi->fstype ||
	    strcmp(c->source, mi->source) ||
	    strcmp(c->options, mi->options))
		return false;

	if (bind)
		return true;

	if (strcmp(c->root, mi->root))
		return false;
	if (strcmp(basename(c->mountpoint), basename(mi->mountpoint)))
		return false;
	return true;
}

/*
 * mnt_roots is a temporary directory for restoring sub-trees of
 * non-root namespaces.
 */
static char *mnt_roots;

static struct mount_info *mnt_build_ids_tree(struct mount_info *list)
{
	struct mount_info *m, *root = NULL;
	struct mount_info *tmp_root_mount = NULL;

	if (mnt_roots) {
		/* mnt_roots is a tmpfs mount and it's private */
		tmp_root_mount = mnt_entry_alloc();
		if (!tmp_root_mount)
			return NULL;

		tmp_root_mount->mountpoint = mnt_roots;
		tmp_root_mount->mounted = true;
	}

	/*
	 * Just resolve the mnt_id:parent_mnt_id relations
	 */

	pr_debug("\tBuilding plain mount tree\n");
	for (m = list; m != NULL; m = m->next) {
		struct mount_info *p;

		pr_debug("\t\tWorking on %d->%d\n", m->mnt_id, m->parent_mnt_id);

		if (m->mnt_id != m->parent_mnt_id)
			p = __lookup_mnt_id(list, m->parent_mnt_id);
		else /* a circular mount reference. It's rootfs or smth like it. */
			p = NULL;

		if (!p) {
			/* This should be / */
			if (root == NULL && is_root_mount(m)) {
				root = m;
				continue;
			}

			pr_err("Mountpoint %d w/o parent %d found @%s (root %s)\n",
					m->mnt_id, m->parent_mnt_id, m->mountpoint,
					root ? "found" : "not found");
			if (root && m->is_ns_root) {
				if (!mounts_equal(root, m, true) ||
						strcmp(root->root, m->root)) {
					pr_err("Nested mount namespaces with different roots are not supported yet");
					return NULL;
				}

				/*
				 * A root of a sub mount namespace is
				 * mounted in a temporary directory in the
				 * root mount namespace, so its parent is
				 * the main root.
				 */
				p = tmp_root_mount;
			} else
				return NULL;
		}

		m->parent = p;
		list_add_tail(&m->siblings, &p->children);
	}

	if (!root) {
		pr_err("No root found for tree\n");
		return NULL;
	}

	if (mnt_roots) {
		tmp_root_mount->parent = root;
		list_add_tail(&tmp_root_mount->siblings, &root->children);
	}

	return root;
}

static int mnt_depth(struct mount_info *m)
{
	int depth = 0;
	char *c;

	for (c = m->mountpoint; *c != '\0'; c++)
		if (*c == '/')
			depth++;

	return depth;
}

static void mnt_resort_siblings(struct mount_info *tree)
{
	struct mount_info *m, *p;
	LIST_HEAD(list);

	/*
	 * Put siblings of each node in an order they can be (u)mounted
	 * I.e. if we have mounts on foo/bar/, foo/bar/foobar/ and foo/
	 * we should put them in the foo/bar/foobar/, foo/bar/, foo/ order.
	 * Otherwise we will not be able to (u)mount them in a sequence.
	 *
	 * Funny, but all we need for this is to sort them in the descending
	 * order of the amount of /-s in a path =)
	 *
	 * Use stupid insertion sort here, we're not expecting mount trees
	 * to contain hundreds (or more) elements.
	 */

	pr_info("\tResorting siblings on %d\n", tree->mnt_id);
	while (!list_empty(&tree->children)) {
		int depth;

		m = list_first_entry(&tree->children, struct mount_info, siblings);
		list_del(&m->siblings);

		depth = mnt_depth(m);
		list_for_each_entry(p, &list, siblings)
			if (mnt_depth(p) <= depth)
				break;

		list_add(&m->siblings, &p->siblings);
		mnt_resort_siblings(m);
	}

	list_splice(&list, &tree->children);
}

static void mnt_tree_show(struct mount_info *tree, int off)
{
	struct mount_info *m;

	pr_info("%*s[%s](%d->%d)\n", off, "",
			tree->mountpoint, tree->mnt_id, tree->parent_mnt_id);

	list_for_each_entry(m, &tree->children, siblings)
		mnt_tree_show(m, off + 1);

	pr_info("%*s<--\n", off, "");
}

static int try_resolve_ext_mount(struct mount_info *info)
{
	struct ext_mount *em;

	em = ext_mount_lookup(info->mountpoint + 1 /* trim the . */);
	if (em == NULL)
		return -ENOTSUP;

	pr_info("Found %s mapping for %s mountpoint\n",
			em->val, info->mountpoint);
	info->external = em;
	return 0;
}

static struct mount_info *get_widest_peer(struct mount_info *m)
{
	struct mount_info *p;

	/*
	 * Try to find a mount, which is wider or equal.
	 * A is wider than B, if A->root is a subpath of B->root.
	 */
	list_for_each_entry(p, &m->mnt_share, mnt_share)
		if (issubpath(m->root, p->root))
			return p;

	return NULL;
}

static struct mount_info *find_shared_peer(struct mount_info *m,
		struct mount_info *ct, char *ct_mountpoint, int m_mpnt_l)
{
	struct mount_info *cm;

	list_for_each_entry(cm, &m->children, siblings) {
		if (strcmp(ct_mountpoint, cm->mountpoint + m_mpnt_l))
			continue;

		if (!mounts_equal(cm, ct, false))
			break;

		return cm;
	}

	return NULL;
}

static inline int path_length(char *path)
{
	int off;

	off = strlen(path);
	/*
	 * If we're pure / then set lenght to zero so that adding this
	 * value as sub-path offset would produce the correct result.
	 * E.g. the tail path of the "/foo/bar" relative to the "/foo"
	 * will be the "/foo/bar" + len("/foo") == "/bar", while the
	 * same relative to the "/" should be +0 to be the "/foo/bar",
	 * not +1 and the "foo/bar".
	 */
	if (path[off - 1] == '/')
		off--;

	return off;
}

static int validate_shared(struct mount_info *m)
{
	struct mount_info *t, *ct;
	int t_root_l, m_root_l, t_mpnt_l, m_mpnt_l;
	char *m_root_rpath;
	LIST_HEAD(children);

	/*
	 * Check that all mounts in one shared group has the same set of
	 * children. Only visible children are accounted. A non-root bind-mount
	 * doesn't see children out of its root and it's excpected case.
	 *
	 * Here is a few conditions:
	 * 1. t is wider than m
	 * 2. We search a wider mount in the same direction, so when we
	 *    enumirate all mounts, we can't be sure that all of them
	 *    has the same set of children.
	 */

	t = get_widest_peer(m);
	if (!t)
		/*
		 * The current mount is the widest one in its shared group,
		 * all others will be compared to it or with some other,
		 * which will be compared to it.
		 */
		return 0;

	/* A set of childrent which ar visiable for both should be the same */

	t_root_l = path_length(t->root);
	m_root_l = path_length(m->root);
	t_mpnt_l = path_length(t->mountpoint);
	m_mpnt_l = path_length(m->mountpoint);

	/* For example:
	 * t->root = /		t->mp = ./zdtm/live/static/mntns_root_bind.test
	 * m->root = /test	m->mp = ./zdtm/live/static/mntns_root_bind.test/test.bind
	 * t_root_l = 0	t_mpnt_l = 39
	 * m_root_l = 5	m_mpnt_l = 49
	 * ct->root = /		ct->mp = ./zdtm/live/static/mntns_root_bind.test/test/sub
	 * tp = /test/sub	mp = /test len=5
	 */

	/*
	 * ct:  | t->root       |	child mount point	|
	 * cm:  |       m->root         | child mount point	|
	 * ct:  |		|  /test/sub			|
	 * cm:  |		  /test	| /sub			|
	 *                      | A     | B                     |
	 *			| ct->mountpoint + t_mpnt_l
	 *			| m->root + strlen(t->root)
	 */

	m_root_rpath = m->root + t_root_l;	/* path from t->root to m->root */

	/* Search a child, which is visiable in both mounts. */
	list_for_each_entry(ct, &t->children, siblings) {
		char *ct_mpnt_rpath;
		struct mount_info *cm;

		if (ct->is_ns_root)
			continue;

		ct_mpnt_rpath = ct->mountpoint + t_mpnt_l; /* path from t->mountpoint to ct->mountpoint */

		/*
		 * Check whether ct can be is visible at m, i.e. the
		 * ct's rpath starts (as path) with m's rpath.
		 */

		if (!issubpath(ct_mpnt_rpath, m_root_rpath))
			continue;

		/*
		 * The ct has peer in m but with the mount path deeper according
		 * to m's depth relavie to t. Thus -- trim this difference (the
		 * lenght of m_root_rpath) from ct's mountpoint path.
		 */

		ct_mpnt_rpath += m_root_l - t_root_l;

		/*
		 * Find in m the mountpoint that fully matches with ct (with the
		 * described above path corrections).
		 */

		cm = find_shared_peer(m, ct, ct_mpnt_rpath, m_mpnt_l);
		if (!cm)
			goto err;

		/*
		 * Keep this one aside. At the end of t's children scan we should
		 * move _all_ m's children here (the list_empty check below).
		 */
		list_move(&cm->siblings, &children);
	}

	if (!list_empty(&m->children))
		goto err;

	list_splice(&children, &m->children);
	return 0;

err:
	list_splice(&children, &m->children);
	pr_err("%d:%s and %d:%s have different set of mounts\n",
			m->mnt_id, m->mountpoint, t->mnt_id, t->mountpoint);
	return -1;
}

/*
 * Find the mount_info from which the respective bind-mount
 * can be created. It can be either an FS-root mount, or the
 * root of the tree (the latter only if its root path is the
 * sub-path of the bind mount's root).
 */

static struct mount_info *find_fsroot_mount_for(struct mount_info *bm)
{
	struct mount_info *sm;

	list_for_each_entry(sm, &bm->mnt_bind, mnt_bind)
		if (fsroot_mounted(sm) ||
				(sm->parent == NULL &&
				 strstartswith(bm->root, sm->root)))
			return sm;

	return NULL;
}

static int validate_mounts(struct mount_info *info, bool for_dump)
{
	struct mount_info *m, *t;

	for (m = info; m; m = m->next) {
		if (m->parent == NULL || m->is_ns_root)
			/* root mount can be any */
			continue;

		if (m->shared_id && validate_shared(m))
			return -1;

		/*
		 * Mountpoint can point to / of an FS. In that case this FS
		 * should be of some known type so that we can just mount one.
		 *
		 * Otherwise it's a bindmount mountpoint and we try to find
		 * what fsroot mountpoint it's bound to. If this point is the
		 * root mount, the path to bindmount root should be accessible
		 * form the rootmount path (the strstartswith check in the
		 * else branch below).
		 */

		if (fsroot_mounted(m)) {
			if (m->fstype->code == FSTYPE__UNSUPPORTED) {
				pr_err("FS mnt %s dev %#x root %s unsupported id %d\n",
						m->mountpoint, m->s_dev, m->root, m->mnt_id);
				return -1;
			}
		} else {
			t = find_fsroot_mount_for(m);
			if (!t) {
				int ret;

				if (for_dump) {
					// We've already resolved the mount
					// and it is external.
					if (m->external) {
						ret = 0;
					} else {
						ret = run_plugins(DUMP_EXT_MOUNT, m->mountpoint, m->mnt_id);
						if (ret == 0)
							m->need_plugin = true;
					}
				} else {
					if (m->need_plugin || m->external)
						/*
						 * plugin should take care of this one
						 * in restore_ext_mount, or do_bind_mount
						 * will mount it as external
						 */
						ret = 0;
					else
						ret = -ENOTSUP;
				}

				if (ret < 0) {
					if (ret == -ENOTSUP)
						pr_err("%d:%s doesn't have a proper root mount\n",
								m->mnt_id, m->mountpoint);
					return -1;
				}
			}
		}

		list_for_each_entry(t, &m->parent->children, siblings) {
			if (m == t)
				continue;
			if (!issubpath(m->mountpoint, t->mountpoint))
				continue;

			pr_err("%d:%s is overmounted\n", m->mnt_id, m->mountpoint);
			return -1;
		}
	}

	return 0;
}

static struct mount_info *find_best_external_match(struct mount_info *list, struct mount_info *info)
{
	struct mount_info *it, *candidate = NULL;

	for (it = list; it; it = it->next) {
		if (!mounts_equal(info, it, true))
			continue;

		candidate = it;

		/*
		 * Consider the case of:
		 *
		 * mount /xxx
		 * mount --bind /xxx /yyy
		 * mount --make-shared /yyy
		 * mount --bind /xxx /zzz
		 * mount --make-shared /zzz
		 * bind mount a shared mount into the namespace
		 *
		 * Here, we want to return the /right/ mount, not just a mount
		 * that's equal. However, in the case:
		 *
		 * bind mount a shared mount into the namespace
		 * inside the namespace, remount MS_PRIVATE
		 * inside the namespace, remount MS_SHARED
		 *
		 * there will be no external mount with matching sharing
		 * because the sharing is only internal; we still want to bind
		 * mount from this mountinfo so we should return it, but we
		 * should make the sharing namespace private after that bind
		 * mount.
		 *
		 * Below are the cases where we found an exact match.
		 */
		if (info->flags & MS_SHARED && info->shared_id == it->shared_id)
			return candidate;

		if (info->flags & MS_SLAVE && info->master_id == it->shared_id)
			return candidate;
	}

	return candidate;
}

static int resolve_external_mounts(struct mount_info *info)
{
	struct mount_info *m;
	struct ns_id *ns = NULL, *iter;

	for (iter = ns_ids; iter->next; iter = iter->next) {
		if (iter->pid == getpid() && iter->nd == &mnt_ns_desc) {
			ns = iter;
			break;
		}
	}

	if (!ns) {
		pr_err("Failed to find criu pid's mount ns!");
		return -1;
	}

	for (m = info; m; m = m->next) {
		int ret, size;
		char *p;
		struct ext_mount *em;
		struct mount_info *match;

		if (m->parent == NULL || m->is_ns_root)
			continue;

		ret = try_resolve_ext_mount(m);
		if (ret < 0 && ret != -ENOTSUP) {
			return -1;
		} else if (ret == -ENOTSUP && !opts.autodetect_ext_mounts) {
			continue;
		} else if (ret == 0) {
			continue;
		}

		match = find_best_external_match(ns->mnt.mntinfo_list, m);
		if (!match)
			continue;

		if (m->flags & MS_SHARED) {
			if (!opts.enable_external_sharing)
				continue;

			if (m->shared_id != match->shared_id)
				m->internal_sharing = true;
		}

		if (m->flags & MS_SLAVE) {
			if (!opts.enable_external_masters)
				continue;

			/*
			 * In order to support something like internal slavery,
			 * we need to teach can_mount_now and do_mount_one
			 * about slavery relationships in external mounts. This
			 * seems like an uncommon case, so we punt for not.
			 */
			if (m->master_id != match->shared_id)
				continue;
		}

		size = strlen(match->mountpoint + 1) + strlen(m->root) + 1;
		p = xmalloc(sizeof(char) * size);
		if (!p)
			return -1;

		ret = snprintf(p, size+1, "%s%s", match->mountpoint + 1, m->root);
		if (ret < 0 || ret >= size) {
			free(p);
			return -1;
		}

		em = xmalloc(sizeof(struct ext_mount));
		if (!em) {
			free(p);
			return -1;
		}

		em->val = AUTODETECTED_MOUNT;
		em->key = p;

		m->external = em;

		xfree(m->source);
		m->source = p;

		pr_info("autodetected external mount %s for %s\n", p, m->mountpoint);
	}

	return 0;
}

static int collect_shared(struct mount_info *info, bool for_dump)
{
	struct mount_info *m, *t;

	/*
	 * If we have a shared mounts, both master
	 * slave targets are to be present in mount
	 * list, otherwise we can't be sure if we can
	 * recreate the scheme later on restore.
	 */
	for (m = info; m; m = m->next) {
		bool need_share, need_master;

		need_share = m->shared_id && list_empty(&m->mnt_share);
		need_master = m->master_id;

		for (t = info; t && (need_share || need_master); t = t->next) {
			if (t == m)
				continue;
			if (need_master && t->shared_id == m->master_id) {
				pr_debug("The mount %d is slave for %d\n", m->mnt_id, t->mnt_id);
				list_add(&m->mnt_slave, &t->mnt_slave_list);
				m->mnt_master = t;
				need_master = false;
			}

			/* Collect all mounts from this group */
			if (need_share && t->shared_id == m->shared_id) {
				pr_debug("Mount %d is shared with %d group %d\n",
						m->mnt_id, t->mnt_id, m->shared_id);
				list_add(&t->mnt_share, &m->mnt_share);
			}
		}

		// If we haven't already determined this mount is external,
		// then we don't know where it came from.
		if (need_master && m->parent && !m->external) {
			pr_err("Mount %d %s (master_id: %d shared_id: %d) "
			       "has unreachable sharing. Try --enable-external-masters.\n", m->mnt_id,
				m->mountpoint, m->master_id, m->shared_id);
			return -1;
		}

		/* Search bind-mounts */
		if (list_empty(&m->mnt_bind)) {
			/*
			 * A first mounted point will be set up as a source point
			 * for others. Look at propagate_mount()
			 */
			for (t = m->next; t; t = t->next) {
				if (mounts_equal(m, t, true))
					list_add(&t->mnt_bind, &m->mnt_bind);
			}
		}
	}

	return 0;
}

static struct mount_info *mnt_build_tree(struct mount_info *list)
{
	struct mount_info *tree;

	/*
	 * Organize them in a sequence in which they can be mounted/umounted.
	 */

	pr_info("Building mountpoints tree\n");
	tree = mnt_build_ids_tree(list);
	if (!tree)
		return NULL;

	mnt_resort_siblings(tree);
	pr_info("Done:\n");
	mnt_tree_show(tree, 0);
	return tree;
}

/*
 * mnt_fd is a file descriptor on the mountpoint, which is closed in an error case.
 * If mnt_fd is -1, the mountpoint will be opened by this function.
 */
static int __open_mountpoint(struct mount_info *pm, int mnt_fd)
{
	dev_t dev;
	struct stat st;
	int ret;

	if (mnt_fd == -1) {
		int mntns_root;

		mntns_root = mntns_get_root_fd(pm->nsid);
		if (mntns_root < 0)
			return -1;

		mnt_fd = openat(mntns_root, pm->mountpoint, O_RDONLY);
		if (mnt_fd < 0) {
			pr_perror("Can't open %s", pm->mountpoint);
			return -1;
		}
	}

	ret = fstat(mnt_fd, &st);
	if (ret < 0) {
		pr_perror("fstat(%s) failed", pm->mountpoint);
		goto err;
	}

	dev = phys_stat_resolve_dev(pm->nsid, st.st_dev, pm->mountpoint + 1);
	if (dev != pm->s_dev) {
		pr_err("The file system %#x (%#x) %s %s is inaccessible\n",
				pm->s_dev, (int)dev, pm->fstype->name, pm->mountpoint);
		goto err;
	}

	return mnt_fd;
err:
	close(mnt_fd);
	return -1;
}

int open_mount(unsigned int s_dev)
{
	struct mount_info *m;

	m = lookup_mnt_sdev(s_dev);
	if (!m)
		return -ENOENT;

	return __open_mountpoint(m, -1);
}

static int open_mountpoint(struct mount_info *pm)
{
	int fd = -1, ns_old = -1;
	char mnt_path_tmp[] = "/tmp/cr-tmpfs.XXXXXX";
	char mnt_path_root[] = "/cr-tmpfs.XXXXXX";
	char *mnt_path = mnt_path_tmp;
	int cwd_fd;

	/*
	 * If a mount doesn't have children, we can open a mount point,
	 * otherwise we need to create a "private" copy.
	 */
	if (list_empty(&pm->children))
		return __open_mountpoint(pm, -1);

	pr_info("Something is mounted on top of %s\n", pm->mountpoint);

	/*
	 * To create a "private" copy, the target mount is bind-mounted
	 * in a temporary place w/o MS_REC (non-recursively).
	 * A mount point can't be bind-mounted in criu's namespace, it will be
	 * mounted in a target namespace. The sequence of actions is
	 * mkdtemp, setns(tgt), mount, open, detach, setns(old).
	 */

	cwd_fd = open(".", O_DIRECTORY);
	if (cwd_fd < 0) {
		pr_perror("Unable to open cwd");
		return -1;
	}

	if (switch_ns(root_item->pid.real, &mnt_ns_desc, &ns_old) < 0)
		goto out;

	mnt_path = mkdtemp(mnt_path_tmp);
	if (mnt_path == NULL && errno == ENOENT)
		mnt_path = mkdtemp(mnt_path_root);
	if (mnt_path == NULL) {
		pr_perror("Can't create a temporary directory");
		goto out;
	}

	if (mount(pm->mountpoint, mnt_path, NULL, MS_BIND, NULL)) {
		pr_perror("Can't bind-mount %d:%s to %s",
				pm->mnt_id, pm->mountpoint, mnt_path);
		rmdir(mnt_path);
		goto out;
	}

	fd = open_detach_mount(mnt_path);
	if (fd < 0)
		goto out;

	if (restore_ns(ns_old, &mnt_ns_desc)) {
		ns_old = -1;
		goto out;
	}
	if (fchdir(cwd_fd)) {
		pr_perror("Unable to restore cwd");
		close(cwd_fd);
		close(fd);
		return -1;
	}
	close(cwd_fd);

	return __open_mountpoint(pm, fd);
out:
	if (ns_old >= 0)
		 restore_ns(ns_old, &mnt_ns_desc);
	close_safe(&fd);
	if (fchdir(cwd_fd))
		pr_perror("Unable to restore cwd");
	close(cwd_fd);
	return -1;
}

static int attach_option(struct mount_info *pm, char *opt)
{
	char *buf;
	int len, olen;

	len = strlen(pm->options);
	olen = strlen(opt);
	buf = xrealloc(pm->options, len + olen + 2);
	if (buf == NULL)
		return -1;

	if (len && buf[len - 1] != ',') {
		buf[len] = ',';
		len++;
	}

	memcpy(buf + len, opt, olen + 1);
	pm->options = buf;

	return 0;
}

/* Is it mounted w or w/o the newinstance option */
static int devpts_parse(struct mount_info *pm)
{
	int ret;

	ret = kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVPTS, pm->s_dev);
	if (ret <= 0)
		return ret;

	/*
	 * Kernel hides this option, but if the fs instance
	 * is new (virtualized) we know that it was created
	 * with -o newinstance.
	 */
	return attach_option(pm, "newinstance");
}

static int tmpfs_dump(struct mount_info *pm)
{
	int ret = -1;
	char tmpfs_path[PSFDS];
	int fd = -1;
	struct cr_img *img;

	fd = open_mountpoint(pm);
	if (fd < 0)
		return -1;

	if (fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) & ~FD_CLOEXEC) == -1) {
		pr_perror("Can not drop FD_CLOEXEC");
		goto out;
	}

	img = open_image(CR_FD_TMPFS_DEV, O_DUMP, pm->s_dev);
	if (!img)
		goto out;

	sprintf(tmpfs_path, "/proc/self/fd/%d", fd);

	ret = cr_system(-1, img_raw_fd(img), -1, "tar", (char *[])
			{ "tar", "--create",
			"--gzip",
			"--one-file-system",
			"--check-links",
			"--preserve-permissions",
			"--sparse",
			"--numeric-owner",
			"--directory", tmpfs_path, ".", NULL });

	if (ret)
		pr_err("Can't dump tmpfs content\n");

	close_image(img);
out:
	close_safe(&fd);
	return ret;
}

/*
 * Virtualized devtmpfs on any side (dump or restore)
 * means, that we should try to handle it as a plain
 * tmpfs.
 *
 * Interesting case -- shared on dump and virtual on
 * restore -- will fail, since no tarball with the fs
 * contents will be found.
 */

static int devtmpfs_virtual(struct mount_info *pm)
{
	return kerndat_fs_virtualized(KERNDAT_FS_STAT_DEVTMPFS, pm->s_dev);
}

static int devtmpfs_dump(struct mount_info *pm)
{
	int ret;

	ret = devtmpfs_virtual(pm);
	if (ret == 1)
		ret = tmpfs_dump(pm);

	return ret;
}

static int tmpfs_restore(struct mount_info *pm)
{
	int ret;
	struct cr_img *img;

	img = open_image(CR_FD_TMPFS_DEV, O_RSTR, pm->s_dev);
	if (empty_image(img)) {
		close_image(img);
		img = open_image(CR_FD_TMPFS_IMG, O_RSTR, pm->mnt_id);
	}
	if (!img || empty_image(img))
		return -1;

	ret = cr_system(img_raw_fd(img), -1, -1, "tar",
			(char *[]) {"tar", "--extract", "--gzip",
				"--directory", pm->mountpoint, NULL});
	close_image(img);

	if (ret) {
		pr_err("Can't restore tmpfs content\n");
		return -1;
	}

	return 0;
}

static int devtmpfs_restore(struct mount_info *pm)
{
	int ret;

	ret = devtmpfs_virtual(pm);
	if (ret == 1)
		ret = tmpfs_restore(pm);

	return ret;
}

static int binfmt_misc_dump(struct mount_info *pm)
{
	int fd, ret = -1;
	struct dirent *de;
	DIR *fdir = NULL;

	fd = open_mountpoint(pm);
	if (fd < 0)
		return -1;

	fdir = fdopendir(fd);
	if (fdir == NULL) {
		close(fd);
		return -1;
	}

	while ((de = readdir(fdir))) {
		if (dir_dots(de))
			continue;
		if (!strcmp(de->d_name, "register"))
			continue;
		if (!strcmp(de->d_name, "status"))
			continue;

		pr_err("binfmt_misc isn't empty: %s\n", de->d_name);
		goto out;
	}

	ret = 0;
out:
	closedir(fdir);
	return ret;
}

static int fusectl_dump(struct mount_info *pm)
{
	int fd, ret = -1;
	struct dirent *de;
	DIR *fdir = NULL;

	fd = open_mountpoint(pm);
	if (fd < 0)
		return -1;

	fdir = fdopendir(fd);
	if (fdir == NULL) {
		close(fd);
		return -1;
	}

	while ((de = readdir(fdir))) {
		int id;
		struct mount_info *it;

		if (dir_dots(de))
			continue;

		if (sscanf(de->d_name, "%d", &id) != 1) {
			pr_err("wrong number of items scanned in fusectl dump\n");
			goto out;
		}

		for (it = mntinfo; it; it = it->next) {
			if (it->fstype->code == FSTYPE__FUSE && id == minor(it->s_dev) && !it->external) {
				pr_err("%s is a fuse mount but not external\n", it->mountpoint);
				goto out;
			}
		}
	}

	ret = 0;
out:
	closedir(fdir);
	return ret;
}

static int dump_empty_fs(struct mount_info *pm)
{
	int fd, ret = -1;
	struct dirent *de;
	DIR *fdir = NULL;
	fd = open_mountpoint(pm);

	if (fd < 0)
		return -1;

	fdir = fdopendir(fd);
	if (fdir == NULL) {
		close(fd);
		return -1;
	}

	while ((de = readdir(fdir))) {
		if (dir_dots(de))
			continue;

		pr_err("%s isn't empty: %s\n", pm->fstype->name, de->d_name);
		goto out;
	}

	ret = 0;
out:
	closedir(fdir);
	return ret;
}

/*
 * Some fses (fuse) cannot be dumped, so we should always fail on dump/restore
 * of these fses.
 */
static int always_fail(struct mount_info *pm)
{
	return -1;
}

static struct fstype fstypes[32] = {
	{
		.name = "unsupported",
		.code = FSTYPE__UNSUPPORTED,
	}, {
		.name = "proc",
		.code = FSTYPE__PROC,
	}, {
		.name = "sysfs",
		.code = FSTYPE__SYSFS,
	}, {
		.name = "devtmpfs",
		.code = FSTYPE__DEVTMPFS,
		.dump = devtmpfs_dump,
		.restore = devtmpfs_restore,
	}, {
		.name = "binfmt_misc",
		.code = FSTYPE__BINFMT_MISC,
		.dump = binfmt_misc_dump,
	}, {
		.name = "tmpfs",
		.code = FSTYPE__TMPFS,
		.dump = tmpfs_dump,
		.restore = tmpfs_restore,
	}, {
		.name = "devpts",
		.parse = devpts_parse,
		.code = FSTYPE__DEVPTS,
	}, {
		.name = "simfs",
		.code = FSTYPE__SIMFS,
	}, {
		.name = "btrfs",
		.code = FSTYPE__UNSUPPORTED,
	}, {
		.name = "pstore",
		.dump = dump_empty_fs,
		.code = FSTYPE__PSTORE,
	}, {
		.name = "mqueue",
		.dump = dump_empty_fs,
		.code = FSTYPE__MQUEUE,
	}, {
		.name = "securityfs",
		.code = FSTYPE__SECURITYFS,
	}, {
		.name = "fusectl",
		.dump = fusectl_dump,
		.code = FSTYPE__FUSECTL,
	}, {
		.name = "debugfs",
		.code = FSTYPE__DEBUGFS,
	}, {
		.name = "cgroup",
		.code = FSTYPE__CGROUP,
	}, {
		.name = "aufs",
		.code = FSTYPE__AUFS,
		.parse = aufs_parse,
	}, {
		.name = "fuse",
		.code = FSTYPE__FUSE,
		.dump = always_fail,
		.restore = always_fail,
	},
};

static char *fsauto_names;

static bool fsname_is_auto(const char *name)
{
	const char *p;

	if (!fsauto_names)
		return false;

	if (strcmp(fsauto_names, "all") == 0)
		return true;

	for (p = strtok(fsauto_names, ","); p; p = strtok(NULL, ",")) {
		if (strcmp(name, p) == 0)
			return true;
	}

 	return false;
}

bool add_fsname_auto(const char *names)
{
	xfree(fsauto_names);
	fsauto_names = xstrdup(names);
	return fsauto_names != NULL;
}

static struct fstype *__find_fstype_by_name(char *_fst, bool force_auto)
{
	int i;

	/*
	 * This fn is required for two things.
	 * 1st -- to check supported filesystems (as just mounting
	 * anything is wrong, almost every fs has its own features)
	 * 2nd -- save some space in the image (since we scan all
	 * names anyway)
	 *
	 * The kernel reports "subtypes" sometimes and the valid
	 * type-vs-subtype delimiter is the dot symbol. We disregard any
	 * subtypes for the purpose of finding the fstype.
	 */
	char fst[1024];

	for (i = 0; _fst[i] && i < sizeof(fst) - 1; i++) {
		if (_fst[i] == '.')
			break;

		fst[i] = _fst[i];
	}

	fst[i] = 0;

	for (i = 1; i < ARRAY_SIZE(fstypes); i++) {
		struct fstype *fstype = fstypes + i;

		if (!fstype->name) {
			if (!force_auto && !fsname_is_auto(fst))
				break;

			fstype->name = xstrdup(fst);
			fstype->code = FSTYPE__AUTO;
			return fstype;
		}

		if (!strcmp(fstype->name, fst))
			return fstype;
	}

	if (i == ARRAY_SIZE(fstypes)) /* ensure we have a room for auto */
		pr_err_once("fstypes[] overflow!\n");

	return &fstypes[0];
}

struct fstype *find_fstype_by_name(char *fst)
{
	return __find_fstype_by_name(fst, false);
}

static struct fstype *decode_fstype(u32 fst, char *fsname)
{
	int i;

	if (fst == FSTYPE__AUTO)
		return __find_fstype_by_name(fsname, true);

	if (fst == FSTYPE__UNSUPPORTED)
		goto uns;

	for (i = 1; i < ARRAY_SIZE(fstypes); i++) {
		struct fstype *fstype = fstypes + i;

		if (!fstype->name)
			break;

		if (fstype->code == fst)
			return fstype;
	}
uns:
	return &fstypes[0];
}

static int dump_one_mountpoint(struct mount_info *pm, struct cr_img *img)
{
	MntEntry me = MNT_ENTRY__INIT;

	pr_info("\t%d: %x:%s @ %s\n", pm->mnt_id, pm->s_dev,
			pm->root, pm->mountpoint);

	me.fstype		= pm->fstype->code;

	if (me.fstype == FSTYPE__AUTO)
		me.fsname = pm->fstype->name;

	if (pm->parent && !pm->dumped && !pm->need_plugin &&
	    pm->fstype->dump && fsroot_mounted(pm)) {
		struct mount_info *t;

		if (pm->fstype->dump(pm))
			return -1;

		list_for_each_entry(t, &pm->mnt_bind, mnt_bind)
			t->dumped = true;
	}

	me.mnt_id		= pm->mnt_id;
	me.root_dev		= pm->s_dev;
	me.parent_mnt_id	= pm->parent_mnt_id;
	me.flags		= pm->flags;
	me.mountpoint		= pm->mountpoint + 1;
	me.source		= pm->source;
	me.options		= pm->options;
	me.shared_id		= pm->shared_id;
	me.has_shared_id	= true;
	me.master_id		= pm->master_id;
	me.has_master_id	= true;
	if (pm->need_plugin) {
		me.has_with_plugin = true;
		me.with_plugin = true;
	}

	if (pm->internal_sharing) {
		me.has_internal_sharing = true;
		me.internal_sharing = true;
	}

	if (pm->external) {
		/*
		 * For external mount points dump the mapping's
		 * value instead of root. See collect_mnt_from_image
		 * for reverse mapping details.
		 */
		me.root	= pm->external->val;
		me.has_ext_mount = true;
		me.ext_mount = true;
	} else
		me.root = pm->root;

	if (pb_write_one(img, &me, PB_MNT))
		return -1;

	return 0;
}

static void free_mntinfo(struct mount_info *pms)
{
	while (pms) {
		struct mount_info *pm;

		pm = pms->next;
		mnt_entry_free(pms);
		pms = pm;
	}
}

struct mount_info *collect_mntinfo(struct ns_id *ns, bool for_dump)
{
	struct mount_info *pm;

	ns->mnt.mntinfo_list = pm = parse_mountinfo(ns->pid, ns, for_dump);
	if (!pm) {
		pr_err("Can't parse %d's mountinfo\n", ns->pid);
		return NULL;
	}

	ns->mnt.mntinfo_tree = mnt_build_tree(pm);
	if (ns->mnt.mntinfo_tree == NULL)
		goto err;

	return pm;
err:
	free_mntinfo(pm);
	return NULL;
}

static int dump_mnt_ns(struct ns_id *ns, struct mount_info *pms)
{
	struct mount_info *pm;
	int ret = -1;
	struct cr_img *img;
	int ns_id = ns->id;

	pr_info("Dumping mountpoints\n");
	img = open_image(CR_FD_MNTS, O_DUMP, ns_id);
	if (!img)
		goto err;

	for (pm = pms; pm && pm->nsid == ns; pm = pm->next)
		if (dump_one_mountpoint(pm, img))
			goto err_i;

	ret = 0;
err_i:
	close_image(img);
err:
	return ret;
}

/*
 * _fn_f  - pre-order traversal function
 * _fn_f  - post-order traversal function
 * _plist - a postpone list. _el is added to this list, if _fn_f returns
 *	    a positive value, and all lower elements are not enumirated.
 */
#define MNT_TREE_WALK(_r, _el, _fn_f, _fn_r, _plist, _prgs) do {		\
		struct mount_info *_mi = _r;					\
										\
		while (1) {							\
			int ret;						\
										\
			list_del_init(&_mi->postpone);				\
										\
			ret = _fn_f(_mi);					\
			if (ret < 0)						\
				return -1;					\
			else if (ret > 0) {					\
				list_add_tail(&_mi->postpone, _plist);		\
				goto up;					\
			}							\
										\
			_prgs++;					\
										\
			if (!list_empty(&_mi->children)) {			\
				_mi = list_entry(_mi->children._el,		\
						struct mount_info, siblings);	\
				continue;					\
			}							\
	up:									\
			if (_fn_r(_mi))						\
				return -1;					\
			if (_mi == _r)						\
				break;						\
			if (_mi->siblings._el == &_mi->parent->children) {	\
				_mi = _mi->parent;				\
				goto up;					\
			}							\
			_mi = list_entry(_mi->siblings._el,			\
					struct mount_info, siblings);		\
		}								\
	} while (0)

#define MNT_WALK_NONE	0 &&


static int mnt_tree_for_each(struct mount_info *start,
		int (*fn)(struct mount_info *))
{
	struct mount_info *tmp;
	LIST_HEAD(postpone);
	LIST_HEAD(postpone2);
	int progress;

	pr_debug("Start with %d:%s\n", start->mnt_id, start->mountpoint);
	list_add(&start->postpone, &postpone);

again:
	progress = 0;

	list_for_each_entry_safe(start, tmp, &postpone, postpone)
		MNT_TREE_WALK(start, next, fn, MNT_WALK_NONE, &postpone2, progress);

	if (!progress) {
		struct mount_info *m;

		pr_err("A few mount points can't be mounted\n");
		list_for_each_entry(m, &postpone2, postpone) {
			pr_err("%d:%d %s %s %s\n", m->mnt_id,
				m->parent_mnt_id, m->root,
				m->mountpoint, m->source);
		}
		return -1;
	}

	list_splice_init(&postpone2, &postpone);

	if (!list_empty(&postpone))
		goto again;

	return 0;

}

static int mnt_tree_for_each_reverse(struct mount_info *m,
		int (*fn)(struct mount_info *))
{
	int progress = 0;

	MNT_TREE_WALK(m, prev, MNT_WALK_NONE, fn, (struct list_head *) NULL, progress);

	return 0;
}

static char *resolve_source(struct mount_info *mi)
{
	if (kdev_major(mi->s_dev) == 0)
		/*
		 * Anonymous block device. Kernel creates them for
		 * diskless mounts.
		 */
		return mi->source;

	pr_err("No device for %s mount\n", mi->mountpoint);
	return NULL;
}

static int restore_shared_options(struct mount_info *mi, bool private, bool shared, bool slave)
{
	pr_debug("%d:%s private %d shared %d slave %d\n",
			mi->mnt_id, mi->mountpoint, private, shared, slave);

	if (private && mount(NULL, mi->mountpoint, NULL, MS_PRIVATE, NULL)) {
		pr_perror("Unable to make %s private", mi->mountpoint);
		return -1;
	}
	if (slave && mount(NULL, mi->mountpoint, NULL, MS_SLAVE, NULL)) {
		pr_perror("Unable to make %s slave", mi->mountpoint);
		return -1;
	}
	if (shared && mount(NULL, mi->mountpoint, NULL, MS_SHARED, NULL)) {
		pr_perror("Unable to make %s shared", mi->mountpoint);
		return -1;
	}

	return 0;
}

/*
 * Umount points, which are propagated in slave parents, because
 * we can't be sure, that they were inherited in a real life.
 */
static int umount_from_slaves(struct mount_info *mi)
{
	struct mount_info *t;
	char mpath[PATH_MAX];

	list_for_each_entry(t, &mi->parent->mnt_slave_list, mnt_slave) {
		if (!t->mounted)
			continue;

		snprintf(mpath, sizeof(mpath), "%s/%s",
				t->mountpoint, basename(mi->mountpoint));
		pr_debug("\t\tUmount %s\n", mpath);
		if (umount(mpath) == -1) {
			pr_perror("Can't umount %s", mpath);
			return -1;
		}
	}

	return 0;
}

/*
 * If something is mounted in one shared point, it will be spread in
 * all other points from this shared group.
 *
 * Look at Documentation/filesystems/sharedsubtree.txt for more details
 */
static int propagate_siblings(struct mount_info *mi)
{
	struct mount_info *t;

	/*
	 * Find all mounts, which must be bind-mounted from this one
	 * to inherite shared group or master id
	 */
	list_for_each_entry(t, &mi->mnt_share, mnt_share) {
		if (t->mounted)
			continue;
		pr_debug("\t\tBind %s\n", t->mountpoint);
		t->bind = mi;
	}

	list_for_each_entry(t, &mi->mnt_slave_list, mnt_slave) {
		if (t->mounted)
			continue;
		pr_debug("\t\tBind %s\n", t->mountpoint);
		t->bind = mi;
	}

	return 0;
}

static int propagate_mount(struct mount_info *mi)
{
	struct mount_info *t;

	propagate_siblings(mi);

	if (!mi->parent)
		goto skip_parent;

	umount_from_slaves(mi);

	/* Propagate this mount to everyone from a parent group */

	list_for_each_entry(t, &mi->parent->mnt_share, mnt_share) {
		struct mount_info *c;

		list_for_each_entry(c, &t->children, siblings) {
			if (mounts_equal(mi, c, false)) {
				pr_debug("\t\tPropogate %s\n", c->mountpoint);
				c->mounted = true;
				propagate_siblings(c);
				umount_from_slaves(c);
			}
		}
	}

skip_parent:
	/*
	 * FIXME Currently non-root mounts can be restored
	 * only if a proper root mount exists
	 */
	if (fsroot_mounted(mi) || mi->parent == NULL)
		list_for_each_entry(t, &mi->mnt_bind, mnt_bind) {
			if (t->mounted)
				continue;
			if (t->bind)
				continue;
			if (t->master_id)
				continue;
			t->bind = mi;
		}

	return 0;
}

static int do_new_mount(struct mount_info *mi)
{
	char *src;
	struct fstype *tp = mi->fstype;

	src = resolve_source(mi);
	if (!src)
		return -1;

	if (mount(src, mi->mountpoint, tp->name,
			mi->flags & (~MS_SHARED), mi->options) < 0) {
		pr_perror("Can't mount at %s", mi->mountpoint);
		return -1;
	}

	if (restore_shared_options(mi, 0, mi->shared_id, 0))
		return -1;

	mi->mounted = true;

	if (tp->restore && tp->restore(mi))
		return -1;

	return 0;
}

static int restore_ext_mount(struct mount_info *mi)
{
	int ret;

	pr_debug("Restoring external bind mount %s\n", mi->mountpoint);
	ret = run_plugins(RESTORE_EXT_MOUNT, mi->mnt_id, mi->mountpoint, "/", NULL);
	if (ret)
		pr_err("Can't restore ext mount (%d)\n", ret);
	return ret;
}

static int do_bind_mount(struct mount_info *mi)
{
	bool shared = 0;
	bool force_private_remount = false;

	if (!mi->need_plugin) {
		char *root, rpath[PATH_MAX];
		int tok = 0;

		if (mi->external) {
			/*
			 * We have / pointing to criu's ns root still,
			 * so just use the mapping's path. The mountpoint
			 * is tuned in collect_mnt_from_image to refer
			 * to proper location in the namespace we restore.
			 */
			root = mi->root;
			force_private_remount = mi->internal_sharing;
			goto do_bind;
		}

		shared = mi->shared_id && mi->shared_id == mi->bind->shared_id;

		/*
		 * Cut common part of root.
		 * For non-root binds the source is always "/" (checked)
		 * so this will result in this slash removal only.
		 */
		while (mi->root[tok] == mi->bind->root[tok]) {
			tok++;
			if (mi->bind->root[tok] == '\0')
				break;
			BUG_ON(mi->root[tok] == '\0');
		}

		snprintf(rpath, sizeof(rpath), "%s/%s",
				mi->bind->mountpoint, mi->root + tok);
		root = rpath;
do_bind:
		pr_info("\tBind %s to %s\n", root, mi->mountpoint);
		if (mount(root, mi->mountpoint, NULL,
					MS_BIND, NULL) < 0) {
			pr_perror("Can't mount at %s", mi->mountpoint);
			return -1;
		}
	} else {
		if (restore_ext_mount(mi))
			return -1;
	}

	/*
	 * shared - the mount is in the same shared group with mi->bind
	 * mi->shared_id && !shared - create a new shared group
	 */
	if (restore_shared_options(mi, force_private_remount || (!shared && !mi->master_id),
					mi->shared_id && !shared,
					mi->master_id))
		return -1;

	mi->mounted = true;

	return 0;
}

static bool can_mount_now(struct mount_info *mi)
{
	/* The root mount */
	if (!mi->parent)
		return true;
	if (mi->is_ns_root)
		return true;

	if (mi->external)
		return true;

	if (mi->master_id && mi->bind == NULL)
		return false;

	if (!fsroot_mounted(mi) && (mi->bind == NULL && !mi->need_plugin && !mi->external))
		return false;

	if (mi->parent->shared_id) {
		struct mount_info *p = mi->parent, *n;

		if (mi->parent->shared_id == mi->shared_id) {
			int rlen = strlen(mi->root);
			list_for_each_entry(n, &p->mnt_share, mnt_share)
				if (strlen(n->root) < rlen && !n->mounted)
					return false;
		} else {
			list_for_each_entry(n, &p->mnt_share, mnt_share)
				if (!n->mounted)
					return false;
			list_for_each_entry(n, &p->mnt_slave_list, mnt_slave)
				if (!n->mounted)
					return false;
		}
	}

	return true;
}

static int do_mount_root(struct mount_info *mi)
{
	if (restore_shared_options(mi, !mi->shared_id && !mi->master_id,
						mi->shared_id, mi->master_id))
		return -1;

	mi->mounted = true;

	return 0;
}

static int do_mount_one(struct mount_info *mi)
{
	int ret;

	if (mi->mounted)
		return 0;

	if (!can_mount_now(mi)) {
		pr_debug("Postpone slave %s\n", mi->mountpoint);
		return 1;
	}

	pr_debug("\tMounting %s @%s (%d)\n", mi->fstype->name, mi->mountpoint, mi->need_plugin);

	if (!mi->parent)
		ret = do_mount_root(mi);
	else if (!mi->bind && !mi->need_plugin && !mi->external)
		ret = do_new_mount(mi);
	else
		ret = do_bind_mount(mi);

	if (ret == 0 && propagate_mount(mi))
		return -1;

	if (mi->fstype->code == FSTYPE__UNSUPPORTED) {
		struct statfs st;

		if (statfs(mi->mountpoint, &st)) {
			pr_perror("Unable to statfs %s", mi->mountpoint);
			return -1;
		}
		if (st.f_type == BTRFS_SUPER_MAGIC)
			mi->fstype = find_fstype_by_name("btrfs");
	}

	return ret;
}

static int do_umount_one(struct mount_info *mi)
{
	if (!mi->parent)
		return 0;

	if (mount("none", mi->parent->mountpoint, "none", MS_REC|MS_PRIVATE, NULL)) {
		pr_perror("Can't mark %s as private", mi->parent->mountpoint);
		return -1;
	}

	if (umount(mi->mountpoint)) {
		pr_perror("Can't umount at %s", mi->mountpoint);
		return -1;
	}

	pr_info("Umounted at %s\n", mi->mountpoint);
	return 0;
}

static int clean_mnt_ns(struct mount_info *mntinfo_tree)
{
	pr_info("Cleaning mount namespace\n");

	/*
	 * Mountinfos were collected at prepare stage
	 */

	return mnt_tree_for_each_reverse(mntinfo_tree, do_umount_one);
}

static int cr_pivot_root(char *root)
{
	char put_root[] = "crtools-put-root.XXXXXX";
	int exit_code = -1;

	pr_info("Move the root to %s\n", root ? : ".");

	if (root) {
		if (chdir(root)) {
			pr_perror("chdir(%s) failed", root);
			return -1;
		}
	}

	if (mkdtemp(put_root) == NULL) {
		pr_perror("Can't create a temporary directory");
		return -1;
	}

	if (mount(put_root, put_root, NULL, MS_BIND, NULL)) {
		pr_perror("Unable to mount tmpfs in %s", put_root);
		goto err_root;
	}

	if (mount(NULL, put_root, NULL, MS_PRIVATE, NULL)) {
		pr_perror("Can't remount %s with MS_PRIVATE", put_root);
		goto err_tmpfs;
	}

	if (pivot_root(".", put_root)) {
		pr_perror("pivot_root(., %s) failed", put_root);
		goto err_tmpfs;
	}

	if (mount("none", put_root, "none", MS_REC|MS_PRIVATE, NULL)) {
		pr_perror("Can't remount root with MS_PRIVATE");
		return -1;
	}

	exit_code = 0;

	if (umount2(put_root, MNT_DETACH)) {
		pr_perror("Can't umount %s", put_root);
		return -1;
	}

err_tmpfs:
	if (umount2(put_root, MNT_DETACH)) {
		pr_perror("Can't umount %s", put_root);
		return -1;
	}

err_root:
	if (rmdir(put_root)) {
		pr_perror("Can't remove the directory %s", put_root);
		return -1;
	}

	return exit_code;
}

struct mount_info *mnt_entry_alloc()
{
	struct mount_info *new;

	new = xzalloc(sizeof(struct mount_info));
	if (new) {
		INIT_LIST_HEAD(&new->children);
		INIT_LIST_HEAD(&new->siblings);
		INIT_LIST_HEAD(&new->mnt_slave_list);
		INIT_LIST_HEAD(&new->mnt_share);
		INIT_LIST_HEAD(&new->mnt_bind);
		INIT_LIST_HEAD(&new->postpone);
	}
	return new;
}

void mnt_entry_free(struct mount_info *mi)
{
	if (mi == NULL)
		return;

	xfree(mi->root);
	xfree(mi->mountpoint);
	xfree(mi->source);
	xfree(mi->options);
	xfree(mi);
}

/*
 * Helper for getting a path to where the namespace's root
 * is re-constructed.
 */
static inline int print_ns_root(struct ns_id *ns, char *buf, int bs)
{
	return snprintf(buf, bs, "%s/%d", mnt_roots, ns->id);
}

static int create_mnt_roots(void)
{
	if (mnt_roots)
		return 0;

	if (chdir(opts.root ? : "/")) {
		pr_perror("Unable to change working directory on %s", opts.root);
		return -1;
	}

	mnt_roots = strdup(".criu.mntns.XXXXXX");
	if (mnt_roots == NULL) {
		pr_perror("Can't allocate memory");
		return -1;
	}

	if (mkdtemp(mnt_roots) == NULL) {
		pr_perror("Unable to create a temporary directory");
		mnt_roots = NULL;
		return -1;
	}

	return 0;
}

static int rst_collect_local_mntns(void)
{
	struct ns_id *nsid;

	nsid = rst_new_ns_id(0, getpid(), &mnt_ns_desc);
	if (!nsid)
		return -1;

	mntinfo = collect_mntinfo(nsid, false);
	if (!mntinfo)
		return -1;

	futex_set(&nsid->ns_created, 1);
	return 0;
}

static int collect_mnt_from_image(struct mount_info **pms, struct ns_id *nsid)
{
	MntEntry *me = NULL;
	int ret, root_len = 1;
	struct cr_img *img;
	char root[PATH_MAX] = ".";

	img = open_image(CR_FD_MNTS, O_RSTR, nsid->id);
	if (img < 0)
		return -1;

	if (nsid->id != root_item->ids->mnt_ns_id)
		root_len = print_ns_root(nsid, root, sizeof(root));

	pr_debug("Reading mountpoint images\n");

	while (1) {
		struct mount_info *pm;
		int len;

		ret = pb_read_one_eof(img, &me, PB_MNT);
		if (ret <= 0)
			break;

		pm = mnt_entry_alloc();
		if (!pm)
			goto err;

		pm->nsid = nsid;
		pm->next = *pms;
		*pms = pm;

		pm->mnt_id		= me->mnt_id;
		pm->parent_mnt_id	= me->parent_mnt_id;
		pm->s_dev		= me->root_dev;
		pm->flags		= me->flags;
		pm->shared_id		= me->shared_id;
		pm->master_id		= me->master_id;
		pm->need_plugin		= me->with_plugin;
		pm->is_ns_root		= is_root(me->mountpoint);

		pr_debug("\t\tGetting source for %d\n", pm->mnt_id);
		pm->source = xstrdup(me->source);
		if (!pm->source)
			goto err;

		if (me->has_internal_sharing)
			pm->internal_sharing = me->internal_sharing;

		/* FIXME: abort unsupported early */
		pm->fstype		= decode_fstype(me->fstype, me->fsname);

		if (me->ext_mount) {
			struct ext_mount *em;

			/*
			 * External mount point -- get the reverse mapping
			 * from the command line and put into root's place
			 */

			em = ext_mount_lookup(me->root);
			if (!em) {
				if (!opts.autodetect_ext_mounts) {
					pr_err("No mapping for %s mountpoint\n", me->mountpoint);
					goto err;
				}

				/*
				 * Make up an external mount entry for this
				 * mount point, since we couldn't find a user
				 * supplied one.
				 */
				em = xmalloc(sizeof(struct ext_mount));
				if (!em)
					goto err;

				em->val = pm->source;

				/*
				 * Put a : in here since those are invalid on
				 * the cli, so we know it's autogenerated in
				 * debugging.
				 */
				em->key = AUTODETECTED_MOUNT;
			}

			pm->external = em;
			pm->root = em->val;
			pr_debug("Mountpoint %s will have root from %s\n",
					me->mountpoint, pm->root);

		} else {
			pr_debug("\t\tGetting root for %d\n", pm->mnt_id);
			pm->root = xstrdup(me->root);
			if (!pm->root)
				goto err;
		}

		len  = strlen(me->mountpoint) + root_len + 1;
		pm->mountpoint = xmalloc(len);
		if (!pm->mountpoint)
			goto err;
		pm->ns_mountpoint = pm->mountpoint + root_len;
		/*
		 * For bind-mounts we would also fix the root here
		 * too, but bind-mounts restore merges mountpoint
		 * and root paths together, so there's no need in
		 * that.
		 */

		strcpy(pm->mountpoint, root);
		strcpy(pm->mountpoint + root_len, me->mountpoint);

		pr_debug("\t\tGetting mpt for %d %s\n", pm->mnt_id, pm->mountpoint);

		pr_debug("\t\tGetting opts for %d\n", pm->mnt_id);
		pm->options = xstrdup(me->options);
		if (!pm->options)
			goto err;

		pr_debug("\tRead %d mp @ %s\n", pm->mnt_id, pm->mountpoint);
	}

	if (me)
		mnt_entry__free_unpacked(me, NULL);

	close_image(img);

	return 0;
err:
	close_image(img);
	return -1;
}

static struct mount_info *read_mnt_ns_img(void)
{
	struct mount_info *pms = NULL;
	struct ns_id *nsid;

	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
		if (nsid->nd != &mnt_ns_desc)
			continue;

		if (nsid->id != root_item->ids->mnt_ns_id)
			/*
			 * If we have more than one (root) namespace,
			 * then we'll need the roots yard.
			 */
			if (create_mnt_roots())
				return NULL;

		if (collect_mnt_from_image(&pms, nsid))
			return NULL;
	}

	/* Here it doesn't matter where the mount list is saved */
	mntinfo = pms;
	return pms;
}

char *rst_get_mnt_root(int mnt_id)
{
	struct mount_info *m;
	static char path[PATH_MAX] = "/";

	if (!(root_ns_mask & CLONE_NEWNS))
		return path;

	if (mnt_id == -1)
		return path;

	m = lookup_mnt_id(mnt_id);
	if (m == NULL)
		return NULL;

	if (m->nsid->pid == getpid())
		return path;

	print_ns_root(m->nsid, path, sizeof(path));
	return path;
}

static int do_restore_task_mnt_ns(struct ns_id *nsid)
{
	char path[PATH_MAX];

	if (nsid->pid != getpid()) {
		int fd;

		futex_wait_while_eq(&nsid->ns_created, 0);
		fd = open_proc(nsid->pid, "ns/mnt");
		if (fd < 0)
			return -1;

		if (setns(fd, CLONE_NEWNS)) {
			pr_perror("Unable to change mount namespace");
			return -1;
		}

		close(fd);
		return 0;
	}

	if (unshare(CLONE_NEWNS)) {
		pr_perror("Unable to unshare mount namespace");
		return -1;
	}

	print_ns_root(nsid, path, sizeof(path));
	if (cr_pivot_root(path))
		return -1;

	futex_set_and_wake(&nsid->ns_created, 1);

	return 0;
}

int restore_task_mnt_ns(struct pstree_item *current)
{
	if (current->ids && current->ids->has_mnt_ns_id) {
		unsigned int id = current->ids->mnt_ns_id;
		struct ns_id *nsid;

		/*
		 * Regardless of the namespace a task wants to
		 * live in, by that point they all will live in
		 * root's one (see prepare_pstree_kobj_ids() +
		 * get_clone_mask()). So if the current task's
		 * target namespace is the root's one -- it's
		 * already there, otherwise it will have to do
		 * setns().
		 */
		if (root_item->ids->mnt_ns_id == id)
			return 0;

		nsid = lookup_ns_by_id(id, &mnt_ns_desc);
		if (nsid == NULL) {
			pr_err("Can't find mount namespace %d\n", id);
			return -1;
		}

		if (do_restore_task_mnt_ns(nsid))
			return -1;
	}

	return 0;
}

/*
 * All nested mount namespaces are restore as sub-trees of the root namespace.
 */
static int prepare_roots_yard(void)
{
	char path[PATH_MAX];
	struct ns_id *nsid;

	if (mnt_roots == NULL)
		return 0;

	if (mount("none", mnt_roots, "tmpfs", 0, NULL)) {
		pr_perror("Unable to mount tmpfs in %s", mnt_roots);
		return -1;
	}
	if (mount("none", mnt_roots, NULL, MS_PRIVATE, NULL))
		return -1;

	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
		if (nsid->nd != &mnt_ns_desc)
			continue;

		print_ns_root(nsid, path, sizeof(path));
		if (mkdir(path, 0600)) {
			pr_perror("Unable to create %s", path);
			return -1;
		}
	}

	return 0;
}

static int populate_mnt_ns(struct mount_info *mis)
{
	struct mount_info *pms;
	struct ns_id *nsid;

	if (prepare_roots_yard())
		return -1;

	pms = mnt_build_tree(mis);
	if (!pms)
		return -1;

	if (collect_shared(mis, false))
		return -1;

	for (nsid = ns_ids; nsid; nsid = nsid->next) {
		if (nsid->nd != &mnt_ns_desc)
			continue;

		/*
		 * Make trees of all namespaces look the
		 * same, so that manual paths resolution
		 * works on them.
		 */
		nsid->mnt.mntinfo_tree = pms;
	}

	if (validate_mounts(mis, false))
		return -1;

	return mnt_tree_for_each(pms, do_mount_one);
}

int fini_mnt_ns(void)
{
	int ret = 0;

	if (mnt_roots == NULL)
		return 0;

	if (mount("none", mnt_roots, "none", MS_REC|MS_PRIVATE, NULL)) {
		pr_perror("Can't remount root with MS_PRIVATE");
		ret = 1;
	}
	/*
	 * Don't exit after a first error, becuase this function
	 * can be used to rollback in a error case.
	 * Don't worry about MNT_DETACH, because files are restored after this
	 * and nobody will not be restored from a wrong mount namespace.
	 */
	if (umount2(mnt_roots, MNT_DETACH)) {
		pr_perror("Can't unmount %s", mnt_roots);
		ret = 1;
	}
	if (rmdir(mnt_roots)) {
		pr_perror("Can't remove the directory %s", mnt_roots);
		ret = 1;
	}

	return ret;
}

int prepare_mnt_ns(void)
{
	int ret = -1;
	struct mount_info *mis, *old;
	struct ns_id ns = { .pid = PROC_SELF, .nd = &mnt_ns_desc };

	if (!(root_ns_mask & CLONE_NEWNS))
		return rst_collect_local_mntns();

	pr_info("Restoring mount namespace\n");

	old = collect_mntinfo(&ns, false);
	if (old == NULL)
		return -1;

	close_proc();

	mis = read_mnt_ns_img();
	if (!mis)
		goto out;

	/*
	 * The new mount namespace is filled with the mountpoint
	 * clones from the original one. We have to umount them
	 * prior to recreating new ones.
	 */
	if (!opts.root) {
		if (chdir("/")) {
			pr_perror("chdir(\"/\") failed");
			return -1;
		}

		if (clean_mnt_ns(ns.mnt.mntinfo_tree))
			return -1;
	} else {
		struct mount_info *mi;

		/* moving a mount residing under a shared mount is invalid. */
		mi = mount_resolve_path(ns.mnt.mntinfo_tree, opts.root);
		if (mi == NULL) {
			pr_err("Unable to find mount point for %s\n", opts.root);
			return -1;
		}
		if (mi->parent == NULL) {
			pr_err("New root and old root are the same\n");
			return -1;
		}

		/* Our root is mounted over the parent (in the same directory) */
		if (!strcmp(mi->parent->mountpoint, mi->mountpoint)) {
			pr_err("The parent of the new root is unreachable\n");
			return -1;
		}

		if (mount("none", mi->parent->mountpoint + 1, "none", MS_SLAVE, NULL)) {
			pr_perror("Can't remount the parent of the new root with MS_SLAVE");
			return -1;
		}

		/* Unprivileged users can't reveal what is under a mount */
		if (root_ns_mask & CLONE_NEWUSER) {
			if (mount(opts.root, opts.root, NULL, MS_BIND | MS_REC, NULL)) {
				pr_perror("Can't remount bind-mount %s into itself\n", opts.root);
				return -1;
			}
		}
		if (chdir(opts.root)) {
			pr_perror("chdir(%s) failed", opts.root ? : "/");
			return -1;
		}
	}

	free_mntinfo(old);

	ret = populate_mnt_ns(mis);
	if (ret)
		goto out;

	if (opts.root)
		ret = cr_pivot_root(NULL);
out:
	return ret;
}

int __mntns_get_root_fd(pid_t pid)
{
	static int mntns_root_pid = -1;

	int fd, pfd;
	int ret;
	char path[PATH_MAX + 1];

	if (mntns_root_pid == pid) /* The required root is already opened */
		return get_service_fd(ROOT_FD_OFF);

	close_service_fd(ROOT_FD_OFF);

	if (!(root_ns_mask & CLONE_NEWNS)) {
		/*
		 * If criu and tasks we dump live in the same mount
		 * namespace, we can just open the root directory.
		 * All paths resolution would occur relative to criu's
		 * root. Even if it is not namespace's root, provided
		 * file paths are resolved, we'd get consistent dump.
		 */
		fd = open("/", O_RDONLY | O_DIRECTORY);
		if (fd < 0) {
			pr_perror("Can't open root");
			return -1;
		}

		goto set_root;
	}

	/*
	 * If /proc/pid/root links on '/', it signs that a root of the task
	 * and a root of mntns is the same.
	 */

	pfd = open_pid_proc(pid);
	ret = readlinkat(pfd, "root", path, sizeof(path) - 1);
	if (ret < 0) {
		close_pid_proc();
		return ret;
	}

	path[ret] = '\0';

	if (ret != 1 || path[0] != '/') {
		pr_err("The root task has another root than mntns: %s\n", path);
		close_pid_proc();
		return -1;
	}

	fd = openat(pfd, "root", O_RDONLY | O_DIRECTORY, 0);
	close_pid_proc();
	if (fd < 0) {
		pr_perror("Can't open the task root");
		return -1;
	}

set_root:
	ret = install_service_fd(ROOT_FD_OFF, fd);
	if (ret >= 0)
		mntns_root_pid = pid;
	close(fd);
	return ret;
}

int mntns_get_root_fd(struct ns_id *mntns)
{
	return __mntns_get_root_fd(mntns->pid);
}

struct ns_id *lookup_nsid_by_mnt_id(int mnt_id)
{
	struct mount_info *mi;

	/*
	 * Kernel before 3.15 doesn't show mnt_id for file descriptors.
	 * mnt_id isn't saved for files, if mntns isn't dumped.
	 * In both these cases we have only one root, so here
	 * is not matter which mount will be restured.
	 */
	if (mnt_id == -1)
		mi = mntinfo;
	else
		mi = lookup_mnt_id(mnt_id);

	if (mi == NULL)
		return NULL;

	return mi->nsid;
}

int mntns_get_root_by_mnt_id(int mnt_id)
{
	struct ns_id *mntns;

	mntns = lookup_nsid_by_mnt_id(mnt_id);
	BUG_ON(mntns == NULL);

	return mntns_get_root_fd(mntns);
}

struct collect_mntns_arg {
	bool need_to_validate;
	bool for_dump;
};

static int collect_mntns(struct ns_id *ns, void *__arg)
{
	struct collect_mntns_arg *arg = __arg;
	struct mount_info *pms;

	pms = collect_mntinfo(ns, arg->for_dump);
	if (!pms)
		return -1;

	if (arg->for_dump && ns->pid != getpid())
		arg->need_to_validate = true;

	if (ns->pid != getpid() || !(root_ns_mask & CLONE_NEWNS))
		mntinfo_add_list(pms);
	return 0;
}

int collect_mnt_namespaces(bool for_dump)
{
	struct collect_mntns_arg arg;
	int ret;

	arg.for_dump = for_dump;
	arg.need_to_validate = false;

	ret = walk_namespaces(&mnt_ns_desc, opts.autodetect_ext_mounts, collect_mntns, &arg);
	if (ret)
		goto err;

	if (resolve_external_mounts(mntinfo))
		goto err;

	if (arg.need_to_validate) {
		ret = -1;

		if (collect_shared(mntinfo, true))
			goto err;
		if (validate_mounts(mntinfo, true))
			goto err;
	}

	ret = 0;
err:
	return ret;
}

int dump_mnt_namespaces(void)
{
	struct ns_id *nsid;
	int n = 0;

	if (!(root_ns_mask & CLONE_NEWNS))
		return 0;

	for (nsid = ns_ids; nsid != NULL; nsid = nsid->next) {
		if (nsid->nd != &mnt_ns_desc)
			continue;

		if (nsid->pid == getpid())
			continue;

		if (++n == 2 && check_mnt_id()) {
			pr_err("Nested mount namespaces are not supported "
				"without mnt_id in fdinfo\n");
			return -1;
		}

		if (dump_mnt_ns(nsid, nsid->mnt.mntinfo_list))
			return -1;
	}

	return 0;
}

struct ns_desc mnt_ns_desc = NS_DESC_ENTRY(CLONE_NEWNS, "mnt");