criu/files-reg.c

#include <stdlib.h>
#include <unistd.h>
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/vfs.h>
#include <ctype.h>

/* Stolen from kernel/fs/nfs/unlink.c */
#define SILLYNAME_PREF ".nfs"
#define SILLYNAME_SUFF_LEN (((unsigned)sizeof(u64) << 1) + ((unsigned)sizeof(unsigned int) << 1))

#include "cr_options.h"
#include "imgset.h"
#include "file-ids.h"
#include "mount.h"
#include "files.h"
#include "image.h"
#include "list.h"
#include "util.h"
#include "fs-magic.h"
#include "asm/atomic.h"
#include "namespaces.h"
#include "proc_parse.h"
#include "pstree.h"

#include "protobuf.h"
#include "protobuf/regfile.pb-c.h"
#include "protobuf/remap-file-path.pb-c.h"

#include "files-reg.h"
#include "plugin.h"

int setfsuid(uid_t fsuid);

/*
 * Ghost files are those not visible from the FS. Dumping them is
 * nasty and the only way we have -- just carry its contents with
 * us. Any brave soul to implement link unlinked file back?
 */
struct ghost_file {
	struct list_head	list;
	u32			id;

	u32			dev;
	u32			ino;

	struct file_remap	remap;
};

static u32 ghost_file_ids = 1;
static LIST_HEAD(ghost_files);

static mutex_t *ghost_file_mutex;

/*
 * To rollback link remaps.
 */
struct link_remap_rlb {
	struct list_head	list;
	struct ns_id		*mnt_ns;
	char			*path;
};
static LIST_HEAD(link_remaps);

static int create_ghost(struct ghost_file *gf, GhostFileEntry *gfe, char *root, struct cr_img *img)
{
	int gfd, ghost_flags, ret = -1;
	char path[PATH_MAX];

	snprintf(path, sizeof(path), "%s/%s", root, gf->remap.path);
	if (S_ISFIFO(gfe->mode)) {
		if (mknod(path, gfe->mode, 0)) {
			pr_perror("Can't create node for ghost file");
			goto err;
		}
		ghost_flags = O_RDWR; /* To not block */
	} else if (S_ISCHR(gfe->mode) || S_ISBLK(gfe->mode)) {
		if (!gfe->has_rdev) {
			pr_err("No rdev for ghost device\n");
			goto err;
		}

		if (mknod(path, gfe->mode, gfe->rdev)) {
			pr_perror("Can't create node for ghost dev");
			goto err;
		}
		ghost_flags = O_WRONLY;
	} else if (S_ISDIR(gfe->mode)) {
		if (mkdir(path, gfe->mode)) {
			pr_perror("Can't make ghost dir");
			goto err;
		}
		ghost_flags = O_DIRECTORY;
	} else
		ghost_flags = O_WRONLY | O_CREAT | O_EXCL;

	gfd = open(path, ghost_flags, gfe->mode);
	if (gfd < 0) {
		pr_perror("Can't open ghost file %s", path);
		goto err;
	}

	if (fchown(gfd, gfe->uid, gfe->gid) < 0) {
		pr_perror("Can't reset user/group on ghost %s", path);
		goto err_c;
	}

	if (S_ISREG(gfe->mode)) {
		if (copy_file(img_raw_fd(img), gfd, 0) < 0)
			goto err_c;
	}

	ret = 0;
err_c:
	close(gfd);
err:
	return ret;
}

static int open_remap_ghost(struct reg_file_info *rfi,
		RemapFilePathEntry *rfe)
{
	struct ghost_file *gf;
	GhostFileEntry *gfe = NULL;
	struct cr_img *img;
	char *root;

	list_for_each_entry(gf, &ghost_files, list)
		if (gf->id == rfe->remap_id)
			goto gf_found;

	/*
	 * Ghost not found. We will create one in the same dir
	 * as the very first client of it thus resolving any
	 * issues with cross-device links.
	 */

	pr_info("Opening ghost file %#x for %s\n", rfe->remap_id, rfi->path);

	root = rst_get_mnt_root(rfi->rfe->mnt_id);
	if (root == NULL) {
		pr_err("The %d mount is not found\n", rfi->rfe->mnt_id);
		return -1;
	}

	gf = shmalloc(sizeof(*gf));
	if (!gf)
		return -1;
	gf->remap.path = xmalloc(PATH_MAX);
	gf->remap.mnt_id = rfi->rfe->mnt_id;
	if (!gf->remap.path)
		goto err;

	img = open_image(CR_FD_GHOST_FILE, O_RSTR, rfe->remap_id);
	if (!img)
		goto err;

	if (pb_read_one(img, &gfe, PB_GHOST_FILE) < 0)
		goto close_ifd;

	/*
	 * For old formats where optional has_[dev|ino] is
	 * not present we will have zeros here which is quite
	 * a sign for "absent" fields.
	 */
	gf->dev = gfe->dev;
	gf->ino = gfe->ino;

	if (S_ISDIR(gfe->mode))
		strncpy(gf->remap.path, rfi->path, PATH_MAX);
	else
		snprintf(gf->remap.path, PATH_MAX, "%s.cr.%x.ghost", rfi->path, rfe->remap_id);

	if (create_ghost(gf, gfe, root, img))
		goto close_ifd;

	ghost_file_entry__free_unpacked(gfe, NULL);
	close_image(img);

	gf->id = rfe->remap_id;
	gf->remap.users = 0;
	gf->remap.is_dir = S_ISDIR(gfe->mode);
	gf->remap.owner = gfe->uid;
	list_add_tail(&gf->list, &ghost_files);
gf_found:
	rfi->remap = &gf->remap;
	return 0;

close_ifd:
	close_image(img);
err:
	if (gfe)
		ghost_file_entry__free_unpacked(gfe, NULL);
	xfree(gf->remap.path);
	shfree_last(gf);
	return -1;
}

static int open_remap_linked(struct reg_file_info *rfi,
		RemapFilePathEntry *rfe)
{
	struct file_remap *rm;
	struct file_desc *rdesc;
	struct reg_file_info *rrfi;
	uid_t owner = -1;

	rdesc = find_file_desc_raw(FD_TYPES__REG, rfe->remap_id);
	if (!rdesc) {
		pr_err("Can't find target file %x\n", rfe->remap_id);
		return -1;
	}

	rm = xmalloc(sizeof(*rm));
	if (!rm)
		return -1;

	rrfi = container_of(rdesc, struct reg_file_info, d);
	pr_info("Remapped %s -> %s\n", rfi->path, rrfi->path);

	if (root_ns_mask & CLONE_NEWUSER) {
		int rfd;
		struct stat st;

		rfd = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id);
		if (fstatat(rfd, rrfi->path, &st, AT_SYMLINK_NOFOLLOW)) {
			pr_perror("Can't get owner of link remap %s", rrfi->path);
			return -1;
		}

		owner = st.st_uid;
	}

	rm->path = rrfi->path;
	rm->users = 0;
	rm->is_dir = false;
	rm->owner = owner;
	rm->mnt_id = rfi->rfe->mnt_id;
	rfi->remap = rm;
	return 0;
}

static int open_remap_dead_process(struct reg_file_info *rfi,
		RemapFilePathEntry *rfe)
{
	struct pstree_item *helper;

	for_each_pstree_item(helper) {
		/* don't need to add multiple tasks */
		if (helper->pid.virt == rfe->remap_id) {
			pr_info("Skipping helper for restoring /proc/%d; pid exists\n", rfe->remap_id);
			return 0;
		}
	}

	helper = alloc_pstree_helper();
	if (!helper)
		return -1;

	helper->sid = root_item->sid;
	helper->pgid = root_item->pgid;
	helper->pid.virt = rfe->remap_id;
	helper->parent = root_item;
	list_add_tail(&helper->sibling, &root_item->children);

	pr_info("Added a helper for restoring /proc/%d\n", helper->pid.virt);

	return 0;
}

static int collect_one_remap(void *obj, ProtobufCMessage *msg)
{
	int ret = -1;
	RemapFilePathEntry *rfe;
	struct file_desc *fdesc;
	struct reg_file_info *rfi;

	rfe = pb_msg(msg, RemapFilePathEntry);

	fdesc = find_file_desc_raw(FD_TYPES__REG, rfe->orig_id);
	if (fdesc == NULL) {
		pr_err("Remap for non existing file %#x\n",
				rfe->orig_id);
		goto out;
	}

	rfi = container_of(fdesc, struct reg_file_info, d);
	pr_info("Configuring remap %#x -> %#x\n", rfi->rfe->id, rfe->remap_id);


	if (!rfe->has_remap_type) {
		rfe->has_remap_type = true;
		/* backward compatibility with images */
		if (rfe->remap_id & REMAP_GHOST) {
			rfe->remap_id &= ~REMAP_GHOST;
			rfe->remap_type = REMAP_TYPE__GHOST;
		} else
			rfe->remap_type = REMAP_TYPE__LINKED;
	}

	switch (rfe->remap_type) {
	case REMAP_TYPE__LINKED:
		ret = open_remap_linked(rfi, rfe);
		break;
	case REMAP_TYPE__GHOST:
		ret = open_remap_ghost(rfi, rfe);
		break;
	case REMAP_TYPE__PROCFS:
		ret = open_remap_dead_process(rfi, rfe);
		break;
	default:
		pr_err("unknown remap type %u\n", rfe->remap_type);
		goto out;
	}

out:
	return ret;
}

struct collect_image_info remap_cinfo = {
	.fd_type = CR_FD_REMAP_FPATH,
	.pb_type = PB_REMAP_FPATH,
	.collect = collect_one_remap,
};

static int dump_ghost_file(int _fd, u32 id, const struct stat *st, dev_t phys_dev)
{
	struct cr_img *img;
	GhostFileEntry gfe = GHOST_FILE_ENTRY__INIT;

	pr_info("Dumping ghost file contents (id %#x)\n", id);

	img = open_image(CR_FD_GHOST_FILE, O_DUMP, id);
	if (!img)
		return -1;

	gfe.uid = userns_uid(st->st_uid);
	gfe.gid = userns_gid(st->st_gid);
	gfe.mode = st->st_mode;

	gfe.has_dev = gfe.has_ino = true;
	gfe.dev = phys_dev;
	gfe.ino = st->st_ino;

	if (S_ISCHR(st->st_mode) || S_ISBLK(st->st_mode)) {
		gfe.has_rdev = true;
		gfe.rdev = st->st_rdev;
	}

	if (pb_write_one(img, &gfe, PB_GHOST_FILE))
		return -1;

	if (S_ISREG(st->st_mode)) {
		int fd, ret;
		char lpath[PSFDS];

		/*
		 * Reopen file locally since it may have no read
		 * permissions when drained
		 */
		sprintf(lpath, "/proc/self/fd/%d", _fd);
		fd = open(lpath, O_RDONLY);
		if (fd < 0) {
			pr_perror("Can't open ghost original file");
			return -1;
		}
		ret = copy_file(fd, img_raw_fd(img), st->st_size);
		close(fd);
		if (ret)
			return -1;
	}

	close_image(img);
	return 0;
}

void remap_put(struct file_remap *remap)
{
	mutex_lock(ghost_file_mutex);
	if (--remap->users == 0) {
		int mntns_root;

		pr_info("Unlink the ghost %s\n", remap->path);

		mntns_root = mntns_get_root_by_mnt_id(remap->mnt_id);
		unlinkat(mntns_root, remap->path, 0);
	}
	mutex_unlock(ghost_file_mutex);
}

struct file_remap *lookup_ghost_remap(u32 dev, u32 ino)
{
	struct ghost_file *gf;

	mutex_lock(ghost_file_mutex);
	list_for_each_entry(gf, &ghost_files, list) {
		if (gf->ino == ino && (gf->dev == dev)) {
			gf->remap.users++;
			mutex_unlock(ghost_file_mutex);
			return &gf->remap;
		}
	}
	mutex_unlock(ghost_file_mutex);

	return NULL;
}

static int dump_ghost_remap(char *path, const struct stat *st,
				int lfd, u32 id, struct ns_id *nsid)
{
	struct ghost_file *gf;
	RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT;
	dev_t phys_dev;

	pr_info("Dumping ghost file for fd %d id %#x\n", lfd, id);

	if (st->st_size > opts.ghost_limit) {
		pr_err("Can't dump ghost file %s of %"PRIu64" size, increase limit\n",
				path, st->st_size);
		return -1;
	}

	phys_dev = phys_stat_resolve_dev(nsid, st->st_dev, path);
	list_for_each_entry(gf, &ghost_files, list)
		if ((gf->dev == phys_dev) && (gf->ino == st->st_ino))
			goto dump_entry;

	gf = xmalloc(sizeof(*gf));
	if (gf == NULL)
		return -1;

	gf->dev = phys_dev;
	gf->ino = st->st_ino;
	gf->id = ghost_file_ids++;
	list_add_tail(&gf->list, &ghost_files);

	if (dump_ghost_file(lfd, gf->id, st, phys_dev))
		return -1;

dump_entry:
	rpe.orig_id = id;
	rpe.remap_id = gf->id;
	rpe.has_remap_type = true;
	rpe.remap_type = REMAP_TYPE__GHOST;

	return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH),
			&rpe, PB_REMAP_FPATH);
}

static void __rollback_link_remaps(bool do_unlink)
{
	struct link_remap_rlb *rlb, *tmp;
	int mntns_root;

	if (!opts.link_remap_ok)
		return;

	list_for_each_entry_safe(rlb, tmp, &link_remaps, list) {
		mntns_root = mntns_get_root_fd(rlb->mnt_ns);
		if (mntns_root < 0)
			return;
		list_del(&rlb->list);
		if (do_unlink)
			unlinkat(mntns_root, rlb->path, 0);
		xfree(rlb->path);
		xfree(rlb);
	}
}

void delete_link_remaps(void) { __rollback_link_remaps(true); }
void free_link_remaps(void) { __rollback_link_remaps(false); }

static int create_link_remap(char *path, int len, int lfd,
				u32 *idp, struct ns_id *nsid)
{
	char link_name[PATH_MAX], *tmp;
	RegFileEntry rfe = REG_FILE_ENTRY__INIT;
	FownEntry fwn = FOWN_ENTRY__INIT;
	struct link_remap_rlb *rlb;
	int mntns_root;

	if (!opts.link_remap_ok) {
		pr_err("Can't create link remap for %s. "
				"Use " LREMAP_PARAM " option.\n", path);
		return -1;
	}

	/*
	 * Linked remapping -- we create a hard link on a removed file
	 * in the directory original file used to sit.
	 *
	 * Bad news is than we can't easily open lfd's parent dir. Thus
	 * we have to just generate an absolute path and use it. The linkat
	 * will fail if we chose the bad one.
	 */

	link_name[0] = '.';
	memcpy(link_name + 1, path, len);
	tmp = link_name + len;
	while (*tmp != '/') {
		BUG_ON(tmp == link_name);
		tmp--;
	}

	fd_id_generate_special(NULL, idp);
	rfe.id		= *idp;
	rfe.flags	= 0;
	rfe.pos		= 0;
	rfe.fown	= &fwn;
	rfe.name	= link_name + 1;

	/* Any 'unique' name works here actually. Remap works by reg-file ids. */
	snprintf(tmp + 1, sizeof(link_name) - (size_t)(tmp - link_name - 1), "link_remap.%d", rfe.id);

	mntns_root = mntns_get_root_fd(nsid);

	if (linkat(lfd, "", mntns_root, link_name, AT_EMPTY_PATH) < 0) {
		pr_perror("Can't link remap to %s", path);
		return -1;
	}

	/*
	 * Remember the name to delete it if needed on error or
	 * rollback action. Note we don't expect that there will
	 * be a HUGE number of link remaps, so in a sake of speed
	 * we keep all data in memory.
	 */
	rlb = xmalloc(sizeof(*rlb));
	if (!rlb)
		goto err1;

	rlb->path = strdup(link_name);
	if (!rlb->path)
		goto err2;

	rlb->mnt_ns = nsid;
	list_add(&rlb->list, &link_remaps);

	return pb_write_one(img_from_set(glob_imgset, CR_FD_REG_FILES), &rfe, PB_REG_FILE);

err2:
	xfree(rlb);
err1:
	pr_perror("Can't register rollback for %s", path);
	return -1;
}

static int dump_linked_remap(char *path, int len, const struct stat *ost,
				int lfd, u32 id, struct ns_id *nsid)
{
	u32 lid;
	RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT;

	if (create_link_remap(path, len, lfd, &lid, nsid))
		return -1;

	rpe.orig_id = id;
	rpe.remap_id = lid;

	return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH),
			&rpe, PB_REMAP_FPATH);
}

static int have_seen_dead_pid(pid_t pid)
{
	static pid_t *dead_pids = NULL;
	static int n_dead_pids = 0;
	size_t i;

	for (i = 0; i < n_dead_pids; i++) {
		if (dead_pids[i] == pid)
			return 1;
	}

	if (xrealloc_safe(&dead_pids, sizeof(*dead_pids) * (n_dead_pids + 1)))
		return -1;
	dead_pids[n_dead_pids++] = pid;

	return 0;
}

static int dump_dead_process_remap(pid_t pid, char *path, int len, const struct stat *ost,
				int lfd, u32 id, struct ns_id *nsid)
{
	RemapFilePathEntry rpe = REMAP_FILE_PATH_ENTRY__INIT;
	int ret;

	ret = have_seen_dead_pid(pid);
	if (ret < 0)
		return -1;
	if (ret) {
		pr_info("Found dead pid %d already, skipping remap\n", pid);
		return 0;
	}

	rpe.orig_id = id;
	rpe.remap_id = pid;
	rpe.has_remap_type = true;
	rpe.remap_type = REMAP_TYPE__PROCFS;

	return pb_write_one(img_from_set(glob_imgset, CR_FD_REMAP_FPATH),
			&rpe, PB_REMAP_FPATH);
}

static bool is_sillyrename_name(char *name)
{
	int i;

	name = strrchr(name, '/');
	BUG_ON(name == NULL); /* see check in dump_one_reg_file */
	name++;

	/*
	 * Strictly speaking this check is not bullet-proof. User
	 * can create file with this name by hands and we have no
	 * API to distinguish really-silly-renamed files from those
	 * fake names :(
	 *
	 * But since NFS people expect .nfsXXX files to be unstable,
	 * we treat them as such too.
	 */

	if (strncmp(name, SILLYNAME_PREF, sizeof(SILLYNAME_PREF) - 1))
		return false;

	name += sizeof(SILLYNAME_PREF) - 1;
	for (i = 0; i < SILLYNAME_SUFF_LEN; i++)
		if (!isxdigit(name[i]))
			return false;

	return true;
}

static inline bool nfs_silly_rename(char *rpath, const struct fd_parms *parms)
{
	return (parms->fs_type == NFS_SUPER_MAGIC) && is_sillyrename_name(rpath);
}

int strip_deleted(struct fd_link *link)
{
	struct dcache_prepends {
		const char	*str;
		size_t		len;
	} static const prepends[] = {
		{
			.str	= " (deleted)",
			.len	= 10,
		}, {
			.str	= "//deleted",
			.len	= 9,
		}
	};
	size_t i;

	for (i = 0; i < ARRAY_SIZE(prepends); i++) {
		size_t at;

		if (link->len <= prepends[i].len)
			continue;

		at = link->len - prepends[i].len;
		if (!strcmp(&link->name[at], prepends[i].str)) {
			pr_debug("Stip %s' tag from '%s'\n",
				 prepends[i].str, link->name);
			link->name[at] = '\0';
			link->len -= prepends[i].len;
			return 1;
		}
	}
	return 0;
}

static int check_path_remap(struct fd_link *link, const struct fd_parms *parms,
				int lfd, u32 id, struct ns_id *nsid)
{
	char *rpath = link->name;
	int plen = link->len;
	int ret, mntns_root;
	struct stat pst;
	const struct stat *ost = &parms->stat;

	if (parms->fs_type == PROC_SUPER_MAGIC) {
		/* The file points to /proc/pid/<foo> where pid is a dead
		 * process. We remap this file by adding this pid to be
		 * fork()ed into a TASK_HELPER state so that we can point to it
		 * on restore.
		 */
		pid_t pid;
		char *start, *end;

		/* skip "./proc/" */
		start = strstr(rpath, "/") + 1;
		if (!start)
			return -1;
		start = strstr(start, "/") + 1;
		if (!start)
			return -1;
		pid = strtol(start, &end, 10);

		/* if we didn't find another /, this path something
		 * like ./proc/kmsg, which we shouldn't mess with. */
		if (*end == '/') {
			*end = 0;
			ret = access(rpath, F_OK);
			*end = '/';

			if (ret) {
				pr_info("Dumping dead process remap of %d\n", pid);
				return dump_dead_process_remap(pid, rpath + 1, plen - 1, ost, lfd, id, nsid);
			}
		}

		return 0;
	} else if (parms->fs_type == DEVPTS_SUPER_MAGIC) {
		/*
		 * It's safe to call stripping here because
		 * file paths are having predefined format for
		 * this FS and can't have a valid " (deleted)"
		 * postfix as a part of not deleted filename.
		 */
		strip_deleted(link);
		/*
		 * Devpts devices/files are generated by the
		 * kernel itself so we should not try to generate
		 * any kind of ghost files here even if file is
		 * no longer exist.
		 */
		return 0;
	}

	if (ost->st_nlink == 0) {
		/*
		 * Unpleasant, but easy case. File is completely invisible
		 * from the FS. Just dump its contents and that's it. But
		 * be careful whether anybody still has any of its hardlinks
		 * also open.
		 */
		strip_deleted(link);
		return dump_ghost_remap(rpath + 1, ost, lfd, id, nsid);
	}

	if (nfs_silly_rename(rpath, parms)) {
		/*
		 * If this is NFS silly-rename file the path we have at hands
		 * will be accessible by fstat(), but once we kill the dumping
		 * tasks it will disappear. So we just go ahead an dump it as
		 * linked-remap file (NFS will allow us to create more hard
		 * links on it) to have some persistent name at hands.
		 */
		pr_debug("Dump silly-rename linked remap for %x\n", id);
		return dump_linked_remap(rpath + 1, plen - 1, ost, lfd, id, nsid);
	}

	mntns_root = mntns_get_root_fd(nsid);
	if (mntns_root < 0)
		return -1;

	ret = fstatat(mntns_root, rpath, &pst, 0);
	if (ret < 0) {
		/*
		 * Linked file, but path is not accessible (unless any
		 * other error occurred). We can create a temporary link to it
		 * uning linkat with AT_EMPTY_PATH flag and remap it to this
		 * name.
		 */

		if (errno == ENOENT)
			return dump_linked_remap(rpath + 1, plen - 1,
							ost, lfd, id, nsid);

		pr_perror("Can't stat path");
		return -1;
	}

	if ((pst.st_ino != ost->st_ino) || (pst.st_dev != ost->st_dev)) {
		if (opts.evasive_devices &&
		    (S_ISCHR(ost->st_mode) || S_ISBLK(ost->st_mode)) &&
		    pst.st_rdev == ost->st_rdev)
			return 0;
		/*
		 * FIXME linked file, but the name we see it by is reused
		 * by somebody else. We can dump it with linked remaps, but
		 * we'll have difficulties on restore -- we will have to
		 * move the exisint file aside, then restore this one,
		 * unlink, then move the original file back. It's fairly
		 * easy to do, but we don't do it now, since unlinked files
		 * have the "(deleted)" suffix in proc and name conflict
		 * is unlikely :)
		 */
		pr_err("Unaccessible path opened %u:%u, need %u:%u\n",
				(int)pst.st_dev, (int)pst.st_ino,
				(int)ost->st_dev, (int)ost->st_ino);
		return -1;
	}

	/*
	 * File is linked and visible by the name it is opened by
	 * this task. Go ahead and dump it.
	 */
	return 0;
}

static bool should_check_size(int flags)
{
	/* Skip size if file has O_APPEND and O_WRONLY flags (e.g. log file). */
	if (((flags & O_ACCMODE) == O_WRONLY) &&
			(flags & O_APPEND))
		return false;

	return true;
}

int dump_one_reg_file(int lfd, u32 id, const struct fd_parms *p)
{
	struct fd_link _link, *link;
	struct ns_id *nsid;
	struct cr_img *rimg;

	RegFileEntry rfe = REG_FILE_ENTRY__INIT;

	if (!p->link) {
		if (fill_fdlink(lfd, p, &_link))
			return -1;
		link = &_link;
	} else
		link = p->link;

	nsid = lookup_nsid_by_mnt_id(p->mnt_id);
	if (nsid == NULL) {
		pr_err("Can't lookup mount=%d for fd=%d path=%s\n",
			p->mnt_id, p->fd, link->name + 1);
		return -1;
	}

	if (p->mnt_id >= 0 && (root_ns_mask & CLONE_NEWNS)) {
		rfe.mnt_id = p->mnt_id;
		rfe.has_mnt_id = true;
	}

	pr_info("Dumping path for %d fd via self %d [%s]\n",
			p->fd, lfd, &link->name[1]);

	/*
	 * The regular path we can handle should start with slash.
	 */
	if (link->name[1] != '/') {
		pr_err("The path [%s] is not supported\n", &link->name[1]);
		return -1;
	}

	if (check_path_remap(link, p, lfd, id, nsid))
		return -1;

	rfe.id		= id;
	rfe.flags	= p->flags;
	rfe.pos		= p->pos;
	rfe.fown	= (FownEntry *)&p->fown;
	rfe.name	= &link->name[1];

	if (S_ISREG(p->stat.st_mode) && should_check_size(rfe.flags)) {
		rfe.has_size = true;
		rfe.size = p->stat.st_size;
	}

	rimg = img_from_set(glob_imgset, CR_FD_REG_FILES);
	return pb_write_one(rimg, &rfe, PB_REG_FILE);
}

const struct fdtype_ops regfile_dump_ops = {
	.type		= FD_TYPES__REG,
	.dump		= dump_one_reg_file,
};

static void convert_path_from_another_mp(char *src, char *dst, int dlen,
					struct mount_info *smi,
					struct mount_info *dmi)
{
	int off;

	/*
	 * mi->mountpoint	./foo/bar
	 * mi->ns_mountpoint	/foo/bar
	 * rfi->path		foo/bar/baz
	 */
	off = strlen(smi->ns_mountpoint + 1);
	BUG_ON(strlen(smi->root) < strlen(dmi->root));

	/*
	 * Create paths relative to this mount.
	 * Absolute path to the mount point + difference between source
	 * and destination roots + path relative to the mountpoint.
	 */
	snprintf(dst, dlen, "%s/%s/%s",
				dmi->ns_mountpoint + 1,
				smi->root + strlen(dmi->root),
				src + off);
}

static int linkat_hard(int odir, char *opath, int ndir, char *npath, uid_t owner)
{
	int ret, old_fsuid = -1;

	if (root_ns_mask & CLONE_NEWUSER)
		/*
		 * Kernel has strange secutiry restrictions about
		 * linkat. If the fsuid of the caller doesn't equals
		 * the uid of the file and the file is not "safe"
		 * one, then only global CAP_CHOWN will be allowed
		 * to link().
		 *
		 * Next, when we're in user namespace we're ns root,
		 * but not global CAP_CHOWN. Thus, even though we
		 * ARE ns root, we will not be allowed to link() at
		 * files that belong to regular users %)
		 *
		 * Fortunately, the setfsuid() requires ns-level
		 * CAP_SETUID which we have.
		 */

		old_fsuid = setfsuid(owner);

	ret = linkat(odir, opath, ndir, npath, 0);
	if (ret < 0)
		pr_perror("Can't link %s -> %s", opath, npath);

	if (root_ns_mask & CLONE_NEWUSER) {
		setfsuid(old_fsuid);
		if (setfsuid(-1) != old_fsuid)
			pr_warn("Failed to restore old fsuid!\n");
			/*
			 * Don't fail here. We still have chances to run till
			 * the pie/restorer, and if _this_ guy fails to set
			 * the proper fsuid, then we'll abort the restore.
			 */
	}

	return ret;
}

/*
 * This routine properly resolves d's path handling ghost/link-remaps.
 * The open_cb is a routine that does actual open, it differs for
 * files, directories, fifos, etc.
 */

static int rfi_remap(struct reg_file_info *rfi)
{
	struct mount_info *mi, *rmi, *tmi;
	char _path[PATH_MAX], *path = _path;
	char _rpath[PATH_MAX], *rpath = _rpath;
	int mntns_root;

	if (rfi->rfe->mnt_id == -1) {
		/* Know nothing about mountpoints */
		mntns_root = mntns_get_root_by_mnt_id(-1);
		path = rfi->path;
		rpath = rfi->remap->path;
		goto out_root;
	}

	mi = lookup_mnt_id(rfi->rfe->mnt_id);
	if (rfi->rfe->mnt_id == rfi->remap->mnt_id) {
		/* Both links on the same mount point */
		tmi = mi;
		path = rfi->path;
		rpath = rfi->remap->path;
		goto out;
	}

	rmi = lookup_mnt_id(rfi->remap->mnt_id);

	/*
	 * Find the common bind-mount. We know that one mount point was
	 * really mounted and all other were bind-mounted from it, so the
	 * lowest mount must contains all bind-mounts.
	 */
	for (tmi = mi; tmi->bind; tmi = tmi->bind)
		;

	BUG_ON(tmi->s_dev != rmi->s_dev);
	BUG_ON(tmi->s_dev != mi->s_dev);

	/* Calcalate paths on the device (root mount) */
	convert_path_from_another_mp(rfi->path, path, sizeof(_path), mi, tmi);
	convert_path_from_another_mp(rfi->remap->path, rpath, sizeof(_rpath), rmi, tmi);

out:
	pr_debug("%d: Link %s -> %s\n", tmi->mnt_id, rpath, path);
	mntns_root = mntns_get_root_fd(tmi->nsid);

out_root:
	return linkat_hard(mntns_root, rpath, mntns_root, path, rfi->remap->owner);
}

int open_path(struct file_desc *d,
		int(*open_cb)(int mntns_root, struct reg_file_info *, void *), void *arg)
{
	struct reg_file_info *rfi;
	int tmp, mntns_root;
	char *orig_path = NULL;

	if (inherited_fd(d, &tmp))
		return tmp;

	rfi = container_of(d, struct reg_file_info, d);
	if (rfi->remap) {
		mutex_lock(ghost_file_mutex);
		if (rfi->remap->is_dir) {
			/*
			 * FIXME Can't make directory under new name.
			 * Will have to open it under the ghost one :(
			 */
			orig_path = rfi->path;
			rfi->path = rfi->remap->path;
		} else if (rfi_remap(rfi) < 0) {
			static char tmp_path[PATH_MAX];

			if (errno != EEXIST) {
				pr_err("Can't link %s -> %s", rfi->path,
						rfi->remap->path);
				return -1;
			}

			/*
			 * The file whose name we're trying to create
			 * exists. Need to pick some other one, we're
			 * going to remove it anyway.
			 *
			 * Strictly speaking, this is cheating, file
			 * name shouldn't change. But since NFS with
			 * its silly-rename doesn't care, why should we?
			 */

			orig_path = rfi->path;
			rfi->path = tmp_path;
			snprintf(tmp_path, sizeof(tmp_path), "%s.cr_link", orig_path);
			pr_debug("Fake %s -> %s link\n", rfi->path, rfi->remap->path);

			if (rfi_remap(rfi) < 0) {
				pr_perror("Can't create even fake link!");
				return -1;
			}
		}
	}

	mntns_root = mntns_get_root_by_mnt_id(rfi->rfe->mnt_id);
	tmp = open_cb(mntns_root, rfi, arg);
	if (tmp < 0) {
		pr_perror("Can't open file %s", rfi->path);
		return -1;
	}

	if (rfi->rfe->has_size && !rfi->size_checked) {
		struct stat st;

		if (fstat(tmp, &st) < 0) {
			pr_perror("Can't fstat opened file");
			return -1;
		}

		if (st.st_size != rfi->rfe->size) {
			pr_err("File %s has bad size %"PRIu64" (expect %"PRIu64")\n",
					rfi->path, st.st_size,
					rfi->rfe->size);
			return -1;
		}

		/*
		 * This is only visible in the current process, so
		 * change w/o locks. Other tasks sharing the same
		 * file will get one via unix sockets.
		 */
		rfi->size_checked = true;
	}

	if (rfi->remap) {
		if (!rfi->remap->is_dir) {
			unlinkat(mntns_root, rfi->path, 0);
		}

		BUG_ON(!rfi->remap->users);
		if (--rfi->remap->users == 0) {
			pr_info("Unlink the ghost %s\n", rfi->remap->path);
			mntns_root = mntns_get_root_by_mnt_id(rfi->remap->mnt_id);
			unlinkat(mntns_root, rfi->remap->path, rfi->remap->is_dir ? AT_REMOVEDIR : 0);
		}

		if (orig_path)
			rfi->path = orig_path;
		mutex_unlock(ghost_file_mutex);
	}

	if (restore_fown(tmp, rfi->rfe->fown))
		return -1;

	return tmp;
}

int do_open_reg_noseek_flags(int ns_root_fd, struct reg_file_info *rfi, void *arg)
{
	u32 flags = *(u32 *)arg;
	int fd;

	fd = openat(ns_root_fd, rfi->path, flags);
	if (fd < 0) {
		pr_perror("Can't open file %s on restore", rfi->path);
		return fd;
	}

	return fd;
}

static int do_open_reg_noseek(int ns_root_fd, struct reg_file_info *rfi, void *arg)
{
	return do_open_reg_noseek_flags(ns_root_fd, rfi, &rfi->rfe->flags);
}

static int do_open_reg(int ns_root_fd, struct reg_file_info *rfi, void *arg)
{
	int fd;

	fd = do_open_reg_noseek(ns_root_fd, rfi, arg);
	if (fd < 0)
		return fd;

	if ((rfi->rfe->pos != -1ULL) &&
			lseek(fd, rfi->rfe->pos, SEEK_SET) < 0) {
		pr_perror("Can't restore file pos");
		close(fd);
		return -1;
	}

	return fd;
}

int open_reg_fd(struct file_desc *fd)
{
	return open_path(fd, do_open_reg_noseek, NULL);
}

int open_reg_by_id(u32 id)
{
	struct file_desc *fd;

	/*
	 * This one gets called by exe link, chroot and cwd
	 * restoring code. No need in calling lseek on either
	 * of them.
	 */

	fd = find_file_desc_raw(FD_TYPES__REG, id);
	if (fd == NULL) {
		pr_err("Can't find regfile for %#x\n", id);
		return -1;
	}

	return open_reg_fd(fd);
}

int get_filemap_fd(struct vma_area *vma)
{
	u32 flags;

	/*
	 * Thevma->fd should have been assigned in collect_filemap
	 *
	 * We open file w/o lseek, as mappings don't care about it
	 */

	BUG_ON(vma->vmfd == NULL);
	if (vma->e->has_fdflags)
		flags = vma->e->fdflags;
	else if ((vma->e->prot & PROT_WRITE) &&
			vma_area_is(vma, VMA_FILE_SHARED))
		flags = O_RDWR;
	else
		flags = O_RDONLY;

	return open_path(vma->vmfd, do_open_reg_noseek_flags, &flags);
}

static void remap_get(struct file_desc *fdesc, char typ)
{
	struct reg_file_info *rfi;

	rfi = container_of(fdesc, struct reg_file_info, d);
	if (rfi->remap) {
		pr_debug("One more remap user (%c) for %s\n",
				typ, rfi->remap->path);
		/* No lock, we're still sngle-process here */
		rfi->remap->users++;
	}
}

static void collect_reg_fd(struct file_desc *fdesc,
		struct fdinfo_list_entry *fle, struct rst_info *ri)
{
	if (list_empty(&fdesc->fd_info_head))
		remap_get(fdesc, 'f');

	collect_gen_fd(fle, ri);
}

static int open_fe_fd(struct file_desc *fd)
{
	return open_path(fd, do_open_reg, NULL);
}

static char *reg_file_path(struct file_desc *d, char *buf, size_t s)
{
	struct reg_file_info *rfi;

	rfi = container_of(d, struct reg_file_info, d);
	return rfi->path;
}

static struct file_desc_ops reg_desc_ops = {
	.type = FD_TYPES__REG,
	.open = open_fe_fd,
	.collect_fd = collect_reg_fd,
	.name = reg_file_path,
};

struct file_desc *try_collect_special_file(u32 id, int optional)
{
	struct file_desc *fdesc;

	/*
	 * Files dumped for vmas/exe links can have remaps
	 * configured. Need to bump-up users for them, otherwise
	 * the open_path() would unlink the remap file after
	 * the very first open.
	 */

	fdesc = find_file_desc_raw(FD_TYPES__REG, id);
	if (fdesc == NULL) {
		if (!optional)
			pr_err("No entry for reg-file-ID %#x\n", id);
		return NULL;
	}

	remap_get(fdesc, 's');
	return fdesc;
}

static int collect_one_regfile(void *o, ProtobufCMessage *base)
{
	struct reg_file_info *rfi = o;
	static char dot[] = ".";

	rfi->rfe = pb_msg(base, RegFileEntry);
	/* change "/foo" into "foo" and "/" into "." */
	if (rfi->rfe->name[1] == '\0')
		rfi->path = dot;
	else
		rfi->path = rfi->rfe->name + 1;
	rfi->remap = NULL;
	rfi->size_checked = false;

	pr_info("Collected [%s] ID %#x\n", rfi->path, rfi->rfe->id);
	return file_desc_add(&rfi->d, rfi->rfe->id, &reg_desc_ops);
}

struct collect_image_info reg_file_cinfo = {
	.fd_type = CR_FD_REG_FILES,
	.pb_type = PB_REG_FILE,
	.priv_size = sizeof(struct reg_file_info),
	.collect = collect_one_regfile,
};

int prepare_shared_reg_files(void)
{
	ghost_file_mutex = shmalloc(sizeof(*ghost_file_mutex));
	if (!ghost_file_mutex)
		return -1;

	mutex_init(ghost_file_mutex);
	return 0;
}