From af55c059fb6b0080b618dff6af6559c9e1036a0e Mon Sep 17 00:00:00 2001
From: Andrew Vagin <avagin@virtuozzo.com>
Date: Wed, 9 Dec 2015 18:58:00 +0300
Subject: [PATCH] mount: fix a race between restoring namespaces and file
 mappings (v2)

Currently we wait when a namespace will be restored to get its root.
We need to open a namespace root to open a file to restore a memory mapping.

A process restores mappings and only then forks children. So we can have
a situation, when we need to open a file from a namespace, which will be
"restored" by one of our children.

The root task restores all mount namespaces and opens a file descriptor
for each of them. In this patch we open root for each mntns in the root
task.

If we neeed to get root of a namespace which isn't populated, we can get
it from the root task. After the CR_STATE_FORKING stage, the root task
closes all namespace descriptors ane we know that all namespaces are
populated at this moment.

v2: don't close root_fd for root ns, because it was not opened
Signed-off-by: Andrew Vagin <avagin@virtuozzo.com>
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
---
 include/namespaces.h |  1 +
 mount.c              | 39 +++++++++++++++++++++++++++++++++------
 2 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/include/namespaces.h b/include/namespaces.h
index c65589072..953b8748a 100644
--- a/include/namespaces.h
+++ b/include/namespaces.h
@@ -38,6 +38,7 @@ struct ns_id {
 			struct mount_info *mntinfo_list;
 			struct mount_info *mntinfo_tree;
 			int ns_fd;
+			int root_fd;
 		} mnt;
 
 		struct {
diff --git a/mount.c b/mount.c
index 06e657483..e1608a9fd 100644
--- a/mount.c
+++ b/mount.c
@@ -2972,6 +2972,8 @@ void fini_restore_mntns(void)
 		if (nsid->nd != &mnt_ns_desc)
 			continue;
 		close(nsid->mnt.ns_fd);
+		if (nsid->type != NS_ROOT)
+			close(nsid->mnt.root_fd);
 	}
 }
 
@@ -3179,6 +3181,8 @@ int prepare_mnt_ns(void)
 			nsid->mnt.ns_fd = open_proc(PROC_SELF, "ns/mnt");
 			if (nsid->mnt.ns_fd < 0)
 				goto err;
+			/* we set ns_populated so we don't need to open root_fd */
+			futex_set(&nsid->ns_populated, 1);
 			continue;
 		}
 
@@ -3199,6 +3203,11 @@ int prepare_mnt_ns(void)
 		if (nsid->mnt.ns_fd < 0)
 			goto err;
 
+		/* root_fd is used to restore file mappings */
+		nsid->mnt.root_fd = open_proc(PROC_SELF, "root");
+		if (nsid->mnt.root_fd < 0)
+			goto err;
+
 		/* And return back to regain the access to the roots yard */
 		if (setns(rst, CLONE_NEWNS)) {
 			pr_perror("Can't restore mntns back");
@@ -3289,15 +3298,33 @@ set_root:
 
 int mntns_get_root_fd(struct ns_id *mntns) {
 	/*
-	 * We need to find a task from the target namespace and open its root.
-	 * For that we need to wait when one of tasks enters into required
-	 * namespaces.
+	 * All namespaces are restored from the root task and during the
+	 * CR_STATE_FORKING stage the root task has two file descriptors for
+	 * each mntns. One is associated with a namespace and another one is a
+	 * root of this mntns.
 	 *
-	 * The root task is born in the root mount namespace.
+	 * When a non-root task is forked, it enters into a proper mount
+	 * namespace, restores private mappings and forks children. Some of
+	 * these mappings can be associated with files from other namespaces.
+	 *
+	 * After the CR_STATE_FORKING stage the root task has to close all
+	 * mntns file descriptors to restore its descriptors and at this moment
+	 * we know that all tasks live in their mount namespaces.
+	 *
+	 * If we find that a mount namespace isn't populated, we can get its
+	 * root from the root task.
 	 */
 
-	if (mntns->type != NS_ROOT)
-		futex_wait_while_eq(&mntns->ns_populated, 0);
+	if (!futex_get(&mntns->ns_populated)) {
+		int fd;
+
+		fd = open_proc(root_item->pid.virt, "fd/%d", mntns->mnt.root_fd);
+		if (fd < 0)
+			return -1;
+
+		return mntns_set_root_fd(mntns->ns_pid, fd);
+	}
+
 	return __mntns_get_root_fd(mntns->ns_pid);
 }