restorer: Add a lock around cgroupd communication.

Threads are put into cgroups through the cgroupd thread, which communicates with other threads using a socketpair. Previously, each thread received a dup'd copy of the socket, and did the following sendmsg(socket_dup_fd, my_cgroup_set); // wait for ack. while (1) { recvmsg(socket_dup_fd, &h, MSG_PEEK); if (h.pid != my_pid) continue; recvmsg(socket_dup_fd, &h, 0); } close(socket_dup_fd); When restoring many threads, many threads would be spinning in the above loop waiting for their PID to appear. In my test-case, restoring a process with a 11.5G heap and 491 threads could take anywhere between 10 seconds and 60 seconds to complete. To avoid the spinning, we drop the loop and MSG_PEEK, and add a lock around the above code. This does not decrease parallelism, as the cgroupd daemon uses a single thread anyway. With the lock in place, the same restore consistently takes around 10 seconds on my machine (Thinkpad P14s, AMD Ryzen 8840HS). There is a similar "daemon" thread for user namespaces. That already is protected with a similar userns_sync_lock in __userns_call(). Fixes #2614 Signed-off-by: Han-Wen Nienhuys <hanwen@engflow.com>
2025-08-22 09:58:09 +00:00 · 2025-03-13 08:46:16 +01:00 · 2025-03-13 08:46:16 +01:00 · 8d5cef546a
commit 8d5cef546a
parent 5b4c819d54
3 changed files with 31 additions and 32 deletions
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@ -2329,6 +2329,7 @@ int prepare_task_entries(void)
 	task_entries->nr_helpers = 0;
 	futex_set(&task_entries->start, CR_STATE_FAIL);
 	mutex_init(&task_entries->userns_sync_lock);
+	mutex_init(&task_entries->cgroupd_sync_lock);
 	mutex_init(&task_entries->last_pid_mutex);

 	return 0;
--- a/criu/include/rst_info.h
+++ b/criu/include/rst_info.h
@ -14,6 +14,7 @@ struct task_entries {
 	futex_t start;
 	atomic_t cr_err;
 	mutex_t userns_sync_lock;
+	mutex_t cgroupd_sync_lock;
 	mutex_t last_pid_mutex;
 };

--- a/criu/pie/restorer.c
+++ b/criu/pie/restorer.c
@ -704,9 +704,8 @@ static int send_cg_set(int sk, int cg_set)
 }

 /*
- * As this socket is shared among threads, recvmsg(MSG_PEEK)
- * from the socket until getting its own thread id as an
- * acknowledge of successful threaded cgroup fixup
+ * As the cgroupd socket is shared among threads and processes, this
+ * should be called with task_entries->cgroupd_sync_lock held.
 */
 static int recv_cg_set_restore_ack(int sk)
 {
@ -719,10 +718,9 @@ static int recv_cg_set_restore_ack(int sk)
 	h.msg_control = cmsg;
 	h.msg_controllen = sizeof(cmsg);

-	while (1) {
-		ret = sys_recvmsg(sk, &h, MSG_PEEK);
+	ret = sys_recvmsg(sk, &h, 0);
 	if (ret < 0) {
-			pr_err("Unable to peek from cgroupd %d\n", ret);
+		pr_err("Unable to receive from cgroupd %d\n", ret);
 		return -1;
 	}

@ -733,20 +731,10 @@ static int recv_cg_set_restore_ack(int sk)

 	ch = CMSG_FIRSTHDR(&h);
 	cred = (struct ucred *)CMSG_DATA(ch);
-		if (cred->pid != sys_gettid())
-			continue;
-
-		/*
-		 * Actual remove message from recv queue of socket
-		 */
-		ret = sys_recvmsg(sk, &h, 0);
-		if (ret < 0) {
-			pr_err("Unable to receive from cgroupd %d\n", ret);
+	if (cred->pid != sys_gettid()) {
+		pr_err("cred pid %d != gettid\n", cred->pid);
 		return -1;
 	}
-
-		break;
-	}
 	return 0;
 }

@ -782,12 +770,21 @@ __visible long __export_restore_thread(struct thread_restore_args *args)
 	rt_sigframe = (void *)&args->mz->rt_sigframe;

 	if (args->cg_set != -1) {
+		int err = 0;
+
+		mutex_lock(&task_entries_local->cgroupd_sync_lock);
+
 		pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set);
-		if (send_cg_set(args->cgroupd_sk, args->cg_set))
-			goto core_restore_end;
-		if (recv_cg_set_restore_ack(args->cgroupd_sk))
-			goto core_restore_end;
+
+		err = send_cg_set(args->cgroupd_sk, args->cg_set);
+		if (!err)
+			err = recv_cg_set_restore_ack(args->cgroupd_sk);
+
+		mutex_unlock(&task_entries_local->cgroupd_sync_lock);
 		sys_close(args->cgroupd_sk);
+
+		if (err)
+			goto core_restore_end;
 	}

 	if (restore_thread_common(args))