2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 09:58:09 +00:00

restorer: Add a lock around cgroupd communication.

Threads are put into cgroups through the cgroupd thread, which
communicates with other threads using a socketpair.

Previously, each thread received a dup'd copy of the socket, and did
the following

    sendmsg(socket_dup_fd, my_cgroup_set);

    // wait for ack.
    while (1) {
        recvmsg(socket_dup_fd, &h, MSG_PEEK);
        if (h.pid != my_pid) continue;
        recvmsg(socket_dup_fd, &h, 0);
    }
    close(socket_dup_fd);

When restoring many threads, many threads would be spinning in the
above loop waiting for their PID to appear.

In my test-case, restoring a process with a 11.5G heap and 491 threads
could take anywhere between 10 seconds and 60 seconds to complete.

To avoid the spinning, we drop the loop and MSG_PEEK, and add a lock
around the above code. This does not decrease parallelism, as the
cgroupd daemon uses a single thread anyway.

With the lock in place, the same restore consistently takes around 10
seconds on my machine (Thinkpad P14s, AMD Ryzen 8840HS).

There is a similar "daemon" thread for user namespaces. That already
is protected with a similar userns_sync_lock in __userns_call().

Fixes #2614

Signed-off-by: Han-Wen Nienhuys <hanwen@engflow.com>
This commit is contained in:
Han-Wen Nienhuys 2025-03-13 08:46:16 +01:00 committed by Andrei Vagin
parent 5b4c819d54
commit 8d5cef546a
3 changed files with 31 additions and 32 deletions

View File

@ -2329,6 +2329,7 @@ int prepare_task_entries(void)
task_entries->nr_helpers = 0;
futex_set(&task_entries->start, CR_STATE_FAIL);
mutex_init(&task_entries->userns_sync_lock);
mutex_init(&task_entries->cgroupd_sync_lock);
mutex_init(&task_entries->last_pid_mutex);
return 0;

View File

@ -14,6 +14,7 @@ struct task_entries {
futex_t start;
atomic_t cr_err;
mutex_t userns_sync_lock;
mutex_t cgroupd_sync_lock;
mutex_t last_pid_mutex;
};

View File

@ -704,9 +704,8 @@ static int send_cg_set(int sk, int cg_set)
}
/*
* As this socket is shared among threads, recvmsg(MSG_PEEK)
* from the socket until getting its own thread id as an
* acknowledge of successful threaded cgroup fixup
* As the cgroupd socket is shared among threads and processes, this
* should be called with task_entries->cgroupd_sync_lock held.
*/
static int recv_cg_set_restore_ack(int sk)
{
@ -719,10 +718,9 @@ static int recv_cg_set_restore_ack(int sk)
h.msg_control = cmsg;
h.msg_controllen = sizeof(cmsg);
while (1) {
ret = sys_recvmsg(sk, &h, MSG_PEEK);
ret = sys_recvmsg(sk, &h, 0);
if (ret < 0) {
pr_err("Unable to peek from cgroupd %d\n", ret);
pr_err("Unable to receive from cgroupd %d\n", ret);
return -1;
}
@ -733,20 +731,10 @@ static int recv_cg_set_restore_ack(int sk)
ch = CMSG_FIRSTHDR(&h);
cred = (struct ucred *)CMSG_DATA(ch);
if (cred->pid != sys_gettid())
continue;
/*
* Actual remove message from recv queue of socket
*/
ret = sys_recvmsg(sk, &h, 0);
if (ret < 0) {
pr_err("Unable to receive from cgroupd %d\n", ret);
if (cred->pid != sys_gettid()) {
pr_err("cred pid %d != gettid\n", cred->pid);
return -1;
}
break;
}
return 0;
}
@ -782,12 +770,21 @@ __visible long __export_restore_thread(struct thread_restore_args *args)
rt_sigframe = (void *)&args->mz->rt_sigframe;
if (args->cg_set != -1) {
int err = 0;
mutex_lock(&task_entries_local->cgroupd_sync_lock);
pr_info("Restore cg_set in thread cg_set: %d\n", args->cg_set);
if (send_cg_set(args->cgroupd_sk, args->cg_set))
goto core_restore_end;
if (recv_cg_set_restore_ack(args->cgroupd_sk))
goto core_restore_end;
err = send_cg_set(args->cgroupd_sk, args->cg_set);
if (!err)
err = recv_cg_set_restore_ack(args->cgroupd_sk);
mutex_unlock(&task_entries_local->cgroupd_sync_lock);
sys_close(args->cgroupd_sk);
if (err)
goto core_restore_end;
}
if (restore_thread_common(args))