2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-29 13:28:27 +00:00

Use clone3() with set_tid to create processes

With the in Linux Kernel 5.4 introduced clone3() with set_tid it is no
longer necessary to write to to /proc/../ns_last_pid to influence the
next PID number. clone3() can directly select a PID for the newly
created process/thread.

After checking for the availability of clone3() with set_tid and adding
the assembler wrapper for clone3() in previous patches, this extends
criu/pie/restorer.c and criu/clone-noasan.c to use the newly added
assembler clone3() wrapper to create processes with a certain PID.

This is a RFC and WIP, but I wanted to share it and run it through CI
for feedback. As the CI will probably not use a 5.4 based kernel it
should just keep on working as before.

Signed-off-by: Adrian Reber <areber@redhat.com>
This commit is contained in:
Adrian Reber 2019-12-16 10:42:13 +00:00 committed by Andrei Vagin
parent 97c03b97d0
commit a1ea8deb4c
6 changed files with 117 additions and 47 deletions

View File

@ -1,4 +1,10 @@
#include <stdlib.h>
#include <sched.h> #include <sched.h>
#include <unistd.h>
#include <compel/plugins/std/syscall-codes.h>
#include "sched.h"
#include "common/compiler.h" #include "common/compiler.h"
#include "log.h" #include "log.h"
#include "common/bug.h" #include "common/bug.h"
@ -31,6 +37,7 @@
int clone_noasan(int (*fn)(void *), int flags, void *arg) int clone_noasan(int (*fn)(void *), int flags, void *arg)
{ {
void *stack_ptr = (void *)round_down((unsigned long)&stack_ptr - 1024, 16); void *stack_ptr = (void *)round_down((unsigned long)&stack_ptr - 1024, 16);
BUG_ON((flags & CLONE_VM) && !(flags & CLONE_VFORK)); BUG_ON((flags & CLONE_VM) && !(flags & CLONE_VFORK));
/* /*
* Reserve some bytes for clone() internal needs * Reserve some bytes for clone() internal needs
@ -38,3 +45,28 @@ int clone_noasan(int (*fn)(void *), int flags, void *arg)
*/ */
return clone(fn, stack_ptr, flags, arg); return clone(fn, stack_ptr, flags, arg);
} }
int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags,
int exit_signal, pid_t pid)
{
struct _clone_args c_args = {};
BUG_ON(flags & CLONE_VM);
/*
* Make sure no child signals are requested. clone3() uses
* exit_signal for that.
*/
BUG_ON(flags & 0xff);
pr_debug("Creating process using clone3()\n");
c_args.exit_signal = exit_signal;
c_args.flags = flags;
c_args.set_tid = ptr_to_u64(&pid);
c_args.set_tid_size = 1;
pid = syscall(__NR_clone3, &c_args, sizeof(c_args));
if (pid == 0)
exit(fn(arg));
return pid;
}

View File

@ -1374,32 +1374,43 @@ static inline int fork_with_pid(struct pstree_item *item)
if (!(ca.clone_flags & CLONE_NEWPID)) { if (!(ca.clone_flags & CLONE_NEWPID)) {
char buf[32]; char buf[32];
int len; int len;
int fd; int fd = -1;
if (!kdat.has_clone3_set_tid) {
fd = open_proc_rw(PROC_GEN, LAST_PID_PATH); fd = open_proc_rw(PROC_GEN, LAST_PID_PATH);
if (fd < 0) if (fd < 0)
goto err; goto err;
}
lock_last_pid(); lock_last_pid();
if (!kdat.has_clone3_set_tid) {
len = snprintf(buf, sizeof(buf), "%d", pid - 1); len = snprintf(buf, sizeof(buf), "%d", pid - 1);
if (write(fd, buf, len) != len) { if (write(fd, buf, len) != len) {
pr_perror("%d: Write %s to %s", pid, buf, LAST_PID_PATH); pr_perror("%d: Write %s to %s", pid, buf,
LAST_PID_PATH);
close(fd); close(fd);
goto err_unlock; goto err_unlock;
} }
close(fd); close(fd);
}
} else { } else {
BUG_ON(pid != INIT_PID); BUG_ON(pid != INIT_PID);
} }
if (kdat.has_clone3_set_tid) {
ret = clone3_with_pid_noasan(restore_task_with_children,
&ca, (ca.clone_flags &
~(CLONE_NEWNET | CLONE_NEWCGROUP)),
SIGCHLD, pid);
} else {
/* /*
* Some kernel modules, such as network packet generator * Some kernel modules, such as network packet generator
* run kernel thread upon net-namespace creattion taking * run kernel thread upon net-namespace creation taking
* the @pid we've been requeting via LAST_PID_PATH interface * the @pid we've been requesting via LAST_PID_PATH interface
* so that we can't restore a take with pid needed. * so that we can't restore a take with pid needed.
* *
* Here is an idea -- unhare net namespace in callee instead. * Here is an idea -- unshare net namespace in callee instead.
*/ */
/* /*
* The cgroup namespace is also unshared explicitly in the * The cgroup namespace is also unshared explicitly in the
@ -1407,7 +1418,11 @@ static inline int fork_with_pid(struct pstree_item *item)
*/ */
close_pid_proc(); close_pid_proc();
ret = clone_noasan(restore_task_with_children, ret = clone_noasan(restore_task_with_children,
(ca.clone_flags & ~(CLONE_NEWNET | CLONE_NEWCGROUP)) | SIGCHLD, &ca); (ca.clone_flags &
~(CLONE_NEWNET | CLONE_NEWCGROUP)) | SIGCHLD,
&ca);
}
if (ret < 0) { if (ret < 0) {
pr_perror("Can't fork for %d", pid); pr_perror("Can't fork for %d", pid);
goto err_unlock; goto err_unlock;
@ -3588,6 +3603,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
task_args->vdso_maps_rt = vdso_maps_rt; task_args->vdso_maps_rt = vdso_maps_rt;
task_args->vdso_rt_size = vdso_rt_size; task_args->vdso_rt_size = vdso_rt_size;
task_args->can_map_vdso = kdat.can_map_vdso; task_args->can_map_vdso = kdat.can_map_vdso;
task_args->has_clone3_set_tid = kdat.has_clone3_set_tid;
new_sp = restorer_stack(task_args->t->mz); new_sp = restorer_stack(task_args->t->mz);

View File

@ -2,5 +2,7 @@
#define __CR_CLONE_NOASAN_H__ #define __CR_CLONE_NOASAN_H__
int clone_noasan(int (*fn)(void *), int flags, void *arg); int clone_noasan(int (*fn)(void *), int flags, void *arg);
int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags,
int exit_signal, pid_t pid);
#endif /* __CR_CLONE_NOASAN_H__ */ #endif /* __CR_CLONE_NOASAN_H__ */

View File

@ -221,6 +221,7 @@ struct task_restore_args {
#endif #endif
int lsm_type; int lsm_type;
int child_subreaper; int child_subreaper;
bool has_clone3_set_tid;
} __aligned(64); } __aligned(64);
/* /*

View File

@ -4,6 +4,7 @@
#include "common/lock.h" #include "common/lock.h"
#include "common/list.h" #include "common/list.h"
#include "vma.h" #include "vma.h"
#include "kerndat.h"
struct task_entries { struct task_entries {
int nr_threads, nr_tasks, nr_helpers; int nr_threads, nr_tasks, nr_helpers;

View File

@ -35,6 +35,7 @@
#include "sk-inet.h" #include "sk-inet.h"
#include "vma.h" #include "vma.h"
#include "uffd.h" #include "uffd.h"
#include "sched.h"
#include "common/lock.h" #include "common/lock.h"
#include "common/page.h" #include "common/page.h"
@ -1771,9 +1772,11 @@ long __export_restore_task(struct task_restore_args *args)
long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND | long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND |
CLONE_THREAD | CLONE_SYSVSEM | CLONE_FS; CLONE_THREAD | CLONE_SYSVSEM | CLONE_FS;
long last_pid_len; long last_pid_len;
pid_t thread_pid;
long parent_tid; long parent_tid;
int i, fd = -1; int i, fd = -1;
if (!args->has_clone3_set_tid) {
/* One level pid ns hierarhy */ /* One level pid ns hierarhy */
fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0); fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0);
if (fd < 0) { if (fd < 0) {
@ -1781,6 +1784,7 @@ long __export_restore_task(struct task_restore_args *args)
goto core_restore_end; goto core_restore_end;
} }
}
mutex_lock(&task_entries_local->last_pid_mutex); mutex_lock(&task_entries_local->last_pid_mutex);
for (i = 0; i < args->nr_threads; i++) { for (i = 0; i < args->nr_threads; i++) {
@ -1791,6 +1795,20 @@ long __export_restore_task(struct task_restore_args *args)
continue; continue;
new_sp = restorer_stack(thread_args[i].mz); new_sp = restorer_stack(thread_args[i].mz);
if (args->has_clone3_set_tid) {
struct _clone_args c_args = {};
thread_pid = thread_args[i].pid;
c_args.set_tid = ptr_to_u64(&thread_pid);
c_args.flags = clone_flags;
c_args.set_tid_size = 1;
/* The kernel does stack + stack_size. */
c_args.stack = new_sp - RESTORE_STACK_SIZE;
c_args.stack_size = RESTORE_STACK_SIZE;
c_args.child_tid = ptr_to_u64(&thread_args[i].pid);
c_args.parent_tid = ptr_to_u64(&parent_tid);
pr_debug("Using clone3 to restore the process\n");
RUN_CLONE3_RESTORE_FN(ret, c_args, sizeof(c_args), &thread_args[i], args->clone_restore_fn);
} else {
last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s); last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s);
sys_lseek(fd, 0, SEEK_SET); sys_lseek(fd, 0, SEEK_SET);
ret = sys_write(fd, s, last_pid_len); ret = sys_write(fd, s, last_pid_len);
@ -1807,8 +1825,8 @@ long __export_restore_task(struct task_restore_args *args)
* thread will run with own stack and we must not * thread will run with own stack and we must not
* have any additional instructions... oh, dear... * have any additional instructions... oh, dear...
*/ */
RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn); RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn);
}
if (ret != thread_args[i].pid) { if (ret != thread_args[i].pid) {
pr_err("Unable to create a thread: %ld\n", ret); pr_err("Unable to create a thread: %ld\n", ret);
mutex_unlock(&task_entries_local->last_pid_mutex); mutex_unlock(&task_entries_local->last_pid_mutex);