mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-29 13:28:27 +00:00
Use clone3() with set_tid to create processes
With the in Linux Kernel 5.4 introduced clone3() with set_tid it is no longer necessary to write to to /proc/../ns_last_pid to influence the next PID number. clone3() can directly select a PID for the newly created process/thread. After checking for the availability of clone3() with set_tid and adding the assembler wrapper for clone3() in previous patches, this extends criu/pie/restorer.c and criu/clone-noasan.c to use the newly added assembler clone3() wrapper to create processes with a certain PID. This is a RFC and WIP, but I wanted to share it and run it through CI for feedback. As the CI will probably not use a 5.4 based kernel it should just keep on working as before. Signed-off-by: Adrian Reber <areber@redhat.com>
This commit is contained in:
parent
97c03b97d0
commit
a1ea8deb4c
@ -1,4 +1,10 @@
|
||||
#include <stdlib.h>
|
||||
#include <sched.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#include <compel/plugins/std/syscall-codes.h>
|
||||
|
||||
#include "sched.h"
|
||||
#include "common/compiler.h"
|
||||
#include "log.h"
|
||||
#include "common/bug.h"
|
||||
@ -31,6 +37,7 @@
|
||||
int clone_noasan(int (*fn)(void *), int flags, void *arg)
|
||||
{
|
||||
void *stack_ptr = (void *)round_down((unsigned long)&stack_ptr - 1024, 16);
|
||||
|
||||
BUG_ON((flags & CLONE_VM) && !(flags & CLONE_VFORK));
|
||||
/*
|
||||
* Reserve some bytes for clone() internal needs
|
||||
@ -38,3 +45,28 @@ int clone_noasan(int (*fn)(void *), int flags, void *arg)
|
||||
*/
|
||||
return clone(fn, stack_ptr, flags, arg);
|
||||
}
|
||||
|
||||
int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags,
|
||||
int exit_signal, pid_t pid)
|
||||
{
|
||||
struct _clone_args c_args = {};
|
||||
|
||||
BUG_ON(flags & CLONE_VM);
|
||||
|
||||
/*
|
||||
* Make sure no child signals are requested. clone3() uses
|
||||
* exit_signal for that.
|
||||
*/
|
||||
BUG_ON(flags & 0xff);
|
||||
|
||||
pr_debug("Creating process using clone3()\n");
|
||||
|
||||
c_args.exit_signal = exit_signal;
|
||||
c_args.flags = flags;
|
||||
c_args.set_tid = ptr_to_u64(&pid);
|
||||
c_args.set_tid_size = 1;
|
||||
pid = syscall(__NR_clone3, &c_args, sizeof(c_args));
|
||||
if (pid == 0)
|
||||
exit(fn(arg));
|
||||
return pid;
|
||||
}
|
||||
|
@ -1374,40 +1374,55 @@ static inline int fork_with_pid(struct pstree_item *item)
|
||||
if (!(ca.clone_flags & CLONE_NEWPID)) {
|
||||
char buf[32];
|
||||
int len;
|
||||
int fd;
|
||||
int fd = -1;
|
||||
|
||||
fd = open_proc_rw(PROC_GEN, LAST_PID_PATH);
|
||||
if (fd < 0)
|
||||
goto err;
|
||||
if (!kdat.has_clone3_set_tid) {
|
||||
fd = open_proc_rw(PROC_GEN, LAST_PID_PATH);
|
||||
if (fd < 0)
|
||||
goto err;
|
||||
}
|
||||
|
||||
lock_last_pid();
|
||||
|
||||
len = snprintf(buf, sizeof(buf), "%d", pid - 1);
|
||||
if (write(fd, buf, len) != len) {
|
||||
pr_perror("%d: Write %s to %s", pid, buf, LAST_PID_PATH);
|
||||
if (!kdat.has_clone3_set_tid) {
|
||||
len = snprintf(buf, sizeof(buf), "%d", pid - 1);
|
||||
if (write(fd, buf, len) != len) {
|
||||
pr_perror("%d: Write %s to %s", pid, buf,
|
||||
LAST_PID_PATH);
|
||||
close(fd);
|
||||
goto err_unlock;
|
||||
}
|
||||
close(fd);
|
||||
goto err_unlock;
|
||||
}
|
||||
close(fd);
|
||||
} else {
|
||||
BUG_ON(pid != INIT_PID);
|
||||
}
|
||||
|
||||
/*
|
||||
* Some kernel modules, such as network packet generator
|
||||
* run kernel thread upon net-namespace creattion taking
|
||||
* the @pid we've been requeting via LAST_PID_PATH interface
|
||||
* so that we can't restore a take with pid needed.
|
||||
*
|
||||
* Here is an idea -- unhare net namespace in callee instead.
|
||||
*/
|
||||
/*
|
||||
* The cgroup namespace is also unshared explicitly in the
|
||||
* move_in_cgroup(), so drop this flag here as well.
|
||||
*/
|
||||
close_pid_proc();
|
||||
ret = clone_noasan(restore_task_with_children,
|
||||
(ca.clone_flags & ~(CLONE_NEWNET | CLONE_NEWCGROUP)) | SIGCHLD, &ca);
|
||||
if (kdat.has_clone3_set_tid) {
|
||||
ret = clone3_with_pid_noasan(restore_task_with_children,
|
||||
&ca, (ca.clone_flags &
|
||||
~(CLONE_NEWNET | CLONE_NEWCGROUP)),
|
||||
SIGCHLD, pid);
|
||||
} else {
|
||||
/*
|
||||
* Some kernel modules, such as network packet generator
|
||||
* run kernel thread upon net-namespace creation taking
|
||||
* the @pid we've been requesting via LAST_PID_PATH interface
|
||||
* so that we can't restore a take with pid needed.
|
||||
*
|
||||
* Here is an idea -- unshare net namespace in callee instead.
|
||||
*/
|
||||
/*
|
||||
* The cgroup namespace is also unshared explicitly in the
|
||||
* move_in_cgroup(), so drop this flag here as well.
|
||||
*/
|
||||
close_pid_proc();
|
||||
ret = clone_noasan(restore_task_with_children,
|
||||
(ca.clone_flags &
|
||||
~(CLONE_NEWNET | CLONE_NEWCGROUP)) | SIGCHLD,
|
||||
&ca);
|
||||
}
|
||||
|
||||
if (ret < 0) {
|
||||
pr_perror("Can't fork for %d", pid);
|
||||
goto err_unlock;
|
||||
@ -3588,6 +3603,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
|
||||
task_args->vdso_maps_rt = vdso_maps_rt;
|
||||
task_args->vdso_rt_size = vdso_rt_size;
|
||||
task_args->can_map_vdso = kdat.can_map_vdso;
|
||||
task_args->has_clone3_set_tid = kdat.has_clone3_set_tid;
|
||||
|
||||
new_sp = restorer_stack(task_args->t->mz);
|
||||
|
||||
|
@ -2,5 +2,7 @@
|
||||
#define __CR_CLONE_NOASAN_H__
|
||||
|
||||
int clone_noasan(int (*fn)(void *), int flags, void *arg);
|
||||
int clone3_with_pid_noasan(int (*fn)(void *), void *arg, int flags,
|
||||
int exit_signal, pid_t pid);
|
||||
|
||||
#endif /* __CR_CLONE_NOASAN_H__ */
|
||||
|
@ -221,6 +221,7 @@ struct task_restore_args {
|
||||
#endif
|
||||
int lsm_type;
|
||||
int child_subreaper;
|
||||
bool has_clone3_set_tid;
|
||||
} __aligned(64);
|
||||
|
||||
/*
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include "common/lock.h"
|
||||
#include "common/list.h"
|
||||
#include "vma.h"
|
||||
#include "kerndat.h"
|
||||
|
||||
struct task_entries {
|
||||
int nr_threads, nr_tasks, nr_helpers;
|
||||
|
@ -35,6 +35,7 @@
|
||||
#include "sk-inet.h"
|
||||
#include "vma.h"
|
||||
#include "uffd.h"
|
||||
#include "sched.h"
|
||||
|
||||
#include "common/lock.h"
|
||||
#include "common/page.h"
|
||||
@ -1771,16 +1772,19 @@ long __export_restore_task(struct task_restore_args *args)
|
||||
long clone_flags = CLONE_VM | CLONE_FILES | CLONE_SIGHAND |
|
||||
CLONE_THREAD | CLONE_SYSVSEM | CLONE_FS;
|
||||
long last_pid_len;
|
||||
pid_t thread_pid;
|
||||
long parent_tid;
|
||||
int i, fd = -1;
|
||||
|
||||
/* One level pid ns hierarhy */
|
||||
fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0);
|
||||
if (fd < 0) {
|
||||
pr_err("can't open last pid fd %d\n", fd);
|
||||
goto core_restore_end;
|
||||
}
|
||||
if (!args->has_clone3_set_tid) {
|
||||
/* One level pid ns hierarhy */
|
||||
fd = sys_openat(args->proc_fd, LAST_PID_PATH, O_RDWR, 0);
|
||||
if (fd < 0) {
|
||||
pr_err("can't open last pid fd %d\n", fd);
|
||||
goto core_restore_end;
|
||||
}
|
||||
|
||||
}
|
||||
mutex_lock(&task_entries_local->last_pid_mutex);
|
||||
|
||||
for (i = 0; i < args->nr_threads; i++) {
|
||||
@ -1791,24 +1795,38 @@ long __export_restore_task(struct task_restore_args *args)
|
||||
continue;
|
||||
|
||||
new_sp = restorer_stack(thread_args[i].mz);
|
||||
last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s);
|
||||
sys_lseek(fd, 0, SEEK_SET);
|
||||
ret = sys_write(fd, s, last_pid_len);
|
||||
if (ret < 0) {
|
||||
pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
|
||||
sys_close(fd);
|
||||
mutex_unlock(&task_entries_local->last_pid_mutex);
|
||||
goto core_restore_end;
|
||||
if (args->has_clone3_set_tid) {
|
||||
struct _clone_args c_args = {};
|
||||
thread_pid = thread_args[i].pid;
|
||||
c_args.set_tid = ptr_to_u64(&thread_pid);
|
||||
c_args.flags = clone_flags;
|
||||
c_args.set_tid_size = 1;
|
||||
/* The kernel does stack + stack_size. */
|
||||
c_args.stack = new_sp - RESTORE_STACK_SIZE;
|
||||
c_args.stack_size = RESTORE_STACK_SIZE;
|
||||
c_args.child_tid = ptr_to_u64(&thread_args[i].pid);
|
||||
c_args.parent_tid = ptr_to_u64(&parent_tid);
|
||||
pr_debug("Using clone3 to restore the process\n");
|
||||
RUN_CLONE3_RESTORE_FN(ret, c_args, sizeof(c_args), &thread_args[i], args->clone_restore_fn);
|
||||
} else {
|
||||
last_pid_len = std_vprint_num(last_pid_buf, sizeof(last_pid_buf), thread_args[i].pid - 1, &s);
|
||||
sys_lseek(fd, 0, SEEK_SET);
|
||||
ret = sys_write(fd, s, last_pid_len);
|
||||
if (ret < 0) {
|
||||
pr_err("Can't set last_pid %ld/%s\n", ret, last_pid_buf);
|
||||
sys_close(fd);
|
||||
mutex_unlock(&task_entries_local->last_pid_mutex);
|
||||
goto core_restore_end;
|
||||
}
|
||||
|
||||
/*
|
||||
* To achieve functionality like libc's clone()
|
||||
* we need a pure assembly here, because clone()'ed
|
||||
* thread will run with own stack and we must not
|
||||
* have any additional instructions... oh, dear...
|
||||
*/
|
||||
RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn);
|
||||
}
|
||||
|
||||
/*
|
||||
* To achieve functionality like libc's clone()
|
||||
* we need a pure assembly here, because clone()'ed
|
||||
* thread will run with own stack and we must not
|
||||
* have any additional instructions... oh, dear...
|
||||
*/
|
||||
|
||||
RUN_CLONE_RESTORE_FN(ret, clone_flags, new_sp, parent_tid, thread_args, args->clone_restore_fn);
|
||||
if (ret != thread_args[i].pid) {
|
||||
pr_err("Unable to create a thread: %ld\n", ret);
|
||||
mutex_unlock(&task_entries_local->last_pid_mutex);
|
||||
|
Loading…
x
Reference in New Issue
Block a user