From 84eb0a1927e60f682aa357f08f00ca32f5cb78c0 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov Date: Mon, 30 Jun 2014 20:30:44 +0400 Subject: [PATCH] criu: Restore tasks as siblings in swrk Andrey validly pointed out, that restoring pdeath_sig is not compatible with criu_restore_child() call -- after criu restore children, it will exit and fire the pdeath_sig into restored tree root, potentially killing it. The fix for that could be -- when started in swrk more, criu can restore tree not as children tasks, but as siblings, using the CLONE_PARENT flag when fork()-ing the root task. With this we should also take care about errors handing -- right now criu catches the SIGCHILD from dying children tasks, and since we plan to create them be children of the criu parent (the library caller) we will not be able to catch them. To do so we SEIZE the root task in advance thus causing all SIGCHLD-s go to criu, not to its parent. Having this done we no longer need the SUBREAPER trick in the library call -- tasks get restored right as callers kids :) Some thoughts for future -- using this trick we can finally make "natural" restoration of shell jobs. I.e. -- make criu restore some subtree right under bash, w/o leaving itself as intermediate task and w/o re-parenting the subtree to init after restore. Signed-off-by: Pavel Emelyanov Acked-by: Andrey Vagin --- cr-restore.c | 55 ++++++++++++++++++++++++++++++++++++++++---- crtools.c | 4 +++- include/cr_options.h | 1 + lib/criu.c | 15 ------------ 4 files changed, 54 insertions(+), 21 deletions(-) diff --git a/cr-restore.c b/cr-restore.c index 270d1d162..dc560593c 100644 --- a/cr-restore.c +++ b/cr-restore.c @@ -1374,7 +1374,7 @@ static int restore_switch_stage(int next_stage) return restore_wait_inprogress_tasks(); } -static int attach_to_tasks() +static int attach_to_tasks(bool root_seized) { struct pstree_item *item; @@ -1394,9 +1394,21 @@ static int attach_to_tasks() for (i = 0; i < item->nr_threads; i++) { pid = item->threads[i].real; - if (ptrace(PTRACE_ATTACH, pid, 0, 0)) { - pr_perror("Can't attach to %d", pid); - return -1; + if (item != root_item || !root_seized) { + if (ptrace(PTRACE_ATTACH, pid, 0, 0)) { + pr_perror("Can't attach to %d", pid); + return -1; + } + } else { + /* + * Root item is SEIZE-d, so we only need + * to stop one (INTERRUPT) to make wait4 + * and SYSCALL below work. + */ + if (ptrace(PTRACE_INTERRUPT, pid, 0, 0)) { + pr_perror("Can't interrupt task"); + return -1; + } } if (wait4(pid, &status, __WALL, NULL) != pid) { @@ -1513,10 +1525,43 @@ static int restore_root_task(struct pstree_item *init) futex_set(&task_entries->nr_in_progress, stage_participants(CR_STATE_RESTORE_NS)); + /* + * This means we're called from lib's criu_restore_child(). + * In that case create the root task as the child one to+ + * the caller. This is the only way to correctly restore the + * pdeath_sig of the root task. But also looks nice. + */ + if (opts.swrk_restore) + init->rst->clone_flags |= CLONE_PARENT; + ret = fork_with_pid(init); if (ret < 0) return -1; + if (opts.swrk_restore) { + /* + * Root task is not our sibling. This means, that + * we will not notice when (if) it dies in SIGCHLD + * handler, but we should. To do this -- attach to + * the guy with ptrace and (!) make the kernel + * deliver us the signal when it will get stopped. + * It will in case of e.g. segfault before handling + * the signal. + */ + + act.sa_flags &= ~SA_NOCLDSTOP; + ret = sigaction(SIGCHLD, &act, NULL); + if (ret < 0) { + pr_perror("sigaction() failed"); + goto out; + } + + if (ptrace(PTRACE_SEIZE, init->pid.real, 0, 0)) { + pr_perror("Can't attach to init"); + goto out; + } + } + pr_info("Wait until namespaces are created\n"); ret = restore_wait_inprogress_tasks(); if (ret) @@ -1570,7 +1615,7 @@ static int restore_root_task(struct pstree_item *init) timing_stop(TIME_RESTORE); - ret = attach_to_tasks(); + ret = attach_to_tasks(opts.swrk_restore); pr_info("Restore finished successfully. Resuming tasks.\n"); futex_set_and_wake(&task_entries->start, CR_STATE_COMPLETE); diff --git a/crtools.c b/crtools.c index c3a2e270b..b662fff36 100644 --- a/crtools.c +++ b/crtools.c @@ -183,14 +183,16 @@ int main(int argc, char *argv[]) if (init_service_fd()) return 1; - if (!strcmp(argv[1], "swrk")) + if (!strcmp(argv[1], "swrk")) { /* * This is to start criu service worker from libcriu calls. * The usage is "criu swrk " and is not for CLI/scripts. * The arguments semantics can change at any tyme with the * corresponding lib call change. */ + opts.swrk_restore = true; return cr_service_work(atoi(argv[2])); + } while (1) { idx = -1; diff --git a/include/cr_options.h b/include/cr_options.h index 2732e58d6..55ca70b32 100644 --- a/include/cr_options.h +++ b/include/cr_options.h @@ -34,6 +34,7 @@ struct cr_options { bool link_remap_ok; unsigned int rst_namespaces_flags; bool log_file_per_pid; + bool swrk_restore; char *output; char *root; char *pidfile; diff --git a/lib/criu.c b/lib/criu.c index 86256fd0a..7c1ac0785 100644 --- a/lib/criu.c +++ b/lib/criu.c @@ -15,10 +15,6 @@ #include "rpc.pb-c.h" #include "cr-service-const.h" -#ifndef PR_SET_CHILD_SUBREAPER -#define PR_SET_CHILD_SUBREAPER 36 -#endif - const char *criu_lib_version = CRIU_VERSION; static char *service_address = CR_DEFAULT_SERVICE_ADDRESS; @@ -582,15 +578,6 @@ int criu_restore_child(void) if (socketpair(PF_LOCAL, SOCK_SEQPACKET, 0, sks)) goto out; - /* - * Set us as child subreaper so that after the swrk - * finishes restore and exits the restored subtree - * gets reparented to us. - */ - - if (prctl(PR_SET_CHILD_SUBREAPER, 1, 0, 0)) - goto err; - pid = fork(); if (pid < 0) goto err; @@ -633,8 +620,6 @@ int criu_restore_child(void) close(sks[0]); waitpid(pid, NULL, 0); - /* Drop the subreaper role _after_ swrk exits */ - prctl(PR_SET_CHILD_SUBREAPER, 0, 0, 0); if (!ret) { ret = resp->success ? resp->restore->pid : -EBADE;