From 651df375bd3cbacd27bde44546f19f29546db576 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Wed, 11 Sep 2024 17:37:45 -0700 Subject: [PATCH] criu: Allow disabling freeze cgroups Some plugins (e.g., CUDA) may not function correctly when processes are frozen using cgroups. This change introduces a mechanism to disable the use of freeze cgroups during process seizing, even if explicitly requested via the --freeze-cgroup option. The CUDA plugin is updated to utilize this new mechanism to ensure compatibility. Signed-off-by: Andrei Vagin --- criu/include/seize.h | 1 + criu/seize.c | 66 +++++++++++++++++++++++++++++++------- plugins/cuda/cuda_plugin.c | 2 ++ 3 files changed, 58 insertions(+), 11 deletions(-) diff --git a/criu/include/seize.h b/criu/include/seize.h index 4545bf262..3225029dd 100644 --- a/criu/include/seize.h +++ b/criu/include/seize.h @@ -8,5 +8,6 @@ extern bool alarm_timeouted(void); extern char *task_comm_info(pid_t pid, char *comm, size_t size); extern char *__task_comm_info(pid_t pid); +extern void dont_use_freeze_cgroup(void); #endif diff --git a/criu/seize.c b/criu/seize.c index ba26072e6..edeb57cc8 100644 --- a/criu/seize.c +++ b/criu/seize.c @@ -25,6 +25,19 @@ #include "xmalloc.h" #include "util.h" +static bool freeze_cgroup_disabled; + +/* + * Disables the use of freeze cgroups for process seizing, even if explicitly + * requested via the --freeze-cgroup option. This is necessary for plugins + * (e.g., CUDA) that do not function correctly when processes are frozen using + * cgroups. + */ +void __attribute__((used)) dont_use_freeze_cgroup(void) +{ + freeze_cgroup_disabled = true; +} + char *task_comm_info(pid_t pid, char *comm, size_t size) { bool is_read = false; @@ -397,7 +410,7 @@ static int freezer_detach(void) { int i; - if (!opts.freeze_cgroup) + if (!opts.freeze_cgroup || freeze_cgroup_disabled) return 0; for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) { @@ -492,6 +505,31 @@ static int log_unfrozen_stacks(char *root) return 0; } +static int check_freezer_cgroup(void) +{ + enum freezer_state state = THAWED; + int fd; + + BUG_ON(!freeze_cgroup_disabled); + + fd = freezer_open(); + if (fd < 0) + return -1; + + state = get_freezer_state(fd); + close(fd); + if (state == FREEZER_ERROR) { + return -1; + } + + if (state != THAWED) { + pr_err("One or more plugins are incompatible with the freezer cgroup in the FROZEN state.\n"); + return -1; + } + + return 0; +} + static int freeze_processes(void) { int fd, exit_code = -1; @@ -643,7 +681,7 @@ static int collect_children(struct pstree_item *item) goto free; } - if (!opts.freeze_cgroup) + if (!opts.freeze_cgroup || freeze_cgroup_disabled) /* fails when meets a zombie */ __ignore_value(compel_interrupt_task(pid)); @@ -831,7 +869,8 @@ static int collect_threads(struct pstree_item *item) pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid); - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) + if ((!opts.freeze_cgroup || freeze_cgroup_disabled) && + compel_interrupt_task(pid)) continue; ret = compel_wait_task(pid, item_ppid(item), parse_pid_status, NULL, &t_creds.s, NULL); @@ -887,7 +926,7 @@ static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_i { int attempts = NR_ATTEMPTS, nr_inprogress = 1; - if (opts.freeze_cgroup) + if (opts.freeze_cgroup && !freeze_cgroup_disabled) attempts = 1; /* @@ -993,12 +1032,16 @@ int collect_pstree(void) pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1); - if (opts.freeze_cgroup && freeze_processes()) - goto err; - - if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { - set_cr_errno(ESRCH); - goto err; + if (opts.freeze_cgroup && !freeze_cgroup_disabled) { + if (freeze_processes()) + goto err; + } else { + if (opts.freeze_cgroup && check_freezer_cgroup()) + goto err; + if (compel_interrupt_task(pid)) { + set_cr_errno(ESRCH); + goto err; + } } ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds.s, NULL); @@ -1024,7 +1067,8 @@ int collect_pstree(void) if (ret < 0) goto err; - if (opts.freeze_cgroup && freezer_wait_processes()) { + if (opts.freeze_cgroup && !freeze_cgroup_disabled && + freezer_wait_processes()) { ret = -1; goto err; } diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index 174545476..04d70b114 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -483,6 +483,8 @@ int cuda_plugin_init(int stage) INIT_LIST_HEAD(&cuda_pids); } + dont_use_freeze_cgroup(); + return 0; }