2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 01:51:51 +00:00

criu: Allow disabling freeze cgroups

Some plugins (e.g., CUDA) may not function correctly when processes are
frozen using cgroups. This change introduces a mechanism to disable the
use of freeze cgroups during process seizing, even if explicitly
requested via the --freeze-cgroup option.

The CUDA plugin is updated to utilize this new mechanism to ensure
compatibility.

Signed-off-by: Andrei Vagin <avagin@google.com>
This commit is contained in:
Andrei Vagin 2024-09-11 17:37:45 -07:00 committed by Andrei Vagin
parent 59f49c6276
commit 651df375bd
3 changed files with 58 additions and 11 deletions

View File

@ -8,5 +8,6 @@ extern bool alarm_timeouted(void);
extern char *task_comm_info(pid_t pid, char *comm, size_t size);
extern char *__task_comm_info(pid_t pid);
extern void dont_use_freeze_cgroup(void);
#endif

View File

@ -25,6 +25,19 @@
#include "xmalloc.h"
#include "util.h"
static bool freeze_cgroup_disabled;
/*
* Disables the use of freeze cgroups for process seizing, even if explicitly
* requested via the --freeze-cgroup option. This is necessary for plugins
* (e.g., CUDA) that do not function correctly when processes are frozen using
* cgroups.
*/
void __attribute__((used)) dont_use_freeze_cgroup(void)
{
freeze_cgroup_disabled = true;
}
char *task_comm_info(pid_t pid, char *comm, size_t size)
{
bool is_read = false;
@ -397,7 +410,7 @@ static int freezer_detach(void)
{
int i;
if (!opts.freeze_cgroup)
if (!opts.freeze_cgroup || freeze_cgroup_disabled)
return 0;
for (i = 0; i < processes_to_wait && processes_to_wait_pids; i++) {
@ -492,6 +505,31 @@ static int log_unfrozen_stacks(char *root)
return 0;
}
static int check_freezer_cgroup(void)
{
enum freezer_state state = THAWED;
int fd;
BUG_ON(!freeze_cgroup_disabled);
fd = freezer_open();
if (fd < 0)
return -1;
state = get_freezer_state(fd);
close(fd);
if (state == FREEZER_ERROR) {
return -1;
}
if (state != THAWED) {
pr_err("One or more plugins are incompatible with the freezer cgroup in the FROZEN state.\n");
return -1;
}
return 0;
}
static int freeze_processes(void)
{
int fd, exit_code = -1;
@ -643,7 +681,7 @@ static int collect_children(struct pstree_item *item)
goto free;
}
if (!opts.freeze_cgroup)
if (!opts.freeze_cgroup || freeze_cgroup_disabled)
/* fails when meets a zombie */
__ignore_value(compel_interrupt_task(pid));
@ -831,7 +869,8 @@ static int collect_threads(struct pstree_item *item)
pr_info("\tSeizing %d's %d thread\n", item->pid->real, pid);
if (!opts.freeze_cgroup && compel_interrupt_task(pid))
if ((!opts.freeze_cgroup || freeze_cgroup_disabled) &&
compel_interrupt_task(pid))
continue;
ret = compel_wait_task(pid, item_ppid(item), parse_pid_status, NULL, &t_creds.s, NULL);
@ -887,7 +926,7 @@ static int collect_loop(struct pstree_item *item, int (*collect)(struct pstree_i
{
int attempts = NR_ATTEMPTS, nr_inprogress = 1;
if (opts.freeze_cgroup)
if (opts.freeze_cgroup && !freeze_cgroup_disabled)
attempts = 1;
/*
@ -993,12 +1032,16 @@ int collect_pstree(void)
pr_debug("Detected cgroup V%d freezer\n", cgroup_v2 ? 2 : 1);
if (opts.freeze_cgroup && freeze_processes())
goto err;
if (!opts.freeze_cgroup && compel_interrupt_task(pid)) {
set_cr_errno(ESRCH);
goto err;
if (opts.freeze_cgroup && !freeze_cgroup_disabled) {
if (freeze_processes())
goto err;
} else {
if (opts.freeze_cgroup && check_freezer_cgroup())
goto err;
if (compel_interrupt_task(pid)) {
set_cr_errno(ESRCH);
goto err;
}
}
ret = compel_wait_task(pid, -1, parse_pid_status, NULL, &creds.s, NULL);
@ -1024,7 +1067,8 @@ int collect_pstree(void)
if (ret < 0)
goto err;
if (opts.freeze_cgroup && freezer_wait_processes()) {
if (opts.freeze_cgroup && !freeze_cgroup_disabled &&
freezer_wait_processes()) {
ret = -1;
goto err;
}

View File

@ -483,6 +483,8 @@ int cuda_plugin_init(int stage)
INIT_LIST_HEAD(&cuda_pids);
}
dont_use_freeze_cgroup();
return 0;
}