2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 09:58:09 +00:00

seize: use separate checkpoint_devices function

Move `run_plugins(CHECKPOINT_DEVICES)` out of `collect_pstree()` to
ensure that the function's sole responsibility is to use the cgroup
freezer for the process tree. This allows us to avoid a time-out
error when checkpointing applications with large GPU state.

v2: This patch calls `checkpoint_devices()` only for `criu dump`.
Support for GPU checkpointing with `pre-dump` will be introduced in
a separate patch.

Suggested-by: Andrei Vagin <avagin@google.com>
Suggested-by: Jesus Ramos <jeramos@nvidia.com>
Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
This commit is contained in:
Radostin Stoyanov 2024-12-21 14:17:35 +00:00 committed by Andrei Vagin
parent 59b022db35
commit dcd8808db0
3 changed files with 22 additions and 9 deletions

View File

@ -2192,6 +2192,9 @@ int cr_dump_tasks(pid_t pid)
if (collect_pstree()) if (collect_pstree())
goto err; goto err;
if (checkpoint_devices())
goto err;
if (collect_pstree_ids()) if (collect_pstree_ids())
goto err; goto err;

View File

@ -2,6 +2,7 @@
#define __CR_SEIZE_H__ #define __CR_SEIZE_H__
extern int collect_pstree(void); extern int collect_pstree(void);
extern int checkpoint_devices(void);
struct pstree_item; struct pstree_item;
extern void pstree_switch_state(struct pstree_item *root_item, int st); extern void pstree_switch_state(struct pstree_item *root_item, int st);
extern const char *get_real_freezer_state(void); extern const char *get_real_freezer_state(void);

View File

@ -1050,7 +1050,6 @@ int collect_pstree(void)
pid_t pid = root_item->pid->real; pid_t pid = root_item->pid->real;
int ret, exit_code = -1; int ret, exit_code = -1;
struct proc_status_creds creds; struct proc_status_creds creds;
struct pstree_item *iter;
timing_start(TIME_FREEZING); timing_start(TIME_FREEZING);
@ -1111,14 +1110,6 @@ int collect_pstree(void)
goto err; goto err;
} }
for_each_pstree_item(iter) {
if (!task_alive(iter))
continue;
ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real);
if (ret < 0 && ret != -ENOTSUP)
goto err;
}
exit_code = 0; exit_code = 0;
timing_stop(TIME_FREEZING); timing_stop(TIME_FREEZING);
timing_start(TIME_FROZEN); timing_start(TIME_FROZEN);
@ -1128,3 +1119,21 @@ err:
alarm(0); alarm(0);
return exit_code; return exit_code;
} }
int checkpoint_devices(void)
{
struct pstree_item *iter;
int ret, exit_code = -1;
for_each_pstree_item(iter) {
if (!task_alive(iter))
continue;
ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real);
if (ret < 0 && ret != -ENOTSUP)
goto err;
}
exit_code = 0;
err:
return exit_code;
}