2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 09:58:09 +00:00

criu/plugin: Introduce new plugin hooks PAUSE_DEVICES and CHECKPOINT_DEVICES to be used during pstree collection

PAUSE_DEVICES is called before a process is frozen and is used by the CUDA
plugin to place the process in a state that's ready to be checkpointed and
quiesce any pending work

CHECKPOINT_DEVICES is called after all processes in the tree have been frozen
and PAUSE'd and performs the actual checkpointing operation for CUDA
applications

Signed-off-by: Jesus Ramos <jeramos@nvidia.com>
This commit is contained in:
Jesus Ramos 2024-06-06 11:16:07 -07:00 committed by Andrei Vagin
parent 1012e542e5
commit 5f486d5aee
3 changed files with 26 additions and 0 deletions

View File

@ -56,6 +56,10 @@ enum {
CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9, CR_PLUGIN_HOOK__RESUME_DEVICES_LATE = 9,
CR_PLUGIN_HOOK__PAUSE_DEVICES = 10,
CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11,
CR_PLUGIN_HOOK__MAX CR_PLUGIN_HOOK__MAX
}; };
@ -72,6 +76,8 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, int fd, const struct
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr, DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const uint64_t addr,
const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd); const uint64_t old_pgoff, uint64_t *new_pgoff, int *plugin_fd);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid); DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
enum { enum {
CR_PLUGIN_STAGE__DUMP, CR_PLUGIN_STAGE__DUMP,

View File

@ -57,6 +57,8 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
__assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma"); __assign_hook(HANDLE_DEVICE_VMA, "cr_plugin_handle_device_vma");
__assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map"); __assign_hook(UPDATE_VMA_MAP, "cr_plugin_update_vma_map");
__assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late"); __assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late");
__assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices");
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
#undef __assign_hook #undef __assign_hook

View File

@ -16,6 +16,7 @@
#include "pstree.h" #include "pstree.h"
#include "criu-log.h" #include "criu-log.h"
#include <compel/ptrace.h> #include <compel/ptrace.h>
#include "plugin.h"
#include "proc_parse.h" #include "proc_parse.h"
#include "seccomp.h" #include "seccomp.h"
#include "seize.h" #include "seize.h"
@ -637,6 +638,11 @@ static int collect_children(struct pstree_item *item)
goto free; goto free;
} }
ret = run_plugins(PAUSE_DEVICES, pid);
if (ret < 0 && ret != -ENOTSUP) {
goto free;
}
if (!opts.freeze_cgroup) if (!opts.freeze_cgroup)
/* fails when meets a zombie */ /* fails when meets a zombie */
__ignore_value(compel_interrupt_task(pid)); __ignore_value(compel_interrupt_task(pid));
@ -966,6 +972,7 @@ int collect_pstree(void)
pid_t pid = root_item->pid->real; pid_t pid = root_item->pid->real;
int ret = -1; int ret = -1;
struct proc_status_creds creds; struct proc_status_creds creds;
struct pstree_item *iter;
timing_start(TIME_FREEZING); timing_start(TIME_FREEZING);
@ -984,6 +991,11 @@ int collect_pstree(void)
if (opts.freeze_cgroup && freeze_processes()) if (opts.freeze_cgroup && freeze_processes())
goto err; goto err;
ret = run_plugins(PAUSE_DEVICES, pid);
if (ret < 0 && ret != -ENOTSUP) {
goto err;
}
if (!opts.freeze_cgroup && compel_interrupt_task(pid)) { if (!opts.freeze_cgroup && compel_interrupt_task(pid)) {
set_cr_errno(ESRCH); set_cr_errno(ESRCH);
goto err; goto err;
@ -1017,6 +1029,12 @@ int collect_pstree(void)
goto err; goto err;
} }
for_each_pstree_item(iter) {
ret = run_plugins(CHECKPOINT_DEVICES, iter->pid->real);
if (ret < 0 && ret != -ENOTSUP)
goto err;
}
ret = 0; ret = 0;
timing_stop(TIME_FREEZING); timing_stop(TIME_FREEZING);
timing_start(TIME_FROZEN); timing_start(TIME_FROZEN);