mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-22 01:51:51 +00:00
seize: fix pause-devices plugin hook
The plugin hook "PAUSE_DEVICES" was recently introduced in the following commit. This hook was intended to execute the cuda-checkpoint tool before the process tree is frozen. However, the run_plugins() call has been placed immediately *after* freeze_processes(). This causes the cuda-checkpoint tool to hang indefinitely during the checkpointing of CUDA applications running in containers, eventually leading to its termination by the timeout alarm. a85f488595e0a3a6e6cc6ca7c94d4a00b1341aaf criu/plugin: Introduce new plugin hooks PAUSE_DEVICES and CHECKPOINT_DEVICES to be used during pstree collection This problem can be reproduced with the following example: sudo podman run -d --rm \ --device nvidia.com/gpu=all --security-opt=label=disable \ quay.io/radostin/cuda-counter sudo podman container checkpoint -l -e /tmp/checkpoint.tar Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
This commit is contained in:
parent
21108b40de
commit
85050be66b
10
criu/seize.c
10
criu/seize.c
@ -983,6 +983,11 @@ int collect_pstree(void)
|
||||
*/
|
||||
alarm(opts.timeout);
|
||||
|
||||
ret = run_plugins(PAUSE_DEVICES, pid);
|
||||
if (ret < 0 && ret != -ENOTSUP) {
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (opts.freeze_cgroup && cgroup_version())
|
||||
goto err;
|
||||
|
||||
@ -991,11 +996,6 @@ int collect_pstree(void)
|
||||
if (opts.freeze_cgroup && freeze_processes())
|
||||
goto err;
|
||||
|
||||
ret = run_plugins(PAUSE_DEVICES, pid);
|
||||
if (ret < 0 && ret != -ENOTSUP) {
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (!opts.freeze_cgroup && compel_interrupt_task(pid)) {
|
||||
set_cr_errno(ESRCH);
|
||||
goto err;
|
||||
|
Loading…
x
Reference in New Issue
Block a user