From c42b58f4fb2e05feaf0228f63d0fcf828cc2875c Mon Sep 17 00:00:00 2001 From: Radostin Stoyanov Date: Wed, 24 Jul 2024 11:30:59 +0100 Subject: [PATCH] plugin: enable multiple plugins for the same hook CRIU provides two plugins for checkpoint/restore of GPU applications: amdgpu and cuda. Both plugins use the `RESUME_DEVICES_LATE` hook to enable restore: CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late) CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) However, CRIU currently does not support running more than one plugin for the same hook. As a result, when both plugins are installed, the resume function for CUDA applications is not executed. To fix this, we need to make sure that both `plugin_resume_devices_late()` functions return `-ENOTSUP` when restore is not supported. Signed-off-by: Radostin Stoyanov --- plugins/amdgpu/amdgpu_plugin.c | 3 ++- plugins/cuda/cuda_plugin.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index a41469a50..b73b5101d 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -1809,7 +1809,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC); if (fd < 0) { pr_perror("failed to open kfd in plugin"); - return -1; + return -ENOTSUP; } args.pid = target_pid; @@ -1818,6 +1818,7 @@ int amdgpu_plugin_resume_devices_late(int target_pid) if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) { if (errno == ESRCH) { pr_info("Pid %d has no kfd process info\n", target_pid); + exit_code = -ENOTSUP; } else { pr_perror("restore late ioctl failed"); exit_code = -1; diff --git a/plugins/cuda/cuda_plugin.c b/plugins/cuda/cuda_plugin.c index b3f2fc8df..f16c4c505 100644 --- a/plugins/cuda/cuda_plugin.c +++ b/plugins/cuda/cuda_plugin.c @@ -408,7 +408,7 @@ interrupt: int cuda_plugin_resume_devices_late(int pid) { if (plugin_disabled) { - return 0; + return -ENOTSUP; } return resume_device(pid, 1);