2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 01:51:51 +00:00

cuda: fix check for GPU device availability

The check for `/dev/nvidiactl` to determine if the CUDA plugin can be
used is unreliable because in some cases the default path for driver
installation is different [1]. This patch changes the logic to check
if a GPU device is available in `/proc/driver/nvidia/gpus/`. This
approach is similar to `torch.cuda.is_available()` and it is a more
accurate indicator.

The subsequent check for support of the `cuda-checkpoint --action`
option would confirm if the driver supports checkpoint/restore.

[1] https://github.com/NVIDIA/gpu-operator

Fixes: #2509

Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
This commit is contained in:
Radostin Stoyanov 2024-11-02 08:29:43 +00:00 committed by Andrei Vagin
parent 31b38d662d
commit 26dcc216c2

View File

@ -470,6 +470,20 @@ int cuda_plugin_resume_devices_late(int pid)
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late)
/**
* Check if a CUDA device is available on the system
*/
static bool is_cuda_device_available(void)
{
const char *gpu_path = "/proc/driver/nvidia/gpus/";
struct stat sb;
if (stat(gpu_path, &sb) != 0)
return false;
return S_ISDIR(sb.st_mode);
}
int cuda_plugin_init(int stage)
{
int ret;
@ -481,8 +495,8 @@ int cuda_plugin_init(int stage)
}
}
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && !is_cuda_device_available()) {
pr_info("No GPU device found; CUDA plugin is disabled\n");
plugin_disabled = true;
return 0;
}