mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-22 09:58:09 +00:00
cuda: fix check for GPU device availability
The check for `/dev/nvidiactl` to determine if the CUDA plugin can be used is unreliable because in some cases the default path for driver installation is different [1]. This patch changes the logic to check if a GPU device is available in `/proc/driver/nvidia/gpus/`. This approach is similar to `torch.cuda.is_available()` and it is a more accurate indicator. The subsequent check for support of the `cuda-checkpoint --action` option would confirm if the driver supports checkpoint/restore. [1] https://github.com/NVIDIA/gpu-operator Fixes: #2509 Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
This commit is contained in:
parent
31b38d662d
commit
26dcc216c2
@ -470,6 +470,20 @@ int cuda_plugin_resume_devices_late(int pid)
|
||||
}
|
||||
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late)
|
||||
|
||||
/**
|
||||
* Check if a CUDA device is available on the system
|
||||
*/
|
||||
static bool is_cuda_device_available(void)
|
||||
{
|
||||
const char *gpu_path = "/proc/driver/nvidia/gpus/";
|
||||
struct stat sb;
|
||||
|
||||
if (stat(gpu_path, &sb) != 0)
|
||||
return false;
|
||||
|
||||
return S_ISDIR(sb.st_mode);
|
||||
}
|
||||
|
||||
int cuda_plugin_init(int stage)
|
||||
{
|
||||
int ret;
|
||||
@ -481,8 +495,8 @@ int cuda_plugin_init(int stage)
|
||||
}
|
||||
}
|
||||
|
||||
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
|
||||
pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
|
||||
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && !is_cuda_device_available()) {
|
||||
pr_info("No GPU device found; CUDA plugin is disabled\n");
|
||||
plugin_disabled = true;
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user