2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 01:51:51 +00:00

criu/plugin: Add NVIDIA CUDA plugin

Adding support for the NVIDIA cuda-checkpoint utility, requires the use of an
r555 or higher driver along with the cuda-checkpoint binary.

Signed-off-by: Jesus Ramos <jeramos@nvidia.com>
This commit is contained in:
Jesus Ramos 2024-05-31 13:38:54 -07:00 committed by Andrei Vagin
parent 5f486d5aee
commit bf417dd050
5 changed files with 578 additions and 4 deletions

View File

@ -165,7 +165,7 @@ HOSTCFLAGS += $(WARNINGS) $(DEFINES) -iquote include/
export AFLAGS CFLAGS USERCLFAGS HOSTCFLAGS
# Default target
all: criu lib crit
all: criu lib crit cuda_plugin
.PHONY: all
#
@ -298,15 +298,19 @@ clean-amdgpu_plugin:
$(Q) $(MAKE) -C plugins/amdgpu clean
.PHONY: clean-amdgpu_plugin
clean-cuda_plugin:
$(Q) $(MAKE) -C plugins/cuda clean
.PHONY: clean-cuda_plugin
clean-top:
$(Q) $(MAKE) -C Documentation clean
$(Q) $(MAKE) $(build)=test/compel clean
$(Q) $(RM) .gitid
.PHONY: clean-top
clean: clean-top clean-amdgpu_plugin
clean: clean-top clean-amdgpu_plugin clean-cuda_plugin
mrproper-top: clean-top clean-amdgpu_plugin
mrproper-top: clean-top clean-amdgpu_plugin clean-cuda_plugin
$(Q) $(RM) $(CONFIG_HEADER)
$(Q) $(RM) $(VERSION_HEADER)
$(Q) $(RM) $(COMPEL_VERSION_HEADER)
@ -338,6 +342,10 @@ amdgpu_plugin: criu
$(Q) $(MAKE) -C plugins/amdgpu all
.PHONY: amdgpu_plugin
cuda_plugin: criu
$(Q) $(MAKE) -C plugins/cuda all
.PHONY: cuda_plugin
crit: lib
$(Q) $(MAKE) -C crit
.PHONY: crit
@ -424,6 +432,7 @@ help:
@echo ' lint - Run code linters'
@echo ' indent - Indent C code'
@echo ' amdgpu_plugin - Make AMD GPU plugin'
@echo ' cuda_plugin - Make NVIDIA CUDA plugin'
.PHONY: help
ruff:

View File

@ -72,12 +72,16 @@ install-amdgpu_plugin: amdgpu_plugin
$(Q) $(MAKE) -C plugins/amdgpu install
.PHONY: install-amdgpu_plugin
install-cuda_plugin: cuda_plugin
$(Q) $(MAKE) -C plugins/cuda install
.PHONY: install-cuda_plugin
install-compel: $(compel-install-targets)
$(Q) $(MAKE) $(build)=compel install
$(Q) $(MAKE) $(build)=compel/plugins install
.PHONY: install-compel
install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin ;
install: install-man install-lib install-crit install-criu install-compel install-amdgpu_plugin install-cuda_plugin ;
.PHONY: install
uninstall:
@ -88,4 +92,5 @@ uninstall:
$(Q) $(MAKE) $(build)=compel $@
$(Q) $(MAKE) $(build)=compel/plugins $@
$(Q) $(MAKE) -C plugins/amdgpu $@
$(Q) $(MAKE) -C plugins/cuda $@
.PHONY: uninstall

42
plugins/cuda/Makefile Normal file
View File

@ -0,0 +1,42 @@
PLUGIN_NAME := cuda_plugin
PLUGIN_SOBJ := cuda_plugin.so
DEPS_CUDA := $(PLUGIN_SOBJ)
PLUGIN_INCLUDE := -iquote../../include
PLUGIN_INCLUDE += -iquote../../criu/include
PLUGIN_INCLUDE += -iquote../../criu/arch/$(ARCH)/include/
PLUGIN_INCLUDE += -iquote../../
COMPEL := ../../compel/compel-host
CC := gcc
PLUGIN_CFLAGS := -g -Wall -Werror -shared -nostartfiles -fPIC
__nmk_dir ?= ../../scripts/nmk/scripts/
include $(__nmk_dir)msg.mk
all: $(DEPS_CUDA)
cuda_plugin.so: cuda_plugin.c
$(call msg-gen, $@)
$(Q) $(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS)
clean:
$(call msg-clean, $@)
$(Q) $(RM) $(PLUGIN_SOBJ)
.PHONY: clean
mrproper: clean
install:
$(Q) mkdir -p $(DESTDIR)$(PLUGINDIR)
$(E) " INSTALL " $(PLUGIN_NAME)
$(Q) install -m 644 $(PLUGIN_SOBJ) $(DESTDIR)$(PLUGINDIR)
.PHONY: install
uninstall:
$(E) " UNINSTALL" $(PLUGIN_NAME)
$(Q) $(RM) $(DESTDIR)$(PLUGINDIR)/$(PLUGIN_SOBJ)
.PHONY: uninstall

59
plugins/cuda/README.md Normal file
View File

@ -0,0 +1,59 @@
Checkpoint and Restore for CUDA applications with CRIU
======================================================
# Requirements
The cuda-checkpoint utility should be placed somewhere in your $PATH and an r555
or higher GPU driver is required for CUDA CRIU integration support.
## cuda-checkpoint
The cuda-checkpoint utility can be found at:
https://github.com/NVIDIA/cuda-checkpoint
cuda-checkpoint is a binary utility used to issue checkpointing commands to CUDA
applications. Updating the cuda-checkpoint utility between driver releases
should not be necessary as the utility simply exposes some extra driver behavior
so driver updates are all that's needed to get access to newer features.
# Checkpointing Procedure
cuda-checkpoint exposes 4 actions used in the checkpointing process: lock,
checkpoint, restore, unlock.
* lock - Used with the PAUSE_DEVICES hook while a process is still running to
quiesce the application into a state where it can be checkpointed
* checkpoint - Used with the CHECKPOINT_DEVICES hook once a process has been
seized/frozen to perform the actual checkpointing operation
* restore/unlock - Used with the RESUME_DEVICES_LATE hook to restore the CUDA
state and release the process back to it's running state
These actions are facilitated by a CUDA checkpoint+restore thread that the CUDA
plugin will re-wake when needed.
# Known Limitations
* Currently GPU memory contents are brought into main system memory and CRIU
then checkpoints that as part of the normal procedure. On systems with many
GPU's with high GPU memory usage this can cause memory thrashing. A future
CUDA release will add support for dumping the memory contents to files to
alleviate this as well as support in the CRIU plugin.
* There's currently a small race between when a PAUSE_DEVICES hook is called on
a running process and a process calls cuInit() and finishes initializing CUDA
after the PAUSE is issued but before the process is frozen to checkpoint. This
will cause cuda-checkpoint to report that the process is in an illegal state
for checkpointing and it's recommended to just attempt the CRIU procedure
again, this should be very rare.
* Applications that use NVML will leave some leftover device references as NVML
is not currently supported for checkpointing. There will be support for this
in later drivers. A possible temporary workaround is to have the
{DUMP,RESTORE}_EXT_FILE hook just ignore /dev/nvidiactl and /dev/nvidia{0..N}
remaining references for these applications as in most cases NVML is used to
get info such as gpu count and some capabilities and these values are never
accessed again and unlikely to change.
* CUDA applications that fork() but don't call exec() but also don't issue any
CUDA API calls will have some leftover references to /dev/nvidia* and fail to
checkpoint as a result. This can be worked around in a similar fashion to the
NVML case where the leftover references can be ignored as CUDA is not fork()
safe anyway.
* Restore currently requires that you restore on a system with similar GPU's and
same GPU count.
* NVIDIA UVM Managed Memory, MIG (Multi Instance GPU), and MPS (Multi-Process
Service) are currently not supported for checkpointing. Future CUDA releases
will add support for these.

459
plugins/cuda/cuda_plugin.c Normal file
View File

@ -0,0 +1,459 @@
#include "criu-log.h"
#include "plugin.h"
#include "util.h"
#include "cr_options.h"
#include "pid.h"
#include "proc_parse.h"
#include <common/list.h>
#include <compel/infect.h>
#include <ctype.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
/* cuda-checkpoint binary should live in your PATH */
#define CUDA_CHECKPOINT "cuda-checkpoint"
/* cuda-checkpoint --action flags */
#define ACTION_LOCK "lock"
#define ACTION_CHECKPOINT "checkpoint"
#define ACTION_RESTORE "restore"
#define ACTION_UNLOCK "unlock"
#define CUDA_CKPT_BUF_SIZE (128)
#ifdef LOG_PREFIX
#undef LOG_PREFIX
#endif
#define LOG_PREFIX "cuda_plugin: "
/* Disable plugin functionality if cuda-checkpoint is not in $PATH or driver
* version doesn't support --action flag
*/
bool plugin_disabled = false;
struct pid_info {
int pid;
char checkpointed;
struct list_head list;
};
/* Used to track which PID's we've paused CUDA operations on so far so we can
* release them after we're done with the DUMP
*/
struct list_head cuda_pids;
static void dealloc_pid_buffer(struct list_head *pid_buf)
{
struct pid_info *info;
struct pid_info *n;
list_for_each_entry_safe(info, n, pid_buf, list) {
list_del(&info->list);
xfree(info);
}
}
static int add_pid_to_buf(struct list_head *pid_buf, int pid)
{
struct pid_info *new = xmalloc(sizeof(*new));
if (new == NULL) {
return -1;
}
new->pid = pid;
new->checkpointed = 0;
list_add_tail(&new->list, pid_buf);
return 0;
}
static int update_checkpointed_pid(struct list_head *pid_buf, int pid)
{
struct pid_info *info;
list_for_each_entry(info, pid_buf, list) {
if (info->pid == pid) {
info->checkpointed = 1;
return 0;
}
}
return -1;
}
static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size)
{
#define READ 0
#define WRITE 1
int fd[2];
if (pipe(fd) != 0) {
pr_err("Couldn't create pipes for reading cuda-checkpoint output\n");
return -1;
}
buf[0] = '\0';
int child_pid = fork();
if (child_pid == -1) {
pr_err("Failed to fork to exec cuda-checkpoint\n");
close(fd[READ]);
close(fd[WRITE]);
return -1;
}
if (child_pid == 0) { // child
if (dup2(fd[WRITE], STDOUT_FILENO) == -1) {
return -1;
}
if (dup2(fd[WRITE], STDERR_FILENO) == -1) {
return -1;
}
close(fd[READ]);
return execvp(args[0], (char **)args);
} else { // parent
close(fd[WRITE]);
int bytes_read = read(fd[READ], buf, buf_size);
if (bytes_read > 0) {
buf[bytes_read - 1] = '\0';
}
// Clear out any of the remaining output in the pipe in case the buffer wasn't large enough
struct pollfd read_poll = { .fd = fd[READ], .events = POLLIN | POLLHUP };
while (true) {
int poll_status = poll(&read_poll, 1, -1);
if (poll_status == -1) {
close(fd[READ]);
pr_err("Unexpected error when clearing cuda-checkpoint output buffer\n");
return -1;
}
if (read_poll.revents & POLLHUP) {
break;
}
// POLLIN, read into scratch buffer to flush things out
char scratch[64];
bytes_read = read(fd[READ], scratch, sizeof(scratch));
}
int status;
if (waitpid(child_pid, &status, 0) == -1 || !WIFEXITED(status)) {
pr_err("cuda-checkpoint exited improperly, couldn't complete operation\n");
close(fd[READ]);
return -1;
}
close(fd[READ]);
return WEXITSTATUS(status);
}
}
static bool cuda_checkpoint_supports_flag(const char *flag)
{
char msg_buf[2048];
const char *args[] = { CUDA_CHECKPOINT, "-h", NULL };
int ret = launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf));
if (ret != 0) {
pr_err("Failed to launch cuda-checkpoint utility, check that the utility is present in your $PATH\n");
return false;
}
if (strstr(msg_buf, flag) == NULL) {
return false;
}
return true;
}
/* Retrieve the cuda restore thread TID from the root pid */
static int get_cuda_restore_tid(int root_pid)
{
char pid_buf[16];
char pid_out[CUDA_CKPT_BUF_SIZE];
snprintf(pid_buf, sizeof(pid_buf), "%d", root_pid);
const char *args[] = { CUDA_CHECKPOINT, "--get-restore-tid", "--pid", pid_buf, NULL };
int ret = launch_cuda_checkpoint(args, pid_out, sizeof(pid_out));
if (ret != 0) {
pr_err("Failed to launch cuda-checkpoint to retrieve restore tid: %s\n", pid_out);
return -1;
}
return atoi(pid_out);
}
static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf,
int buf_size)
{
char pid_buf[16];
char timeout_buf[16];
snprintf(pid_buf, sizeof(pid_buf), "%d", pid);
const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */,
NULL /* timeout_val */, NULL };
if (timeout > 0) {
snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout);
args[5] = "--timeout";
args[6] = timeout_buf;
}
return launch_cuda_checkpoint(args, msg_buf, buf_size);
}
static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigset)
{
/* Since we resumed a thread that CRIU previously already froze we need to
* INTERRUPT it once again, task was already SEIZE'd so we don't need to do
* a compel_interrupt_task()
*/
if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) {
pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n",
restore_tid);
return -1;
}
struct proc_status_creds creds;
if (compel_wait_task(restore_tid, -1, parse_pid_status, NULL, &creds.s, NULL) != COMPEL_TASK_ALIVE) {
pr_err("compel_wait_task failed after interrupt\n");
return -1;
}
if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) {
pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid);
return -1;
}
if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) {
pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid);
return -1;
}
return 0;
}
static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset)
{
k_rtsigset_t block;
if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) {
pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid);
return -1;
}
ksigfillset(&block);
ksigdelset(&block, SIGTRAP);
if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) {
pr_err("Failed to block signals on restore tid %d\n", restore_tid);
return -1;
}
// Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread
if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) {
pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid);
return -1;
}
if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) {
pr_err("Could not resume cuda restore tid %d\n", restore_tid);
return -1;
}
return 0;
}
int cuda_plugin_checkpoint_devices(int pid)
{
int restore_tid;
char msg_buf[CUDA_CKPT_BUF_SIZE];
int int_ret;
int status;
k_rtsigset_t save_sigset;
if (plugin_disabled) {
return 0;
}
restore_tid = get_cuda_restore_tid(pid);
/* We can possibly hit a race with cuInit() where we are past the point of
* locking the process but at lock time cuInit() hadn't completed in which
* case cuda-checkpoint will report that we're in an invalid state to
* checkpoint
*/
if (restore_tid == -1) {
pr_info("No need to checkpoint devices on pid %d\n", pid);
return 0;
}
pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid);
/* We need to resume the checkpoint thread to prepare the mappings for
* checkpointing
*/
if (resume_restore_thread(restore_tid, &save_sigset)) {
return -1;
}
status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf);
goto interrupt;
}
status = update_checkpointed_pid(&cuda_pids, pid);
if (status) {
pr_err("Failed to track checkpointed pid %d\n", pid);
status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid);
}
}
interrupt:
int_ret = interrupt_restore_thread(restore_tid, &save_sigset);
return status != 0 ? status : int_ret;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices);
int cuda_plugin_pause_devices(int pid)
{
int restore_tid;
char msg_buf[CUDA_CKPT_BUF_SIZE];
if (plugin_disabled) {
return 0;
}
restore_tid = get_cuda_restore_tid(pid);
if (restore_tid == -1) {
pr_info("no need to pause devices on pid %d\n", pid);
return 0;
}
pr_info("pausing devices on pid %d\n", pid);
int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("PAUSE_DEVICES failed with %s\n", msg_buf);
return -1;
}
if (add_pid_to_buf(&cuda_pids, pid)) {
pr_err("unable to track paused pid %d\n", pid);
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid);
}
return -1;
}
return 0;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices)
int resume_device(int pid, int checkpointed)
{
char msg_buf[CUDA_CKPT_BUF_SIZE];
int status;
int ret = 0;
int int_ret;
k_rtsigset_t save_sigset;
int restore_tid = get_cuda_restore_tid(pid);
if (restore_tid == -1) {
pr_info("No need to resume devices on pid %d\n", pid);
return 0;
}
pr_info("resuming devices on pid %d\n", pid);
/* The resuming process has to stay frozen during this time otherwise
* attempting to access a UVM pointer will crash if we haven't restored the
* underlying mappings yet
*/
pr_debug("Restore thread pid %d found for real pid %d\n", restore_tid, pid);
/* wakeup the restore thread so we can handle the restore for this pid,
* rseq_cs has to be restored before execution
*/
if (resume_restore_thread(restore_tid, &save_sigset)) {
return -1;
}
if (checkpointed) {
status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf);
ret = -1;
goto interrupt;
}
}
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf);
ret = -1;
}
interrupt:
int_ret = interrupt_restore_thread(restore_tid, &save_sigset);
return ret != 0 ? ret : int_ret;
}
int cuda_plugin_resume_devices_late(int pid)
{
if (plugin_disabled) {
return 0;
}
return resume_device(pid, 1);
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late)
int cuda_plugin_init(int stage)
{
if (!cuda_checkpoint_supports_flag("--action")) {
pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n");
plugin_disabled = true;
return 0;
}
pr_info("initialized: %s stage %d\n", CR_PLUGIN_DESC.name, stage);
/* In the DUMP stage track all the PID's we've paused CUDA operations on to
* release them when we're done if the user requested the leave-running option
*/
if (stage == CR_PLUGIN_STAGE__DUMP) {
INIT_LIST_HEAD(&cuda_pids);
}
return 0;
}
void cuda_plugin_fini(int stage, int ret)
{
if (plugin_disabled) {
return;
}
pr_info("finished %s stage %d err %d\n", CR_PLUGIN_DESC.name, stage, ret);
/* Release all the paused PID's at the end of the DUMP stage in case the
* user provides the -R (leave-running) flag or an error occurred
*/
if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) {
struct pid_info *info;
list_for_each_entry(info, &cuda_pids, list) {
resume_device(info->pid, info->checkpointed);
}
}
if (stage == CR_PLUGIN_STAGE__DUMP) {
dealloc_pid_buffer(&cuda_pids);
}
}
CR_PLUGIN_REGISTER("cuda_plugin", cuda_plugin_init, cuda_plugin_fini)