#include "criu-log.h" #include "plugin.h" #include "util.h" #include "cr_options.h" #include "pid.h" #include "proc_parse.h" #include #include #include #include #include #include #include #include /* cuda-checkpoint binary should live in your PATH */ #define CUDA_CHECKPOINT "cuda-checkpoint" /* cuda-checkpoint --action flags */ #define ACTION_LOCK "lock" #define ACTION_CHECKPOINT "checkpoint" #define ACTION_RESTORE "restore" #define ACTION_UNLOCK "unlock" #define CUDA_CKPT_BUF_SIZE (128) #ifdef LOG_PREFIX #undef LOG_PREFIX #endif #define LOG_PREFIX "cuda_plugin: " /* Disable plugin functionality if cuda-checkpoint is not in $PATH or driver * version doesn't support --action flag */ bool plugin_disabled = false; struct pid_info { int pid; char checkpointed; struct list_head list; }; /* Used to track which PID's we've paused CUDA operations on so far so we can * release them after we're done with the DUMP */ struct list_head cuda_pids; static void dealloc_pid_buffer(struct list_head *pid_buf) { struct pid_info *info; struct pid_info *n; list_for_each_entry_safe(info, n, pid_buf, list) { list_del(&info->list); xfree(info); } } static int add_pid_to_buf(struct list_head *pid_buf, int pid) { struct pid_info *new = xmalloc(sizeof(*new)); if (new == NULL) { return -1; } new->pid = pid; new->checkpointed = 0; list_add_tail(&new->list, pid_buf); return 0; } static int update_checkpointed_pid(struct list_head *pid_buf, int pid) { struct pid_info *info; list_for_each_entry(info, pid_buf, list) { if (info->pid == pid) { info->checkpointed = 1; return 0; } } return -1; } static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size) { #define READ 0 #define WRITE 1 int fd[2]; if (pipe(fd) != 0) { pr_err("Couldn't create pipes for reading cuda-checkpoint output\n"); return -1; } buf[0] = '\0'; int child_pid = fork(); if (child_pid == -1) { pr_err("Failed to fork to exec cuda-checkpoint\n"); close(fd[READ]); close(fd[WRITE]); return -1; } if (child_pid == 0) { // child if (dup2(fd[WRITE], STDOUT_FILENO) == -1) { return -1; } if (dup2(fd[WRITE], STDERR_FILENO) == -1) { return -1; } close(fd[READ]); return execvp(args[0], (char **)args); } else { // parent close(fd[WRITE]); int bytes_read = read(fd[READ], buf, buf_size); if (bytes_read > 0) { buf[bytes_read - 1] = '\0'; } // Clear out any of the remaining output in the pipe in case the buffer wasn't large enough struct pollfd read_poll = { .fd = fd[READ], .events = POLLIN | POLLHUP }; while (true) { int poll_status = poll(&read_poll, 1, -1); if (poll_status == -1) { close(fd[READ]); pr_err("Unexpected error when clearing cuda-checkpoint output buffer\n"); return -1; } if (read_poll.revents & POLLHUP) { break; } // POLLIN, read into scratch buffer to flush things out char scratch[64]; bytes_read = read(fd[READ], scratch, sizeof(scratch)); } int status; if (waitpid(child_pid, &status, 0) == -1 || !WIFEXITED(status)) { pr_err("cuda-checkpoint exited improperly, couldn't complete operation\n"); close(fd[READ]); return -1; } close(fd[READ]); return WEXITSTATUS(status); } } static bool cuda_checkpoint_supports_flag(const char *flag) { char msg_buf[2048]; const char *args[] = { CUDA_CHECKPOINT, "-h", NULL }; int ret = launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)); if (ret != 0) { pr_err("Failed to launch cuda-checkpoint utility, check that the utility is present in your $PATH\n"); return false; } if (strstr(msg_buf, flag) == NULL) { return false; } return true; } /* Retrieve the cuda restore thread TID from the root pid */ static int get_cuda_restore_tid(int root_pid) { char pid_buf[16]; char pid_out[CUDA_CKPT_BUF_SIZE]; snprintf(pid_buf, sizeof(pid_buf), "%d", root_pid); const char *args[] = { CUDA_CHECKPOINT, "--get-restore-tid", "--pid", pid_buf, NULL }; int ret = launch_cuda_checkpoint(args, pid_out, sizeof(pid_out)); if (ret != 0) { pr_err("Failed to launch cuda-checkpoint to retrieve restore tid: %s\n", pid_out); return -1; } return atoi(pid_out); } static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf, int buf_size) { char pid_buf[16]; char timeout_buf[16]; snprintf(pid_buf, sizeof(pid_buf), "%d", pid); const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */, NULL /* timeout_val */, NULL }; if (timeout > 0) { snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout); args[5] = "--timeout"; args[6] = timeout_buf; } return launch_cuda_checkpoint(args, msg_buf, buf_size); } static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigset) { /* Since we resumed a thread that CRIU previously already froze we need to * INTERRUPT it once again, task was already SEIZE'd so we don't need to do * a compel_interrupt_task() */ if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) { pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n", restore_tid); return -1; } struct proc_status_creds creds; if (compel_wait_task(restore_tid, -1, parse_pid_status, NULL, &creds.s, NULL) != COMPEL_TASK_ALIVE) { pr_err("compel_wait_task failed after interrupt\n"); return -1; } if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) { pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid); return -1; } if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) { pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid); return -1; } return 0; } static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset) { k_rtsigset_t block; if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) { pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid); return -1; } ksigfillset(&block); ksigdelset(&block, SIGTRAP); if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) { pr_err("Failed to block signals on restore tid %d\n", restore_tid); return -1; } // Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) { pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid); return -1; } if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) { pr_err("Could not resume cuda restore tid %d\n", restore_tid); return -1; } return 0; } int cuda_plugin_checkpoint_devices(int pid) { int restore_tid; char msg_buf[CUDA_CKPT_BUF_SIZE]; int int_ret; int status; k_rtsigset_t save_sigset; if (plugin_disabled) { return 0; } restore_tid = get_cuda_restore_tid(pid); /* We can possibly hit a race with cuInit() where we are past the point of * locking the process but at lock time cuInit() hadn't completed in which * case cuda-checkpoint will report that we're in an invalid state to * checkpoint */ if (restore_tid == -1) { pr_info("No need to checkpoint devices on pid %d\n", pid); return 0; } pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid); /* We need to resume the checkpoint thread to prepare the mappings for * checkpointing */ if (resume_restore_thread(restore_tid, &save_sigset)) { return -1; } status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf); goto interrupt; } status = update_checkpointed_pid(&cuda_pids, pid); if (status) { pr_err("Failed to track checkpointed pid %d\n", pid); status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid); } } interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); return status != 0 ? status : int_ret; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices); int cuda_plugin_pause_devices(int pid) { int restore_tid; char msg_buf[CUDA_CKPT_BUF_SIZE]; if (plugin_disabled) { return 0; } restore_tid = get_cuda_restore_tid(pid); if (restore_tid == -1) { pr_info("no need to pause devices on pid %d\n", pid); return 0; } pr_info("pausing devices on pid %d\n", pid); int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf)); if (status) { pr_err("PAUSE_DEVICES failed with %s\n", msg_buf); return -1; } if (add_pid_to_buf(&cuda_pids, pid)) { pr_err("unable to track paused pid %d\n", pid); status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid); } return -1; } return 0; } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices) int resume_device(int pid, int checkpointed) { char msg_buf[CUDA_CKPT_BUF_SIZE]; int status; int ret = 0; int int_ret; k_rtsigset_t save_sigset; int restore_tid = get_cuda_restore_tid(pid); if (restore_tid == -1) { pr_info("No need to resume devices on pid %d\n", pid); return 0; } pr_info("resuming devices on pid %d\n", pid); /* The resuming process has to stay frozen during this time otherwise * attempting to access a UVM pointer will crash if we haven't restored the * underlying mappings yet */ pr_debug("Restore thread pid %d found for real pid %d\n", restore_tid, pid); /* wakeup the restore thread so we can handle the restore for this pid, * rseq_cs has to be restored before execution */ if (resume_restore_thread(restore_tid, &save_sigset)) { return -1; } if (checkpointed) { status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf); ret = -1; goto interrupt; } } status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf)); if (status) { pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf); ret = -1; } interrupt: int_ret = interrupt_restore_thread(restore_tid, &save_sigset); return ret != 0 ? ret : int_ret; } int cuda_plugin_resume_devices_late(int pid) { if (plugin_disabled) { return 0; } return resume_device(pid, 1); } CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late) int cuda_plugin_init(int stage) { if (!cuda_checkpoint_supports_flag("--action")) { pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n"); plugin_disabled = true; return 0; } pr_info("initialized: %s stage %d\n", CR_PLUGIN_DESC.name, stage); /* In the DUMP stage track all the PID's we've paused CUDA operations on to * release them when we're done if the user requested the leave-running option */ if (stage == CR_PLUGIN_STAGE__DUMP) { INIT_LIST_HEAD(&cuda_pids); } return 0; } void cuda_plugin_fini(int stage, int ret) { if (plugin_disabled) { return; } pr_info("finished %s stage %d err %d\n", CR_PLUGIN_DESC.name, stage, ret); /* Release all the paused PID's at the end of the DUMP stage in case the * user provides the -R (leave-running) flag or an error occurred */ if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) { struct pid_info *info; list_for_each_entry(info, &cuda_pids, list) { resume_device(info->pid, info->checkpointed); } } if (stage == CR_PLUGIN_STAGE__DUMP) { dealloc_pid_buffer(&cuda_pids); } } CR_PLUGIN_REGISTER("cuda_plugin", cuda_plugin_init, cuda_plugin_fini)