2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 18:07:57 +00:00
criu/plugins/cuda/cuda_plugin.c

460 lines
12 KiB
C
Raw Normal View History

#include "criu-log.h"
#include "plugin.h"
#include "util.h"
#include "cr_options.h"
#include "pid.h"
#include "proc_parse.h"
#include <common/list.h>
#include <compel/infect.h>
#include <ctype.h>
#include <fcntl.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/ptrace.h>
#include <sys/wait.h>
/* cuda-checkpoint binary should live in your PATH */
#define CUDA_CHECKPOINT "cuda-checkpoint"
/* cuda-checkpoint --action flags */
#define ACTION_LOCK "lock"
#define ACTION_CHECKPOINT "checkpoint"
#define ACTION_RESTORE "restore"
#define ACTION_UNLOCK "unlock"
#define CUDA_CKPT_BUF_SIZE (128)
#ifdef LOG_PREFIX
#undef LOG_PREFIX
#endif
#define LOG_PREFIX "cuda_plugin: "
/* Disable plugin functionality if cuda-checkpoint is not in $PATH or driver
* version doesn't support --action flag
*/
bool plugin_disabled = false;
struct pid_info {
int pid;
char checkpointed;
struct list_head list;
};
/* Used to track which PID's we've paused CUDA operations on so far so we can
* release them after we're done with the DUMP
*/
struct list_head cuda_pids;
static void dealloc_pid_buffer(struct list_head *pid_buf)
{
struct pid_info *info;
struct pid_info *n;
list_for_each_entry_safe(info, n, pid_buf, list) {
list_del(&info->list);
xfree(info);
}
}
static int add_pid_to_buf(struct list_head *pid_buf, int pid)
{
struct pid_info *new = xmalloc(sizeof(*new));
if (new == NULL) {
return -1;
}
new->pid = pid;
new->checkpointed = 0;
list_add_tail(&new->list, pid_buf);
return 0;
}
static int update_checkpointed_pid(struct list_head *pid_buf, int pid)
{
struct pid_info *info;
list_for_each_entry(info, pid_buf, list) {
if (info->pid == pid) {
info->checkpointed = 1;
return 0;
}
}
return -1;
}
static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size)
{
#define READ 0
#define WRITE 1
int fd[2];
if (pipe(fd) != 0) {
pr_err("Couldn't create pipes for reading cuda-checkpoint output\n");
return -1;
}
buf[0] = '\0';
int child_pid = fork();
if (child_pid == -1) {
pr_err("Failed to fork to exec cuda-checkpoint\n");
close(fd[READ]);
close(fd[WRITE]);
return -1;
}
if (child_pid == 0) { // child
if (dup2(fd[WRITE], STDOUT_FILENO) == -1) {
return -1;
}
if (dup2(fd[WRITE], STDERR_FILENO) == -1) {
return -1;
}
close(fd[READ]);
return execvp(args[0], (char **)args);
} else { // parent
close(fd[WRITE]);
int bytes_read = read(fd[READ], buf, buf_size);
if (bytes_read > 0) {
buf[bytes_read - 1] = '\0';
}
// Clear out any of the remaining output in the pipe in case the buffer wasn't large enough
struct pollfd read_poll = { .fd = fd[READ], .events = POLLIN | POLLHUP };
while (true) {
int poll_status = poll(&read_poll, 1, -1);
if (poll_status == -1) {
close(fd[READ]);
pr_err("Unexpected error when clearing cuda-checkpoint output buffer\n");
return -1;
}
if (read_poll.revents & POLLHUP) {
break;
}
// POLLIN, read into scratch buffer to flush things out
char scratch[64];
bytes_read = read(fd[READ], scratch, sizeof(scratch));
}
int status;
if (waitpid(child_pid, &status, 0) == -1 || !WIFEXITED(status)) {
pr_err("cuda-checkpoint exited improperly, couldn't complete operation\n");
close(fd[READ]);
return -1;
}
close(fd[READ]);
return WEXITSTATUS(status);
}
}
static bool cuda_checkpoint_supports_flag(const char *flag)
{
char msg_buf[2048];
const char *args[] = { CUDA_CHECKPOINT, "-h", NULL };
int ret = launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf));
if (ret != 0) {
pr_err("Failed to launch cuda-checkpoint utility, check that the utility is present in your $PATH\n");
return false;
}
if (strstr(msg_buf, flag) == NULL) {
return false;
}
return true;
}
/* Retrieve the cuda restore thread TID from the root pid */
static int get_cuda_restore_tid(int root_pid)
{
char pid_buf[16];
char pid_out[CUDA_CKPT_BUF_SIZE];
snprintf(pid_buf, sizeof(pid_buf), "%d", root_pid);
const char *args[] = { CUDA_CHECKPOINT, "--get-restore-tid", "--pid", pid_buf, NULL };
int ret = launch_cuda_checkpoint(args, pid_out, sizeof(pid_out));
if (ret != 0) {
pr_err("Failed to launch cuda-checkpoint to retrieve restore tid: %s\n", pid_out);
return -1;
}
return atoi(pid_out);
}
static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf,
int buf_size)
{
char pid_buf[16];
char timeout_buf[16];
snprintf(pid_buf, sizeof(pid_buf), "%d", pid);
const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */,
NULL /* timeout_val */, NULL };
if (timeout > 0) {
snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout);
args[5] = "--timeout";
args[6] = timeout_buf;
}
return launch_cuda_checkpoint(args, msg_buf, buf_size);
}
static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigset)
{
/* Since we resumed a thread that CRIU previously already froze we need to
* INTERRUPT it once again, task was already SEIZE'd so we don't need to do
* a compel_interrupt_task()
*/
if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) {
pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n",
restore_tid);
return -1;
}
struct proc_status_creds creds;
if (compel_wait_task(restore_tid, -1, parse_pid_status, NULL, &creds.s, NULL) != COMPEL_TASK_ALIVE) {
pr_err("compel_wait_task failed after interrupt\n");
return -1;
}
if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) {
pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid);
return -1;
}
if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) {
pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid);
return -1;
}
return 0;
}
static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset)
{
k_rtsigset_t block;
if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) {
pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid);
return -1;
}
ksigfillset(&block);
ksigdelset(&block, SIGTRAP);
if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) {
pr_err("Failed to block signals on restore tid %d\n", restore_tid);
return -1;
}
// Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread
if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) {
pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid);
return -1;
}
if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) {
pr_err("Could not resume cuda restore tid %d\n", restore_tid);
return -1;
}
return 0;
}
int cuda_plugin_checkpoint_devices(int pid)
{
int restore_tid;
char msg_buf[CUDA_CKPT_BUF_SIZE];
int int_ret;
int status;
k_rtsigset_t save_sigset;
if (plugin_disabled) {
return 0;
}
restore_tid = get_cuda_restore_tid(pid);
/* We can possibly hit a race with cuInit() where we are past the point of
* locking the process but at lock time cuInit() hadn't completed in which
* case cuda-checkpoint will report that we're in an invalid state to
* checkpoint
*/
if (restore_tid == -1) {
pr_info("No need to checkpoint devices on pid %d\n", pid);
return 0;
}
pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid);
/* We need to resume the checkpoint thread to prepare the mappings for
* checkpointing
*/
if (resume_restore_thread(restore_tid, &save_sigset)) {
return -1;
}
status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf);
goto interrupt;
}
status = update_checkpointed_pid(&cuda_pids, pid);
if (status) {
pr_err("Failed to track checkpointed pid %d\n", pid);
status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid);
}
}
interrupt:
int_ret = interrupt_restore_thread(restore_tid, &save_sigset);
return status != 0 ? status : int_ret;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices);
int cuda_plugin_pause_devices(int pid)
{
int restore_tid;
char msg_buf[CUDA_CKPT_BUF_SIZE];
if (plugin_disabled) {
return 0;
}
restore_tid = get_cuda_restore_tid(pid);
if (restore_tid == -1) {
pr_info("no need to pause devices on pid %d\n", pid);
return 0;
}
pr_info("pausing devices on pid %d\n", pid);
int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("PAUSE_DEVICES failed with %s\n", msg_buf);
return -1;
}
if (add_pid_to_buf(&cuda_pids, pid)) {
pr_err("unable to track paused pid %d\n", pid);
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid);
}
return -1;
}
return 0;
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices)
int resume_device(int pid, int checkpointed)
{
char msg_buf[CUDA_CKPT_BUF_SIZE];
int status;
int ret = 0;
int int_ret;
k_rtsigset_t save_sigset;
int restore_tid = get_cuda_restore_tid(pid);
if (restore_tid == -1) {
pr_info("No need to resume devices on pid %d\n", pid);
return 0;
}
pr_info("resuming devices on pid %d\n", pid);
/* The resuming process has to stay frozen during this time otherwise
* attempting to access a UVM pointer will crash if we haven't restored the
* underlying mappings yet
*/
pr_debug("Restore thread pid %d found for real pid %d\n", restore_tid, pid);
/* wakeup the restore thread so we can handle the restore for this pid,
* rseq_cs has to be restored before execution
*/
if (resume_restore_thread(restore_tid, &save_sigset)) {
return -1;
}
if (checkpointed) {
status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf);
ret = -1;
goto interrupt;
}
}
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
if (status) {
pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf);
ret = -1;
}
interrupt:
int_ret = interrupt_restore_thread(restore_tid, &save_sigset);
return ret != 0 ? ret : int_ret;
}
int cuda_plugin_resume_devices_late(int pid)
{
if (plugin_disabled) {
return 0;
}
return resume_device(pid, 1);
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late)
int cuda_plugin_init(int stage)
{
if (!cuda_checkpoint_supports_flag("--action")) {
pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n");
plugin_disabled = true;
return 0;
}
pr_info("initialized: %s stage %d\n", CR_PLUGIN_DESC.name, stage);
/* In the DUMP stage track all the PID's we've paused CUDA operations on to
* release them when we're done if the user requested the leave-running option
*/
if (stage == CR_PLUGIN_STAGE__DUMP) {
INIT_LIST_HEAD(&cuda_pids);
}
return 0;
}
void cuda_plugin_fini(int stage, int ret)
{
if (plugin_disabled) {
return;
}
pr_info("finished %s stage %d err %d\n", CR_PLUGIN_DESC.name, stage, ret);
/* Release all the paused PID's at the end of the DUMP stage in case the
* user provides the -R (leave-running) flag or an error occurred
*/
if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) {
struct pid_info *info;
list_for_each_entry(info, &cuda_pids, list) {
resume_device(info->pid, info->checkpointed);
}
}
if (stage == CR_PLUGIN_STAGE__DUMP) {
dealloc_pid_buffer(&cuda_pids);
}
}
CR_PLUGIN_REGISTER("cuda_plugin", cuda_plugin_init, cuda_plugin_fini)