mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-22 18:07:57 +00:00
This patch extends the inventory image with a `plugins` field that contains an array of plugins which were used during checkpoint, for example, to save GPU state. In particular, the CUDA and AMDGPU plugins are added to this field only when the checkpoint contains GPU state. This allows to disable unnecessary plugins during restore, show appropriate error messages if required CRIU plugin are missing, and migrate a process that does not use GPU from a GPU-enabled system to CPU-only environment. We use the `optional plugins_entry` for backwards compatibility. This entry allows us to distinguish between *unset* and *missing* field: - When the field is missing, it indicates that the checkpoint was created with a previous version of CRIU, and all plugins should be *enabled* during restore. - When the field is empty, it indicates that no plugins were used during checkpointing. Thus, all plugins can be *disabled* during restore. Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
539 lines
14 KiB
C
539 lines
14 KiB
C
#include "criu-log.h"
|
|
#include "plugin.h"
|
|
#include "util.h"
|
|
#include "cr_options.h"
|
|
#include "pid.h"
|
|
#include "proc_parse.h"
|
|
#include "seize.h"
|
|
#include "fault-injection.h"
|
|
|
|
#include <common/list.h>
|
|
#include <compel/infect.h>
|
|
|
|
#include <ctype.h>
|
|
#include <fcntl.h>
|
|
#include <stdio.h>
|
|
#include <unistd.h>
|
|
#include <sys/ptrace.h>
|
|
#include <sys/wait.h>
|
|
|
|
/* cuda-checkpoint binary should live in your PATH */
|
|
#define CUDA_CHECKPOINT "cuda-checkpoint"
|
|
|
|
/* cuda-checkpoint --action flags */
|
|
#define ACTION_LOCK "lock"
|
|
#define ACTION_CHECKPOINT "checkpoint"
|
|
#define ACTION_RESTORE "restore"
|
|
#define ACTION_UNLOCK "unlock"
|
|
|
|
#define CUDA_CKPT_BUF_SIZE (128)
|
|
|
|
#ifdef LOG_PREFIX
|
|
#undef LOG_PREFIX
|
|
#endif
|
|
#define LOG_PREFIX "cuda_plugin: "
|
|
|
|
/* Disable plugin functionality if cuda-checkpoint is not in $PATH or driver
|
|
* version doesn't support --action flag
|
|
*/
|
|
bool plugin_disabled = false;
|
|
|
|
bool plugin_added_to_inventory = false;
|
|
|
|
struct pid_info {
|
|
int pid;
|
|
char checkpointed;
|
|
struct list_head list;
|
|
};
|
|
|
|
/* Used to track which PID's we've paused CUDA operations on so far so we can
|
|
* release them after we're done with the DUMP
|
|
*/
|
|
static LIST_HEAD(cuda_pids);
|
|
|
|
static void dealloc_pid_buffer(struct list_head *pid_buf)
|
|
{
|
|
struct pid_info *info;
|
|
struct pid_info *n;
|
|
|
|
list_for_each_entry_safe(info, n, pid_buf, list) {
|
|
list_del(&info->list);
|
|
xfree(info);
|
|
}
|
|
}
|
|
|
|
static int add_pid_to_buf(struct list_head *pid_buf, int pid)
|
|
{
|
|
struct pid_info *new = xmalloc(sizeof(*new));
|
|
|
|
if (new == NULL) {
|
|
return -1;
|
|
}
|
|
|
|
new->pid = pid;
|
|
new->checkpointed = 0;
|
|
list_add_tail(&new->list, pid_buf);
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int update_checkpointed_pid(struct list_head *pid_buf, int pid)
|
|
{
|
|
struct pid_info *info;
|
|
|
|
list_for_each_entry(info, pid_buf, list) {
|
|
if (info->pid == pid) {
|
|
info->checkpointed = 1;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
return -1;
|
|
}
|
|
|
|
static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size)
|
|
{
|
|
#define READ 0
|
|
#define WRITE 1
|
|
int fd[2], buf_off;
|
|
|
|
if (pipe(fd) != 0) {
|
|
pr_err("Couldn't create pipes for reading cuda-checkpoint output\n");
|
|
return -1;
|
|
}
|
|
|
|
buf[0] = '\0';
|
|
|
|
int child_pid = fork();
|
|
if (child_pid == -1) {
|
|
pr_err("Failed to fork to exec cuda-checkpoint\n");
|
|
close(fd[READ]);
|
|
close(fd[WRITE]);
|
|
return -1;
|
|
}
|
|
|
|
if (child_pid == 0) { // child
|
|
if (dup2(fd[WRITE], STDOUT_FILENO) == -1) {
|
|
pr_perror("unable to clone fd %d->%d", fd[WRITE], STDOUT_FILENO);
|
|
_exit(EXIT_FAILURE);
|
|
}
|
|
if (dup2(fd[WRITE], STDERR_FILENO) == -1) {
|
|
pr_perror("unable to clone fd %d->%d", fd[WRITE], STDERR_FILENO);
|
|
_exit(EXIT_FAILURE);
|
|
}
|
|
close(fd[READ]);
|
|
|
|
close_fds(STDERR_FILENO + 1);
|
|
|
|
execvp(args[0], (char **)args);
|
|
|
|
/* We can't use pr_error() as log file fd is closed. */
|
|
fprintf(stderr, "execvp(\"%s\") failed: %s\n", args[0], strerror(errno));
|
|
|
|
_exit(EXIT_FAILURE);
|
|
}
|
|
|
|
close(fd[WRITE]);
|
|
buf_off = 0;
|
|
/* Reserve one byte for the null charracter. */
|
|
buf_size--;
|
|
while (buf_off < buf_size) {
|
|
int bytes_read;
|
|
bytes_read = read(fd[READ], buf + buf_off, buf_size - buf_off);
|
|
if (bytes_read == -1) {
|
|
pr_perror("Unable to read output of cuda-checkpoint");
|
|
goto err;
|
|
}
|
|
if (bytes_read == 0)
|
|
break;
|
|
buf_off += bytes_read;
|
|
}
|
|
buf[buf_off] = '\0';
|
|
|
|
/* Clear out any of the remaining output in the pipe in case the buffer wasn't large enough */
|
|
while (true) {
|
|
char scratch[1024];
|
|
int bytes_read;
|
|
bytes_read = read(fd[READ], scratch, sizeof(scratch));
|
|
if (bytes_read == -1) {
|
|
pr_perror("Unable to read output of cuda-checkpoint");
|
|
goto err;
|
|
}
|
|
if (bytes_read == 0)
|
|
break;
|
|
}
|
|
close(fd[READ]);
|
|
|
|
int status, exit_code = -1;
|
|
if (waitpid(child_pid, &status, 0) == -1) {
|
|
pr_perror("Unable to wait for the cuda-checkpoint process %d", child_pid);
|
|
goto err;
|
|
}
|
|
if (WIFSIGNALED(status)) {
|
|
int sig = WTERMSIG(status);
|
|
|
|
pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig));
|
|
} else if (WIFEXITED(status)) {
|
|
exit_code = WEXITSTATUS(status);
|
|
} else {
|
|
pr_err("cuda-checkpoint exited improperly: %u\n", status);
|
|
}
|
|
|
|
if (exit_code != EXIT_SUCCESS)
|
|
pr_debug("cuda-checkpoint output ===>\n%s\n"
|
|
"<=== cuda-checkpoint output\n",
|
|
buf);
|
|
|
|
return exit_code;
|
|
err:
|
|
kill(child_pid, SIGKILL);
|
|
waitpid(child_pid, NULL, 0);
|
|
return -1;
|
|
}
|
|
|
|
/**
|
|
* Checks if a given flag is supported by the cuda-checkpoint utility
|
|
*
|
|
* Returns:
|
|
* 1 if the flag is supported,
|
|
* 0 if the flag is not supported,
|
|
* -1 if there was an error launching the cuda-checkpoint utility.
|
|
*/
|
|
static int cuda_checkpoint_supports_flag(const char *flag)
|
|
{
|
|
char msg_buf[2048];
|
|
const char *args[] = { CUDA_CHECKPOINT, "-h", NULL };
|
|
|
|
if (launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)) != 0)
|
|
return -1;
|
|
|
|
if (strstr(msg_buf, flag) == NULL)
|
|
return 0;
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* Retrieve the cuda restore thread TID from the root pid */
|
|
static int get_cuda_restore_tid(int root_pid)
|
|
{
|
|
char pid_buf[16];
|
|
char pid_out[CUDA_CKPT_BUF_SIZE];
|
|
|
|
snprintf(pid_buf, sizeof(pid_buf), "%d", root_pid);
|
|
|
|
const char *args[] = { CUDA_CHECKPOINT, "--get-restore-tid", "--pid", pid_buf, NULL };
|
|
int ret = launch_cuda_checkpoint(args, pid_out, sizeof(pid_out));
|
|
if (ret != 0) {
|
|
pr_err("Failed to launch cuda-checkpoint to retrieve restore tid: %s\n", pid_out);
|
|
return -1;
|
|
}
|
|
|
|
return atoi(pid_out);
|
|
}
|
|
|
|
static int cuda_process_checkpoint_action(int pid, const char *action, unsigned int timeout, char *msg_buf,
|
|
int buf_size)
|
|
{
|
|
char pid_buf[16];
|
|
char timeout_buf[16];
|
|
|
|
snprintf(pid_buf, sizeof(pid_buf), "%d", pid);
|
|
|
|
const char *args[] = { CUDA_CHECKPOINT, "--action", action, "--pid", pid_buf, NULL /* --timeout */,
|
|
NULL /* timeout_val */, NULL };
|
|
if (timeout > 0) {
|
|
snprintf(timeout_buf, sizeof(timeout_buf), "%d", timeout);
|
|
args[5] = "--timeout";
|
|
args[6] = timeout_buf;
|
|
}
|
|
|
|
return launch_cuda_checkpoint(args, msg_buf, buf_size);
|
|
}
|
|
|
|
static int interrupt_restore_thread(int restore_tid, k_rtsigset_t *restore_sigset)
|
|
{
|
|
/* Since we resumed a thread that CRIU previously already froze we need to
|
|
* INTERRUPT it once again, task was already SEIZE'd so we don't need to do
|
|
* a compel_interrupt_task()
|
|
*/
|
|
if (ptrace(PTRACE_INTERRUPT, restore_tid, NULL, 0)) {
|
|
pr_err("Could not interrupt cuda restore tid %d after checkpoint, process may be in strange state\n",
|
|
restore_tid);
|
|
return -1;
|
|
}
|
|
|
|
struct proc_status_creds creds;
|
|
if (compel_wait_task(restore_tid, -1, parse_pid_status, NULL, &creds.s, NULL) != COMPEL_TASK_ALIVE) {
|
|
pr_err("compel_wait_task failed after interrupt\n");
|
|
return -1;
|
|
}
|
|
|
|
if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, PTRACE_O_SUSPEND_SECCOMP | PTRACE_O_TRACESYSGOOD)) {
|
|
pr_err("Failed to set ptrace options on interrupt for restore tid %d\n", restore_tid);
|
|
return -1;
|
|
}
|
|
|
|
if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(*restore_sigset), restore_sigset)) {
|
|
pr_err("Unable to restore original sigmask to restore tid %d\n", restore_tid);
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int resume_restore_thread(int restore_tid, k_rtsigset_t *save_sigset)
|
|
{
|
|
k_rtsigset_t block;
|
|
|
|
if (ptrace(PTRACE_GETSIGMASK, restore_tid, sizeof(*save_sigset), save_sigset)) {
|
|
pr_err("Failed to get current sigmask for restore tid %d\n", restore_tid);
|
|
return -1;
|
|
}
|
|
|
|
ksigfillset(&block);
|
|
ksigdelset(&block, SIGTRAP);
|
|
|
|
if (ptrace(PTRACE_SETSIGMASK, restore_tid, sizeof(block), &block)) {
|
|
pr_err("Failed to block signals on restore tid %d\n", restore_tid);
|
|
return -1;
|
|
}
|
|
|
|
// Clear out PTRACE_O_SUSPEND_SECCOMP when we resume the restore thread
|
|
if (ptrace(PTRACE_SETOPTIONS, restore_tid, NULL, 0)) {
|
|
pr_err("Could not clear ptrace options on restore tid %d\n", restore_tid);
|
|
return -1;
|
|
}
|
|
|
|
if (ptrace(PTRACE_CONT, restore_tid, NULL, 0)) {
|
|
pr_err("Could not resume cuda restore tid %d\n", restore_tid);
|
|
return -1;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
int cuda_plugin_checkpoint_devices(int pid)
|
|
{
|
|
int restore_tid;
|
|
char msg_buf[CUDA_CKPT_BUF_SIZE];
|
|
int int_ret;
|
|
int status;
|
|
k_rtsigset_t save_sigset;
|
|
|
|
if (plugin_disabled) {
|
|
return -ENOTSUP;
|
|
}
|
|
|
|
restore_tid = get_cuda_restore_tid(pid);
|
|
|
|
/* We can possibly hit a race with cuInit() where we are past the point of
|
|
* locking the process but at lock time cuInit() hadn't completed in which
|
|
* case cuda-checkpoint will report that we're in an invalid state to
|
|
* checkpoint
|
|
*/
|
|
if (restore_tid == -1) {
|
|
pr_info("No need to checkpoint devices on pid %d\n", pid);
|
|
return 0;
|
|
}
|
|
|
|
pr_info("Checkpointing CUDA devices on pid %d restore_tid %d\n", pid, restore_tid);
|
|
/* We need to resume the checkpoint thread to prepare the mappings for
|
|
* checkpointing
|
|
*/
|
|
if (resume_restore_thread(restore_tid, &save_sigset)) {
|
|
return -1;
|
|
}
|
|
status = cuda_process_checkpoint_action(pid, ACTION_CHECKPOINT, 0, msg_buf, sizeof(msg_buf));
|
|
if (status) {
|
|
pr_err("CHECKPOINT_DEVICES failed with %s\n", msg_buf);
|
|
goto interrupt;
|
|
}
|
|
status = update_checkpointed_pid(&cuda_pids, pid);
|
|
if (status) {
|
|
pr_err("Failed to track checkpointed pid %d\n", pid);
|
|
status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf));
|
|
if (status) {
|
|
pr_err("Failed to restore process after error %s on pid %d\n", msg_buf, pid);
|
|
}
|
|
}
|
|
|
|
if (!status && !plugin_added_to_inventory) {
|
|
status = add_inventory_plugin(CR_PLUGIN_DESC.name);
|
|
if (status)
|
|
pr_err("Failed to add CUDA plugin to inventory image\n");
|
|
else
|
|
plugin_added_to_inventory = true;
|
|
}
|
|
|
|
interrupt:
|
|
int_ret = interrupt_restore_thread(restore_tid, &save_sigset);
|
|
|
|
return status != 0 ? status : int_ret;
|
|
}
|
|
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, cuda_plugin_checkpoint_devices);
|
|
|
|
int cuda_plugin_pause_devices(int pid)
|
|
{
|
|
int restore_tid;
|
|
char msg_buf[CUDA_CKPT_BUF_SIZE];
|
|
|
|
if (plugin_disabled) {
|
|
return -ENOTSUP;
|
|
}
|
|
|
|
restore_tid = get_cuda_restore_tid(pid);
|
|
|
|
if (restore_tid == -1) {
|
|
pr_info("no need to pause devices on pid %d\n", pid);
|
|
return 0;
|
|
}
|
|
|
|
pr_info("pausing devices on pid %d\n", pid);
|
|
int status = cuda_process_checkpoint_action(pid, ACTION_LOCK, opts.timeout * 1000, msg_buf, sizeof(msg_buf));
|
|
if (status) {
|
|
pr_err("PAUSE_DEVICES failed with %s\n", msg_buf);
|
|
if (alarm_timeouted())
|
|
goto unlock;
|
|
return -1;
|
|
}
|
|
|
|
if (add_pid_to_buf(&cuda_pids, pid)) {
|
|
pr_err("unable to track paused pid %d\n", pid);
|
|
goto unlock;
|
|
}
|
|
|
|
return 0;
|
|
unlock:
|
|
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
|
|
if (status) {
|
|
pr_err("Failed to unlock process status %s, pid %d may hang\n", msg_buf, pid);
|
|
}
|
|
return -1;
|
|
}
|
|
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__PAUSE_DEVICES, cuda_plugin_pause_devices)
|
|
|
|
int resume_device(int pid, int checkpointed)
|
|
{
|
|
char msg_buf[CUDA_CKPT_BUF_SIZE];
|
|
int status;
|
|
int ret = 0;
|
|
int int_ret;
|
|
k_rtsigset_t save_sigset;
|
|
|
|
int restore_tid = get_cuda_restore_tid(pid);
|
|
if (restore_tid == -1) {
|
|
pr_info("No need to resume devices on pid %d\n", pid);
|
|
return 0;
|
|
}
|
|
|
|
pr_info("resuming devices on pid %d\n", pid);
|
|
/* The resuming process has to stay frozen during this time otherwise
|
|
* attempting to access a UVM pointer will crash if we haven't restored the
|
|
* underlying mappings yet
|
|
*/
|
|
pr_debug("Restore thread pid %d found for real pid %d\n", restore_tid, pid);
|
|
/* wakeup the restore thread so we can handle the restore for this pid,
|
|
* rseq_cs has to be restored before execution
|
|
*/
|
|
if (resume_restore_thread(restore_tid, &save_sigset)) {
|
|
return -1;
|
|
}
|
|
|
|
if (checkpointed) {
|
|
status = cuda_process_checkpoint_action(pid, ACTION_RESTORE, 0, msg_buf, sizeof(msg_buf));
|
|
if (status) {
|
|
pr_err("RESUME_DEVICES RESTORE failed with %s\n", msg_buf);
|
|
ret = -1;
|
|
goto interrupt;
|
|
}
|
|
}
|
|
|
|
status = cuda_process_checkpoint_action(pid, ACTION_UNLOCK, 0, msg_buf, sizeof(msg_buf));
|
|
if (status) {
|
|
pr_err("RESUME_DEVICES UNLOCK failed with %s\n", msg_buf);
|
|
ret = -1;
|
|
}
|
|
|
|
interrupt:
|
|
int_ret = interrupt_restore_thread(restore_tid, &save_sigset);
|
|
|
|
return ret != 0 ? ret : int_ret;
|
|
}
|
|
|
|
int cuda_plugin_resume_devices_late(int pid)
|
|
{
|
|
if (plugin_disabled) {
|
|
return -ENOTSUP;
|
|
}
|
|
|
|
return resume_device(pid, 1);
|
|
}
|
|
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_devices_late)
|
|
|
|
int cuda_plugin_init(int stage)
|
|
{
|
|
int ret;
|
|
|
|
if (stage == CR_PLUGIN_STAGE__RESTORE) {
|
|
if (!check_and_remove_inventory_plugin(CR_PLUGIN_DESC.name, strlen(CR_PLUGIN_DESC.name))) {
|
|
plugin_disabled = true;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
if (!fault_injected(FI_PLUGIN_CUDA_FORCE_ENABLE) && access("/dev/nvidiactl", F_OK)) {
|
|
pr_info("/dev/nvidiactl doesn't exist. The CUDA plugin is disabled.\n");
|
|
plugin_disabled = true;
|
|
return 0;
|
|
}
|
|
|
|
ret = cuda_checkpoint_supports_flag("--action");
|
|
if (ret == -1) {
|
|
pr_warn("check that %s is present in $PATH\n", CUDA_CHECKPOINT);
|
|
plugin_disabled = true;
|
|
return 0;
|
|
}
|
|
|
|
if (ret == 0) {
|
|
pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n");
|
|
plugin_disabled = true;
|
|
return 0;
|
|
}
|
|
|
|
pr_info("initialized: %s stage %d\n", CR_PLUGIN_DESC.name, stage);
|
|
|
|
/* In the DUMP stage track all the PID's we've paused CUDA operations on to
|
|
* release them when we're done if the user requested the leave-running option
|
|
*/
|
|
if (stage == CR_PLUGIN_STAGE__DUMP) {
|
|
INIT_LIST_HEAD(&cuda_pids);
|
|
}
|
|
|
|
dont_use_freeze_cgroup();
|
|
|
|
return 0;
|
|
}
|
|
|
|
void cuda_plugin_fini(int stage, int ret)
|
|
{
|
|
if (plugin_disabled) {
|
|
return;
|
|
}
|
|
|
|
pr_info("finished %s stage %d err %d\n", CR_PLUGIN_DESC.name, stage, ret);
|
|
|
|
/* Release all the paused PID's at the end of the DUMP stage in case the
|
|
* user provides the -R (leave-running) flag or an error occurred
|
|
*/
|
|
if (stage == CR_PLUGIN_STAGE__DUMP && (opts.final_state == TASK_ALIVE || ret != 0)) {
|
|
struct pid_info *info;
|
|
list_for_each_entry(info, &cuda_pids, list) {
|
|
resume_device(info->pid, info->checkpointed);
|
|
}
|
|
}
|
|
if (stage == CR_PLUGIN_STAGE__DUMP) {
|
|
dealloc_pid_buffer(&cuda_pids);
|
|
}
|
|
}
|
|
CR_PLUGIN_REGISTER("cuda_plugin", cuda_plugin_init, cuda_plugin_fini)
|