2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 01:51:51 +00:00

cuda: fix launch cuda-checkpoint

When the cuda-checkpoint tool is not installed, execvp() is expected to
fail and return -1. In this case, we need to call exit() to terminate
the child process that was created earlier with fork().

Since CRIU can be used with applications that do not use CUDA, even
when the CUDA plugin is installed, this patch also updates the log
messages to show debug and warning (instead of error) when the
cuda-checkpoint tool is not found in $PATH.

Signed-off-by: Radostin Stoyanov <rstoyanov@fedoraproject.org>
Signed-off-by: Andrei Vagin <avagin@google.com>
This commit is contained in:
Radostin Stoyanov 2024-07-07 18:48:30 +01:00 committed by Andrei Vagin
parent fde0b7ac69
commit ad66c27a11

View File

@ -45,7 +45,7 @@ struct pid_info {
/* Used to track which PID's we've paused CUDA operations on so far so we can
* release them after we're done with the DUMP
*/
struct list_head cuda_pids;
static LIST_HEAD(cuda_pids);
static void dealloc_pid_buffer(struct list_head *pid_buf)
{
@ -91,7 +91,7 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size)
{
#define READ 0
#define WRITE 1
int fd[2];
int fd[2], buf_off;
if (pipe(fd) != 0) {
pr_err("Couldn't create pipes for reading cuda-checkpoint output\n");
@ -110,68 +110,103 @@ static int launch_cuda_checkpoint(const char **args, char *buf, int buf_size)
if (child_pid == 0) { // child
if (dup2(fd[WRITE], STDOUT_FILENO) == -1) {
return -1;
pr_perror("unable to clone fd %d->%d", fd[WRITE], STDOUT_FILENO);
_exit(EXIT_FAILURE);
}
if (dup2(fd[WRITE], STDERR_FILENO) == -1) {
return -1;
pr_perror("unable to clone fd %d->%d", fd[WRITE], STDERR_FILENO);
_exit(EXIT_FAILURE);
}
close(fd[READ]);
close_fds(STDERR_FILENO + 1);
return execvp(args[0], (char **)args);
} else { // parent
close(fd[WRITE]);
execvp(args[0], (char **)args);
int bytes_read = read(fd[READ], buf, buf_size);
if (bytes_read > 0) {
buf[bytes_read - 1] = '\0';
}
/* We can't use pr_error() as log file fd is closed. */
fprintf(stderr, "execvp(\"%s\") failed: %s\n", args[0], strerror(errno));
// Clear out any of the remaining output in the pipe in case the buffer wasn't large enough
struct pollfd read_poll = { .fd = fd[READ], .events = POLLIN | POLLHUP };
while (true) {
int poll_status = poll(&read_poll, 1, -1);
if (poll_status == -1) {
close(fd[READ]);
pr_err("Unexpected error when clearing cuda-checkpoint output buffer\n");
return -1;
}
if (read_poll.revents & POLLHUP) {
break;
}
// POLLIN, read into scratch buffer to flush things out
char scratch[64];
bytes_read = read(fd[READ], scratch, sizeof(scratch));
}
int status;
if (waitpid(child_pid, &status, 0) == -1 || !WIFEXITED(status)) {
pr_err("cuda-checkpoint exited improperly, couldn't complete operation\n");
close(fd[READ]);
return -1;
}
close(fd[READ]);
return WEXITSTATUS(status);
_exit(EXIT_FAILURE);
}
close(fd[WRITE]);
buf_off = 0;
/* Reserve one byte for the null charracter. */
buf_size--;
while (buf_off < buf_size) {
int bytes_read;
bytes_read = read(fd[READ], buf + buf_off, buf_size - buf_off);
if (bytes_read == -1) {
pr_perror("Unable to read output of cuda-checkpoint");
goto err;
}
if (bytes_read == 0)
break;
buf_off += bytes_read;
}
buf[buf_off] = '\0';
/* Clear out any of the remaining output in the pipe in case the buffer wasn't large enough */
while (true) {
char scratch[1024];
int bytes_read;
bytes_read = read(fd[READ], scratch, sizeof(scratch));
if (bytes_read == -1) {
pr_perror("Unable to read output of cuda-checkpoint");
goto err;
}
if (bytes_read == 0)
break;
}
close(fd[READ]);
int status, exit_code = -1;
if (waitpid(child_pid, &status, 0) == -1) {
pr_perror("Unable to wait for the cuda-checkpoint process %d", child_pid);
goto err;
}
if (WIFSIGNALED(status)) {
int sig = WTERMSIG(status);
pr_err("cuda-checkpoint unexpectedly signaled with %d: %s\n", sig, strsignal(sig));
} else if (WIFEXITED(status)) {
exit_code = WEXITSTATUS(status);
} else {
pr_err("cuda-checkpoint exited improperly: %u\n", status);
}
if (exit_code != EXIT_SUCCESS)
pr_debug("cuda-checkpoint output ===>\n%s\n"
"<=== cuda-checkpoint output\n",
buf);
return exit_code;
err:
kill(child_pid, SIGKILL);
waitpid(child_pid, NULL, 0);
return -1;
}
static bool cuda_checkpoint_supports_flag(const char *flag)
/**
* Checks if a given flag is supported by the cuda-checkpoint utility
*
* Returns:
* 1 if the flag is supported,
* 0 if the flag is not supported,
* -1 if there was an error launching the cuda-checkpoint utility.
*/
static int cuda_checkpoint_supports_flag(const char *flag)
{
char msg_buf[2048];
const char *args[] = { CUDA_CHECKPOINT, "-h", NULL };
int ret = launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf));
if (ret != 0) {
pr_err("Failed to launch cuda-checkpoint utility, check that the utility is present in your $PATH\n");
return false;
}
if (strstr(msg_buf, flag) == NULL) {
return false;
}
if (launch_cuda_checkpoint(args, msg_buf, sizeof(msg_buf)) != 0)
return -1;
return true;
if (strstr(msg_buf, flag) == NULL)
return 0;
return 1;
}
/* Retrieve the cuda restore thread TID from the root pid */
@ -419,7 +454,15 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, cuda_plugin_resume_
int cuda_plugin_init(int stage)
{
if (!cuda_checkpoint_supports_flag("--action")) {
int ret = cuda_checkpoint_supports_flag("--action");
if (ret == -1) {
pr_warn("check that %s is present in $PATH\n", CUDA_CHECKPOINT);
plugin_disabled = true;
return 0;
}
if (ret == 0) {
pr_warn("cuda-checkpoint --action flag not supported, an r555 or higher version driver is required. Disabling CUDA plugin\n");
plugin_disabled = true;
return 0;