mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-22 18:07:57 +00:00
To support Checkpoint Restore with AMDGPUs for ROCm workloads, introduce a new plugin to assist CRIU with the help of AMD KFD kernel driver. This initial commit just provides the basic framework to build up further capabilities. Like CRIU, the amdgpu plugin also uses protobuf to serialize and save the amdkfd data which is mostly VRAM contents with some metadata. We generate a data file "amdgpu-kfd-<id>.img" during the dump stage. On restore this file is read and extracted to re-create various types of buffer objects that belonged to the previously checkpointed process. Upon restore the mmap page offset within a device file might change so we use the new hook to update and adjust the mmap offsets for newly created target process. This is needed for sys_mmap call in pie restorer phase. Support for queues and events is added in future patches of this series. With the current implementation (amdgpu_plugin), we support: - Only compute workloads such (Non Gfx) are supported - GPU visible inside a container - AMD GPU Gfx 9 Family - Pytorch Benchmarks such as BERT Base amdgpu plugin dependes on libdrm and libdrm_amdgpu which are typically installed with libdrm-dev package. We build amdgpu_plugin only when the dependencies are met on the target system and when user intends to install the amdgpu plugin and not by default with criu build. Suggested-by: Felix Kuehling <felix.kuehling@amd.com> Co-authored-by: David Yat Sin <david.yatsin@amd.com> Signed-off-by: David Yat Sin <david.yatsin@amd.com> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
931 lines
23 KiB
C
931 lines
23 KiB
C
#include <errno.h>
|
|
#include <fcntl.h>
|
|
#include <stdlib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
|
|
#include <linux/limits.h>
|
|
|
|
#include <sys/ioctl.h>
|
|
#include <sys/stat.h>
|
|
#include <sys/sysmacros.h>
|
|
#include <sys/mman.h>
|
|
#include <sys/types.h>
|
|
#include <stdint.h>
|
|
|
|
#include "criu-plugin.h"
|
|
#include "plugin.h"
|
|
#include "criu-amdgpu.pb-c.h"
|
|
|
|
#include "kfd_ioctl.h"
|
|
#include "xmalloc.h"
|
|
#include "criu-log.h"
|
|
|
|
#include "common/list.h"
|
|
|
|
#define DRM_FIRST_RENDER_NODE 128
|
|
#define DRM_LAST_RENDER_NODE 255
|
|
|
|
#define AMDGPU_KFD_DEVICE "/dev/kfd"
|
|
#define PROCPIDMEM "/proc/%d/mem"
|
|
|
|
#ifndef _GNU_SOURCE
|
|
#define _GNU_SOURCE 1
|
|
#endif
|
|
|
|
#ifdef LOG_PREFIX
|
|
#undef LOG_PREFIX
|
|
#endif
|
|
#define LOG_PREFIX "amdgpu_plugin: "
|
|
|
|
#ifdef DEBUG
|
|
#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__)
|
|
#else
|
|
#define plugin_log_msg(fmt, ...) \
|
|
{ \
|
|
}
|
|
#endif
|
|
|
|
struct vma_metadata {
|
|
struct list_head list;
|
|
uint64_t old_pgoff;
|
|
uint64_t new_pgoff;
|
|
uint64_t vma_entry;
|
|
};
|
|
|
|
static LIST_HEAD(update_vma_info_list);
|
|
|
|
int open_drm_render_device(int minor)
|
|
{
|
|
char path[128];
|
|
int fd;
|
|
|
|
if (minor < DRM_FIRST_RENDER_NODE || minor > DRM_LAST_RENDER_NODE) {
|
|
pr_perror("DRM render minor %d out of range [%d, %d]", minor, DRM_FIRST_RENDER_NODE,
|
|
DRM_LAST_RENDER_NODE);
|
|
return -EINVAL;
|
|
}
|
|
|
|
sprintf(path, "/dev/dri/renderD%d", minor);
|
|
fd = open(path, O_RDWR | O_CLOEXEC);
|
|
if (fd < 0) {
|
|
if (errno != ENOENT && errno != EPERM) {
|
|
pr_err("Failed to open %s: %s\n", path, strerror(errno));
|
|
if (errno == EACCES)
|
|
pr_err("Check user is in \"video\" group\n");
|
|
}
|
|
return -EBADFD;
|
|
}
|
|
|
|
return fd;
|
|
}
|
|
|
|
int write_file(const char *file_path, const void *buf, const size_t buf_len)
|
|
{
|
|
int fd;
|
|
FILE *fp;
|
|
size_t len_wrote;
|
|
|
|
fd = openat(criu_get_image_dir(), file_path, O_WRONLY | O_CREAT, 0600);
|
|
if (fd < 0) {
|
|
pr_perror("Cannot open %s", file_path);
|
|
return -errno;
|
|
}
|
|
|
|
fp = fdopen(fd, "w");
|
|
if (!fp) {
|
|
pr_perror("Cannot fdopen %s", file_path);
|
|
return -errno;
|
|
}
|
|
|
|
len_wrote = fwrite(buf, 1, buf_len, fp);
|
|
if (len_wrote != buf_len) {
|
|
pr_perror("Unable to write %s (wrote:%ld buf_len:%ld)", file_path, len_wrote, buf_len);
|
|
fclose(fp);
|
|
return -EIO;
|
|
}
|
|
|
|
pr_info("Wrote file:%s (%ld bytes)\n", file_path, buf_len);
|
|
/* this will also close fd */
|
|
fclose(fp);
|
|
return 0;
|
|
}
|
|
|
|
int read_file(const char *file_path, void *buf, const size_t buf_len)
|
|
{
|
|
int fd;
|
|
FILE *fp;
|
|
size_t len_read;
|
|
|
|
fd = openat(criu_get_image_dir(), file_path, O_RDONLY);
|
|
if (fd < 0) {
|
|
pr_perror("Cannot open %s", file_path);
|
|
return -errno;
|
|
}
|
|
|
|
fp = fdopen(fd, "r");
|
|
if (!fp) {
|
|
pr_perror("Cannot fdopen %s", file_path);
|
|
return -errno;
|
|
}
|
|
|
|
len_read = fread(buf, 1, buf_len, fp);
|
|
if (len_read != buf_len) {
|
|
pr_perror("Unable to read %s", file_path);
|
|
fclose(fp);
|
|
return -EIO;
|
|
}
|
|
|
|
pr_info("Read file:%s (%ld bytes)\n", file_path, buf_len);
|
|
|
|
/* this will also close fd */
|
|
fclose(fp);
|
|
return 0;
|
|
}
|
|
|
|
/* Call ioctl, restarting if it is interrupted */
|
|
int kmtIoctl(int fd, unsigned long request, void *arg)
|
|
{
|
|
int ret, max_retries = 200;
|
|
|
|
do {
|
|
ret = ioctl(fd, request, arg);
|
|
} while (ret == -1 && max_retries-- > 0 && (errno == EINTR || errno == EAGAIN));
|
|
|
|
if (ret == -1 && errno == EBADF)
|
|
/* In case pthread_atfork didn't catch it, this will
|
|
* make any subsequent hsaKmt calls fail in CHECK_KFD_OPEN.
|
|
*/
|
|
pr_perror("KFD file descriptor not valid in this process");
|
|
return ret;
|
|
}
|
|
|
|
static void free_e(CriuKfd *e)
|
|
{
|
|
for (int i = 0; i < e->n_bo_entries; i++) {
|
|
if (e->bo_entries[i]) {
|
|
if (e->bo_entries[i]->rawdata.data)
|
|
xfree(e->bo_entries[i]->rawdata.data);
|
|
|
|
xfree(e->bo_entries[i]);
|
|
}
|
|
}
|
|
|
|
for (int i = 0; i < e->n_device_entries; i++) {
|
|
if (e->device_entries[i])
|
|
xfree(e->device_entries[i]);
|
|
}
|
|
xfree(e);
|
|
}
|
|
|
|
static int allocate_device_entries(CriuKfd *e, int num_of_devices)
|
|
{
|
|
e->device_entries = xmalloc(sizeof(DeviceEntry *) * num_of_devices);
|
|
if (!e->device_entries) {
|
|
pr_err("Failed to allocate device_entries\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
for (int i = 0; i < num_of_devices; i++) {
|
|
DeviceEntry *entry = xzalloc(sizeof(*entry));
|
|
|
|
if (!entry) {
|
|
pr_err("Failed to allocate entry\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
device_entry__init(entry);
|
|
|
|
e->device_entries[i] = entry;
|
|
e->n_device_entries++;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucket *bo_bucket_ptr)
|
|
{
|
|
e->bo_entries = xmalloc(sizeof(BoEntry *) * num_bos);
|
|
if (!e->bo_entries) {
|
|
pr_err("Failed to allocate bo_info\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
for (int i = 0; i < num_bos; i++) {
|
|
BoEntry *entry = xzalloc(sizeof(*entry));
|
|
|
|
if (!entry) {
|
|
pr_err("Failed to allocate botest\n");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
bo_entry__init(entry);
|
|
|
|
if ((bo_bucket_ptr)[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM ||
|
|
(bo_bucket_ptr)[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
|
|
entry->rawdata.data = xmalloc((bo_bucket_ptr)[i].size);
|
|
entry->rawdata.len = (bo_bucket_ptr)[i].size;
|
|
}
|
|
|
|
e->bo_entries[i] = entry;
|
|
e->n_bo_entries++;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
int amdgpu_plugin_init(int stage)
|
|
{
|
|
pr_info("amdgpu_plugin: initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
|
|
|
|
return 0;
|
|
}
|
|
|
|
void amdgpu_plugin_fini(int stage, int ret)
|
|
{
|
|
pr_info("amdgpu_plugin: finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
|
|
}
|
|
|
|
CR_PLUGIN_REGISTER("amdgpu_plugin", amdgpu_plugin_init, amdgpu_plugin_fini)
|
|
|
|
int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
|
|
{
|
|
struct stat st_kfd, st_dri_min;
|
|
char img_path[128];
|
|
int ret = 0;
|
|
|
|
pr_debug("amdgpu_plugin: Enter %s\n", __func__);
|
|
ret = stat(AMDGPU_KFD_DEVICE, &st_kfd);
|
|
if (ret == -1) {
|
|
pr_perror("stat error for /dev/kfd");
|
|
return ret;
|
|
}
|
|
|
|
snprintf(img_path, sizeof(img_path), "/dev/dri/renderD%d", DRM_FIRST_RENDER_NODE);
|
|
|
|
ret = stat(img_path, &st_dri_min);
|
|
if (ret == -1) {
|
|
pr_perror("stat error for %s", img_path);
|
|
return ret;
|
|
}
|
|
|
|
if (major(st_buf->st_rdev) == major(st_kfd.st_rdev) || ((major(st_buf->st_rdev) == major(st_dri_min.st_rdev)) &&
|
|
(minor(st_buf->st_rdev) >= minor(st_dri_min.st_rdev) &&
|
|
minor(st_buf->st_rdev) >= DRM_FIRST_RENDER_NODE))) {
|
|
pr_debug("Known non-regular mapping, kfd-renderD%d -> OK\n", minor(st_buf->st_rdev));
|
|
pr_debug("AMD KFD(maj) = %d, DRI(maj,min) = %d:%d VMA Device fd(maj,min) = %d:%d\n",
|
|
major(st_kfd.st_rdev), major(st_dri_min.st_rdev), minor(st_dri_min.st_rdev),
|
|
major(st_buf->st_rdev), minor(st_buf->st_rdev));
|
|
/* VMA belongs to kfd */
|
|
return 0;
|
|
}
|
|
|
|
pr_perror("amdgpu_plugin: Can't handle the VMA mapping");
|
|
return -ENOTSUP;
|
|
}
|
|
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma)
|
|
|
|
static int unpause_process(int fd)
|
|
{
|
|
int ret = 0;
|
|
struct kfd_ioctl_criu_args args = { 0 };
|
|
|
|
args.op = KFD_CRIU_OP_UNPAUSE;
|
|
|
|
ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args);
|
|
if (ret) {
|
|
pr_perror("amdgpu_plugin: Failed to unpause process");
|
|
goto exit;
|
|
}
|
|
|
|
exit:
|
|
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_device_bucket *device_buckets,
|
|
CriuKfd *e)
|
|
{
|
|
int ret = 0;
|
|
|
|
pr_debug("Dumping %d devices\n", args->num_devices);
|
|
|
|
e->num_of_gpus = args->num_devices;
|
|
|
|
ret = allocate_device_entries(e, e->num_of_gpus);
|
|
if (ret) {
|
|
ret = -ENOMEM;
|
|
goto exit;
|
|
}
|
|
|
|
plugin_log_msg("Number of GPUs:%d\n", e->num_of_gpus);
|
|
|
|
exit:
|
|
pr_info("Dumped devices %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
|
|
return ret;
|
|
}
|
|
|
|
static int save_bos(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
|
|
{
|
|
int ret = 0, i;
|
|
char *fname;
|
|
|
|
pr_debug("Dumping %d BOs\n", args->num_bos);
|
|
|
|
e->num_of_bos = args->num_bos;
|
|
ret = allocate_bo_entries(e, e->num_of_bos, bo_buckets);
|
|
if (ret)
|
|
goto exit;
|
|
|
|
for (i = 0; i < e->num_of_bos; i++) {
|
|
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
|
|
BoEntry *boinfo = e->bo_entries[i];
|
|
|
|
boinfo->gpu_id = bo_bucket->gpu_id;
|
|
boinfo->addr = bo_bucket->addr;
|
|
boinfo->size = bo_bucket->size;
|
|
boinfo->offset = bo_bucket->offset;
|
|
boinfo->alloc_flags = bo_bucket->alloc_flags;
|
|
|
|
if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM ||
|
|
bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
|
|
if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) {
|
|
void *addr;
|
|
|
|
pr_info("amdgpu_plugin: large bar read possible\n");
|
|
|
|
addr = mmap(NULL, boinfo->size, PROT_READ, MAP_SHARED, fd, boinfo->offset);
|
|
if (addr == MAP_FAILED) {
|
|
pr_perror("amdgpu_plugin: mmap failed\n");
|
|
ret = -errno;
|
|
goto exit;
|
|
}
|
|
|
|
/* direct memcpy is possible on large bars */
|
|
memcpy(boinfo->rawdata.data, addr, boinfo->size);
|
|
munmap(addr, boinfo->size);
|
|
} else {
|
|
size_t bo_size;
|
|
int mem_fd;
|
|
|
|
pr_info("Now try reading BO contents with /proc/pid/mem\n");
|
|
if (asprintf(&fname, PROCPIDMEM, args->pid) < 0) {
|
|
pr_perror("failed in asprintf, %s", fname);
|
|
ret = -1;
|
|
goto exit;
|
|
}
|
|
|
|
mem_fd = open(fname, O_RDONLY);
|
|
if (mem_fd < 0) {
|
|
pr_perror("Can't open %s for pid %d", fname, args->pid);
|
|
free(fname);
|
|
close(mem_fd);
|
|
ret = -1;
|
|
goto exit;
|
|
}
|
|
|
|
pr_info("Opened %s file for pid = %d\n", fname, args->pid);
|
|
free(fname);
|
|
|
|
if (lseek(mem_fd, (off_t)bo_bucket->addr, SEEK_SET) == -1) {
|
|
pr_perror("Can't lseek for bo_offset for pid = %d", args->pid);
|
|
close(mem_fd);
|
|
ret = -1;
|
|
goto exit;
|
|
}
|
|
|
|
bo_size = read(mem_fd, boinfo->rawdata.data, boinfo->size);
|
|
if (bo_size != boinfo->size) {
|
|
close(mem_fd);
|
|
pr_perror("Can't read buffer");
|
|
ret = -1;
|
|
goto exit;
|
|
}
|
|
close(mem_fd);
|
|
}
|
|
}
|
|
}
|
|
exit:
|
|
pr_info("Dumped bos %s (ret:%d)\n", ret ? "failed" : "ok", ret);
|
|
return ret;
|
|
}
|
|
|
|
int amdgpu_plugin_dump_file(int fd, int id)
|
|
{
|
|
struct kfd_ioctl_criu_args args = { 0 };
|
|
char img_path[PATH_MAX];
|
|
struct stat st, st_kfd;
|
|
unsigned char *buf;
|
|
CriuKfd *e = NULL;
|
|
int ret = 0;
|
|
size_t len;
|
|
|
|
if (fstat(fd, &st) == -1) {
|
|
pr_perror("amdgpu_plugin: fstat error");
|
|
return -1;
|
|
}
|
|
|
|
ret = stat(AMDGPU_KFD_DEVICE, &st_kfd);
|
|
if (ret == -1) {
|
|
pr_perror("amdgpu_plugin: fstat error for /dev/kfd");
|
|
return -1;
|
|
}
|
|
|
|
/* Check whether this plugin was called for kfd or render nodes */
|
|
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
|
|
/* This is RenderD dumper plugin, for now just save renderD
|
|
* minor number to be used during restore. In later phases this
|
|
* needs to save more data for video decode etc.
|
|
*/
|
|
|
|
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
|
|
|
|
pr_info("amdgpu_plugin: Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev),
|
|
fd, id);
|
|
|
|
rd.minor_number = minor(st.st_rdev);
|
|
|
|
len = criu_render_node__get_packed_size(&rd);
|
|
buf = xmalloc(len);
|
|
if (!buf)
|
|
return -ENOMEM;
|
|
|
|
criu_render_node__pack(&rd, buf);
|
|
|
|
snprintf(img_path, sizeof(img_path), "amdgpu-renderD-%d.img", id);
|
|
ret = write_file(img_path, buf, len);
|
|
if (ret) {
|
|
xfree(buf);
|
|
return ret;
|
|
}
|
|
|
|
xfree(buf);
|
|
/* Need to return success here so that criu can call plugins for renderD nodes */
|
|
return ret;
|
|
}
|
|
|
|
pr_info("amdgpu_plugin: %s : %s() called for fd = %d\n", CR_PLUGIN_DESC.name, __func__, major(st.st_rdev));
|
|
|
|
args.op = KFD_CRIU_OP_PROCESS_INFO;
|
|
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) {
|
|
pr_perror("amdgpu_plugin: Failed to call process info ioctl");
|
|
ret = -1;
|
|
goto exit;
|
|
}
|
|
|
|
pr_info("amdgpu_plugin: devices:%d bos:%d objects:%d priv_data:%lld\n", args.num_devices, args.num_bos,
|
|
args.num_objects, args.priv_data_size);
|
|
|
|
e = xmalloc(sizeof(*e));
|
|
if (!e) {
|
|
pr_err("Failed to allocate proto structure\n");
|
|
ret = -ENOMEM;
|
|
goto exit;
|
|
}
|
|
|
|
criu_kfd__init(e);
|
|
e->pid = args.pid;
|
|
|
|
args.devices = (uintptr_t)xzalloc((args.num_devices * sizeof(struct kfd_criu_device_bucket)));
|
|
if (!args.devices) {
|
|
ret = -ENOMEM;
|
|
goto exit;
|
|
}
|
|
|
|
args.bos = (uintptr_t)xzalloc((args.num_bos * sizeof(struct kfd_criu_bo_bucket)));
|
|
if (!args.bos) {
|
|
ret = -ENOMEM;
|
|
goto exit;
|
|
}
|
|
|
|
args.priv_data = (uintptr_t)xzalloc((args.priv_data_size));
|
|
if (!args.priv_data) {
|
|
ret = -ENOMEM;
|
|
goto exit;
|
|
}
|
|
|
|
args.op = KFD_CRIU_OP_CHECKPOINT;
|
|
ret = kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args);
|
|
if (ret) {
|
|
pr_perror("amdgpu_plugin: Failed to call dumper (process) ioctl");
|
|
goto exit;
|
|
}
|
|
|
|
ret = save_devices(fd, &args, (struct kfd_criu_device_bucket *)args.devices, e);
|
|
if (ret)
|
|
goto exit;
|
|
|
|
ret = save_bos(fd, &args, (struct kfd_criu_bo_bucket *)args.bos, e);
|
|
if (ret)
|
|
goto exit;
|
|
|
|
e->num_of_objects = args.num_objects;
|
|
|
|
e->priv_data.data = (void *)args.priv_data;
|
|
e->priv_data.len = args.priv_data_size;
|
|
|
|
snprintf(img_path, sizeof(img_path), "amdgpu-kfd-%d.img", id);
|
|
pr_info("amdgpu_plugin: img_path = %s\n", img_path);
|
|
|
|
len = criu_kfd__get_packed_size(e);
|
|
|
|
pr_info("amdgpu_plugin: Len = %ld\n", len);
|
|
|
|
buf = xmalloc(len);
|
|
if (!buf) {
|
|
pr_perror("Failed to allocate memory to store protobuf");
|
|
ret = -ENOMEM;
|
|
goto exit;
|
|
}
|
|
|
|
criu_kfd__pack(e, buf);
|
|
|
|
ret = write_file(img_path, buf, len);
|
|
|
|
xfree(buf);
|
|
exit:
|
|
/* Restore all queues */
|
|
unpause_process(fd);
|
|
|
|
xfree((void *)args.devices);
|
|
xfree((void *)args.bos);
|
|
xfree((void *)args.priv_data);
|
|
|
|
free_e(e);
|
|
|
|
if (ret)
|
|
pr_err("amdgpu_plugin: Failed to dump (ret:%d)\n", ret);
|
|
else
|
|
pr_info("amdgpu_plugin: Dump successful\n");
|
|
|
|
return ret;
|
|
}
|
|
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__DUMP_EXT_FILE, amdgpu_plugin_dump_file)
|
|
|
|
/* Restore per-device information */
|
|
static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e)
|
|
{
|
|
struct kfd_criu_device_bucket *device_buckets;
|
|
int ret = 0, bucket_index = 0;
|
|
|
|
pr_debug("Restoring %d devices\n", e->num_of_gpus);
|
|
|
|
args->num_devices = e->num_of_gpus;
|
|
device_buckets = xzalloc(sizeof(*device_buckets) * args->num_devices);
|
|
if (!device_buckets)
|
|
return -ENOMEM;
|
|
|
|
args->devices = (uintptr_t)device_buckets;
|
|
|
|
for (int i = 0; i < e->num_of_gpus; i++) {
|
|
struct kfd_criu_device_bucket *device_bucket;
|
|
DeviceEntry *devinfo = e->device_entries[i];
|
|
|
|
device_bucket = &device_buckets[bucket_index++];
|
|
|
|
device_bucket->user_gpu_id = devinfo->gpu_id;
|
|
|
|
device_bucket->drm_fd = open_drm_render_device(i + DRM_FIRST_RENDER_NODE);
|
|
if (device_bucket->drm_fd < 0) {
|
|
pr_perror("amdgpu_plugin: Can't pass NULL drm render fd to driver");
|
|
goto exit;
|
|
} else {
|
|
pr_info("amdgpu_plugin: passing drm render fd = %d to driver\n", device_bucket->drm_fd);
|
|
}
|
|
}
|
|
|
|
exit:
|
|
pr_info("Restore devices %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
|
|
return ret;
|
|
}
|
|
|
|
static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
|
|
{
|
|
struct kfd_criu_bo_bucket *bo_buckets;
|
|
|
|
pr_debug("Restoring %ld BOs\n", e->num_of_bos);
|
|
|
|
args->num_bos = e->num_of_bos;
|
|
bo_buckets = xzalloc(sizeof(*bo_buckets) * args->num_bos);
|
|
if (!bo_buckets)
|
|
return -ENOMEM;
|
|
|
|
args->bos = (uintptr_t)bo_buckets;
|
|
|
|
for (int i = 0; i < args->num_bos; i++) {
|
|
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
|
|
BoEntry *bo_entry = e->bo_entries[i];
|
|
|
|
bo_bucket->gpu_id = bo_entry->gpu_id;
|
|
bo_bucket->addr = bo_entry->addr;
|
|
bo_bucket->size = bo_entry->size;
|
|
bo_bucket->offset = bo_entry->offset;
|
|
bo_bucket->alloc_flags = bo_entry->alloc_flags;
|
|
|
|
plugin_log_msg("BO [%d] gpu_id:%x addr:%llx size:%llx offset:%llx\n", i, bo_bucket->gpu_id,
|
|
bo_bucket->addr, bo_bucket->size, bo_bucket->offset);
|
|
}
|
|
|
|
pr_info("Restore BOs Ok\n");
|
|
return 0;
|
|
}
|
|
|
|
static int restore_bo_data(int fd, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
|
|
{
|
|
int mem_fd = -1;
|
|
|
|
for (int i = 0; i < e->num_of_bos; i++) {
|
|
void *addr;
|
|
|
|
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
|
|
BoEntry *bo_entry = e->bo_entries[i];
|
|
|
|
if (bo_bucket->alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT |
|
|
KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)) {
|
|
struct vma_metadata *vma_md;
|
|
|
|
vma_md = xmalloc(sizeof(*vma_md));
|
|
if (!vma_md)
|
|
return -ENOMEM;
|
|
|
|
vma_md->old_pgoff = bo_bucket->offset;
|
|
vma_md->vma_entry = bo_bucket->addr;
|
|
vma_md->new_pgoff = bo_bucket->restored_offset;
|
|
|
|
plugin_log_msg("amdgpu_plugin: adding vma_entry:addr:0x%lx old-off:0x%lx "
|
|
"new_off:0x%lx new_minor:%d\n",
|
|
vma_md->vma_entry, vma_md->old_pgoff, vma_md->new_pgoff, vma_md->new_minor);
|
|
|
|
list_add_tail(&vma_md->list, &update_vma_info_list);
|
|
}
|
|
|
|
if (bo_bucket->alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
|
|
pr_info("amdgpu_plugin: Trying mmap in stage 2\n");
|
|
if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC ||
|
|
bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
|
|
plugin_log_msg("amdgpu_plugin: large bar write possible\n");
|
|
addr = mmap(NULL, bo_bucket->size, PROT_WRITE, MAP_SHARED, fd,
|
|
bo_bucket->restored_offset);
|
|
if (addr == MAP_FAILED) {
|
|
pr_perror("amdgpu_plugin: mmap failed");
|
|
fd = -EBADFD;
|
|
goto exit;
|
|
}
|
|
|
|
/* direct memcpy is possible on large bars */
|
|
memcpy(addr, (void *)bo_entry->rawdata.data, bo_entry->size);
|
|
munmap(addr, bo_entry->size);
|
|
} else {
|
|
size_t bo_size;
|
|
char *fname;
|
|
/* Use indirect host data path via /proc/pid/mem
|
|
* on small pci bar GPUs or for Buffer Objects
|
|
* that don't have HostAccess permissions.
|
|
*/
|
|
plugin_log_msg("amdgpu_plugin: using PROCPIDMEM to restore BO contents\n");
|
|
addr = mmap(NULL, bo_bucket->size, PROT_NONE, MAP_SHARED, fd,
|
|
bo_bucket->restored_offset);
|
|
if (addr == MAP_FAILED) {
|
|
pr_perror("amdgpu_plugin: mmap failed");
|
|
fd = -EBADFD;
|
|
goto exit;
|
|
}
|
|
|
|
if (asprintf(&fname, PROCPIDMEM, e->pid) < 0) {
|
|
pr_perror("failed in asprintf, %s", fname);
|
|
munmap(addr, bo_bucket->size);
|
|
fd = -EBADFD;
|
|
goto exit;
|
|
}
|
|
|
|
mem_fd = open(fname, O_RDWR);
|
|
if (mem_fd < 0) {
|
|
pr_perror("Can't open %s for pid %d", fname, e->pid);
|
|
free(fname);
|
|
munmap(addr, bo_bucket->size);
|
|
fd = -EBADFD;
|
|
goto exit;
|
|
}
|
|
|
|
plugin_log_msg("Opened %s file for pid = %d", fname, e->pid);
|
|
free(fname);
|
|
|
|
if (lseek(mem_fd, (off_t)addr, SEEK_SET) == -1) {
|
|
pr_perror("Can't lseek for bo_offset for pid = %d", e->pid);
|
|
munmap(addr, bo_entry->size);
|
|
fd = -EBADFD;
|
|
goto exit;
|
|
}
|
|
|
|
plugin_log_msg("Attempt writing now");
|
|
bo_size = write(mem_fd, bo_entry->rawdata.data, bo_entry->size);
|
|
if (bo_size != bo_entry->size) {
|
|
pr_perror("Can't write buffer");
|
|
munmap(addr, bo_entry->size);
|
|
fd = -EBADFD;
|
|
goto exit;
|
|
}
|
|
munmap(addr, bo_entry->size);
|
|
close(mem_fd);
|
|
}
|
|
} else {
|
|
plugin_log_msg("Not a VRAM BO\n");
|
|
continue;
|
|
}
|
|
}
|
|
|
|
exit:
|
|
if (mem_fd > 0)
|
|
close(mem_fd);
|
|
|
|
return 0;
|
|
}
|
|
|
|
int amdgpu_plugin_restore_file(int id)
|
|
{
|
|
int ret = 0, fd;
|
|
char img_path[PATH_MAX];
|
|
struct stat filestat;
|
|
unsigned char *buf;
|
|
CriuRenderNode *rd;
|
|
CriuKfd *e = NULL;
|
|
struct kfd_ioctl_criu_args args = { 0 };
|
|
|
|
pr_info("amdgpu_plugin: Initialized kfd plugin restorer with ID = %d\n", id);
|
|
|
|
snprintf(img_path, sizeof(img_path), "amdgpu-kfd-%d.img", id);
|
|
|
|
if (stat(img_path, &filestat) == -1) {
|
|
pr_perror("open(%s)", img_path);
|
|
/* This is restorer plugin for renderD nodes. Since criu doesn't
|
|
* gurantee that they will be called before the plugin is called
|
|
* for kfd file descriptor, we need to make sure we open the render
|
|
* nodes only once and before /dev/kfd is open, the render nodes
|
|
* are open too. Generally, it is seen that during checkpoint and
|
|
* restore both, the kfd plugin gets called first.
|
|
*/
|
|
snprintf(img_path, sizeof(img_path), "amdgpu-renderD-%d.img", id);
|
|
|
|
if (stat(img_path, &filestat) == -1) {
|
|
pr_perror("Failed to read file stats");
|
|
return -1;
|
|
}
|
|
pr_info("renderD file size on disk = %ld\n", filestat.st_size);
|
|
|
|
buf = xmalloc(filestat.st_size);
|
|
if (!buf) {
|
|
pr_perror("Failed to allocate memory");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
if (read_file(img_path, buf, filestat.st_size)) {
|
|
pr_perror("Unable to read from %s", img_path);
|
|
xfree(buf);
|
|
return -1;
|
|
}
|
|
|
|
rd = criu_render_node__unpack(NULL, filestat.st_size, buf);
|
|
if (rd == NULL) {
|
|
pr_perror("Unable to parse the KFD message %d", id);
|
|
xfree(buf);
|
|
return -1;
|
|
}
|
|
|
|
pr_info("amdgpu_plugin: render node minor num = %d\n", rd->minor_number);
|
|
fd = open_drm_render_device(rd->minor_number);
|
|
criu_render_node__free_unpacked(rd, NULL);
|
|
xfree(buf);
|
|
return fd;
|
|
}
|
|
|
|
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
|
|
if (fd < 0) {
|
|
pr_perror("failed to open kfd in plugin");
|
|
return -1;
|
|
}
|
|
|
|
pr_info("amdgpu_plugin: Opened kfd, fd = %d\n", fd);
|
|
|
|
pr_info("kfd img file size on disk = %ld\n", filestat.st_size);
|
|
|
|
buf = xmalloc(filestat.st_size);
|
|
if (!buf) {
|
|
pr_perror("Failed to allocate memory");
|
|
return -ENOMEM;
|
|
}
|
|
|
|
if (read_file(img_path, buf, filestat.st_size)) {
|
|
pr_perror("Unable to read from %s", img_path);
|
|
xfree(buf);
|
|
return -1;
|
|
}
|
|
e = criu_kfd__unpack(NULL, filestat.st_size, buf);
|
|
if (e == NULL) {
|
|
pr_err("Unable to parse the KFD message %#x\n", id);
|
|
xfree(buf);
|
|
return -1;
|
|
}
|
|
|
|
plugin_log_msg("amdgpu_plugin: read image file data\n");
|
|
|
|
ret = restore_devices(&args, e);
|
|
if (ret)
|
|
goto exit;
|
|
|
|
ret = restore_bos(&args, e);
|
|
if (ret)
|
|
goto exit;
|
|
|
|
args.num_objects = e->num_of_objects;
|
|
args.priv_data_size = e->priv_data.len;
|
|
args.priv_data = (uintptr_t)e->priv_data.data;
|
|
|
|
args.op = KFD_CRIU_OP_RESTORE;
|
|
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) {
|
|
pr_perror("Restore ioctl failed");
|
|
ret = -1;
|
|
goto exit;
|
|
}
|
|
|
|
ret = restore_bo_data(fd, (struct kfd_criu_bo_bucket *)args.bos, e);
|
|
if (ret)
|
|
goto exit;
|
|
|
|
exit:
|
|
if (e)
|
|
criu_kfd__free_unpacked(e, NULL);
|
|
|
|
xfree((void *)args.devices);
|
|
xfree((void *)args.bos);
|
|
xfree(buf);
|
|
|
|
if (ret) {
|
|
pr_err("amdgpu_plugin: Failed to restore (ret:%d)\n", ret);
|
|
fd = ret;
|
|
} else {
|
|
pr_info("amdgpu_plugin: Restore successful (fd:%d)\n", fd);
|
|
}
|
|
|
|
return fd;
|
|
}
|
|
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, amdgpu_plugin_restore_file)
|
|
|
|
/* return 0 if no match found
|
|
* return -1 for error.
|
|
* return 1 if vmap map must be adjusted.
|
|
*/
|
|
int amdgpu_plugin_update_vmamap(const char *path, const uint64_t addr, const uint64_t old_offset, uint64_t *new_offset,
|
|
int *updated_fd)
|
|
{
|
|
struct vma_metadata *vma_md;
|
|
|
|
plugin_log_msg("amdgpu_plugin: Enter %s\n", __func__);
|
|
|
|
/*
|
|
* On newer versions of AMD KFD driver, only the file descriptor that was used to open the
|
|
* device can be used for mmap, so we will have to return the proper file descriptor here
|
|
*/
|
|
*updated_fd = -1;
|
|
|
|
list_for_each_entry(vma_md, &update_vma_info_list, list) {
|
|
if (addr == vma_md->vma_entry && old_offset == vma_md->old_pgoff) {
|
|
*new_offset = vma_md->new_pgoff;
|
|
|
|
plugin_log_msg("amdgpu_plugin: old_pgoff= 0x%lx new_pgoff = 0x%lx path = %s\n",
|
|
vma_md->old_pgoff, vma_md->new_pgoff, path);
|
|
|
|
return 1;
|
|
}
|
|
}
|
|
pr_info("No match for addr:0x%lx offset:%lx\n", addr, old_offset);
|
|
return 0;
|
|
}
|
|
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, amdgpu_plugin_update_vmamap)
|
|
|
|
int amdgpu_plugin_resume_devices_late(int target_pid)
|
|
{
|
|
struct kfd_ioctl_criu_args args = { 0 };
|
|
int fd, ret = 0;
|
|
|
|
pr_info("amdgpu_plugin: Inside %s for target pid = %d\n", __func__, target_pid);
|
|
|
|
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
|
|
if (fd < 0) {
|
|
pr_perror("failed to open kfd in plugin");
|
|
return -1;
|
|
}
|
|
|
|
args.pid = target_pid;
|
|
args.op = KFD_CRIU_OP_RESUME;
|
|
pr_info("amdgpu_plugin: Calling IOCTL to start notifiers and queues\n");
|
|
if (kmtIoctl(fd, AMDKFD_IOC_CRIU_OP, &args) == -1) {
|
|
pr_perror("restore late ioctl failed");
|
|
ret = -1;
|
|
}
|
|
|
|
close(fd);
|
|
return ret;
|
|
}
|
|
|
|
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late)
|