2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 01:51:51 +00:00

criu/plugin: Remap GPUs on checkpoint restore

The device topology on the restore node can be different from the
topology on the checkpointed node. The GPUs on the restore node may
have different gpu_ids, minor number. or some GPUs may have different
properties as checkpointed node. During restore, the CRIU plugin
determines the target GPUs to avoid restore failures caused by trying
to restore a process on a gpu that is different.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
This commit is contained in:
David Yat Sin 2022-02-15 21:41:05 -05:00 committed by Andrei Vagin
parent 6e99fea2fa
commit 72905c9c9b
5 changed files with 827 additions and 28 deletions

View File

@ -9,8 +9,8 @@ userspace for AMD GPUs.
CURRENT SUPPORT
---------------
Single GPU systems (Gfx9)
Checkpoint / Restore on same system
Single and Multi GPU systems (Gfx9)
Checkpoint / Restore on different system
Checkpoint / Restore inside a docker container
Pytorch

View File

@ -53,6 +53,8 @@ struct vma_metadata {
uint64_t old_pgoff;
uint64_t new_pgoff;
uint64_t vma_entry;
uint32_t new_minor;
int fd;
};
/************************************ Global Variables ********************************************/
@ -257,7 +259,9 @@ int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceE
devinfo->node_id = node->id;
if (NODE_IS_GPU(node)) {
devinfo->gpu_id = node->gpu_id;
devinfo->gpu_id = maps_get_dest_gpu(maps, node->gpu_id);
if (!devinfo->gpu_id)
return -EINVAL;
devinfo->simd_count = node->simd_count;
devinfo->mem_banks_count = node->mem_banks_count;
@ -378,6 +382,8 @@ int amdgpu_plugin_init(int stage)
topology_init(&src_topology);
topology_init(&dest_topology);
maps_init(&checkpoint_maps);
maps_init(&restore_maps);
return 0;
}
@ -386,6 +392,9 @@ void amdgpu_plugin_fini(int stage, int ret)
{
pr_info("amdgpu_plugin: finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name);
maps_free(&checkpoint_maps);
maps_free(&restore_maps);
topology_free(&src_topology);
topology_free(&dest_topology);
}
@ -455,6 +464,15 @@ static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_cri
pr_debug("Dumping %d devices\n", args->num_devices);
/* When checkpointing on a node where there was already a checkpoint-restore before, the
* user_gpu_id and actual_gpu_id will be different.
*
* We store the user_gpu_id in the stored image files so that the stored images always have
* the gpu_id's of the node where the application was first launched.
*/
for (int i = 0; i < args->num_devices; i++)
maps_add_gpu_entry(&checkpoint_maps, device_buckets[i].actual_gpu_id, device_buckets[i].user_gpu_id);
e->num_of_gpus = args->num_devices;
e->num_of_cpus = src_topology.num_nodes - args->num_devices;
@ -638,11 +656,21 @@ int amdgpu_plugin_dump_file(int fd, int id)
*/
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
struct tp_node *tp_node;
pr_info("amdgpu_plugin: Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev),
fd, id);
rd.minor_number = minor(st.st_rdev);
tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev));
if (!tp_node) {
pr_err("amdgpu_plugin: Failed to find a device with minor number = %d\n", minor(st.st_rdev));
return -ENODEV;
}
rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
if (!rd.gpu_id)
return -ENODEV;
len = criu_render_node__get_packed_size(&rd);
buf = xmalloc(len);
@ -787,6 +815,7 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e)
for (int entries_i = 0; entries_i < e->num_of_cpus + e->num_of_gpus; entries_i++) {
struct kfd_criu_device_bucket *device_bucket;
DeviceEntry *devinfo = e->device_entries[entries_i];
struct tp_node *tp_node;
if (!devinfo->gpu_id)
continue;
@ -794,8 +823,19 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e)
device_bucket = &device_buckets[bucket_index++];
device_bucket->user_gpu_id = devinfo->gpu_id;
device_bucket->actual_gpu_id = maps_get_dest_gpu(&restore_maps, devinfo->gpu_id);
if (!device_bucket->actual_gpu_id) {
ret = -ENODEV;
goto exit;
}
device_bucket->drm_fd = open_drm_render_device(bucket_index + DRM_FIRST_RENDER_NODE);
tp_node = sys_get_node_by_gpu_id(&dest_topology, device_bucket->actual_gpu_id);
if (!tp_node) {
ret = -ENODEV;
goto exit;
}
device_bucket->drm_fd = node_get_drm_render_device(tp_node);
if (device_bucket->drm_fd < 0) {
pr_perror("amdgpu_plugin: Can't pass NULL drm render fd to driver");
goto exit;
@ -842,25 +882,42 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
static int restore_bo_data(int fd, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
{
int mem_fd = -1;
int mem_fd = -1, ret = 0;
for (int i = 0; i < e->num_of_bos; i++) {
void *addr;
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
struct tp_node *tp_node;
BoEntry *bo_entry = e->bo_entries[i];
if (bo_bucket->alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT |
KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)) {
struct vma_metadata *vma_md;
uint32_t target_gpu_id; /* actual gpu_id where the BO will be restored */
vma_md = xmalloc(sizeof(*vma_md));
if (!vma_md)
return -ENOMEM;
if (!vma_md) {
ret = -ENOMEM;
goto exit;
}
memset(vma_md, 0, sizeof(*vma_md));
vma_md->old_pgoff = bo_bucket->offset;
vma_md->vma_entry = bo_bucket->addr;
target_gpu_id = maps_get_dest_gpu(&restore_maps, bo_bucket->gpu_id);
tp_node = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
if (!tp_node) {
pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
ret = -ENODEV;
goto exit;
}
vma_md->new_minor = tp_node->drm_render_minor;
vma_md->new_pgoff = bo_bucket->restored_offset;
vma_md->fd = node_get_drm_render_device(tp_node);
plugin_log_msg("amdgpu_plugin: adding vma_entry:addr:0x%lx old-off:0x%lx "
"new_off:0x%lx new_minor:%d\n",
@ -948,7 +1005,7 @@ exit:
if (mem_fd > 0)
close(mem_fd);
return 0;
return ret;
}
int amdgpu_plugin_restore_file(int id)
@ -966,15 +1023,16 @@ int amdgpu_plugin_restore_file(int id)
snprintf(img_path, sizeof(img_path), "amdgpu-kfd-%d.img", id);
if (stat(img_path, &filestat) == -1) {
struct tp_node *tp_node;
uint32_t target_gpu_id;
pr_perror("open(%s)", img_path);
/* This is restorer plugin for renderD nodes. Since criu doesn't
* gurantee that they will be called before the plugin is called
* for kfd file descriptor, we need to make sure we open the render
* nodes only once and before /dev/kfd is open, the render nodes
* are open too. Generally, it is seen that during checkpoint and
* restore both, the kfd plugin gets called first.
/* This is restorer plugin for renderD nodes. Criu doesn't guarantee that they will
* be called before the plugin is called for kfd file descriptor.
* TODO: Currently, this code will only work if this function is called for /dev/kfd
* first as we assume restore_maps is already filled. Need to fix this later.
*/
snprintf(img_path, sizeof(img_path), "amdgpu-renderD-%d.img", id);
snprintf(img_path, sizeof(img_path), "renderDXXX.%d.img", id);
if (stat(img_path, &filestat) == -1) {
pr_perror("Failed to read file stats");
@ -1001,8 +1059,26 @@ int amdgpu_plugin_restore_file(int id)
return -1;
}
pr_info("amdgpu_plugin: render node minor num = %d\n", rd->minor_number);
fd = open_drm_render_device(rd->minor_number);
pr_info("amdgpu_plugin: render node gpu_id = 0x%04x\n", rd->gpu_id);
target_gpu_id = maps_get_dest_gpu(&restore_maps, rd->gpu_id);
if (!target_gpu_id) {
fd = -ENODEV;
goto fail;
}
tp_node = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
if (!tp_node) {
fd = -ENODEV;
goto fail;
}
pr_info("amdgpu_plugin: render node destination gpu_id = 0x%04x\n", tp_node->gpu_id);
fd = node_get_drm_render_device(tp_node);
if (fd < 0)
pr_err("amdgpu_plugin: Failed to open render device (minor:%d)\n", tp_node->drm_render_minor);
fail:
criu_render_node__free_unpacked(rd, NULL);
xfree(buf);
return fd;
@ -1054,6 +1130,12 @@ int amdgpu_plugin_restore_file(int id)
goto exit;
}
ret = set_restore_gpu_maps(&src_topology, &dest_topology, &restore_maps);
if (ret) {
pr_err("Failed to map GPUs\n");
goto exit;
}
ret = restore_devices(&args, e);
if (ret)
goto exit;
@ -1078,6 +1160,8 @@ int amdgpu_plugin_restore_file(int id)
goto exit;
exit:
sys_close_drm_render_devices(&dest_topology);
if (e)
criu_kfd__free_unpacked(e, NULL);
@ -1100,25 +1184,55 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, amdgpu_plugin_restore_
* return -1 for error.
* return 1 if vmap map must be adjusted.
*/
int amdgpu_plugin_update_vmamap(const char *path, const uint64_t addr, const uint64_t old_offset, uint64_t *new_offset,
int *updated_fd)
int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const uint64_t old_offset,
uint64_t *new_offset, int *updated_fd)
{
struct vma_metadata *vma_md;
char path[PATH_MAX];
char *p_begin;
char *p_end;
bool is_kfd = false, is_renderD = false;
plugin_log_msg("amdgpu_plugin: Enter %s\n", __func__);
strncpy(path, in_path, sizeof(path));
p_begin = path;
p_end = p_begin + strlen(path);
/*
* On newer versions of AMD KFD driver, only the file descriptor that was used to open the
* device can be used for mmap, so we will have to return the proper file descriptor here
* Paths sometimes have double forward slashes (e.g //dev/dri/renderD*)
* replace all '//' with '/'.
*/
*updated_fd = -1;
while (p_begin < p_end - 1) {
if (*p_begin == '/' && *(p_begin + 1) == '/')
memmove(p_begin, p_begin + 1, p_end - p_begin);
else
p_begin++;
}
if (!strncmp(path, "/dev/dri/renderD", strlen("/dev/dri/renderD")))
is_renderD = true;
if (!strcmp(path, AMDGPU_KFD_DEVICE))
is_kfd = true;
if (!is_renderD && !is_kfd) {
pr_info("Skipping unsupported path:%s addr:%lx old_offset:%lx\n", in_path, addr, old_offset);
return 0;
}
list_for_each_entry(vma_md, &update_vma_info_list, list) {
if (addr == vma_md->vma_entry && old_offset == vma_md->old_pgoff) {
*new_offset = vma_md->new_pgoff;
plugin_log_msg("amdgpu_plugin: old_pgoff= 0x%lx new_pgoff = 0x%lx path = %s\n",
vma_md->old_pgoff, vma_md->new_pgoff, path);
if (is_renderD)
*updated_fd = vma_md->fd;
else
*updated_fd = -1;
plugin_log_msg("amdgpu_plugin: old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff,
vma_md->new_pgoff, *updated_fd);
return 1;
}

View File

@ -32,6 +32,31 @@
}
#endif
static int open_drm_render_device(int minor)
{
char path[128];
int fd;
if (minor < DRM_FIRST_RENDER_NODE || minor > DRM_LAST_RENDER_NODE) {
pr_perror("DRM render minor %d out of range [%d, %d]", minor, DRM_FIRST_RENDER_NODE,
DRM_LAST_RENDER_NODE);
return -EINVAL;
}
snprintf(path, sizeof(path), "/dev/dri/renderD%d", minor);
fd = open(path, O_RDWR | O_CLOEXEC);
if (fd < 0) {
if (errno != ENOENT && errno != EPERM) {
pr_err("Failed to open %s: %s\n", path, strerror(errno));
if (errno == EACCES)
pr_err("Check user is in \"video\" group\n");
}
return -EBADFD;
}
return fd;
}
static const char *link_type(uint32_t type)
{
switch (type) {
@ -54,6 +79,38 @@ static struct tp_node *p2pgroup_get_node_by_gpu_id(const struct tp_p2pgroup *gro
return NULL;
}
int node_get_drm_render_device(struct tp_node *node)
{
if (node->drm_fd < 0)
node->drm_fd = open_drm_render_device(node->drm_render_minor);
return node->drm_fd;
}
void sys_close_drm_render_devices(struct tp_system *sys)
{
struct tp_node *node;
list_for_each_entry(node, &sys->nodes, listm_system) {
if (node->drm_fd >= 0) {
close(node->drm_fd);
node->drm_fd = -1;
}
}
}
static struct tp_iolink *node_get_iolink_to_node_id(const struct tp_node *node, const uint32_t type,
const uint32_t node_id)
{
struct tp_iolink *iolink;
list_for_each_entry(iolink, &node->iolinks, listm) {
if (iolink->node_to_id == node_id && iolink->type == type)
return iolink;
}
return NULL;
}
struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor)
{
struct tp_node *node;
@ -114,6 +171,167 @@ static struct tp_iolink *get_tp_peer_iolink(const struct tp_node *from_node, con
return NULL;
}
static bool maps_dest_cpu_mapped(const struct device_maps *maps, const uint32_t dest_id)
{
struct id_map *id_map;
list_for_each_entry(id_map, &maps->cpu_maps, listm) {
if (id_map->dest == dest_id)
return true;
}
return false;
}
static uint32_t maps_get_dest_cpu(const struct device_maps *maps, const uint32_t src_id)
{
struct id_map *id_map;
list_for_each_entry(id_map, &maps->cpu_maps, listm) {
if (id_map->src == src_id)
return id_map->dest;
}
return INVALID_CPU_ID;
}
bool maps_dest_gpu_mapped(const struct device_maps *maps, const uint32_t dest_id)
{
struct id_map *id_map;
list_for_each_entry(id_map, &maps->gpu_maps, listm) {
if (id_map->dest == dest_id)
return true;
}
return false;
}
uint32_t maps_get_dest_gpu(const struct device_maps *maps, const uint32_t src_id)
{
struct id_map *id_map;
list_for_each_entry(id_map, &maps->gpu_maps, listm) {
if (id_map->src == src_id)
return id_map->dest;
}
return 0;
}
static struct id_map *maps_add_cpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id)
{
struct id_map *id_map = xzalloc(sizeof(*id_map));
if (!id_map) {
pr_err("Failed to allocate memory for id_map\n");
return NULL;
}
id_map->src = src_id;
id_map->dest = dest_id;
list_add_tail(&id_map->listm, &maps->cpu_maps);
maps->tail_cpu = &id_map->listm;
pr_debug("Added CPU mapping [%02d -> %02d]\n", src_id, dest_id);
return id_map;
}
struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id)
{
struct id_map *id_map = xzalloc(sizeof(*id_map));
if (!id_map) {
pr_err("Failed to allocate memory for id_map\n");
return NULL;
}
id_map->src = src_id;
id_map->dest = dest_id;
list_add_tail(&id_map->listm, &maps->gpu_maps);
maps->tail_gpu = &id_map->listm;
pr_debug("Added GPU mapping [0x%04X -> 0x%04X]\n", src_id, dest_id);
return id_map;
}
static void maps_print(struct device_maps *maps)
{
struct id_map *id_map;
pr_info("===Maps===============\n");
list_for_each_entry(id_map, &maps->gpu_maps, listm)
pr_info("GPU: 0x%04X -> 0x%04X\n", id_map->src, id_map->dest);
list_for_each_entry(id_map, &maps->cpu_maps, listm)
pr_info("CPU: %02d -> %02d\n", id_map->src, id_map->dest);
pr_info("======================\n");
}
void maps_init(struct device_maps *maps)
{
INIT_LIST_HEAD(&maps->cpu_maps);
INIT_LIST_HEAD(&maps->gpu_maps);
maps->tail_cpu = 0;
maps->tail_gpu = 0;
}
void maps_free(struct device_maps *maps)
{
while (!list_empty(&maps->cpu_maps)) {
struct id_map *map = list_first_entry(&maps->cpu_maps, struct id_map, listm);
list_del(&map->listm);
xfree(map);
}
while (!list_empty(&maps->gpu_maps)) {
struct id_map *map = list_first_entry(&maps->gpu_maps, struct id_map, listm);
list_del(&map->listm);
xfree(map);
}
}
static void maps_pop(struct device_maps *maps, struct device_maps *remove)
{
if (remove->tail_cpu)
list_cut_position(&remove->cpu_maps, &maps->cpu_maps, remove->tail_cpu);
if (remove->tail_gpu)
list_cut_position(&remove->gpu_maps, &maps->gpu_maps, remove->tail_gpu);
maps_free(remove);
}
static int maps_push(struct device_maps *maps, struct device_maps *new)
{
struct id_map *src_id_map, *dest_id_map;
list_for_each_entry(src_id_map, &new->cpu_maps, listm) {
list_for_each_entry(dest_id_map, &maps->cpu_maps, listm) {
if (src_id_map->src == dest_id_map->src || src_id_map->dest == dest_id_map->dest) {
pr_err("CPU mapping already exists src [%02d->%02d] new [%02d->%02d]\n",
src_id_map->src, src_id_map->dest, dest_id_map->src, dest_id_map->dest);
return -EINVAL;
}
}
}
list_for_each_entry(src_id_map, &new->gpu_maps, listm) {
list_for_each_entry(dest_id_map, &maps->gpu_maps, listm) {
if (src_id_map->src == dest_id_map->src || src_id_map->dest == dest_id_map->dest) {
pr_err("GPU mapping already exists src [0x%04X -> 0x%04X] new [0x%04X -> 0x%04X]\n",
src_id_map->src, src_id_map->dest, dest_id_map->src, dest_id_map->dest);
return -EINVAL;
}
}
}
list_splice(&new->cpu_maps, &maps->cpu_maps);
list_splice(&new->gpu_maps, &maps->gpu_maps);
return 0;
}
struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t node_to_id)
{
struct tp_iolink *iolink = xzalloc(sizeof(*iolink));
@ -475,6 +693,19 @@ static const char *p2pgroup_to_str(struct tp_p2pgroup *group)
return topology_printstr;
}
static const char *mapping_list_to_str(struct list_head *node_list)
{
static char topology_printstr[200];
struct tp_node *node;
size_t str_len = 0;
topology_printstr[0] = '\0';
list_for_each_entry(node, node_list, listm_mapping)
str_len += sprintf(&topology_printstr[str_len], "0x%04X ", node->gpu_id);
return topology_printstr;
}
void topology_print(const struct tp_system *sys, const char *message)
{
struct tp_node *node;
@ -717,4 +948,449 @@ int topology_parse(struct tp_system *sys, const char *message)
fail:
topology_free(sys);
return ret;
}
}
static bool device_properties_match(struct tp_node *src, struct tp_node *dest)
{
if (src->simd_count == dest->simd_count && src->mem_banks_count == dest->mem_banks_count &&
src->io_links_count == dest->io_links_count && src->max_waves_per_simd == dest->max_waves_per_simd &&
src->lds_size_in_kb == dest->lds_size_in_kb && src->wave_front_size == dest->wave_front_size &&
src->array_count == dest->array_count && src->simd_arrays_per_engine == dest->simd_arrays_per_engine &&
src->cu_per_simd_array == dest->cu_per_simd_array && src->simd_per_cu == dest->simd_per_cu &&
src->max_slots_scratch_cu == dest->max_slots_scratch_cu && src->vendor_id == dest->vendor_id &&
src->device_id == dest->device_id && src->num_sdma_engines == dest->num_sdma_engines &&
src->num_sdma_xgmi_engines == dest->num_sdma_xgmi_engines &&
src->num_sdma_queues_per_engine == dest->num_sdma_queues_per_engine &&
src->num_cp_queues == dest->num_cp_queues && src->capability == dest->capability &&
src->vram_public == dest->vram_public && src->vram_size <= dest->vram_size &&
src->num_gws <= dest->num_gws && src->caches_count <= dest->caches_count &&
src->fw_version <= dest->fw_version && src->sdma_fw_version <= dest->sdma_fw_version) {
return true;
}
return false;
}
/**
* @brief Determines whether iolink dest can be used to replace src
*
* @param src source iolink
* @param dest destination iolink
* @return true if dest can replace src
*/
static bool iolink_match(struct tp_iolink *src, struct tp_iolink *dest)
{
if (!src->valid)
return true;
if (!dest->valid)
return false;
if (NODE_IS_GPU(src->node_to) != NODE_IS_GPU(dest->node_to))
return false;
/* XGMI link can replace PCIE links */
if (src->type == TOPO_IOLINK_TYPE_XGMI && dest->type == TOPO_IOLINK_TYPE_PCIE)
return false;
/* bi-directional links can replace uni-directional links */
if (src->peer != NULL && dest->peer == NULL)
return false;
return true;
}
/**
* @brief Determines whether src_node can be mapped to dest_node
*
* Nodes compatibility are determined by:
* 1. Comparing the node properties
* 2. Making sure iolink mappings to CPUs would be compabitle with existing iolink mappings in maps
*
* If src_node and dest_node are mappable, then map_device will push the new mapping
* for src_node -> dest_node into new_maps.
* @param src_sys system topology information on source system
* @param dest_sys system topology information on destination system
* @param src_node source GPU
* @param dest_node destination GPU
* @param maps list of existing device maps
* @param new_maps if nodes are mappable, then GPU and CPU mappings will be added to this list
* @return true if src_node and dest_node are mappable
*/
static bool map_device(struct tp_system *src_sys, struct tp_system *dest_sys, struct tp_node *src_node,
struct tp_node *dest_node, struct device_maps *maps, struct device_maps *new_maps)
{
struct tp_iolink *src_iolink;
pr_debug("Evaluating mapping nodes [0x%04X -> 0x%04X]\n", src_node->gpu_id, dest_node->gpu_id);
/* Compare GPU properties from /sys/class/kfd/kfd/topology/nodes/N/properties */
if (!device_properties_match(src_node, dest_node)) {
pr_debug("[0x%04X -> 0x%04X] Device properties do not match\n", src_node->gpu_id, dest_node->gpu_id);
return false;
}
if (src_node->num_valid_iolinks > dest_node->num_valid_iolinks) {
pr_debug("[0x%04X -> 0x%04X] Mismatch between number of iolinks\n", src_node->gpu_id,
dest_node->gpu_id);
return false;
}
list_for_each_entry(src_iolink, &src_node->iolinks, listm) {
/* Go through list of iolinks to CPU and compare them */
if (!NODE_IS_GPU(src_iolink->node_to)) {
bool matched_iolink = false;
/* This is a iolink to CPU */
pr_debug("Found link to CPU node:%02d\n", src_iolink->node_to->id);
uint32_t dest_cpu_node_id;
dest_cpu_node_id = maps_get_dest_cpu(maps, src_iolink->node_to->id);
if (dest_cpu_node_id == INVALID_CPU_ID)
dest_cpu_node_id = maps_get_dest_cpu(new_maps, src_iolink->node_to->id);
if (dest_cpu_node_id == INVALID_CPU_ID) {
struct tp_iolink *dest_iolink;
list_for_each_entry(dest_iolink, &dest_node->iolinks, listm) {
if (iolink_match(src_iolink, dest_iolink) &&
!maps_dest_cpu_mapped(maps, dest_iolink->node_to->id) &&
!maps_dest_cpu_mapped(new_maps, dest_iolink->node_to->id)) {
if (!maps_add_cpu_entry(new_maps, src_iolink->node_to->id,
dest_iolink->node_to->id))
/* This is a critical error because we
* are out of memory
*/
return false;
matched_iolink = true;
break;
}
}
} else {
pr_debug("Existing CPU mapping found [%02d-%02d]\n", src_iolink->node_to->id,
dest_cpu_node_id);
/* Confirm that the link to this CPU is same or better */
struct tp_iolink *dest_iolink =
node_get_iolink_to_node_id(dest_node, src_iolink->type, dest_cpu_node_id);
if (dest_iolink && iolink_match(src_iolink, dest_iolink))
matched_iolink = true;
}
if (!matched_iolink) {
pr_debug("[0x%04X -> 0x%04X] Mismatch between iolink to CPU\n", src_node->gpu_id,
dest_node->gpu_id);
return false;
}
} else {
/* If GPUs have P2P-PCIe iolinks to this GPU, then at least one CPU will
* also have a P2P-PCIe iolink to this GPU, so it seems that we do not need
* to consider P2P-PCIe iolinks from GPU to GPU for now. Once P2P-PCIe
* iolinks are exposed via p2p_links we may have to add additional code here
* to validate P2P-PCIe links between GPUs.
*/
}
}
pr_debug("[0x%04X -> 0x%04X] Map is possible\n", src_node->gpu_id, dest_node->gpu_id);
if (!maps_add_gpu_entry(new_maps, src_node->gpu_id, dest_node->gpu_id)) {
/* This is a critical error because we are out of memory */
return false;
}
maps_print(new_maps);
return true;
}
/**
* @brief Determines whether list of GPUs in src_nodes are mappable to dest_nodes
*
* This function will pick the first node from src_nodes and iterate through all the nodes in
* dest_nodes and call map_device to determine whether the node is mappable.
* If a node from dest_nodes is mappable to the first node from src_nodes:
* 1. This function will remove the first node from src_nodes and the node from dest_nodes
* 2. Push sub-mappings (new_maps) generated by map_device into existing mappings (maps)
* 3. Recursively check whether remaining nodes in src_nodes and dest_nodes are mappable.
*
* Once src_nodes is empty then we have successfully mapped all the nodes and maps contains a full
* list of GPU mappings.
*
* If there are no nodes in dest_nodes that can be mapped to the first node in src_nodes, then this
* means we cannot build a full mapping list with the current list of mappings. We backtrack by
* popping the newly generated sub-mappings(new_maps) from existing mappings (maps) and add the two
* nodes back to src_nodes and dest_nodes and return false. When this function returns false, the
* caller function will try a different path by trying to map the first node from src_nodes to the
* next node in dest_nodes.
*
* @param src_sys system topology information on source system
* @param dest_sys system topology information on destination system
* @param src_node list of source GPUs that need to be mapped
* @param dest_node list of destination GPUs that need to be mapped
* @param maps list of device maps based on current map path
* @return true if all nodes from src_nodes and dest_nodes are mappable
*/
static bool map_devices(struct tp_system *src_sys, struct tp_system *dest_sys, struct list_head *src_nodes,
struct list_head *dest_nodes, struct device_maps *maps)
{
struct tp_node *src_node, *dest_node, *dest_node_tmp;
struct device_maps new_maps;
/* Pick the first src node from the list of nodes and look for a dest node that is mappable.
* If we find a mappable destination node, then we add src node and dest node mapping to
* device_maps and recursively try to map the remaining nodes in the list.
* If there are no more src nodes in the list, then we have found a successful combination
* of src to dest nodes that are mappable.
*/
if (list_empty(src_nodes)) {
pr_debug("All nodes mapped successfully\n");
return true;
}
pr_debug("Mapping list src nodes [%s]\n", mapping_list_to_str(src_nodes));
pr_debug("Mapping list dest nodes [%s]\n", mapping_list_to_str(dest_nodes));
src_node = list_first_entry(src_nodes, struct tp_node, listm_mapping);
pr_debug("Looking for match for node 0x%04X\n", src_node->gpu_id);
list_del(&src_node->listm_mapping);
list_for_each_entry_safe(dest_node, dest_node_tmp, dest_nodes, listm_mapping) {
maps_init(&new_maps);
if (map_device(src_sys, dest_sys, src_node, dest_node, maps, &new_maps)) {
pr_debug("Matched destination node 0x%04X\n", dest_node->gpu_id);
/* src node and dest node are mappable, add device_maps generated by
* map_device to list of current valid device_maps, and recursively try to
* map remaining nodes in the list.
*/
list_del(&dest_node->listm_mapping);
if (maps_push(maps, &new_maps))
return false;
if (map_devices(src_sys, dest_sys, src_nodes, dest_nodes, maps)) {
pr_debug("Matched nodes 0x%04X and after\n", dest_node->gpu_id);
return true;
} else {
/* We could not map remaining nodes in the list. Add dest node back
* to list and try to map next dest ndoe in list to current src
* node.
*/
pr_debug("Nodes after [0x%04X -> 0x%04X] did not match, "
"adding list back\n",
src_node->gpu_id, dest_node->gpu_id);
list_add(&dest_node->listm_mapping, dest_nodes);
maps_pop(maps, &new_maps);
}
}
}
pr_debug("Failed to map nodes 0x%04X and after\n", src_node->gpu_id);
/* Either: We could not find a mappable dest node for current node, or we could not build a
* combination from the remaining nodes in the lists. Add src node back to the list and
* caller function will try next possible combination.
*/
list_add(&src_node->listm_mapping, src_nodes);
return false;
}
/**
* @brief Determines whether list of GPUs in src_xgmi_groups are mappable to list of GPUs in
* dest_xgmi_groups
*
* This function will pick the first XGMI group (hive) from src_xgmi_groups and iterate through the
* XGMI groups in dest_xgmi_groups. If the group in dest_xgmi_groups is mappable then this function
* will remove the hives from src_xgmi_groups and dest_xgmi_groups and recursively try to map the
* remaining hives in src_xgmi_groups and dest_xgmi_groups.
*
* If src_xgmi_groups is empty, then this means that we have successfully mapped all the XGMI hives
* and we have a full list of GPU mappings in maps.
*
* If we cannot find a hive inside dest_xgmi_groups that is mappable to the first hive from
* src_xgmi_groups, then this means that this path is not valid and we need to backtrack. We
* backtrack by adding the hives back into src_xgmi_groups and dest_xgmi_groups and returning false.
* The caller function will then try a different path by trying to map the first hive in
* src_xgmi_groups to the next hive in dest_xgmi_groups.
*
* @param src_sys system topology information on source system
* @param dest_sys system topology information on destination system
* @param src_xgmi_groups list of source XGMI hives that need to be mapped
* @param dest_xgmi_groups list of destination XGMI hives that need to be mapped
* @param maps list of device maps based on current map path
* @return true if all nodes from src_nodes and dest_nodes are mappable
*/
bool match_xgmi_groups(struct tp_system *src_sys, struct tp_system *dest_sys, struct list_head *src_xgmi_groups,
struct list_head *dest_xgmi_groups, struct device_maps *maps)
{
struct tp_p2pgroup *src_group;
struct tp_p2pgroup *dest_group;
struct tp_p2pgroup *dest_group_tmp;
if (list_empty(src_xgmi_groups)) {
pr_debug("All groups matched successfully\n");
return true;
}
/* Pick the first src XGMI group from the list. Then try to match src XGMI group with a
* dest XGMI group. If we have a dest XGMI group that is mappable, then we try to
* recursively map the next src XGMI group in the list, with remaining dest XGMI groups.
* If there are no more src XGMI groups in the list, then this means we have successfully
* mapped all the groups and we have a valid device_maps
*/
src_group = list_first_entry(src_xgmi_groups, struct tp_p2pgroup, listm_system);
pr_debug("Looking for match for group [%s]\n", p2pgroup_to_str(src_group));
list_del(&src_group->listm_system);
list_for_each_entry_safe(dest_group, dest_group_tmp, dest_xgmi_groups, listm_system) {
struct tp_node *node;
LIST_HEAD(src_nodes);
LIST_HEAD(dest_nodes);
if (src_group->num_nodes > dest_group->num_nodes)
continue;
pr_debug("Trying destination group [%s]\n", p2pgroup_to_str(dest_group));
list_for_each_entry(node, &src_group->nodes, listm_p2pgroup)
list_add_tail(&node->listm_mapping, &src_nodes);
list_for_each_entry(node, &dest_group->nodes, listm_p2pgroup)
list_add_tail(&node->listm_mapping, &dest_nodes);
/* map_devices will populate maps if successful */
if (map_devices(src_sys, dest_sys, &src_nodes, &dest_nodes, maps)) {
/* All the nodes in current src XGMI group are mappable with nodes in
* current dest XGMI group. Remove the current groups from the lists
* and recursively try to match remaining groups
*/
list_del(&dest_group->listm_system);
pr_debug("Matched destination group [%s]\n", p2pgroup_to_str(dest_group));
if (match_xgmi_groups(src_sys, dest_sys, src_xgmi_groups, dest_xgmi_groups, maps)) {
pr_debug("Matched subgroups of [%s]\n", p2pgroup_to_str(dest_group));
xfree(src_group);
xfree(dest_group);
return true;
} else {
/* We were not able to map the remaining XGMI groups so we add the
* current dest XGMI group back to the list of unmapped groups, and
* try to map current src XGMI group with the next dest XGMI in the
* list of XGMI groups
*/
list_add(&dest_group->listm_system, dest_xgmi_groups);
}
}
}
/* We have not found a mappable dest XGMI group. We discard this combination. If this is
* the first src XGMI group in the list, then it is not possible to match the XGMI groups.
* If this was a recursive call, then the calling instance of function will try the next
* combination of XGMI groups
*/
pr_debug("Failed to match groups [%s]\n", p2pgroup_to_str(src_group));
list_add_tail(&src_group->listm_system, src_xgmi_groups);
return false;
}
/**
* @brief Builds a list of GPU mappings from source topology to destination topology
*
* The topology on the destination system may not be identical to the topology on the source
* system, e.g There can be GPUs with different device ID's and they may be enumerated in a
* different order. This function builds a list of GPU mappings from the source topology to the
* destination topology and stores it in maps.
*
* The function will first validate all the iolinks and determine XGMI groups (hives) by calling the
* topology_determine_iolinks(). It will then try to match the GPUs that belong to XGMI hives and
* after that, match the remaining GPUs.
*
* @param src_sys system topology information on source system
* @param dest_sys system topology information on destination system
* @param maps list of device maps that was generated by this function
* @return true if we were able to build a full list of GPU mappings.
*/
int set_restore_gpu_maps(struct tp_system *src_sys, struct tp_system *dest_sys, struct device_maps *maps)
{
struct tp_node *node;
int ret = 0;
int src_num_gpus = 0;
int dest_num_gpus = 0;
maps_init(maps);
ret = topology_determine_iolinks(src_sys);
if (ret) {
pr_err("Failed to determine iolinks from source (checkpointed) topology\n");
return ret;
}
topology_print(src_sys, "Source ");
ret = topology_determine_iolinks(dest_sys);
if (ret) {
pr_err("Failed to determine iolinks from destination (local) topology\n");
return ret;
}
topology_print(dest_sys, "Destination");
/* Make sure we have same number of GPUs in src and dest */
list_for_each_entry(node, &src_sys->nodes, listm_system) {
if (NODE_IS_GPU(node))
src_num_gpus++;
}
list_for_each_entry(node, &dest_sys->nodes, listm_system) {
if (NODE_IS_GPU(node))
dest_num_gpus++;
}
if (src_num_gpus != dest_num_gpus) {
pr_err("Number of devices mismatch (checkpointed:%d local:%d)\n", src_num_gpus, dest_num_gpus);
return -EINVAL;
}
if (src_sys->num_xgmi_groups > dest_sys->num_xgmi_groups) {
pr_err("Number of xgmi groups mismatch (checkpointed:%d local:%d)\n", src_sys->num_xgmi_groups,
dest_sys->num_xgmi_groups);
return -EINVAL;
}
/* First try to match the XGMI hives */
if (src_sys->num_xgmi_groups) {
if (!match_xgmi_groups(src_sys, dest_sys, &src_sys->xgmi_groups, &dest_sys->xgmi_groups, maps)) {
pr_err("Failed to match all GPU groups\n");
return -EINVAL;
}
pr_info("Current maps after XGMI groups matched\n");
maps_print(maps);
}
/* We matched all the XGMI hives, now match remaining GPUs */
LIST_HEAD(src_nodes);
LIST_HEAD(dest_nodes);
list_for_each_entry(node, &src_sys->nodes, listm_system) {
if (NODE_IS_GPU(node) && !maps_get_dest_gpu(maps, node->gpu_id))
list_add(&node->listm_mapping, &src_nodes);
}
list_for_each_entry(node, &dest_sys->nodes, listm_system) {
if (NODE_IS_GPU(node) && !maps_dest_gpu_mapped(maps, node->gpu_id))
list_add(&node->listm_mapping, &dest_nodes);
}
if (!map_devices(src_sys, dest_sys, &src_nodes, &dest_nodes, maps)) {
pr_err("Failed to match remaining nodes\n");
return -EINVAL;
}
pr_info("Maps after all nodes matched\n");
maps_print(maps);
return ret;
}

View File

@ -107,12 +107,21 @@ int topology_parse(struct tp_system *topology, const char *msg);
int topology_determine_iolinks(struct tp_system *sys);
void topology_print(const struct tp_system *sys, const char *msg);
struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id);
struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id);
struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t node_to_id);
struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32_t gpu_id);
struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor);
int node_get_drm_render_device(struct tp_node *node);
void sys_close_drm_render_devices(struct tp_system *sys);
int set_restore_gpu_maps(struct tp_system *tp_checkpoint, struct tp_system *tp_local, struct device_maps *maps);
uint32_t maps_get_dest_gpu(const struct device_maps *maps, const uint32_t src_id);
void maps_init(struct device_maps *maps);
void maps_free(struct device_maps *maps);

View File

@ -61,5 +61,5 @@ message criu_kfd {
}
message criu_render_node {
required uint32 minor_number = 1;
required uint32 gpu_id = 1;
}