diff --git a/Documentation/amdgpu_plugin.txt b/Documentation/amdgpu_plugin.txt index 4b731cf9a..8ba602cce 100644 --- a/Documentation/amdgpu_plugin.txt +++ b/Documentation/amdgpu_plugin.txt @@ -9,8 +9,8 @@ userspace for AMD GPUs. CURRENT SUPPORT --------------- -Single GPU systems (Gfx9) -Checkpoint / Restore on same system +Single and Multi GPU systems (Gfx9) +Checkpoint / Restore on different system Checkpoint / Restore inside a docker container Pytorch diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 9c883d31a..235f839f8 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -53,6 +53,8 @@ struct vma_metadata { uint64_t old_pgoff; uint64_t new_pgoff; uint64_t vma_entry; + uint32_t new_minor; + int fd; }; /************************************ Global Variables ********************************************/ @@ -257,7 +259,9 @@ int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceE devinfo->node_id = node->id; if (NODE_IS_GPU(node)) { - devinfo->gpu_id = node->gpu_id; + devinfo->gpu_id = maps_get_dest_gpu(maps, node->gpu_id); + if (!devinfo->gpu_id) + return -EINVAL; devinfo->simd_count = node->simd_count; devinfo->mem_banks_count = node->mem_banks_count; @@ -378,6 +382,8 @@ int amdgpu_plugin_init(int stage) topology_init(&src_topology); topology_init(&dest_topology); + maps_init(&checkpoint_maps); + maps_init(&restore_maps); return 0; } @@ -386,6 +392,9 @@ void amdgpu_plugin_fini(int stage, int ret) { pr_info("amdgpu_plugin: finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + maps_free(&checkpoint_maps); + maps_free(&restore_maps); + topology_free(&src_topology); topology_free(&dest_topology); } @@ -455,6 +464,15 @@ static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_cri pr_debug("Dumping %d devices\n", args->num_devices); + /* When checkpointing on a node where there was already a checkpoint-restore before, the + * user_gpu_id and actual_gpu_id will be different. + * + * We store the user_gpu_id in the stored image files so that the stored images always have + * the gpu_id's of the node where the application was first launched. + */ + for (int i = 0; i < args->num_devices; i++) + maps_add_gpu_entry(&checkpoint_maps, device_buckets[i].actual_gpu_id, device_buckets[i].user_gpu_id); + e->num_of_gpus = args->num_devices; e->num_of_cpus = src_topology.num_nodes - args->num_devices; @@ -638,11 +656,21 @@ int amdgpu_plugin_dump_file(int fd, int id) */ CriuRenderNode rd = CRIU_RENDER_NODE__INIT; + struct tp_node *tp_node; pr_info("amdgpu_plugin: Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id); - rd.minor_number = minor(st.st_rdev); + tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev)); + if (!tp_node) { + pr_err("amdgpu_plugin: Failed to find a device with minor number = %d\n", minor(st.st_rdev)); + + return -ENODEV; + } + + rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id); + if (!rd.gpu_id) + return -ENODEV; len = criu_render_node__get_packed_size(&rd); buf = xmalloc(len); @@ -787,6 +815,7 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) for (int entries_i = 0; entries_i < e->num_of_cpus + e->num_of_gpus; entries_i++) { struct kfd_criu_device_bucket *device_bucket; DeviceEntry *devinfo = e->device_entries[entries_i]; + struct tp_node *tp_node; if (!devinfo->gpu_id) continue; @@ -794,8 +823,19 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) device_bucket = &device_buckets[bucket_index++]; device_bucket->user_gpu_id = devinfo->gpu_id; + device_bucket->actual_gpu_id = maps_get_dest_gpu(&restore_maps, devinfo->gpu_id); + if (!device_bucket->actual_gpu_id) { + ret = -ENODEV; + goto exit; + } - device_bucket->drm_fd = open_drm_render_device(bucket_index + DRM_FIRST_RENDER_NODE); + tp_node = sys_get_node_by_gpu_id(&dest_topology, device_bucket->actual_gpu_id); + if (!tp_node) { + ret = -ENODEV; + goto exit; + } + + device_bucket->drm_fd = node_get_drm_render_device(tp_node); if (device_bucket->drm_fd < 0) { pr_perror("amdgpu_plugin: Can't pass NULL drm render fd to driver"); goto exit; @@ -842,25 +882,42 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e) static int restore_bo_data(int fd, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e) { - int mem_fd = -1; + int mem_fd = -1, ret = 0; for (int i = 0; i < e->num_of_bos; i++) { void *addr; - struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i]; + struct tp_node *tp_node; BoEntry *bo_entry = e->bo_entries[i]; if (bo_bucket->alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT | KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)) { struct vma_metadata *vma_md; + uint32_t target_gpu_id; /* actual gpu_id where the BO will be restored */ vma_md = xmalloc(sizeof(*vma_md)); - if (!vma_md) - return -ENOMEM; + if (!vma_md) { + ret = -ENOMEM; + goto exit; + } + + memset(vma_md, 0, sizeof(*vma_md)); vma_md->old_pgoff = bo_bucket->offset; vma_md->vma_entry = bo_bucket->addr; + + target_gpu_id = maps_get_dest_gpu(&restore_maps, bo_bucket->gpu_id); + + tp_node = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!tp_node) { + pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id); + ret = -ENODEV; + goto exit; + } + + vma_md->new_minor = tp_node->drm_render_minor; vma_md->new_pgoff = bo_bucket->restored_offset; + vma_md->fd = node_get_drm_render_device(tp_node); plugin_log_msg("amdgpu_plugin: adding vma_entry:addr:0x%lx old-off:0x%lx " "new_off:0x%lx new_minor:%d\n", @@ -948,7 +1005,7 @@ exit: if (mem_fd > 0) close(mem_fd); - return 0; + return ret; } int amdgpu_plugin_restore_file(int id) @@ -966,15 +1023,16 @@ int amdgpu_plugin_restore_file(int id) snprintf(img_path, sizeof(img_path), "amdgpu-kfd-%d.img", id); if (stat(img_path, &filestat) == -1) { + struct tp_node *tp_node; + uint32_t target_gpu_id; + pr_perror("open(%s)", img_path); - /* This is restorer plugin for renderD nodes. Since criu doesn't - * gurantee that they will be called before the plugin is called - * for kfd file descriptor, we need to make sure we open the render - * nodes only once and before /dev/kfd is open, the render nodes - * are open too. Generally, it is seen that during checkpoint and - * restore both, the kfd plugin gets called first. + /* This is restorer plugin for renderD nodes. Criu doesn't guarantee that they will + * be called before the plugin is called for kfd file descriptor. + * TODO: Currently, this code will only work if this function is called for /dev/kfd + * first as we assume restore_maps is already filled. Need to fix this later. */ - snprintf(img_path, sizeof(img_path), "amdgpu-renderD-%d.img", id); + snprintf(img_path, sizeof(img_path), "renderDXXX.%d.img", id); if (stat(img_path, &filestat) == -1) { pr_perror("Failed to read file stats"); @@ -1001,8 +1059,26 @@ int amdgpu_plugin_restore_file(int id) return -1; } - pr_info("amdgpu_plugin: render node minor num = %d\n", rd->minor_number); - fd = open_drm_render_device(rd->minor_number); + pr_info("amdgpu_plugin: render node gpu_id = 0x%04x\n", rd->gpu_id); + + target_gpu_id = maps_get_dest_gpu(&restore_maps, rd->gpu_id); + if (!target_gpu_id) { + fd = -ENODEV; + goto fail; + } + + tp_node = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id); + if (!tp_node) { + fd = -ENODEV; + goto fail; + } + + pr_info("amdgpu_plugin: render node destination gpu_id = 0x%04x\n", tp_node->gpu_id); + + fd = node_get_drm_render_device(tp_node); + if (fd < 0) + pr_err("amdgpu_plugin: Failed to open render device (minor:%d)\n", tp_node->drm_render_minor); + fail: criu_render_node__free_unpacked(rd, NULL); xfree(buf); return fd; @@ -1054,6 +1130,12 @@ int amdgpu_plugin_restore_file(int id) goto exit; } + ret = set_restore_gpu_maps(&src_topology, &dest_topology, &restore_maps); + if (ret) { + pr_err("Failed to map GPUs\n"); + goto exit; + } + ret = restore_devices(&args, e); if (ret) goto exit; @@ -1078,6 +1160,8 @@ int amdgpu_plugin_restore_file(int id) goto exit; exit: + sys_close_drm_render_devices(&dest_topology); + if (e) criu_kfd__free_unpacked(e, NULL); @@ -1100,25 +1184,55 @@ CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESTORE_EXT_FILE, amdgpu_plugin_restore_ * return -1 for error. * return 1 if vmap map must be adjusted. */ -int amdgpu_plugin_update_vmamap(const char *path, const uint64_t addr, const uint64_t old_offset, uint64_t *new_offset, - int *updated_fd) +int amdgpu_plugin_update_vmamap(const char *in_path, const uint64_t addr, const uint64_t old_offset, + uint64_t *new_offset, int *updated_fd) { struct vma_metadata *vma_md; + char path[PATH_MAX]; + char *p_begin; + char *p_end; + bool is_kfd = false, is_renderD = false; plugin_log_msg("amdgpu_plugin: Enter %s\n", __func__); + strncpy(path, in_path, sizeof(path)); + + p_begin = path; + p_end = p_begin + strlen(path); + /* - * On newer versions of AMD KFD driver, only the file descriptor that was used to open the - * device can be used for mmap, so we will have to return the proper file descriptor here + * Paths sometimes have double forward slashes (e.g //dev/dri/renderD*) + * replace all '//' with '/'. */ - *updated_fd = -1; + while (p_begin < p_end - 1) { + if (*p_begin == '/' && *(p_begin + 1) == '/') + memmove(p_begin, p_begin + 1, p_end - p_begin); + else + p_begin++; + } + + if (!strncmp(path, "/dev/dri/renderD", strlen("/dev/dri/renderD"))) + is_renderD = true; + + if (!strcmp(path, AMDGPU_KFD_DEVICE)) + is_kfd = true; + + if (!is_renderD && !is_kfd) { + pr_info("Skipping unsupported path:%s addr:%lx old_offset:%lx\n", in_path, addr, old_offset); + return 0; + } list_for_each_entry(vma_md, &update_vma_info_list, list) { if (addr == vma_md->vma_entry && old_offset == vma_md->old_pgoff) { *new_offset = vma_md->new_pgoff; - plugin_log_msg("amdgpu_plugin: old_pgoff= 0x%lx new_pgoff = 0x%lx path = %s\n", - vma_md->old_pgoff, vma_md->new_pgoff, path); + if (is_renderD) + *updated_fd = vma_md->fd; + else + *updated_fd = -1; + + plugin_log_msg("amdgpu_plugin: old_pgoff=0x%lx new_pgoff=0x%lx fd=%d\n", vma_md->old_pgoff, + vma_md->new_pgoff, *updated_fd); return 1; } diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c index 5ef98010f..b7a618631 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.c +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -32,6 +32,31 @@ } #endif +static int open_drm_render_device(int minor) +{ + char path[128]; + int fd; + + if (minor < DRM_FIRST_RENDER_NODE || minor > DRM_LAST_RENDER_NODE) { + pr_perror("DRM render minor %d out of range [%d, %d]", minor, DRM_FIRST_RENDER_NODE, + DRM_LAST_RENDER_NODE); + return -EINVAL; + } + + snprintf(path, sizeof(path), "/dev/dri/renderD%d", minor); + fd = open(path, O_RDWR | O_CLOEXEC); + if (fd < 0) { + if (errno != ENOENT && errno != EPERM) { + pr_err("Failed to open %s: %s\n", path, strerror(errno)); + if (errno == EACCES) + pr_err("Check user is in \"video\" group\n"); + } + return -EBADFD; + } + + return fd; +} + static const char *link_type(uint32_t type) { switch (type) { @@ -54,6 +79,38 @@ static struct tp_node *p2pgroup_get_node_by_gpu_id(const struct tp_p2pgroup *gro return NULL; } +int node_get_drm_render_device(struct tp_node *node) +{ + if (node->drm_fd < 0) + node->drm_fd = open_drm_render_device(node->drm_render_minor); + + return node->drm_fd; +} + +void sys_close_drm_render_devices(struct tp_system *sys) +{ + struct tp_node *node; + + list_for_each_entry(node, &sys->nodes, listm_system) { + if (node->drm_fd >= 0) { + close(node->drm_fd); + node->drm_fd = -1; + } + } +} + +static struct tp_iolink *node_get_iolink_to_node_id(const struct tp_node *node, const uint32_t type, + const uint32_t node_id) +{ + struct tp_iolink *iolink; + + list_for_each_entry(iolink, &node->iolinks, listm) { + if (iolink->node_to_id == node_id && iolink->type == type) + return iolink; + } + return NULL; +} + struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor) { struct tp_node *node; @@ -114,6 +171,167 @@ static struct tp_iolink *get_tp_peer_iolink(const struct tp_node *from_node, con return NULL; } +static bool maps_dest_cpu_mapped(const struct device_maps *maps, const uint32_t dest_id) +{ + struct id_map *id_map; + + list_for_each_entry(id_map, &maps->cpu_maps, listm) { + if (id_map->dest == dest_id) + return true; + } + return false; +} + +static uint32_t maps_get_dest_cpu(const struct device_maps *maps, const uint32_t src_id) +{ + struct id_map *id_map; + + list_for_each_entry(id_map, &maps->cpu_maps, listm) { + if (id_map->src == src_id) + return id_map->dest; + } + return INVALID_CPU_ID; +} + +bool maps_dest_gpu_mapped(const struct device_maps *maps, const uint32_t dest_id) +{ + struct id_map *id_map; + + list_for_each_entry(id_map, &maps->gpu_maps, listm) { + if (id_map->dest == dest_id) + return true; + } + return false; +} + +uint32_t maps_get_dest_gpu(const struct device_maps *maps, const uint32_t src_id) +{ + struct id_map *id_map; + + list_for_each_entry(id_map, &maps->gpu_maps, listm) { + if (id_map->src == src_id) + return id_map->dest; + } + return 0; +} + +static struct id_map *maps_add_cpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id) +{ + struct id_map *id_map = xzalloc(sizeof(*id_map)); + + if (!id_map) { + pr_err("Failed to allocate memory for id_map\n"); + return NULL; + } + + id_map->src = src_id; + id_map->dest = dest_id; + + list_add_tail(&id_map->listm, &maps->cpu_maps); + + maps->tail_cpu = &id_map->listm; + + pr_debug("Added CPU mapping [%02d -> %02d]\n", src_id, dest_id); + return id_map; +} + +struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id) +{ + struct id_map *id_map = xzalloc(sizeof(*id_map)); + + if (!id_map) { + pr_err("Failed to allocate memory for id_map\n"); + return NULL; + } + + id_map->src = src_id; + id_map->dest = dest_id; + + list_add_tail(&id_map->listm, &maps->gpu_maps); + + maps->tail_gpu = &id_map->listm; + + pr_debug("Added GPU mapping [0x%04X -> 0x%04X]\n", src_id, dest_id); + return id_map; +} + +static void maps_print(struct device_maps *maps) +{ + struct id_map *id_map; + + pr_info("===Maps===============\n"); + list_for_each_entry(id_map, &maps->gpu_maps, listm) + pr_info("GPU: 0x%04X -> 0x%04X\n", id_map->src, id_map->dest); + + list_for_each_entry(id_map, &maps->cpu_maps, listm) + pr_info("CPU: %02d -> %02d\n", id_map->src, id_map->dest); + pr_info("======================\n"); +} + +void maps_init(struct device_maps *maps) +{ + INIT_LIST_HEAD(&maps->cpu_maps); + INIT_LIST_HEAD(&maps->gpu_maps); + maps->tail_cpu = 0; + maps->tail_gpu = 0; +} + +void maps_free(struct device_maps *maps) +{ + while (!list_empty(&maps->cpu_maps)) { + struct id_map *map = list_first_entry(&maps->cpu_maps, struct id_map, listm); + + list_del(&map->listm); + xfree(map); + } + while (!list_empty(&maps->gpu_maps)) { + struct id_map *map = list_first_entry(&maps->gpu_maps, struct id_map, listm); + + list_del(&map->listm); + xfree(map); + } +} + +static void maps_pop(struct device_maps *maps, struct device_maps *remove) +{ + if (remove->tail_cpu) + list_cut_position(&remove->cpu_maps, &maps->cpu_maps, remove->tail_cpu); + + if (remove->tail_gpu) + list_cut_position(&remove->gpu_maps, &maps->gpu_maps, remove->tail_gpu); + + maps_free(remove); +} + +static int maps_push(struct device_maps *maps, struct device_maps *new) +{ + struct id_map *src_id_map, *dest_id_map; + + list_for_each_entry(src_id_map, &new->cpu_maps, listm) { + list_for_each_entry(dest_id_map, &maps->cpu_maps, listm) { + if (src_id_map->src == dest_id_map->src || src_id_map->dest == dest_id_map->dest) { + pr_err("CPU mapping already exists src [%02d->%02d] new [%02d->%02d]\n", + src_id_map->src, src_id_map->dest, dest_id_map->src, dest_id_map->dest); + return -EINVAL; + } + } + } + list_for_each_entry(src_id_map, &new->gpu_maps, listm) { + list_for_each_entry(dest_id_map, &maps->gpu_maps, listm) { + if (src_id_map->src == dest_id_map->src || src_id_map->dest == dest_id_map->dest) { + pr_err("GPU mapping already exists src [0x%04X -> 0x%04X] new [0x%04X -> 0x%04X]\n", + src_id_map->src, src_id_map->dest, dest_id_map->src, dest_id_map->dest); + return -EINVAL; + } + } + } + + list_splice(&new->cpu_maps, &maps->cpu_maps); + list_splice(&new->gpu_maps, &maps->gpu_maps); + + return 0; +} + struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t node_to_id) { struct tp_iolink *iolink = xzalloc(sizeof(*iolink)); @@ -475,6 +693,19 @@ static const char *p2pgroup_to_str(struct tp_p2pgroup *group) return topology_printstr; } +static const char *mapping_list_to_str(struct list_head *node_list) +{ + static char topology_printstr[200]; + struct tp_node *node; + size_t str_len = 0; + + topology_printstr[0] = '\0'; + list_for_each_entry(node, node_list, listm_mapping) + str_len += sprintf(&topology_printstr[str_len], "0x%04X ", node->gpu_id); + + return topology_printstr; +} + void topology_print(const struct tp_system *sys, const char *message) { struct tp_node *node; @@ -717,4 +948,449 @@ int topology_parse(struct tp_system *sys, const char *message) fail: topology_free(sys); return ret; -} \ No newline at end of file +} + +static bool device_properties_match(struct tp_node *src, struct tp_node *dest) +{ + if (src->simd_count == dest->simd_count && src->mem_banks_count == dest->mem_banks_count && + src->io_links_count == dest->io_links_count && src->max_waves_per_simd == dest->max_waves_per_simd && + src->lds_size_in_kb == dest->lds_size_in_kb && src->wave_front_size == dest->wave_front_size && + src->array_count == dest->array_count && src->simd_arrays_per_engine == dest->simd_arrays_per_engine && + src->cu_per_simd_array == dest->cu_per_simd_array && src->simd_per_cu == dest->simd_per_cu && + src->max_slots_scratch_cu == dest->max_slots_scratch_cu && src->vendor_id == dest->vendor_id && + src->device_id == dest->device_id && src->num_sdma_engines == dest->num_sdma_engines && + src->num_sdma_xgmi_engines == dest->num_sdma_xgmi_engines && + src->num_sdma_queues_per_engine == dest->num_sdma_queues_per_engine && + src->num_cp_queues == dest->num_cp_queues && src->capability == dest->capability && + src->vram_public == dest->vram_public && src->vram_size <= dest->vram_size && + src->num_gws <= dest->num_gws && src->caches_count <= dest->caches_count && + src->fw_version <= dest->fw_version && src->sdma_fw_version <= dest->sdma_fw_version) { + return true; + } + return false; +} + +/** + * @brief Determines whether iolink dest can be used to replace src + * + * @param src source iolink + * @param dest destination iolink + * @return true if dest can replace src + */ +static bool iolink_match(struct tp_iolink *src, struct tp_iolink *dest) +{ + if (!src->valid) + return true; + + if (!dest->valid) + return false; + + if (NODE_IS_GPU(src->node_to) != NODE_IS_GPU(dest->node_to)) + return false; + + /* XGMI link can replace PCIE links */ + if (src->type == TOPO_IOLINK_TYPE_XGMI && dest->type == TOPO_IOLINK_TYPE_PCIE) + return false; + + /* bi-directional links can replace uni-directional links */ + if (src->peer != NULL && dest->peer == NULL) + return false; + + return true; +} + +/** + * @brief Determines whether src_node can be mapped to dest_node + * + * Nodes compatibility are determined by: + * 1. Comparing the node properties + * 2. Making sure iolink mappings to CPUs would be compabitle with existing iolink mappings in maps + * + * If src_node and dest_node are mappable, then map_device will push the new mapping + * for src_node -> dest_node into new_maps. + * @param src_sys system topology information on source system + * @param dest_sys system topology information on destination system + * @param src_node source GPU + * @param dest_node destination GPU + * @param maps list of existing device maps + * @param new_maps if nodes are mappable, then GPU and CPU mappings will be added to this list + * @return true if src_node and dest_node are mappable + */ +static bool map_device(struct tp_system *src_sys, struct tp_system *dest_sys, struct tp_node *src_node, + struct tp_node *dest_node, struct device_maps *maps, struct device_maps *new_maps) +{ + struct tp_iolink *src_iolink; + + pr_debug("Evaluating mapping nodes [0x%04X -> 0x%04X]\n", src_node->gpu_id, dest_node->gpu_id); + + /* Compare GPU properties from /sys/class/kfd/kfd/topology/nodes/N/properties */ + if (!device_properties_match(src_node, dest_node)) { + pr_debug("[0x%04X -> 0x%04X] Device properties do not match\n", src_node->gpu_id, dest_node->gpu_id); + return false; + } + + if (src_node->num_valid_iolinks > dest_node->num_valid_iolinks) { + pr_debug("[0x%04X -> 0x%04X] Mismatch between number of iolinks\n", src_node->gpu_id, + dest_node->gpu_id); + return false; + } + + list_for_each_entry(src_iolink, &src_node->iolinks, listm) { + /* Go through list of iolinks to CPU and compare them */ + + if (!NODE_IS_GPU(src_iolink->node_to)) { + bool matched_iolink = false; + /* This is a iolink to CPU */ + pr_debug("Found link to CPU node:%02d\n", src_iolink->node_to->id); + + uint32_t dest_cpu_node_id; + + dest_cpu_node_id = maps_get_dest_cpu(maps, src_iolink->node_to->id); + if (dest_cpu_node_id == INVALID_CPU_ID) + dest_cpu_node_id = maps_get_dest_cpu(new_maps, src_iolink->node_to->id); + + if (dest_cpu_node_id == INVALID_CPU_ID) { + struct tp_iolink *dest_iolink; + + list_for_each_entry(dest_iolink, &dest_node->iolinks, listm) { + if (iolink_match(src_iolink, dest_iolink) && + !maps_dest_cpu_mapped(maps, dest_iolink->node_to->id) && + !maps_dest_cpu_mapped(new_maps, dest_iolink->node_to->id)) { + if (!maps_add_cpu_entry(new_maps, src_iolink->node_to->id, + dest_iolink->node_to->id)) + /* This is a critical error because we + * are out of memory + */ + return false; + + matched_iolink = true; + break; + } + } + } else { + pr_debug("Existing CPU mapping found [%02d-%02d]\n", src_iolink->node_to->id, + dest_cpu_node_id); + /* Confirm that the link to this CPU is same or better */ + + struct tp_iolink *dest_iolink = + node_get_iolink_to_node_id(dest_node, src_iolink->type, dest_cpu_node_id); + + if (dest_iolink && iolink_match(src_iolink, dest_iolink)) + matched_iolink = true; + } + if (!matched_iolink) { + pr_debug("[0x%04X -> 0x%04X] Mismatch between iolink to CPU\n", src_node->gpu_id, + dest_node->gpu_id); + + return false; + } + } else { + /* If GPUs have P2P-PCIe iolinks to this GPU, then at least one CPU will + * also have a P2P-PCIe iolink to this GPU, so it seems that we do not need + * to consider P2P-PCIe iolinks from GPU to GPU for now. Once P2P-PCIe + * iolinks are exposed via p2p_links we may have to add additional code here + * to validate P2P-PCIe links between GPUs. + */ + } + } + pr_debug("[0x%04X -> 0x%04X] Map is possible\n", src_node->gpu_id, dest_node->gpu_id); + + if (!maps_add_gpu_entry(new_maps, src_node->gpu_id, dest_node->gpu_id)) { + /* This is a critical error because we are out of memory */ + return false; + } + maps_print(new_maps); + return true; +} + +/** + * @brief Determines whether list of GPUs in src_nodes are mappable to dest_nodes + * + * This function will pick the first node from src_nodes and iterate through all the nodes in + * dest_nodes and call map_device to determine whether the node is mappable. + * If a node from dest_nodes is mappable to the first node from src_nodes: + * 1. This function will remove the first node from src_nodes and the node from dest_nodes + * 2. Push sub-mappings (new_maps) generated by map_device into existing mappings (maps) + * 3. Recursively check whether remaining nodes in src_nodes and dest_nodes are mappable. + * + * Once src_nodes is empty then we have successfully mapped all the nodes and maps contains a full + * list of GPU mappings. + * + * If there are no nodes in dest_nodes that can be mapped to the first node in src_nodes, then this + * means we cannot build a full mapping list with the current list of mappings. We backtrack by + * popping the newly generated sub-mappings(new_maps) from existing mappings (maps) and add the two + * nodes back to src_nodes and dest_nodes and return false. When this function returns false, the + * caller function will try a different path by trying to map the first node from src_nodes to the + * next node in dest_nodes. + * + * @param src_sys system topology information on source system + * @param dest_sys system topology information on destination system + * @param src_node list of source GPUs that need to be mapped + * @param dest_node list of destination GPUs that need to be mapped + * @param maps list of device maps based on current map path + * @return true if all nodes from src_nodes and dest_nodes are mappable + */ +static bool map_devices(struct tp_system *src_sys, struct tp_system *dest_sys, struct list_head *src_nodes, + struct list_head *dest_nodes, struct device_maps *maps) +{ + struct tp_node *src_node, *dest_node, *dest_node_tmp; + struct device_maps new_maps; + + /* Pick the first src node from the list of nodes and look for a dest node that is mappable. + * If we find a mappable destination node, then we add src node and dest node mapping to + * device_maps and recursively try to map the remaining nodes in the list. + * If there are no more src nodes in the list, then we have found a successful combination + * of src to dest nodes that are mappable. + */ + if (list_empty(src_nodes)) { + pr_debug("All nodes mapped successfully\n"); + return true; + } + + pr_debug("Mapping list src nodes [%s]\n", mapping_list_to_str(src_nodes)); + pr_debug("Mapping list dest nodes [%s]\n", mapping_list_to_str(dest_nodes)); + + src_node = list_first_entry(src_nodes, struct tp_node, listm_mapping); + pr_debug("Looking for match for node 0x%04X\n", src_node->gpu_id); + + list_del(&src_node->listm_mapping); + + list_for_each_entry_safe(dest_node, dest_node_tmp, dest_nodes, listm_mapping) { + maps_init(&new_maps); + if (map_device(src_sys, dest_sys, src_node, dest_node, maps, &new_maps)) { + pr_debug("Matched destination node 0x%04X\n", dest_node->gpu_id); + + /* src node and dest node are mappable, add device_maps generated by + * map_device to list of current valid device_maps, and recursively try to + * map remaining nodes in the list. + */ + + list_del(&dest_node->listm_mapping); + if (maps_push(maps, &new_maps)) + return false; + + if (map_devices(src_sys, dest_sys, src_nodes, dest_nodes, maps)) { + pr_debug("Matched nodes 0x%04X and after\n", dest_node->gpu_id); + return true; + } else { + /* We could not map remaining nodes in the list. Add dest node back + * to list and try to map next dest ndoe in list to current src + * node. + */ + pr_debug("Nodes after [0x%04X -> 0x%04X] did not match, " + "adding list back\n", + src_node->gpu_id, dest_node->gpu_id); + + list_add(&dest_node->listm_mapping, dest_nodes); + maps_pop(maps, &new_maps); + } + } + } + pr_debug("Failed to map nodes 0x%04X and after\n", src_node->gpu_id); + + /* Either: We could not find a mappable dest node for current node, or we could not build a + * combination from the remaining nodes in the lists. Add src node back to the list and + * caller function will try next possible combination. + */ + list_add(&src_node->listm_mapping, src_nodes); + + return false; +} + +/** + * @brief Determines whether list of GPUs in src_xgmi_groups are mappable to list of GPUs in + * dest_xgmi_groups + * + * This function will pick the first XGMI group (hive) from src_xgmi_groups and iterate through the + * XGMI groups in dest_xgmi_groups. If the group in dest_xgmi_groups is mappable then this function + * will remove the hives from src_xgmi_groups and dest_xgmi_groups and recursively try to map the + * remaining hives in src_xgmi_groups and dest_xgmi_groups. + * + * If src_xgmi_groups is empty, then this means that we have successfully mapped all the XGMI hives + * and we have a full list of GPU mappings in maps. + * + * If we cannot find a hive inside dest_xgmi_groups that is mappable to the first hive from + * src_xgmi_groups, then this means that this path is not valid and we need to backtrack. We + * backtrack by adding the hives back into src_xgmi_groups and dest_xgmi_groups and returning false. + * The caller function will then try a different path by trying to map the first hive in + * src_xgmi_groups to the next hive in dest_xgmi_groups. + * + * @param src_sys system topology information on source system + * @param dest_sys system topology information on destination system + * @param src_xgmi_groups list of source XGMI hives that need to be mapped + * @param dest_xgmi_groups list of destination XGMI hives that need to be mapped + * @param maps list of device maps based on current map path + * @return true if all nodes from src_nodes and dest_nodes are mappable + */ +bool match_xgmi_groups(struct tp_system *src_sys, struct tp_system *dest_sys, struct list_head *src_xgmi_groups, + struct list_head *dest_xgmi_groups, struct device_maps *maps) +{ + struct tp_p2pgroup *src_group; + struct tp_p2pgroup *dest_group; + struct tp_p2pgroup *dest_group_tmp; + + if (list_empty(src_xgmi_groups)) { + pr_debug("All groups matched successfully\n"); + return true; + } + + /* Pick the first src XGMI group from the list. Then try to match src XGMI group with a + * dest XGMI group. If we have a dest XGMI group that is mappable, then we try to + * recursively map the next src XGMI group in the list, with remaining dest XGMI groups. + * If there are no more src XGMI groups in the list, then this means we have successfully + * mapped all the groups and we have a valid device_maps + */ + src_group = list_first_entry(src_xgmi_groups, struct tp_p2pgroup, listm_system); + pr_debug("Looking for match for group [%s]\n", p2pgroup_to_str(src_group)); + + list_del(&src_group->listm_system); + + list_for_each_entry_safe(dest_group, dest_group_tmp, dest_xgmi_groups, listm_system) { + struct tp_node *node; + + LIST_HEAD(src_nodes); + LIST_HEAD(dest_nodes); + + if (src_group->num_nodes > dest_group->num_nodes) + continue; + + pr_debug("Trying destination group [%s]\n", p2pgroup_to_str(dest_group)); + + list_for_each_entry(node, &src_group->nodes, listm_p2pgroup) + list_add_tail(&node->listm_mapping, &src_nodes); + + list_for_each_entry(node, &dest_group->nodes, listm_p2pgroup) + list_add_tail(&node->listm_mapping, &dest_nodes); + + /* map_devices will populate maps if successful */ + if (map_devices(src_sys, dest_sys, &src_nodes, &dest_nodes, maps)) { + /* All the nodes in current src XGMI group are mappable with nodes in + * current dest XGMI group. Remove the current groups from the lists + * and recursively try to match remaining groups + */ + list_del(&dest_group->listm_system); + pr_debug("Matched destination group [%s]\n", p2pgroup_to_str(dest_group)); + if (match_xgmi_groups(src_sys, dest_sys, src_xgmi_groups, dest_xgmi_groups, maps)) { + pr_debug("Matched subgroups of [%s]\n", p2pgroup_to_str(dest_group)); + + xfree(src_group); + xfree(dest_group); + return true; + } else { + /* We were not able to map the remaining XGMI groups so we add the + * current dest XGMI group back to the list of unmapped groups, and + * try to map current src XGMI group with the next dest XGMI in the + * list of XGMI groups + */ + list_add(&dest_group->listm_system, dest_xgmi_groups); + } + } + } + + /* We have not found a mappable dest XGMI group. We discard this combination. If this is + * the first src XGMI group in the list, then it is not possible to match the XGMI groups. + * If this was a recursive call, then the calling instance of function will try the next + * combination of XGMI groups + */ + + pr_debug("Failed to match groups [%s]\n", p2pgroup_to_str(src_group)); + list_add_tail(&src_group->listm_system, src_xgmi_groups); + + return false; +} + +/** + * @brief Builds a list of GPU mappings from source topology to destination topology + * + * The topology on the destination system may not be identical to the topology on the source + * system, e.g There can be GPUs with different device ID's and they may be enumerated in a + * different order. This function builds a list of GPU mappings from the source topology to the + * destination topology and stores it in maps. + * + * The function will first validate all the iolinks and determine XGMI groups (hives) by calling the + * topology_determine_iolinks(). It will then try to match the GPUs that belong to XGMI hives and + * after that, match the remaining GPUs. + * + * @param src_sys system topology information on source system + * @param dest_sys system topology information on destination system + * @param maps list of device maps that was generated by this function + * @return true if we were able to build a full list of GPU mappings. + */ +int set_restore_gpu_maps(struct tp_system *src_sys, struct tp_system *dest_sys, struct device_maps *maps) +{ + struct tp_node *node; + int ret = 0; + int src_num_gpus = 0; + int dest_num_gpus = 0; + + maps_init(maps); + + ret = topology_determine_iolinks(src_sys); + if (ret) { + pr_err("Failed to determine iolinks from source (checkpointed) topology\n"); + return ret; + } + topology_print(src_sys, "Source "); + + ret = topology_determine_iolinks(dest_sys); + if (ret) { + pr_err("Failed to determine iolinks from destination (local) topology\n"); + return ret; + } + topology_print(dest_sys, "Destination"); + + /* Make sure we have same number of GPUs in src and dest */ + list_for_each_entry(node, &src_sys->nodes, listm_system) { + if (NODE_IS_GPU(node)) + src_num_gpus++; + } + list_for_each_entry(node, &dest_sys->nodes, listm_system) { + if (NODE_IS_GPU(node)) + dest_num_gpus++; + } + + if (src_num_gpus != dest_num_gpus) { + pr_err("Number of devices mismatch (checkpointed:%d local:%d)\n", src_num_gpus, dest_num_gpus); + return -EINVAL; + } + + if (src_sys->num_xgmi_groups > dest_sys->num_xgmi_groups) { + pr_err("Number of xgmi groups mismatch (checkpointed:%d local:%d)\n", src_sys->num_xgmi_groups, + dest_sys->num_xgmi_groups); + return -EINVAL; + } + + /* First try to match the XGMI hives */ + if (src_sys->num_xgmi_groups) { + if (!match_xgmi_groups(src_sys, dest_sys, &src_sys->xgmi_groups, &dest_sys->xgmi_groups, maps)) { + pr_err("Failed to match all GPU groups\n"); + return -EINVAL; + } + pr_info("Current maps after XGMI groups matched\n"); + maps_print(maps); + } + + /* We matched all the XGMI hives, now match remaining GPUs */ + LIST_HEAD(src_nodes); + LIST_HEAD(dest_nodes); + + list_for_each_entry(node, &src_sys->nodes, listm_system) { + if (NODE_IS_GPU(node) && !maps_get_dest_gpu(maps, node->gpu_id)) + list_add(&node->listm_mapping, &src_nodes); + } + + list_for_each_entry(node, &dest_sys->nodes, listm_system) { + if (NODE_IS_GPU(node) && !maps_dest_gpu_mapped(maps, node->gpu_id)) + list_add(&node->listm_mapping, &dest_nodes); + } + + if (!map_devices(src_sys, dest_sys, &src_nodes, &dest_nodes, maps)) { + pr_err("Failed to match remaining nodes\n"); + return -EINVAL; + } + + pr_info("Maps after all nodes matched\n"); + maps_print(maps); + + return ret; +} diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h index 434956e9d..c032af696 100644 --- a/plugins/amdgpu/amdgpu_plugin_topology.h +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -107,12 +107,21 @@ int topology_parse(struct tp_system *topology, const char *msg); int topology_determine_iolinks(struct tp_system *sys); void topology_print(const struct tp_system *sys, const char *msg); +struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id); + struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id); struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t node_to_id); struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32_t gpu_id); struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); +int node_get_drm_render_device(struct tp_node *node); +void sys_close_drm_render_devices(struct tp_system *sys); + +int set_restore_gpu_maps(struct tp_system *tp_checkpoint, struct tp_system *tp_local, struct device_maps *maps); + +uint32_t maps_get_dest_gpu(const struct device_maps *maps, const uint32_t src_id); + void maps_init(struct device_maps *maps); void maps_free(struct device_maps *maps); diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 140b2951b..498321738 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -61,5 +61,5 @@ message criu_kfd { } message criu_render_node { - required uint32 minor_number = 1; + required uint32 gpu_id = 1; }