diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile index 0d802d096..abcf888d0 100644 --- a/plugins/amdgpu/Makefile +++ b/plugins/amdgpu/Makefile @@ -22,7 +22,7 @@ endif criu-amdgpu.pb-c.c: criu-amdgpu.proto protoc-c --proto_path=. --c_out=. criu-amdgpu.proto -amdgpu_plugin.so: amdgpu_plugin.c criu-amdgpu.pb-c.c +amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_topology.c criu-amdgpu.pb-c.c $(CC) $(PLUGIN_CFLAGS) $^ -o $@ $(PLUGIN_INCLUDE) amdgpu_plugin_clean: diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c index 58ef05291..9c883d31a 100644 --- a/plugins/amdgpu/amdgpu_plugin.c +++ b/plugins/amdgpu/amdgpu_plugin.c @@ -23,9 +23,7 @@ #include "criu-log.h" #include "common/list.h" - -#define DRM_FIRST_RENDER_NODE 128 -#define DRM_LAST_RENDER_NODE 255 +#include "amdgpu_plugin_topology.h" #define AMDGPU_KFD_DEVICE "/dev/kfd" #define PROCPIDMEM "/proc/%d/mem" @@ -57,7 +55,15 @@ struct vma_metadata { uint64_t vma_entry; }; +/************************************ Global Variables ********************************************/ +struct tp_system src_topology; +struct tp_system dest_topology; + +struct device_maps checkpoint_maps; +struct device_maps restore_maps; + static LIST_HEAD(update_vma_info_list); +/**************************************************************************************************/ int open_drm_render_device(int minor) { @@ -70,7 +76,7 @@ int open_drm_render_device(int minor) return -EINVAL; } - sprintf(path, "/dev/dri/renderD%d", minor); + snprintf(path, sizeof(path), "/dev/dri/renderD%d", minor); fd = open(path, O_RDWR | O_CLOEXEC); if (fd < 0) { if (errno != ENOENT && errno != EPERM) { @@ -176,8 +182,12 @@ static void free_e(CriuKfd *e) } for (int i = 0; i < e->n_device_entries; i++) { - if (e->device_entries[i]) + if (e->device_entries[i]) { + for (int j = 0; j < e->device_entries[i]->n_iolinks; j++) + xfree(e->device_entries[i]->iolinks[j]); + xfree(e->device_entries[i]); + } } xfree(e); } @@ -236,16 +246,148 @@ static int allocate_bo_entries(CriuKfd *e, int num_bos, struct kfd_criu_bo_bucke return 0; } +int topology_to_devinfo(struct tp_system *sys, struct device_maps *maps, DeviceEntry **deviceEntries) +{ + uint32_t devinfo_index = 0; + struct tp_node *node; + + list_for_each_entry(node, &sys->nodes, listm_system) { + DeviceEntry *devinfo = deviceEntries[devinfo_index++]; + + devinfo->node_id = node->id; + + if (NODE_IS_GPU(node)) { + devinfo->gpu_id = node->gpu_id; + + devinfo->simd_count = node->simd_count; + devinfo->mem_banks_count = node->mem_banks_count; + devinfo->caches_count = node->caches_count; + devinfo->io_links_count = node->io_links_count; + devinfo->max_waves_per_simd = node->max_waves_per_simd; + devinfo->lds_size_in_kb = node->lds_size_in_kb; + devinfo->num_gws = node->num_gws; + devinfo->wave_front_size = node->wave_front_size; + devinfo->array_count = node->array_count; + devinfo->simd_arrays_per_engine = node->simd_arrays_per_engine; + devinfo->cu_per_simd_array = node->cu_per_simd_array; + devinfo->simd_per_cu = node->simd_per_cu; + devinfo->max_slots_scratch_cu = node->max_slots_scratch_cu; + devinfo->vendor_id = node->vendor_id; + devinfo->device_id = node->device_id; + devinfo->domain = node->domain; + devinfo->drm_render_minor = node->drm_render_minor; + devinfo->hive_id = node->hive_id; + devinfo->num_sdma_engines = node->num_sdma_engines; + devinfo->num_sdma_xgmi_engines = node->num_sdma_xgmi_engines; + devinfo->num_sdma_queues_per_engine = node->num_sdma_queues_per_engine; + devinfo->num_cp_queues = node->num_cp_queues; + devinfo->fw_version = node->fw_version; + devinfo->capability = node->capability; + devinfo->sdma_fw_version = node->sdma_fw_version; + devinfo->vram_public = node->vram_public; + devinfo->vram_size = node->vram_size; + } else { + devinfo->cpu_cores_count = node->cpu_cores_count; + } + + if (node->num_valid_iolinks) { + struct tp_iolink *iolink; + uint32_t iolink_index = 0; + + devinfo->iolinks = xmalloc(sizeof(DevIolink *) * node->num_valid_iolinks); + if (!devinfo->iolinks) + return -ENOMEM; + + list_for_each_entry(iolink, &node->iolinks, listm) { + if (!iolink->valid) + continue; + + devinfo->iolinks[iolink_index] = xmalloc(sizeof(DevIolink)); + if (!devinfo->iolinks[iolink_index]) + return -ENOMEM; + + dev_iolink__init(devinfo->iolinks[iolink_index]); + + devinfo->iolinks[iolink_index]->type = iolink->type; + devinfo->iolinks[iolink_index]->node_to_id = iolink->node_to_id; + iolink_index++; + } + devinfo->n_iolinks = iolink_index; + } + } + return 0; +} + +int devinfo_to_topology(DeviceEntry *devinfos[], uint32_t num_devices, struct tp_system *sys) +{ + for (int i = 0; i < num_devices; i++) { + struct tp_node *node; + DeviceEntry *devinfo = devinfos[i]; + + node = sys_add_node(sys, devinfo->node_id, devinfo->gpu_id); + if (!node) + return -ENOMEM; + + if (devinfo->cpu_cores_count) { + node->cpu_cores_count = devinfo->cpu_cores_count; + } else { + node->simd_count = devinfo->simd_count; + node->mem_banks_count = devinfo->mem_banks_count; + node->caches_count = devinfo->caches_count; + node->io_links_count = devinfo->io_links_count; + node->max_waves_per_simd = devinfo->max_waves_per_simd; + node->lds_size_in_kb = devinfo->lds_size_in_kb; + node->num_gws = devinfo->num_gws; + node->wave_front_size = devinfo->wave_front_size; + node->array_count = devinfo->array_count; + node->simd_arrays_per_engine = devinfo->simd_arrays_per_engine; + node->cu_per_simd_array = devinfo->cu_per_simd_array; + node->simd_per_cu = devinfo->simd_per_cu; + node->max_slots_scratch_cu = devinfo->max_slots_scratch_cu; + node->vendor_id = devinfo->vendor_id; + node->device_id = devinfo->device_id; + node->domain = devinfo->domain; + node->drm_render_minor = devinfo->drm_render_minor; + node->hive_id = devinfo->hive_id; + node->num_sdma_engines = devinfo->num_sdma_engines; + node->num_sdma_xgmi_engines = devinfo->num_sdma_xgmi_engines; + node->num_sdma_queues_per_engine = devinfo->num_sdma_queues_per_engine; + node->num_cp_queues = devinfo->num_cp_queues; + node->fw_version = devinfo->fw_version; + node->capability = devinfo->capability; + node->sdma_fw_version = devinfo->sdma_fw_version; + node->vram_public = devinfo->vram_public; + node->vram_size = devinfo->vram_size; + } + + for (int j = 0; j < devinfo->n_iolinks; j++) { + struct tp_iolink *iolink; + DevIolink *devlink = (devinfo->iolinks[j]); + + iolink = node_add_iolink(node, devlink->type, devlink->node_to_id); + if (!iolink) + return -ENOMEM; + } + } + return 0; +} + int amdgpu_plugin_init(int stage) { pr_info("amdgpu_plugin: initialized: %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + topology_init(&src_topology); + topology_init(&dest_topology); + return 0; } void amdgpu_plugin_fini(int stage, int ret) { pr_info("amdgpu_plugin: finished %s (AMDGPU/KFD)\n", CR_PLUGIN_DESC.name); + + topology_free(&src_topology); + topology_free(&dest_topology); } CR_PLUGIN_REGISTER("amdgpu_plugin", amdgpu_plugin_init, amdgpu_plugin_fini) @@ -314,14 +456,21 @@ static int save_devices(int fd, struct kfd_ioctl_criu_args *args, struct kfd_cri pr_debug("Dumping %d devices\n", args->num_devices); e->num_of_gpus = args->num_devices; + e->num_of_cpus = src_topology.num_nodes - args->num_devices; - ret = allocate_device_entries(e, e->num_of_gpus); - if (ret) { - ret = -ENOMEM; + /* The ioctl will only return entries for GPUs, but we also store entries for CPUs and the + * information for CPUs is obtained from parsing system topology + */ + ret = allocate_device_entries(e, src_topology.num_nodes); + if (ret) goto exit; - } - plugin_log_msg("Number of GPUs:%d\n", e->num_of_gpus); + pr_debug("Number of CPUs:%d GPUs:%d\n", e->num_of_cpus, e->num_of_gpus); + + /* Store topology information that was obtained from parsing /sys/class/kfd/kfd/topology/ */ + ret = topology_to_devinfo(&src_topology, &checkpoint_maps, e->device_entries); + if (ret) + goto exit; exit: pr_info("Dumped devices %s (ret:%d)\n", ret ? "Failed" : "Ok", ret); @@ -470,6 +619,17 @@ int amdgpu_plugin_dump_file(int fd, int id) return -1; } + if (topology_parse(&src_topology, "Checkpoint")) + return -1; + + /* We call topology_determine_iolinks to validate io_links. If io_links are not valid + * we do not store them inside the checkpointed images + */ + if (topology_determine_iolinks(&src_topology)) { + pr_err("Failed to determine iolinks from topology\n"); + return -1; + } + /* Check whether this plugin was called for kfd or render nodes */ if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) { /* This is RenderD dumper plugin, for now just save renderD @@ -624,15 +784,18 @@ static int restore_devices(struct kfd_ioctl_criu_args *args, CriuKfd *e) args->devices = (uintptr_t)device_buckets; - for (int i = 0; i < e->num_of_gpus; i++) { + for (int entries_i = 0; entries_i < e->num_of_cpus + e->num_of_gpus; entries_i++) { struct kfd_criu_device_bucket *device_bucket; - DeviceEntry *devinfo = e->device_entries[i]; + DeviceEntry *devinfo = e->device_entries[entries_i]; + + if (!devinfo->gpu_id) + continue; device_bucket = &device_buckets[bucket_index++]; device_bucket->user_gpu_id = devinfo->gpu_id; - device_bucket->drm_fd = open_drm_render_device(i + DRM_FIRST_RENDER_NODE); + device_bucket->drm_fd = open_drm_render_device(bucket_index + DRM_FIRST_RENDER_NODE); if (device_bucket->drm_fd < 0) { pr_perror("amdgpu_plugin: Can't pass NULL drm render fd to driver"); goto exit; @@ -878,6 +1041,19 @@ int amdgpu_plugin_restore_file(int id) plugin_log_msg("amdgpu_plugin: read image file data\n"); + ret = devinfo_to_topology(e->device_entries, e->num_of_gpus + e->num_of_cpus, &src_topology); + if (ret) { + pr_err("Failed to convert stored device information to topology\n"); + ret = -EINVAL; + goto exit; + } + + ret = topology_parse(&dest_topology, "Local"); + if (ret) { + pr_err("Failed to parse local system topology\n"); + goto exit; + } + ret = restore_devices(&args, e); if (ret) goto exit; diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c new file mode 100644 index 000000000..5ef98010f --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_topology.c @@ -0,0 +1,720 @@ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "common/list.h" + +#include "xmalloc.h" +#include "kfd_ioctl.h" +#include "amdgpu_plugin_topology.h" + +#define TOPOLOGY_PATH "/sys/class/kfd/kfd/topology/nodes/" + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE 1 +#endif + +#ifdef DEBUG +#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) +#else +#define plugin_log_msg(fmt, ...) \ + { \ + } +#endif + +static const char *link_type(uint32_t type) +{ + switch (type) { + case TOPO_IOLINK_TYPE_PCIE: + return "PCIe"; + case TOPO_IOLINK_TYPE_XGMI: + return "XGMI"; + } + return "Unsupported"; +} + +static struct tp_node *p2pgroup_get_node_by_gpu_id(const struct tp_p2pgroup *group, const uint32_t gpu_id) +{ + struct tp_node *node; + + list_for_each_entry(node, &group->nodes, listm_p2pgroup) { + if (node->gpu_id == gpu_id) + return node; + } + return NULL; +} + +struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor) +{ + struct tp_node *node; + + list_for_each_entry(node, &sys->nodes, listm_system) { + if (node->drm_render_minor == drm_render_minor) + return node; + } + return NULL; +} + +struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32_t gpu_id) +{ + struct tp_node *node; + + list_for_each_entry(node, &sys->nodes, listm_system) { + if (node->gpu_id == gpu_id) + return node; + } + return NULL; +} + +static struct tp_node *sys_get_node_by_node_id(const struct tp_system *sys, const uint32_t node_id) +{ + struct tp_node *node; + + list_for_each_entry(node, &sys->nodes, listm_system) { + if (node->id == node_id) + return node; + } + return NULL; +} + +static struct tp_p2pgroup *sys_get_p2pgroup_with_gpu_id(const struct tp_system *sys, const int type, + const uint32_t gpu_id) +{ + struct tp_p2pgroup *p2pgroup; + + list_for_each_entry(p2pgroup, &sys->xgmi_groups, listm_system) { + if (p2pgroup->type != type) + continue; + + if (p2pgroup_get_node_by_gpu_id(p2pgroup, gpu_id)) + return p2pgroup; + } + return NULL; +} + +static struct tp_iolink *get_tp_peer_iolink(const struct tp_node *from_node, const struct tp_node *to_node, + const uint8_t type) +{ + struct tp_iolink *iolink; + + list_for_each_entry(iolink, &from_node->iolinks, listm) { + if (iolink->node_to_id == to_node->id && iolink->type == type) + return iolink; + } + return NULL; +} + +struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t node_to_id) +{ + struct tp_iolink *iolink = xzalloc(sizeof(*iolink)); + + if (!iolink) + return NULL; + + iolink->type = type; + /* iolink->node_to will be filled in topology_determine_iolinks */ + iolink->node_to_id = node_to_id; + iolink->node_from = node; + + list_add_tail(&iolink->listm, &node->iolinks); + return iolink; +} + +struct tp_p2pgroup *sys_add_group(struct tp_system *sys, uint32_t type) +{ + struct tp_p2pgroup *group; + + group = xzalloc(sizeof(*group)); + if (!group) + return NULL; + + INIT_LIST_HEAD(&group->nodes); + group->type = type; + list_add_tail(&group->listm_system, &sys->xgmi_groups); + if (type == TOPO_IOLINK_TYPE_XGMI) + sys->num_xgmi_groups++; + + return group; +} + +struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id) +{ + struct tp_node *node = NULL; + + node = xzalloc(sizeof(*node)); + if (!node) + return NULL; + + node->id = id; + node->gpu_id = gpu_id; + node->drm_fd = -1; + INIT_LIST_HEAD(&node->iolinks); + list_add_tail(&node->listm_system, &sys->nodes); + sys->num_nodes++; + + return node; +} + +static bool get_prop(char *line, char *name, uint64_t *value) +{ + if (sscanf(line, " %29s %lu", name, value) != 2) + return false; + return true; +} + +/* Parse node properties in /sys/class/kfd/kfd/topology/nodes/N/properties */ +static int parse_topo_node_properties(struct tp_node *dev, const char *dir_path) +{ + FILE *file; + char path[300]; + char line[300]; + + sprintf(path, "%s/properties", dir_path); + file = fopen(path, "r"); + if (!file) { + pr_perror("Failed to access %s", path); + return -EFAULT; + } + + while (fgets(line, sizeof(line), file)) { + char name[30]; + uint64_t value; + + memset(name, 0, sizeof(name)); + if (!get_prop(line, name, &value)) + goto fail; + + if (!strcmp(name, "cpu_cores_count")) + dev->cpu_cores_count = (uint32_t)value; + else if (!strcmp(name, "simd_count")) + dev->simd_count = (uint32_t)value; + else if (!strcmp(name, "mem_banks_count")) + dev->mem_banks_count = (uint32_t)value; + else if (!strcmp(name, "caches_count")) + dev->caches_count = (uint32_t)value; + else if (!strcmp(name, "io_links_count")) + dev->io_links_count = (uint32_t)value; + else if (!strcmp(name, "max_waves_per_simd")) + dev->max_waves_per_simd = (uint32_t)value; + else if (!strcmp(name, "lds_size_in_kb")) + dev->lds_size_in_kb = (uint32_t)value; + else if (!strcmp(name, "num_gws")) + dev->num_gws = (uint32_t)value; + else if (!strcmp(name, "wave_front_size")) + dev->wave_front_size = (uint32_t)value; + else if (!strcmp(name, "array_count")) + dev->array_count = (uint32_t)value; + else if (!strcmp(name, "simd_arrays_per_engine")) + dev->simd_arrays_per_engine = (uint32_t)value; + else if (!strcmp(name, "cu_per_simd_array")) + dev->cu_per_simd_array = (uint32_t)value; + else if (!strcmp(name, "simd_per_cu")) + dev->simd_per_cu = (uint32_t)value; + else if (!strcmp(name, "max_slots_scratch_cu")) + dev->max_slots_scratch_cu = (uint32_t)value; + else if (!strcmp(name, "vendor_id")) + dev->vendor_id = (uint32_t)value; + else if (!strcmp(name, "device_id")) + dev->device_id = (uint32_t)value; + else if (!strcmp(name, "domain")) + dev->domain = (uint32_t)value; + else if (!strcmp(name, "drm_render_minor")) + dev->drm_render_minor = (uint32_t)value; + else if (!strcmp(name, "hive_id")) + dev->hive_id = value; + else if (!strcmp(name, "num_sdma_engines")) + dev->num_sdma_engines = (uint32_t)value; + else if (!strcmp(name, "num_sdma_xgmi_engines")) + dev->num_sdma_xgmi_engines = (uint32_t)value; + else if (!strcmp(name, "num_sdma_queues_per_engine")) + dev->num_sdma_queues_per_engine = (uint32_t)value; + else if (!strcmp(name, "num_cp_queues")) + dev->num_cp_queues = (uint32_t)value; + else if (!strcmp(name, "fw_version")) + dev->fw_version = (uint32_t)value; + else if (!strcmp(name, "capability")) + dev->capability = (uint32_t)value; + else if (!strcmp(name, "sdma_fw_version")) + dev->sdma_fw_version = (uint32_t)value; + + if (!dev->gpu_id && dev->cpu_cores_count >= 1) { + /* This is a CPU - we do not need to parse the other information */ + break; + } + } + + fclose(file); + return 0; +fail: + pr_err("Failed to parse line = %s\n", line); + fclose(file); + return -EINVAL; +} + +/* Parse node memory properties in /sys/class/kfd/kfd/topology/nodes/N/mem_banks */ +static int parse_topo_node_mem_banks(struct tp_node *node, const char *dir_path) +{ + struct dirent *dirent_node; + DIR *d_node; + char path[300]; + FILE *file = NULL; + uint32_t heap_type = 0; + uint64_t mem_size = 0; + int ret; + + if (!NODE_IS_GPU(node)) + return 0; + + sprintf(path, "%s/mem_banks", dir_path); + + d_node = opendir(path); + if (!d_node) { + pr_perror("Can't open %s", path); + return -EACCES; + } + + while ((dirent_node = readdir(d_node)) != NULL) { + char line[300]; + char bank_path[1024]; + struct stat st; + int id; + + heap_type = 0; + mem_size = 0; + + /* Only parse numeric directories */ + if (sscanf(dirent_node->d_name, "%d", &id) != 1) + continue; + + snprintf(bank_path, sizeof(bank_path), "%s/%s", path, dirent_node->d_name); + if (stat(bank_path, &st)) { + pr_err("Cannot to access %s\n", path); + ret = -EACCES; + goto fail; + } + if ((st.st_mode & S_IFMT) == S_IFDIR) { + char properties_path[PATH_MAX]; + + snprintf(properties_path, sizeof(properties_path), "%s/properties", bank_path); + + file = fopen(properties_path, "r"); + if (!file) { + pr_perror("Failed to access %s", properties_path); + ret = -EACCES; + goto fail; + } + + while (fgets(line, sizeof(line), file)) { + char name[30]; + uint64_t value; + + memset(name, 0, sizeof(name)); + if (!get_prop(line, name, &value)) { + ret = -EINVAL; + goto fail; + } + + if (!strcmp(name, "heap_type")) + heap_type = (uint32_t)value; + if (!strcmp(name, "size_in_bytes")) + mem_size = value; + } + + fclose(file); + file = NULL; + } + + if (heap_type == TOPO_HEAP_TYPE_PUBLIC || heap_type == TOPO_HEAP_TYPE_PRIVATE) + break; + } + + if ((heap_type != TOPO_HEAP_TYPE_PUBLIC && heap_type != TOPO_HEAP_TYPE_PRIVATE) || !mem_size) { + pr_err("Failed to determine memory type and size for device in %s\n", dir_path); + ret = -EINVAL; + goto fail; + } + + node->vram_public = (heap_type == TOPO_HEAP_TYPE_PUBLIC); + node->vram_size = mem_size; + closedir(d_node); + return 0; +fail: + if (file) + fclose(file); + closedir(d_node); + return ret; +} + +/* Parse node iolinks properties in /sys/class/kfd/kfd/topology/nodes/N/io_links */ +static int parse_topo_node_iolinks(struct tp_node *node, const char *dir_path) +{ + struct dirent *dirent_node; + DIR *d_node; + char path[300]; + FILE *file = NULL; + int ret = 0; + + snprintf(path, sizeof(path), "%s/io_links", dir_path); + + d_node = opendir(path); + if (!d_node) { + pr_perror("Can't open %s", path); + return -EACCES; + } + + while ((dirent_node = readdir(d_node)) != NULL) { + char line[300]; + char iolink_path[1024]; + struct stat st; + int id; + + uint32_t iolink_type = 0; + uint32_t node_to_id = 0; + + /* Only parse numeric directories */ + if (sscanf(dirent_node->d_name, "%d", &id) != 1) + continue; + + snprintf(iolink_path, sizeof(iolink_path), "%s/%s", path, dirent_node->d_name); + if (stat(iolink_path, &st)) { + pr_err("Cannot to access %s\n", path); + ret = -EACCES; + goto fail; + } + if ((st.st_mode & S_IFMT) == S_IFDIR) { + char properties_path[PATH_MAX]; + + snprintf(properties_path, sizeof(properties_path), "%s/properties", iolink_path); + + file = fopen(properties_path, "r"); + if (!file) { + pr_perror("Failed to access %s", properties_path); + ret = -EACCES; + goto fail; + } + + while (fgets(line, sizeof(line), file)) { + char name[30]; + uint64_t value; + + memset(name, 0, sizeof(name)); + if (!get_prop(line, name, &value)) { + ret = -EINVAL; + goto fail; + } + + if (!strcmp(name, "type")) + iolink_type = (uint32_t)value; + if (!strcmp(name, "node_to")) + node_to_id = (uint32_t)value; + } + fclose(file); + file = NULL; + } + + /* We only store the link information for now, then once all topology parsing is + * finished we will confirm iolinks + */ + if (iolink_type == TOPO_IOLINK_TYPE_PCIE || iolink_type == TOPO_IOLINK_TYPE_XGMI) { + if (!node_add_iolink(node, iolink_type, node_to_id)) { + ret = -ENOMEM; + goto fail; + } + } + } + closedir(d_node); + return 0; +fail: + if (file) + fclose(file); + + closedir(d_node); + return ret; +} + +/* Parse a node (CPU or GPU) in /sys/class/kfd/kfd/topology/nodes/N */ +static int parse_topo_node(struct tp_node *node, const char *dir_path) +{ + if (parse_topo_node_properties(node, dir_path)) { + pr_err("Failed to parse node properties\n"); + return -EINVAL; + } + if (parse_topo_node_mem_banks(node, dir_path)) { + pr_err("Failed to parse node mem_banks\n"); + return -EINVAL; + } + if (parse_topo_node_iolinks(node, dir_path)) { + pr_err("Failed to parse node iolinks\n"); + return -EINVAL; + } + return 0; +} + +static const char *p2pgroup_to_str(struct tp_p2pgroup *group) +{ + static char topology_printstr[200]; + struct tp_node *node; + size_t str_len = 0; + + topology_printstr[0] = '\0'; + str_len += sprintf(&topology_printstr[str_len], "type:%s:", link_type(group->type)); + + list_for_each_entry(node, &group->nodes, listm_p2pgroup) { + str_len += sprintf(&topology_printstr[str_len], "0x%04X ", node->gpu_id); + } + return topology_printstr; +} + +void topology_print(const struct tp_system *sys, const char *message) +{ + struct tp_node *node; + struct tp_p2pgroup *xgmi_group; + + pr_info("===System Topology=[%12s]==================================\n", message); + list_for_each_entry(node, &sys->nodes, listm_system) { + struct tp_iolink *iolink; + + if (!NODE_IS_GPU(node)) { + pr_info("[%d] CPU\n", node->id); + pr_info(" cpu_cores_count:%u\n", node->cpu_cores_count); + } else { + pr_info("[%d] GPU gpu_id:0x%04X\n", node->id, node->gpu_id); + pr_info(" vendor_id:%u device_id:%u\n", node->vendor_id, node->device_id); + pr_info(" vram_public:%c vram_size:%lu\n", node->vram_public ? 'Y' : 'N', node->vram_size); + pr_info(" io_links_count:%u capability:%u\n", node->io_links_count, node->capability); + pr_info(" mem_banks_count:%u caches_count:%d lds_size_in_kb:%u\n", node->mem_banks_count, + node->caches_count, node->lds_size_in_kb); + pr_info(" simd_count:%u max_waves_per_simd:%u\n", node->simd_count, + node->max_waves_per_simd); + pr_info(" num_gws:%u wave_front_size:%u array_count:%u\n", node->num_gws, + node->wave_front_size, node->array_count); + pr_info(" simd_arrays_per_engine:%u simd_per_cu:%u\n", node->simd_arrays_per_engine, + node->simd_per_cu); + pr_info(" max_slots_scratch_cu:%u cu_per_simd_array:%u\n", node->max_slots_scratch_cu, + node->cu_per_simd_array); + pr_info(" num_sdma_engines:%u\n", node->num_sdma_engines); + pr_info(" num_sdma_xgmi_engines:%u num_sdma_queues_per_engine:%u\n", + node->num_sdma_xgmi_engines, node->num_sdma_queues_per_engine); + pr_info(" num_cp_queues:%u fw_version:%u sdma_fw_version:%u\n", node->num_cp_queues, + node->fw_version, node->sdma_fw_version); + } + list_for_each_entry(iolink, &node->iolinks, listm) { + if (!iolink->valid) + continue; + + pr_info(" iolink type:%s node-to:%d (0x%04X) node-from:%d bi-dir:%s\n", + link_type(iolink->type), iolink->node_to_id, iolink->node_to->gpu_id, + iolink->node_from->id, iolink->peer ? "Y" : "N"); + } + } + + pr_info("===Groups==========================================================\n"); + list_for_each_entry(xgmi_group, &sys->xgmi_groups, listm_system) + pr_info("%s\n", p2pgroup_to_str(xgmi_group)); + pr_info("===================================================================\n"); +} + +void topology_init(struct tp_system *sys) +{ + memset(sys, 0, sizeof(*sys)); + INIT_LIST_HEAD(&sys->nodes); + INIT_LIST_HEAD(&sys->xgmi_groups); +} + +void topology_free(struct tp_system *sys) +{ + while (!list_empty(&sys->nodes)) { + struct tp_node *node = list_first_entry(&sys->nodes, struct tp_node, listm_system); + + list_del(&node->listm_system); + + while (!list_empty(&node->iolinks)) { + struct tp_iolink *iolink = list_first_entry(&node->iolinks, struct tp_iolink, listm); + + list_del(&iolink->listm); + xfree(iolink); + } + xfree(node); + } + + while (!list_empty(&sys->xgmi_groups)) { + struct tp_p2pgroup *p2pgroup = list_first_entry(&sys->xgmi_groups, struct tp_p2pgroup, listm_system); + + list_del(&p2pgroup->listm_system); + xfree(p2pgroup); + } +} + +/** + * @brief Validates iolinks and determine XGMI hives in a system topology + * + * On some systems, some GPUs may not be accessible because they are masked by cgroups, but the + * iolinks to these GPUs are still visible. If the peer GPU is not accessible, we consider that link + * invalid. + * In a XGMI hive, each GPU will have a bi-directional iolink to every other GPU. So we create a + * XGMI group (hive) and add all the GPUs in that hive to the group when iterating over the first + * GPU in that group. + * + * @param sys system topology + * @return 0 if successful, errno if failed. + */ +int topology_determine_iolinks(struct tp_system *sys) +{ + int ret = 0; + struct tp_node *node; + + list_for_each_entry(node, &sys->nodes, listm_system) { + struct tp_iolink *iolink; + + list_for_each_entry(iolink, &node->iolinks, listm) { + struct tp_p2pgroup *group = NULL; + struct tp_node *peer_node = NULL; + struct tp_iolink *peer_iolink = NULL; + + peer_node = sys_get_node_by_node_id(sys, iolink->node_to_id); + if (!peer_node) { + /* node not accessible, usually because it is masked by cgroups */ + iolink->valid = false; + continue; + } + iolink->valid = true; + node->num_valid_iolinks++; + + iolink->node_to = peer_node; + peer_iolink = get_tp_peer_iolink(peer_node, node, iolink->type); + if (!peer_iolink) + continue; /* This is a one-dir link */ + + /* We confirmed both sides have same type of iolink */ + iolink->peer = peer_iolink; + peer_iolink->peer = iolink; + + if (iolink->type == TOPO_IOLINK_TYPE_XGMI) { + group = sys_get_p2pgroup_with_gpu_id(sys, iolink->type, node->gpu_id); + if (!group) { + /* This GPU does not already belong to a group so we create + * a new group + */ + group = sys_add_group(sys, iolink->type); + if (!group) { + ret = -ENOMEM; + goto fail; + } + list_add_tail(&node->listm_p2pgroup, &group->nodes); + } + + /* Also add peer GPU to this group */ + if (!p2pgroup_get_node_by_gpu_id(group, peer_node->gpu_id)) + list_add_tail(&peer_node->listm_p2pgroup, &group->nodes); + } + } + } + +fail: + /* In case of failure, caller function will call topology_free which will free groups that + * were successfully allocated + */ + return ret; +} + +/** + * @brief Parse system topology + * + * Parse system topology exposed by the drivers in /sys/class/kfd/kfd/topology and fill in the + * system topology structure. + * + * @param sys system topology structure to be filled by this function + * @param message print this message when printing the topology to logs + * @return 0 if successful, errno if failed. + */ +int topology_parse(struct tp_system *sys, const char *message) +{ + struct dirent *dirent_system; + DIR *d_system; + char path[300]; + int ret; + + if (sys->parsed) + return 0; + + sys->parsed = true; + INIT_LIST_HEAD(&sys->nodes); + INIT_LIST_HEAD(&sys->xgmi_groups); + + d_system = opendir(TOPOLOGY_PATH); + if (!d_system) { + pr_perror("Can't open %s", TOPOLOGY_PATH); + return -EACCES; + } + + while ((dirent_system = readdir(d_system)) != NULL) { + struct stat stbuf; + int id, fd; + + /* Only parse numeric directories */ + if (sscanf(dirent_system->d_name, "%d", &id) != 1) + continue; + + sprintf(path, "%s%s", TOPOLOGY_PATH, dirent_system->d_name); + if (stat(path, &stbuf)) { + /* When cgroup is masking some devices, the path exists, but it is not + * accessible, this is not an error + */ + pr_info("Cannot to access %s\n", path); + continue; + } + + if ((stbuf.st_mode & S_IFMT) == S_IFDIR) { + struct tp_node *node; + int len; + char gpu_id_path[300]; + char read_buf[7]; /* Max gpu_id len is 6 chars */ + unsigned int gpu_id; + + sprintf(gpu_id_path, "%s/%s/gpu_id", TOPOLOGY_PATH, dirent_system->d_name); + fd = open(gpu_id_path, O_RDONLY); + if (fd < 0) { + pr_perror("Failed to access %s", gpu_id_path); + continue; + } + + len = read(fd, read_buf, sizeof(read_buf) - 1); + close(fd); + if (len < 0) + continue; + + read_buf[len] = '\0'; + + if (sscanf(read_buf, "%d", &gpu_id) != 1) + continue; + + node = sys_add_node(sys, id, gpu_id); + if (!node) { + ret = -ENOMEM; + goto fail; + } + + if (parse_topo_node(node, path)) { + pr_err("Failed to parse node %s\n", path); + ret = -EINVAL; + goto fail; + } + } + } + closedir(d_system); + return 0; + +fail: + topology_free(sys); + return ret; +} \ No newline at end of file diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h new file mode 100644 index 000000000..434956e9d --- /dev/null +++ b/plugins/amdgpu/amdgpu_plugin_topology.h @@ -0,0 +1,119 @@ +#ifndef __KFD_PLUGIN_TOPOLOGY_H__ +#define __KFD_PLUGIN_TOPOLOGY_H__ + +#define DRM_FIRST_RENDER_NODE 128 +#define DRM_LAST_RENDER_NODE 255 + +#define TOPO_HEAP_TYPE_PUBLIC 1 /* HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC */ +#define TOPO_HEAP_TYPE_PRIVATE 2 /* HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE */ + +#define TOPO_IOLINK_TYPE_ANY 0 /* HSA_IOLINKTYPE_UNDEFINED */ +#define TOPO_IOLINK_TYPE_PCIE 2 /* HSA_IOLINKTYPE_PCIEXPRESS */ +#define TOPO_IOLINK_TYPE_XGMI 11 /* HSA_IOLINK_TYPE_XGMI */ + +#define NODE_IS_GPU(node) ((node)->gpu_id != 0) +#define INVALID_CPU_ID 0xFFFF + +/*************************************** Structures ***********************************************/ +struct tp_node; + +struct tp_iolink { + struct list_head listm; + uint32_t type; + uint32_t node_to_id; + struct tp_node *node_to; + struct tp_node *node_from; + bool valid; /* Set to false if target node is not accessible */ + struct tp_iolink *peer; /* If link is bi-directional, peer link */ +}; + +struct tp_node { + uint32_t id; + uint32_t gpu_id; + uint32_t cpu_cores_count; + uint32_t simd_count; + uint32_t mem_banks_count; + uint32_t caches_count; + uint32_t io_links_count; + uint32_t max_waves_per_simd; + uint32_t lds_size_in_kb; + uint32_t num_gws; + uint32_t wave_front_size; + uint32_t array_count; + uint32_t simd_arrays_per_engine; + uint32_t cu_per_simd_array; + uint32_t simd_per_cu; + uint32_t max_slots_scratch_cu; + uint32_t vendor_id; + uint32_t device_id; + uint32_t domain; + uint32_t drm_render_minor; + uint64_t hive_id; + uint32_t num_sdma_engines; + uint32_t num_sdma_xgmi_engines; + uint32_t num_sdma_queues_per_engine; + uint32_t num_cp_queues; + uint32_t fw_version; + uint32_t capability; + uint32_t sdma_fw_version; + bool vram_public; + uint64_t vram_size; + + struct list_head listm_system; + struct list_head listm_p2pgroup; + struct list_head listm_mapping; /* Used only during device mapping */ + + uint32_t num_valid_iolinks; + struct list_head iolinks; + + int drm_fd; +}; + +struct tp_p2pgroup { + uint32_t type; + uint32_t num_nodes; + struct list_head listm_system; + struct list_head nodes; +}; + +struct tp_system { + bool parsed; + uint32_t num_nodes; + struct list_head nodes; + uint32_t num_xgmi_groups; + struct list_head xgmi_groups; +}; + +struct id_map { + uint32_t src; + uint32_t dest; + + struct list_head listm; +}; + +struct device_maps { + struct list_head cpu_maps; /* CPUs are mapped using node_id */ + struct list_head gpu_maps; + + struct list_head *tail_cpu; /* GPUs are mapped using gpu_id */ + struct list_head *tail_gpu; +}; + +/**************************************** Functions ***********************************************/ +void topology_init(struct tp_system *sys); +void topology_free(struct tp_system *topology); + +int topology_parse(struct tp_system *topology, const char *msg); +int topology_determine_iolinks(struct tp_system *sys); +void topology_print(const struct tp_system *sys, const char *msg); + +struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id); +struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t node_to_id); + +struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32_t gpu_id); +struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor); + +void maps_init(struct device_maps *maps); +void maps_free(struct device_maps *maps); + +#endif /* __KFD_PLUGIN_TOPOLOGY_H__ */ diff --git a/plugins/amdgpu/criu-amdgpu.proto b/plugins/amdgpu/criu-amdgpu.proto index 308f768f9..140b2951b 100644 --- a/plugins/amdgpu/criu-amdgpu.proto +++ b/plugins/amdgpu/criu-amdgpu.proto @@ -1,7 +1,43 @@ syntax = "proto2"; +message dev_iolink { + required uint32 type = 1; + required uint32 node_to_id = 2; +} + message device_entry { - required uint32 gpu_id = 1; + required uint32 node_id = 1; + required uint32 gpu_id = 2; + required uint32 cpu_cores_count = 3; + required uint32 simd_count = 4; + required uint32 mem_banks_count = 5; + required uint32 caches_count = 6; + required uint32 io_links_count = 7; + required uint32 max_waves_per_simd = 8; + required uint32 lds_size_in_kb = 9; + required uint32 gds_size_in_kb = 10; + required uint32 num_gws = 11; + required uint32 wave_front_size = 12; + required uint32 array_count = 13; + required uint32 simd_arrays_per_engine = 14; + required uint32 cu_per_simd_array = 15; + required uint32 simd_per_cu = 16; + required uint32 max_slots_scratch_cu = 17; + required uint32 vendor_id = 18; + required uint32 device_id = 19; + required uint32 domain = 20; + required uint32 drm_render_minor = 21; + required uint64 hive_id = 22; + required uint32 num_sdma_engines = 23; + required uint32 num_sdma_xgmi_engines = 24; + required uint32 num_sdma_queues_per_engine = 25; + required uint32 num_cp_queues = 26; + required uint32 fw_version = 27; + required uint32 capability = 28; + required uint32 sdma_fw_version = 29; + required uint32 vram_public = 30; + required uint64 vram_size = 31; + repeated dev_iolink iolinks = 32; } message bo_entry { @@ -16,11 +52,12 @@ message bo_entry { message criu_kfd { required uint32 pid = 1; required uint32 num_of_gpus = 2; - repeated device_entry device_entries = 3; - required uint64 num_of_bos = 4; - repeated bo_entry bo_entries = 5; - required uint32 num_of_objects = 6; - required bytes priv_data = 7; + required uint32 num_of_cpus = 3; + repeated device_entry device_entries = 4; + required uint64 num_of_bos = 5; + repeated bo_entry bo_entries = 6; + required uint32 num_of_objects = 7; + required bytes priv_data = 8; } message criu_render_node {