2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 18:07:57 +00:00
criu/plugins/amdgpu/amdgpu_plugin_topology.c
Ramesh Errabolu 733ef96315 amdgpu_plugin: Refactor code in preparation to support C&R for DRM devices
Add a new compilation unit to host symbols and methods that will be
needed to C&R DRM devices. Refactor code that indicates support for
C&R and checkpoints KFD and DRM devices

Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu@amd.com>
2024-09-11 16:02:11 -07:00

1455 lines
44 KiB
C

#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <linux/limits.h>
#include <dirent.h>
#include "common/list.h"
#include "xmalloc.h"
#include "kfd_ioctl.h"
#include "amdgpu_plugin_util.h"
#include "amdgpu_plugin_topology.h"
#define TOPOLOGY_PATH "/sys/class/kfd/kfd/topology/nodes/"
/* User override options */
/* Skip firmware version check */
bool kfd_fw_version_check = true;
/* Skip SDMA firmware version check */
bool kfd_sdma_fw_version_check = true;
/* Skip caches count check */
bool kfd_caches_count_check = true;
/* Skip num gws check */
bool kfd_num_gws_check = true;
/* Skip vram size check */
bool kfd_vram_size_check = true;
/* Preserve NUMA regions */
bool kfd_numa_check = true;
/* Skip capability check */
bool kfd_capability_check = true;
/*
* During dump, we can use any fd value so fd_next is always -1.
* During restore, we have to use a fd value that does not conflict with fd values in use by the target restore process.
* fd_next is initialized as 1 greater than the highest-numbered file descriptor used by the target restore process.
*/
int fd_next = -1;
static int open_drm_render_device(int minor)
{
char path[128];
int fd, ret_fd;
if (minor < DRM_FIRST_RENDER_NODE || minor > DRM_LAST_RENDER_NODE) {
pr_perror("DRM render minor %d out of range [%d, %d]", minor, DRM_FIRST_RENDER_NODE,
DRM_LAST_RENDER_NODE);
return -EINVAL;
}
snprintf(path, sizeof(path), "/dev/dri/renderD%d", minor);
fd = open(path, O_RDWR | O_CLOEXEC);
if (fd < 0) {
if (errno != ENOENT && errno != EPERM) {
pr_err("Failed to open %s: %s\n", path, strerror(errno));
if (errno == EACCES)
pr_err("Check user is in \"video\" group\n");
}
return -EBADFD;
}
if (fd_next < 0)
return fd;
ret_fd = fcntl(fd, F_DUPFD, fd_next++);
close(fd);
if (ret_fd < 0)
pr_perror("Failed to duplicate fd for minor:%d (fd_next:%d)", minor, fd_next);
return ret_fd;
}
static const char *link_type(uint32_t type)
{
switch (type) {
case TOPO_IOLINK_TYPE_PCIE:
return "PCIe";
case TOPO_IOLINK_TYPE_XGMI:
return "XGMI";
}
return "Unsupported";
}
static struct tp_node *p2pgroup_get_node_by_gpu_id(const struct tp_p2pgroup *group, const uint32_t gpu_id)
{
struct tp_node *node;
list_for_each_entry(node, &group->nodes, listm_p2pgroup) {
if (node->gpu_id == gpu_id)
return node;
}
return NULL;
}
int node_get_drm_render_device(struct tp_node *node)
{
if (node->drm_fd < 0)
node->drm_fd = open_drm_render_device(node->drm_render_minor);
return node->drm_fd;
}
void sys_close_drm_render_devices(struct tp_system *sys)
{
struct tp_node *node;
list_for_each_entry(node, &sys->nodes, listm_system) {
if (node->drm_fd >= 0) {
close(node->drm_fd);
node->drm_fd = -1;
}
}
}
static struct tp_iolink *node_get_iolink_to_node_id(const struct tp_node *node, const uint32_t type,
const uint32_t node_id)
{
struct tp_iolink *iolink;
list_for_each_entry(iolink, &node->iolinks, listm) {
if (iolink->node_to_id == node_id && iolink->type == type)
return iolink;
}
return NULL;
}
struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor)
{
struct tp_node *node;
list_for_each_entry(node, &sys->nodes, listm_system) {
if (node->drm_render_minor == drm_render_minor)
return node;
}
return NULL;
}
struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index)
{
struct tp_node *node;
list_for_each_entry(node, &sys->nodes, listm_system) {
if (NODE_IS_GPU(node) && index-- == 0)
return node;
}
return NULL;
}
struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32_t gpu_id)
{
struct tp_node *node;
list_for_each_entry(node, &sys->nodes, listm_system) {
if (node->gpu_id == gpu_id)
return node;
}
return NULL;
}
static struct tp_node *sys_get_node_by_node_id(const struct tp_system *sys, const uint32_t node_id)
{
struct tp_node *node;
list_for_each_entry(node, &sys->nodes, listm_system) {
if (node->id == node_id)
return node;
}
return NULL;
}
static struct tp_p2pgroup *sys_get_p2pgroup_with_gpu_id(const struct tp_system *sys, const int type,
const uint32_t gpu_id)
{
struct tp_p2pgroup *p2pgroup;
list_for_each_entry(p2pgroup, &sys->xgmi_groups, listm_system) {
if (p2pgroup->type != type)
continue;
if (p2pgroup_get_node_by_gpu_id(p2pgroup, gpu_id))
return p2pgroup;
}
return NULL;
}
static struct tp_iolink *get_tp_peer_iolink(const struct tp_node *from_node, const struct tp_node *to_node,
const uint8_t type)
{
struct tp_iolink *iolink;
list_for_each_entry(iolink, &from_node->iolinks, listm) {
if (iolink->node_to_id == to_node->id && iolink->type == type)
return iolink;
}
return NULL;
}
static bool maps_dest_cpu_mapped(const struct device_maps *maps, const uint32_t dest_id)
{
struct id_map *id_map;
list_for_each_entry(id_map, &maps->cpu_maps, listm) {
if (id_map->dest == dest_id)
return true;
}
return false;
}
static uint32_t maps_get_dest_cpu(const struct device_maps *maps, const uint32_t src_id)
{
struct id_map *id_map;
list_for_each_entry(id_map, &maps->cpu_maps, listm) {
if (id_map->src == src_id)
return id_map->dest;
}
return INVALID_CPU_ID;
}
bool maps_dest_gpu_mapped(const struct device_maps *maps, const uint32_t dest_id)
{
struct id_map *id_map;
list_for_each_entry(id_map, &maps->gpu_maps, listm) {
if (id_map->dest == dest_id)
return true;
}
return false;
}
uint32_t maps_get_dest_gpu(const struct device_maps *maps, const uint32_t src_id)
{
struct id_map *id_map;
list_for_each_entry(id_map, &maps->gpu_maps, listm) {
if (id_map->src == src_id)
return id_map->dest;
}
return 0;
}
static struct id_map *maps_add_cpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id)
{
struct id_map *id_map = xzalloc(sizeof(*id_map));
if (!id_map) {
pr_err("Failed to allocate memory for id_map\n");
return NULL;
}
id_map->src = src_id;
id_map->dest = dest_id;
list_add_tail(&id_map->listm, &maps->cpu_maps);
maps->tail_cpu = &id_map->listm;
pr_debug("Added CPU mapping [%02d -> %02d]\n", src_id, dest_id);
return id_map;
}
struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id)
{
struct id_map *id_map = xzalloc(sizeof(*id_map));
if (!id_map) {
pr_err("Failed to allocate memory for id_map\n");
return NULL;
}
id_map->src = src_id;
id_map->dest = dest_id;
list_add_tail(&id_map->listm, &maps->gpu_maps);
maps->tail_gpu = &id_map->listm;
pr_debug("Added GPU mapping [0x%04X -> 0x%04X]\n", src_id, dest_id);
return id_map;
}
static void maps_print(struct device_maps *maps)
{
struct id_map *id_map;
pr_info("===Maps===============\n");
list_for_each_entry(id_map, &maps->gpu_maps, listm)
pr_info("GPU: 0x%04X -> 0x%04X\n", id_map->src, id_map->dest);
list_for_each_entry(id_map, &maps->cpu_maps, listm)
pr_info("CPU: %02d -> %02d\n", id_map->src, id_map->dest);
pr_info("======================\n");
}
void maps_init(struct device_maps *maps)
{
INIT_LIST_HEAD(&maps->cpu_maps);
INIT_LIST_HEAD(&maps->gpu_maps);
maps->tail_cpu = 0;
maps->tail_gpu = 0;
}
void maps_free(struct device_maps *maps)
{
while (!list_empty(&maps->cpu_maps)) {
struct id_map *map = list_first_entry(&maps->cpu_maps, struct id_map, listm);
list_del(&map->listm);
xfree(map);
}
while (!list_empty(&maps->gpu_maps)) {
struct id_map *map = list_first_entry(&maps->gpu_maps, struct id_map, listm);
list_del(&map->listm);
xfree(map);
}
}
static void maps_pop(struct device_maps *maps, struct device_maps *remove)
{
if (remove->tail_cpu)
list_cut_position(&remove->cpu_maps, &maps->cpu_maps, remove->tail_cpu);
if (remove->tail_gpu)
list_cut_position(&remove->gpu_maps, &maps->gpu_maps, remove->tail_gpu);
maps_free(remove);
}
static int maps_push(struct device_maps *maps, struct device_maps *new)
{
struct id_map *src_id_map, *dest_id_map;
list_for_each_entry(src_id_map, &new->cpu_maps, listm) {
list_for_each_entry(dest_id_map, &maps->cpu_maps, listm) {
if (src_id_map->src == dest_id_map->src || src_id_map->dest == dest_id_map->dest) {
pr_err("CPU mapping already exists src [%02d->%02d] new [%02d->%02d]\n",
src_id_map->src, src_id_map->dest, dest_id_map->src, dest_id_map->dest);
return -EINVAL;
}
}
}
list_for_each_entry(src_id_map, &new->gpu_maps, listm) {
list_for_each_entry(dest_id_map, &maps->gpu_maps, listm) {
if (src_id_map->src == dest_id_map->src || src_id_map->dest == dest_id_map->dest) {
pr_err("GPU mapping already exists src [0x%04X -> 0x%04X] new [0x%04X -> 0x%04X]\n",
src_id_map->src, src_id_map->dest, dest_id_map->src, dest_id_map->dest);
return -EINVAL;
}
}
}
list_splice(&new->cpu_maps, &maps->cpu_maps);
list_splice(&new->gpu_maps, &maps->gpu_maps);
return 0;
}
struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t node_to_id)
{
struct tp_iolink *iolink = xzalloc(sizeof(*iolink));
if (!iolink)
return NULL;
iolink->type = type;
/* iolink->node_to will be filled in topology_determine_iolinks */
iolink->node_to_id = node_to_id;
iolink->node_from = node;
list_add_tail(&iolink->listm, &node->iolinks);
return iolink;
}
struct tp_p2pgroup *sys_add_group(struct tp_system *sys, uint32_t type)
{
struct tp_p2pgroup *group;
group = xzalloc(sizeof(*group));
if (!group)
return NULL;
INIT_LIST_HEAD(&group->nodes);
group->type = type;
list_add_tail(&group->listm_system, &sys->xgmi_groups);
if (type == TOPO_IOLINK_TYPE_XGMI)
sys->num_xgmi_groups++;
return group;
}
struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id)
{
struct tp_node *node = NULL;
node = xzalloc(sizeof(*node));
if (!node)
return NULL;
node->id = id;
node->gpu_id = gpu_id;
node->drm_fd = -1;
INIT_LIST_HEAD(&node->iolinks);
list_add_tail(&node->listm_system, &sys->nodes);
sys->num_nodes++;
return node;
}
static bool get_prop(char *line, char *name, uint64_t *value)
{
if (sscanf(line, " %29s %lu", name, value) != 2)
return false;
return true;
}
/* Parse node properties in /sys/class/kfd/kfd/topology/nodes/N/properties */
static int parse_topo_node_properties(struct tp_node *dev, const char *dir_path)
{
FILE *file;
char path[300];
char line[300];
sprintf(path, "%s/properties", dir_path);
file = fopen(path, "r");
if (!file) {
pr_perror("Failed to access %s", path);
return -EFAULT;
}
while (fgets(line, sizeof(line), file)) {
char name[30];
uint64_t value;
memset(name, 0, sizeof(name));
if (!get_prop(line, name, &value))
goto fail;
if (!strcmp(name, "cpu_cores_count"))
dev->cpu_cores_count = (uint32_t)value;
else if (!strcmp(name, "simd_count"))
dev->simd_count = (uint32_t)value;
else if (!strcmp(name, "mem_banks_count"))
dev->mem_banks_count = (uint32_t)value;
else if (!strcmp(name, "caches_count"))
dev->caches_count = (uint32_t)value;
else if (!strcmp(name, "io_links_count"))
dev->io_links_count = (uint32_t)value;
else if (!strcmp(name, "max_waves_per_simd"))
dev->max_waves_per_simd = (uint32_t)value;
else if (!strcmp(name, "lds_size_in_kb"))
dev->lds_size_in_kb = (uint32_t)value;
else if (!strcmp(name, "num_gws"))
dev->num_gws = (uint32_t)value;
else if (!strcmp(name, "wave_front_size"))
dev->wave_front_size = (uint32_t)value;
else if (!strcmp(name, "array_count"))
dev->array_count = (uint32_t)value;
else if (!strcmp(name, "simd_arrays_per_engine"))
dev->simd_arrays_per_engine = (uint32_t)value;
else if (!strcmp(name, "cu_per_simd_array"))
dev->cu_per_simd_array = (uint32_t)value;
else if (!strcmp(name, "simd_per_cu"))
dev->simd_per_cu = (uint32_t)value;
else if (!strcmp(name, "max_slots_scratch_cu"))
dev->max_slots_scratch_cu = (uint32_t)value;
else if (!strcmp(name, "vendor_id"))
dev->vendor_id = (uint32_t)value;
else if (!strcmp(name, "device_id"))
dev->device_id = (uint32_t)value;
else if (!strcmp(name, "domain"))
dev->domain = (uint32_t)value;
else if (!strcmp(name, "drm_render_minor"))
dev->drm_render_minor = (uint32_t)value;
else if (!strcmp(name, "hive_id"))
dev->hive_id = value;
else if (!strcmp(name, "num_sdma_engines"))
dev->num_sdma_engines = (uint32_t)value;
else if (!strcmp(name, "num_sdma_xgmi_engines"))
dev->num_sdma_xgmi_engines = (uint32_t)value;
else if (!strcmp(name, "num_sdma_queues_per_engine"))
dev->num_sdma_queues_per_engine = (uint32_t)value;
else if (!strcmp(name, "num_cp_queues"))
dev->num_cp_queues = (uint32_t)value;
else if (!strcmp(name, "fw_version"))
dev->fw_version = (uint32_t)value;
else if (!strcmp(name, "capability"))
dev->capability = (uint32_t)value;
else if (!strcmp(name, "sdma_fw_version"))
dev->sdma_fw_version = (uint32_t)value;
if (!dev->gpu_id && dev->cpu_cores_count >= 1) {
/* This is a CPU - we do not need to parse the other information */
break;
}
}
fclose(file);
return 0;
fail:
pr_err("Failed to parse line = %s\n", line);
fclose(file);
return -EINVAL;
}
/* Parse node memory properties in /sys/class/kfd/kfd/topology/nodes/N/mem_banks */
static int parse_topo_node_mem_banks(struct tp_node *node, const char *dir_path)
{
struct dirent *dirent_node;
DIR *d_node;
char path[300];
FILE *file = NULL;
uint32_t heap_type = 0;
uint64_t mem_size = 0;
int ret;
if (!NODE_IS_GPU(node))
return 0;
sprintf(path, "%s/mem_banks", dir_path);
d_node = opendir(path);
if (!d_node) {
pr_perror("Can't open %s", path);
return -EACCES;
}
while ((dirent_node = readdir(d_node)) != NULL) {
char line[300];
char bank_path[1024];
struct stat st;
int id;
heap_type = 0;
mem_size = 0;
/* Only parse numeric directories */
if (sscanf(dirent_node->d_name, "%d", &id) != 1)
continue;
snprintf(bank_path, sizeof(bank_path), "%s/%s", path, dirent_node->d_name);
if (stat(bank_path, &st)) {
pr_err("Cannot to access %s\n", path);
ret = -EACCES;
goto fail;
}
if ((st.st_mode & S_IFMT) == S_IFDIR) {
char properties_path[PATH_MAX];
snprintf(properties_path, sizeof(properties_path), "%s/properties", bank_path);
file = fopen(properties_path, "r");
if (!file) {
pr_perror("Failed to access %s", properties_path);
ret = -EACCES;
goto fail;
}
while (fgets(line, sizeof(line), file)) {
char name[30];
uint64_t value;
memset(name, 0, sizeof(name));
if (!get_prop(line, name, &value)) {
ret = -EINVAL;
goto fail;
}
if (!strcmp(name, "heap_type"))
heap_type = (uint32_t)value;
if (!strcmp(name, "size_in_bytes"))
mem_size = value;
}
fclose(file);
file = NULL;
}
if (heap_type == TOPO_HEAP_TYPE_PUBLIC || heap_type == TOPO_HEAP_TYPE_PRIVATE)
break;
}
if ((heap_type != TOPO_HEAP_TYPE_PUBLIC && heap_type != TOPO_HEAP_TYPE_PRIVATE) || !mem_size) {
pr_err("Failed to determine memory type and size for device in %s\n", dir_path);
ret = -EINVAL;
goto fail;
}
node->vram_public = (heap_type == TOPO_HEAP_TYPE_PUBLIC);
node->vram_size = mem_size;
closedir(d_node);
return 0;
fail:
if (file)
fclose(file);
closedir(d_node);
return ret;
}
/* Parse node iolinks properties in /sys/class/kfd/kfd/topology/nodes/N/io_links */
static int parse_topo_node_iolinks(struct tp_node *node, const char *dir_path)
{
struct dirent *dirent_node;
DIR *d_node;
char path[300];
FILE *file = NULL;
int ret = 0;
snprintf(path, sizeof(path), "%s/io_links", dir_path);
d_node = opendir(path);
if (!d_node) {
pr_perror("Can't open %s", path);
return -EACCES;
}
while ((dirent_node = readdir(d_node)) != NULL) {
char line[300];
char iolink_path[1024];
struct stat st;
int id;
uint32_t iolink_type = 0;
uint32_t node_to_id = 0;
/* Only parse numeric directories */
if (sscanf(dirent_node->d_name, "%d", &id) != 1)
continue;
snprintf(iolink_path, sizeof(iolink_path), "%s/%s", path, dirent_node->d_name);
if (stat(iolink_path, &st)) {
pr_err("Cannot to access %s\n", path);
ret = -EACCES;
goto fail;
}
if ((st.st_mode & S_IFMT) == S_IFDIR) {
char properties_path[PATH_MAX];
snprintf(properties_path, sizeof(properties_path), "%s/properties", iolink_path);
file = fopen(properties_path, "r");
if (!file) {
pr_perror("Failed to access %s", properties_path);
ret = -EACCES;
goto fail;
}
while (fgets(line, sizeof(line), file)) {
char name[30];
uint64_t value;
memset(name, 0, sizeof(name));
if (!get_prop(line, name, &value)) {
ret = -EINVAL;
goto fail;
}
if (!strcmp(name, "type"))
iolink_type = (uint32_t)value;
if (!strcmp(name, "node_to"))
node_to_id = (uint32_t)value;
}
fclose(file);
file = NULL;
}
/* We only store the link information for now, then once all topology parsing is
* finished we will confirm iolinks
*/
if (iolink_type == TOPO_IOLINK_TYPE_PCIE || iolink_type == TOPO_IOLINK_TYPE_XGMI) {
if (!node_add_iolink(node, iolink_type, node_to_id)) {
ret = -ENOMEM;
goto fail;
}
}
}
closedir(d_node);
return 0;
fail:
if (file)
fclose(file);
closedir(d_node);
return ret;
}
/* Parse a node (CPU or GPU) in /sys/class/kfd/kfd/topology/nodes/N */
static int parse_topo_node(struct tp_node *node, const char *dir_path)
{
if (parse_topo_node_properties(node, dir_path)) {
pr_err("Failed to parse node properties\n");
return -EINVAL;
}
if (parse_topo_node_mem_banks(node, dir_path)) {
pr_err("Failed to parse node mem_banks\n");
return -EINVAL;
}
if (parse_topo_node_iolinks(node, dir_path)) {
pr_err("Failed to parse node iolinks\n");
return -EINVAL;
}
return 0;
}
static const char *p2pgroup_to_str(struct tp_p2pgroup *group)
{
static char topology_printstr[200];
struct tp_node *node;
size_t str_len = 0;
topology_printstr[0] = '\0';
str_len += sprintf(&topology_printstr[str_len], "type:%s:", link_type(group->type));
list_for_each_entry(node, &group->nodes, listm_p2pgroup) {
str_len += sprintf(&topology_printstr[str_len], "0x%04X ", node->gpu_id);
}
return topology_printstr;
}
static const char *mapping_list_to_str(struct list_head *node_list)
{
static char topology_printstr[200];
struct tp_node *node;
size_t str_len = 0;
topology_printstr[0] = '\0';
list_for_each_entry(node, node_list, listm_mapping)
str_len += sprintf(&topology_printstr[str_len], "0x%04X ", node->gpu_id);
return topology_printstr;
}
void topology_print(const struct tp_system *sys, const char *message)
{
struct tp_node *node;
struct tp_p2pgroup *xgmi_group;
pr_info("===System Topology=[%12s]==================================\n", message);
list_for_each_entry(node, &sys->nodes, listm_system) {
struct tp_iolink *iolink;
if (!NODE_IS_GPU(node)) {
pr_info("[%d] CPU\n", node->id);
pr_info(" cpu_cores_count:%u\n", node->cpu_cores_count);
} else {
pr_info("[%d] GPU gpu_id:0x%04X\n", node->id, node->gpu_id);
pr_info(" vendor_id:%u device_id:%u\n", node->vendor_id, node->device_id);
pr_info(" vram_public:%c vram_size:%lu\n", node->vram_public ? 'Y' : 'N', node->vram_size);
pr_info(" io_links_count:%u capability:%u\n", node->io_links_count, node->capability);
pr_info(" mem_banks_count:%u caches_count:%d lds_size_in_kb:%u\n", node->mem_banks_count,
node->caches_count, node->lds_size_in_kb);
pr_info(" simd_count:%u max_waves_per_simd:%u\n", node->simd_count,
node->max_waves_per_simd);
pr_info(" num_gws:%u wave_front_size:%u array_count:%u\n", node->num_gws,
node->wave_front_size, node->array_count);
pr_info(" simd_arrays_per_engine:%u simd_per_cu:%u\n", node->simd_arrays_per_engine,
node->simd_per_cu);
pr_info(" max_slots_scratch_cu:%u cu_per_simd_array:%u\n", node->max_slots_scratch_cu,
node->cu_per_simd_array);
pr_info(" num_sdma_engines:%u\n", node->num_sdma_engines);
pr_info(" num_sdma_xgmi_engines:%u num_sdma_queues_per_engine:%u\n",
node->num_sdma_xgmi_engines, node->num_sdma_queues_per_engine);
pr_info(" num_cp_queues:%u fw_version:%u sdma_fw_version:%u\n", node->num_cp_queues,
node->fw_version, node->sdma_fw_version);
}
list_for_each_entry(iolink, &node->iolinks, listm) {
if (!iolink->valid)
continue;
pr_info(" iolink type:%s node-to:%d (0x%04X) node-from:%d bi-dir:%s\n",
link_type(iolink->type), iolink->node_to_id, iolink->node_to->gpu_id,
iolink->node_from->id, iolink->peer ? "Y" : "N");
}
}
pr_info("===Groups==========================================================\n");
list_for_each_entry(xgmi_group, &sys->xgmi_groups, listm_system)
pr_info("%s\n", p2pgroup_to_str(xgmi_group));
pr_info("===================================================================\n");
}
void topology_init(struct tp_system *sys)
{
memset(sys, 0, sizeof(*sys));
INIT_LIST_HEAD(&sys->nodes);
INIT_LIST_HEAD(&sys->xgmi_groups);
}
void topology_free(struct tp_system *sys)
{
while (!list_empty(&sys->nodes)) {
struct tp_node *node = list_first_entry(&sys->nodes, struct tp_node, listm_system);
list_del(&node->listm_system);
while (!list_empty(&node->iolinks)) {
struct tp_iolink *iolink = list_first_entry(&node->iolinks, struct tp_iolink, listm);
list_del(&iolink->listm);
xfree(iolink);
}
xfree(node);
}
while (!list_empty(&sys->xgmi_groups)) {
struct tp_p2pgroup *p2pgroup = list_first_entry(&sys->xgmi_groups, struct tp_p2pgroup, listm_system);
list_del(&p2pgroup->listm_system);
xfree(p2pgroup);
}
/* Update Topology as being freed */
sys->parsed = false;
}
/**
* @brief Validates iolinks and determine XGMI hives in a system topology
*
* On some systems, some GPUs may not be accessible because they are masked by cgroups, but the
* iolinks to these GPUs are still visible. If the peer GPU is not accessible, we consider that link
* invalid.
* In a XGMI hive, each GPU will have a bi-directional iolink to every other GPU. So we create a
* XGMI group (hive) and add all the GPUs in that hive to the group when iterating over the first
* GPU in that group.
*
* @param sys system topology
* @return 0 if successful, errno if failed.
*/
int topology_determine_iolinks(struct tp_system *sys)
{
int ret = 0;
struct tp_node *node;
list_for_each_entry(node, &sys->nodes, listm_system) {
struct tp_iolink *iolink;
list_for_each_entry(iolink, &node->iolinks, listm) {
struct tp_p2pgroup *group = NULL;
struct tp_node *peer_node = NULL;
struct tp_iolink *peer_iolink = NULL;
peer_node = sys_get_node_by_node_id(sys, iolink->node_to_id);
if (!peer_node) {
/* node not accessible, usually because it is masked by cgroups */
iolink->valid = false;
continue;
}
iolink->valid = true;
node->num_valid_iolinks++;
iolink->node_to = peer_node;
peer_iolink = get_tp_peer_iolink(peer_node, node, iolink->type);
if (!peer_iolink)
continue; /* This is a one-dir link */
/* We confirmed both sides have same type of iolink */
iolink->peer = peer_iolink;
peer_iolink->peer = iolink;
if (iolink->type == TOPO_IOLINK_TYPE_XGMI) {
group = sys_get_p2pgroup_with_gpu_id(sys, iolink->type, node->gpu_id);
if (!group) {
/* This GPU does not already belong to a group so we create
* a new group
*/
group = sys_add_group(sys, iolink->type);
if (!group) {
ret = -ENOMEM;
goto fail;
}
list_add_tail(&node->listm_p2pgroup, &group->nodes);
}
/* Also add peer GPU to this group */
if (!p2pgroup_get_node_by_gpu_id(group, peer_node->gpu_id))
list_add_tail(&peer_node->listm_p2pgroup, &group->nodes);
}
}
}
fail:
/* In case of failure, caller function will call topology_free which will free groups that
* were successfully allocated
*/
return ret;
}
/**
* @brief Parse system topology
*
* Parse system topology exposed by the drivers in /sys/class/kfd/kfd/topology and fill in the
* system topology structure.
*
* @param sys system topology structure to be filled by this function
* @param message print this message when printing the topology to logs
* @return 0 if successful, errno if failed.
*/
int topology_parse(struct tp_system *sys, const char *message)
{
struct dirent *dirent_system;
DIR *d_system;
char path[300];
int ret;
if (sys->parsed)
return 0;
sys->parsed = true;
INIT_LIST_HEAD(&sys->nodes);
INIT_LIST_HEAD(&sys->xgmi_groups);
d_system = opendir(TOPOLOGY_PATH);
if (!d_system) {
pr_perror("Can't open %s", TOPOLOGY_PATH);
return -EACCES;
}
while ((dirent_system = readdir(d_system)) != NULL) {
struct stat stbuf;
int id, fd;
/* Only parse numeric directories */
if (sscanf(dirent_system->d_name, "%d", &id) != 1)
continue;
sprintf(path, "%s%s", TOPOLOGY_PATH, dirent_system->d_name);
if (stat(path, &stbuf)) {
/* When cgroup is masking some devices, the path exists, but it is not
* accessible, this is not an error
*/
pr_info("Cannot to access %s\n", path);
continue;
}
if ((stbuf.st_mode & S_IFMT) == S_IFDIR) {
struct tp_node *node;
int len;
char gpu_id_path[300];
char read_buf[7]; /* Max gpu_id len is 6 chars */
unsigned int gpu_id;
sprintf(gpu_id_path, "%s/%s/gpu_id", TOPOLOGY_PATH, dirent_system->d_name);
fd = open(gpu_id_path, O_RDONLY);
if (fd < 0) {
pr_perror("Failed to access %s", gpu_id_path);
continue;
}
len = read(fd, read_buf, sizeof(read_buf) - 1);
close(fd);
if (len < 0)
continue;
read_buf[len] = '\0';
if (sscanf(read_buf, "%d", &gpu_id) != 1)
continue;
node = sys_add_node(sys, id, gpu_id);
if (!node) {
ret = -ENOMEM;
goto fail;
}
if (parse_topo_node(node, path)) {
pr_err("Failed to parse node %s\n", path);
ret = -EINVAL;
goto fail;
}
}
}
closedir(d_system);
return 0;
fail:
topology_free(sys);
return ret;
}
static bool device_properties_match(struct tp_node *src, struct tp_node *dest)
{
if (src->simd_count == dest->simd_count && src->mem_banks_count == dest->mem_banks_count &&
src->io_links_count == dest->io_links_count && src->max_waves_per_simd == dest->max_waves_per_simd &&
src->lds_size_in_kb == dest->lds_size_in_kb && src->wave_front_size == dest->wave_front_size &&
src->array_count == dest->array_count && src->simd_arrays_per_engine == dest->simd_arrays_per_engine &&
src->cu_per_simd_array == dest->cu_per_simd_array && src->simd_per_cu == dest->simd_per_cu &&
src->max_slots_scratch_cu == dest->max_slots_scratch_cu && src->vendor_id == dest->vendor_id &&
src->device_id == dest->device_id && src->num_sdma_engines == dest->num_sdma_engines &&
src->num_sdma_xgmi_engines == dest->num_sdma_xgmi_engines &&
src->num_sdma_queues_per_engine == dest->num_sdma_queues_per_engine &&
src->num_cp_queues == dest->num_cp_queues && src->vram_public == dest->vram_public &&
(!kfd_capability_check || (src->capability == dest->capability)) &&
(!kfd_vram_size_check || (src->vram_size <= dest->vram_size)) &&
(!kfd_num_gws_check || (src->num_gws <= dest->num_gws)) &&
(!kfd_caches_count_check || (src->caches_count <= dest->caches_count)) &&
(!kfd_fw_version_check || (src->fw_version <= dest->fw_version)) &&
(!kfd_sdma_fw_version_check || (src->sdma_fw_version <= dest->sdma_fw_version))) {
return true;
}
return false;
}
/**
* @brief Determines whether iolink dest can be used to replace src
*
* @param src source iolink
* @param dest destination iolink
* @return true if dest can replace src
*/
static bool iolink_match(struct tp_iolink *src, struct tp_iolink *dest)
{
if (!src->valid)
return true;
if (!dest->valid)
return false;
if (NODE_IS_GPU(src->node_to) != NODE_IS_GPU(dest->node_to))
return false;
/* XGMI link can replace PCIE links */
if (src->type == TOPO_IOLINK_TYPE_XGMI && dest->type == TOPO_IOLINK_TYPE_PCIE)
return false;
/* bi-directional links can replace uni-directional links */
if (src->peer != NULL && dest->peer == NULL)
return false;
return true;
}
/**
* @brief Determines whether src_node can be mapped to dest_node
*
* Nodes compatibility are determined by:
* 1. Comparing the node properties
* 2. Making sure iolink mappings to CPUs would be compatible with existing iolink mappings in maps
*
* If src_node and dest_node are mappable, then map_device will push the new mapping
* for src_node -> dest_node into new_maps.
* @param src_sys system topology information on source system
* @param dest_sys system topology information on destination system
* @param src_node source GPU
* @param dest_node destination GPU
* @param maps list of existing device maps
* @param new_maps if nodes are mappable, then GPU and CPU mappings will be added to this list
* @return true if src_node and dest_node are mappable
*/
static bool map_device(struct tp_system *src_sys, struct tp_system *dest_sys, struct tp_node *src_node,
struct tp_node *dest_node, struct device_maps *maps, struct device_maps *new_maps)
{
struct tp_iolink *src_iolink;
pr_debug("Evaluating mapping nodes [0x%04X -> 0x%04X]\n", src_node->gpu_id, dest_node->gpu_id);
/* Compare GPU properties from /sys/class/kfd/kfd/topology/nodes/N/properties */
if (!device_properties_match(src_node, dest_node)) {
pr_debug("[0x%04X -> 0x%04X] Device properties do not match\n", src_node->gpu_id, dest_node->gpu_id);
return false;
}
if (src_node->num_valid_iolinks > dest_node->num_valid_iolinks) {
pr_debug("[0x%04X -> 0x%04X] Mismatch between number of iolinks\n", src_node->gpu_id,
dest_node->gpu_id);
return false;
}
list_for_each_entry(src_iolink, &src_node->iolinks, listm) {
/* Go through list of iolinks to CPU and compare them */
if (!NODE_IS_GPU(src_iolink->node_to)) {
bool matched_iolink = false;
/* This is a iolink to CPU */
pr_debug("Found link to CPU node:%02d\n", src_iolink->node_to->id);
if (!kfd_numa_check) {
struct tp_iolink *dest_iolink;
list_for_each_entry(dest_iolink, &dest_node->iolinks, listm) {
if (iolink_match(src_iolink, dest_iolink))
matched_iolink = true;
}
} else {
uint32_t dest_cpu_node_id;
dest_cpu_node_id = maps_get_dest_cpu(maps, src_iolink->node_to->id);
if (dest_cpu_node_id == INVALID_CPU_ID)
dest_cpu_node_id = maps_get_dest_cpu(new_maps, src_iolink->node_to->id);
if (dest_cpu_node_id == INVALID_CPU_ID) {
struct tp_iolink *dest_iolink;
list_for_each_entry(dest_iolink, &dest_node->iolinks, listm) {
if (iolink_match(src_iolink, dest_iolink) &&
!maps_dest_cpu_mapped(maps, dest_iolink->node_to->id) &&
!maps_dest_cpu_mapped(new_maps, dest_iolink->node_to->id)) {
if (!maps_add_cpu_entry(new_maps, src_iolink->node_to->id,
dest_iolink->node_to->id))
/* This is a critical error because
* we are out of memory
*/
return false;
matched_iolink = true;
break;
}
}
} else {
pr_debug("Existing CPU mapping found [%02d-%02d]\n", src_iolink->node_to->id,
dest_cpu_node_id);
/* Confirm that the link to this CPU is same or better */
struct tp_iolink *dest_iolink = node_get_iolink_to_node_id(
dest_node, src_iolink->type, dest_cpu_node_id);
if (dest_iolink && iolink_match(src_iolink, dest_iolink))
matched_iolink = true;
}
}
if (!matched_iolink) {
pr_debug("[0x%04X -> 0x%04X] Mismatch between iolink to CPU\n", src_node->gpu_id,
dest_node->gpu_id);
return false;
}
} else {
/* If GPUs have P2P-PCIe iolinks to this GPU, then at least one CPU will
* also have a P2P-PCIe iolink to this GPU, so it seems that we do not need
* to consider P2P-PCIe iolinks from GPU to GPU for now. Once P2P-PCIe
* iolinks are exposed via p2p_links we may have to add additional code here
* to validate P2P-PCIe links between GPUs.
*/
}
}
pr_debug("[0x%04X -> 0x%04X] Map is possible\n", src_node->gpu_id, dest_node->gpu_id);
if (!maps_add_gpu_entry(new_maps, src_node->gpu_id, dest_node->gpu_id)) {
/* This is a critical error because we are out of memory */
return false;
}
maps_print(new_maps);
return true;
}
/**
* @brief Determines whether list of GPUs in src_nodes are mappable to dest_nodes
*
* This function will pick the first node from src_nodes and iterate through all the nodes in
* dest_nodes and call map_device to determine whether the node is mappable.
* If a node from dest_nodes is mappable to the first node from src_nodes:
* 1. This function will remove the first node from src_nodes and the node from dest_nodes
* 2. Push sub-mappings (new_maps) generated by map_device into existing mappings (maps)
* 3. Recursively check whether remaining nodes in src_nodes and dest_nodes are mappable.
*
* Once src_nodes is empty then we have successfully mapped all the nodes and maps contains a full
* list of GPU mappings.
*
* If there are no nodes in dest_nodes that can be mapped to the first node in src_nodes, then this
* means we cannot build a full mapping list with the current list of mappings. We backtrack by
* popping the newly generated sub-mappings(new_maps) from existing mappings (maps) and add the two
* nodes back to src_nodes and dest_nodes and return false. When this function returns false, the
* caller function will try a different path by trying to map the first node from src_nodes to the
* next node in dest_nodes.
*
* @param src_sys system topology information on source system
* @param dest_sys system topology information on destination system
* @param src_node list of source GPUs that need to be mapped
* @param dest_node list of destination GPUs that need to be mapped
* @param maps list of device maps based on current map path
* @return true if all nodes from src_nodes and dest_nodes are mappable
*/
static bool map_devices(struct tp_system *src_sys, struct tp_system *dest_sys, struct list_head *src_nodes,
struct list_head *dest_nodes, struct device_maps *maps)
{
struct tp_node *src_node, *dest_node, *dest_node_tmp;
struct device_maps new_maps;
/* Pick the first src node from the list of nodes and look for a dest node that is mappable.
* If we find a mappable destination node, then we add src node and dest node mapping to
* device_maps and recursively try to map the remaining nodes in the list.
* If there are no more src nodes in the list, then we have found a successful combination
* of src to dest nodes that are mappable.
*/
if (list_empty(src_nodes)) {
pr_debug("All nodes mapped successfully\n");
return true;
}
pr_debug("Mapping list src nodes [%s]\n", mapping_list_to_str(src_nodes));
pr_debug("Mapping list dest nodes [%s]\n", mapping_list_to_str(dest_nodes));
src_node = list_first_entry(src_nodes, struct tp_node, listm_mapping);
pr_debug("Looking for match for node 0x%04X\n", src_node->gpu_id);
list_del(&src_node->listm_mapping);
list_for_each_entry_safe(dest_node, dest_node_tmp, dest_nodes, listm_mapping) {
maps_init(&new_maps);
if (map_device(src_sys, dest_sys, src_node, dest_node, maps, &new_maps)) {
pr_debug("Matched destination node 0x%04X\n", dest_node->gpu_id);
/* src node and dest node are mappable, add device_maps generated by
* map_device to list of current valid device_maps, and recursively try to
* map remaining nodes in the list.
*/
list_del(&dest_node->listm_mapping);
if (maps_push(maps, &new_maps))
return false;
if (map_devices(src_sys, dest_sys, src_nodes, dest_nodes, maps)) {
pr_debug("Matched nodes 0x%04X and after\n", dest_node->gpu_id);
return true;
} else {
/* We could not map remaining nodes in the list. Add dest node back
* to list and try to map next dest node in list to current src
* node.
*/
pr_debug("Nodes after [0x%04X -> 0x%04X] did not match, "
"adding list back\n",
src_node->gpu_id, dest_node->gpu_id);
list_add(&dest_node->listm_mapping, dest_nodes);
maps_pop(maps, &new_maps);
}
}
}
pr_debug("Failed to map nodes 0x%04X and after\n", src_node->gpu_id);
/* Either: We could not find a mappable dest node for current node, or we could not build a
* combination from the remaining nodes in the lists. Add src node back to the list and
* caller function will try next possible combination.
*/
list_add(&src_node->listm_mapping, src_nodes);
return false;
}
/**
* @brief Determines whether list of GPUs in src_xgmi_groups are mappable to list of GPUs in
* dest_xgmi_groups
*
* This function will pick the first XGMI group (hive) from src_xgmi_groups and iterate through the
* XGMI groups in dest_xgmi_groups. If the group in dest_xgmi_groups is mappable then this function
* will remove the hives from src_xgmi_groups and dest_xgmi_groups and recursively try to map the
* remaining hives in src_xgmi_groups and dest_xgmi_groups.
*
* If src_xgmi_groups is empty, then this means that we have successfully mapped all the XGMI hives
* and we have a full list of GPU mappings in maps.
*
* If we cannot find a hive inside dest_xgmi_groups that is mappable to the first hive from
* src_xgmi_groups, then this means that this path is not valid and we need to backtrack. We
* backtrack by adding the hives back into src_xgmi_groups and dest_xgmi_groups and returning false.
* The caller function will then try a different path by trying to map the first hive in
* src_xgmi_groups to the next hive in dest_xgmi_groups.
*
* @param src_sys system topology information on source system
* @param dest_sys system topology information on destination system
* @param src_xgmi_groups list of source XGMI hives that need to be mapped
* @param dest_xgmi_groups list of destination XGMI hives that need to be mapped
* @param maps list of device maps based on current map path
* @return true if all nodes from src_nodes and dest_nodes are mappable
*/
bool match_xgmi_groups(struct tp_system *src_sys, struct tp_system *dest_sys, struct list_head *src_xgmi_groups,
struct list_head *dest_xgmi_groups, struct device_maps *maps)
{
struct tp_p2pgroup *src_group;
struct tp_p2pgroup *dest_group;
struct tp_p2pgroup *dest_group_tmp;
if (list_empty(src_xgmi_groups)) {
pr_debug("All groups matched successfully\n");
return true;
}
/* Pick the first src XGMI group from the list. Then try to match src XGMI group with a
* dest XGMI group. If we have a dest XGMI group that is mappable, then we try to
* recursively map the next src XGMI group in the list, with remaining dest XGMI groups.
* If there are no more src XGMI groups in the list, then this means we have successfully
* mapped all the groups and we have a valid device_maps
*/
src_group = list_first_entry(src_xgmi_groups, struct tp_p2pgroup, listm_system);
pr_debug("Looking for match for group [%s]\n", p2pgroup_to_str(src_group));
list_del(&src_group->listm_system);
list_for_each_entry_safe(dest_group, dest_group_tmp, dest_xgmi_groups, listm_system) {
struct tp_node *node;
LIST_HEAD(src_nodes);
LIST_HEAD(dest_nodes);
if (src_group->num_nodes > dest_group->num_nodes)
continue;
pr_debug("Trying destination group [%s]\n", p2pgroup_to_str(dest_group));
list_for_each_entry(node, &src_group->nodes, listm_p2pgroup)
list_add_tail(&node->listm_mapping, &src_nodes);
list_for_each_entry(node, &dest_group->nodes, listm_p2pgroup)
list_add_tail(&node->listm_mapping, &dest_nodes);
/* map_devices will populate maps if successful */
if (map_devices(src_sys, dest_sys, &src_nodes, &dest_nodes, maps)) {
/* All the nodes in current src XGMI group are mappable with nodes in
* current dest XGMI group. Remove the current groups from the lists
* and recursively try to match remaining groups
*/
list_del(&dest_group->listm_system);
pr_debug("Matched destination group [%s]\n", p2pgroup_to_str(dest_group));
if (match_xgmi_groups(src_sys, dest_sys, src_xgmi_groups, dest_xgmi_groups, maps)) {
pr_debug("Matched subgroups of [%s]\n", p2pgroup_to_str(dest_group));
xfree(src_group);
xfree(dest_group);
return true;
} else {
/* We were not able to map the remaining XGMI groups so we add the
* current dest XGMI group back to the list of unmapped groups, and
* try to map current src XGMI group with the next dest XGMI in the
* list of XGMI groups
*/
list_add(&dest_group->listm_system, dest_xgmi_groups);
}
}
}
/* We have not found a mappable dest XGMI group. We discard this combination. If this is
* the first src XGMI group in the list, then it is not possible to match the XGMI groups.
* If this was a recursive call, then the calling instance of function will try the next
* combination of XGMI groups
*/
pr_debug("Failed to match groups [%s]\n", p2pgroup_to_str(src_group));
list_add_tail(&src_group->listm_system, src_xgmi_groups);
return false;
}
/**
* @brief Builds a list of GPU mappings from source topology to destination topology
*
* The topology on the destination system may not be identical to the topology on the source
* system, e.g There can be GPUs with different device ID's and they may be enumerated in a
* different order. This function builds a list of GPU mappings from the source topology to the
* destination topology and stores it in maps.
*
* The function will first validate all the iolinks and determine XGMI groups (hives) by calling the
* topology_determine_iolinks(). It will then try to match the GPUs that belong to XGMI hives and
* after that, match the remaining GPUs.
*
* @param src_sys system topology information on source system
* @param dest_sys system topology information on destination system
* @param maps list of device maps that was generated by this function
* @return true if we were able to build a full list of GPU mappings.
*/
int set_restore_gpu_maps(struct tp_system *src_sys, struct tp_system *dest_sys, struct device_maps *maps)
{
struct tp_node *node;
int ret = 0;
int src_num_gpus = 0;
int dest_num_gpus = 0;
maps_init(maps);
ret = topology_determine_iolinks(src_sys);
if (ret) {
pr_err("Failed to determine iolinks from source (checkpointed) topology\n");
return ret;
}
topology_print(src_sys, "Source ");
ret = topology_determine_iolinks(dest_sys);
if (ret) {
pr_err("Failed to determine iolinks from destination (local) topology\n");
return ret;
}
topology_print(dest_sys, "Destination");
/* Make sure we have same number of GPUs in src and dest */
list_for_each_entry(node, &src_sys->nodes, listm_system) {
if (NODE_IS_GPU(node))
src_num_gpus++;
}
list_for_each_entry(node, &dest_sys->nodes, listm_system) {
if (NODE_IS_GPU(node))
dest_num_gpus++;
}
if (src_num_gpus != dest_num_gpus) {
pr_err("Number of devices mismatch (checkpointed:%d local:%d)\n", src_num_gpus, dest_num_gpus);
return -EINVAL;
}
if (src_sys->num_xgmi_groups > dest_sys->num_xgmi_groups) {
pr_err("Number of xgmi groups mismatch (checkpointed:%d local:%d)\n", src_sys->num_xgmi_groups,
dest_sys->num_xgmi_groups);
return -EINVAL;
}
/* First try to match the XGMI hives */
if (src_sys->num_xgmi_groups) {
if (!match_xgmi_groups(src_sys, dest_sys, &src_sys->xgmi_groups, &dest_sys->xgmi_groups, maps)) {
pr_err("Failed to match all GPU groups\n");
return -EINVAL;
}
pr_info("Current maps after XGMI groups matched\n");
maps_print(maps);
}
/* We matched all the XGMI hives, now match remaining GPUs */
LIST_HEAD(src_nodes);
LIST_HEAD(dest_nodes);
list_for_each_entry(node, &src_sys->nodes, listm_system) {
if (NODE_IS_GPU(node) && !maps_get_dest_gpu(maps, node->gpu_id))
list_add(&node->listm_mapping, &src_nodes);
}
list_for_each_entry(node, &dest_sys->nodes, listm_system) {
if (NODE_IS_GPU(node) && !maps_dest_gpu_mapped(maps, node->gpu_id))
list_add(&node->listm_mapping, &dest_nodes);
}
if (!map_devices(src_sys, dest_sys, &src_nodes, &dest_nodes, maps)) {
pr_err("Failed to match remaining nodes\n");
return -EINVAL;
}
pr_info("Maps after all nodes matched\n");
maps_print(maps);
return ret;
}
int topology_gpu_count(struct tp_system *sys)
{
struct tp_node *node;
int count = 0;
list_for_each_entry(node, &sys->nodes, listm_system)
if (NODE_IS_GPU(node))
count++;
return count;
}