2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 18:07:57 +00:00
criu/plugins/amdgpu/amdgpu_plugin_topology.c
David Yat Sin 6e99fea2fa criu/plugin: Implement system topology parsing
Parse local system topology in /sys/class/kfd/kfd/topology/nodes/ and
store properties for each gpu in the CRIU image files. The gpu
properties can then be used later during restore to make the process is
restored on gpu's with similar properties.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
2022-04-28 17:53:52 -07:00

720 lines
19 KiB
C

#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <linux/limits.h>
#include <dirent.h>
#include "common/list.h"
#include "xmalloc.h"
#include "kfd_ioctl.h"
#include "amdgpu_plugin_topology.h"
#define TOPOLOGY_PATH "/sys/class/kfd/kfd/topology/nodes/"
#ifndef _GNU_SOURCE
#define _GNU_SOURCE 1
#endif
#ifdef DEBUG
#define plugin_log_msg(fmt, ...) pr_debug(fmt, ##__VA_ARGS__)
#else
#define plugin_log_msg(fmt, ...) \
{ \
}
#endif
static const char *link_type(uint32_t type)
{
switch (type) {
case TOPO_IOLINK_TYPE_PCIE:
return "PCIe";
case TOPO_IOLINK_TYPE_XGMI:
return "XGMI";
}
return "Unsupported";
}
static struct tp_node *p2pgroup_get_node_by_gpu_id(const struct tp_p2pgroup *group, const uint32_t gpu_id)
{
struct tp_node *node;
list_for_each_entry(node, &group->nodes, listm_p2pgroup) {
if (node->gpu_id == gpu_id)
return node;
}
return NULL;
}
struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor)
{
struct tp_node *node;
list_for_each_entry(node, &sys->nodes, listm_system) {
if (node->drm_render_minor == drm_render_minor)
return node;
}
return NULL;
}
struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32_t gpu_id)
{
struct tp_node *node;
list_for_each_entry(node, &sys->nodes, listm_system) {
if (node->gpu_id == gpu_id)
return node;
}
return NULL;
}
static struct tp_node *sys_get_node_by_node_id(const struct tp_system *sys, const uint32_t node_id)
{
struct tp_node *node;
list_for_each_entry(node, &sys->nodes, listm_system) {
if (node->id == node_id)
return node;
}
return NULL;
}
static struct tp_p2pgroup *sys_get_p2pgroup_with_gpu_id(const struct tp_system *sys, const int type,
const uint32_t gpu_id)
{
struct tp_p2pgroup *p2pgroup;
list_for_each_entry(p2pgroup, &sys->xgmi_groups, listm_system) {
if (p2pgroup->type != type)
continue;
if (p2pgroup_get_node_by_gpu_id(p2pgroup, gpu_id))
return p2pgroup;
}
return NULL;
}
static struct tp_iolink *get_tp_peer_iolink(const struct tp_node *from_node, const struct tp_node *to_node,
const uint8_t type)
{
struct tp_iolink *iolink;
list_for_each_entry(iolink, &from_node->iolinks, listm) {
if (iolink->node_to_id == to_node->id && iolink->type == type)
return iolink;
}
return NULL;
}
struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t node_to_id)
{
struct tp_iolink *iolink = xzalloc(sizeof(*iolink));
if (!iolink)
return NULL;
iolink->type = type;
/* iolink->node_to will be filled in topology_determine_iolinks */
iolink->node_to_id = node_to_id;
iolink->node_from = node;
list_add_tail(&iolink->listm, &node->iolinks);
return iolink;
}
struct tp_p2pgroup *sys_add_group(struct tp_system *sys, uint32_t type)
{
struct tp_p2pgroup *group;
group = xzalloc(sizeof(*group));
if (!group)
return NULL;
INIT_LIST_HEAD(&group->nodes);
group->type = type;
list_add_tail(&group->listm_system, &sys->xgmi_groups);
if (type == TOPO_IOLINK_TYPE_XGMI)
sys->num_xgmi_groups++;
return group;
}
struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id)
{
struct tp_node *node = NULL;
node = xzalloc(sizeof(*node));
if (!node)
return NULL;
node->id = id;
node->gpu_id = gpu_id;
node->drm_fd = -1;
INIT_LIST_HEAD(&node->iolinks);
list_add_tail(&node->listm_system, &sys->nodes);
sys->num_nodes++;
return node;
}
static bool get_prop(char *line, char *name, uint64_t *value)
{
if (sscanf(line, " %29s %lu", name, value) != 2)
return false;
return true;
}
/* Parse node properties in /sys/class/kfd/kfd/topology/nodes/N/properties */
static int parse_topo_node_properties(struct tp_node *dev, const char *dir_path)
{
FILE *file;
char path[300];
char line[300];
sprintf(path, "%s/properties", dir_path);
file = fopen(path, "r");
if (!file) {
pr_perror("Failed to access %s", path);
return -EFAULT;
}
while (fgets(line, sizeof(line), file)) {
char name[30];
uint64_t value;
memset(name, 0, sizeof(name));
if (!get_prop(line, name, &value))
goto fail;
if (!strcmp(name, "cpu_cores_count"))
dev->cpu_cores_count = (uint32_t)value;
else if (!strcmp(name, "simd_count"))
dev->simd_count = (uint32_t)value;
else if (!strcmp(name, "mem_banks_count"))
dev->mem_banks_count = (uint32_t)value;
else if (!strcmp(name, "caches_count"))
dev->caches_count = (uint32_t)value;
else if (!strcmp(name, "io_links_count"))
dev->io_links_count = (uint32_t)value;
else if (!strcmp(name, "max_waves_per_simd"))
dev->max_waves_per_simd = (uint32_t)value;
else if (!strcmp(name, "lds_size_in_kb"))
dev->lds_size_in_kb = (uint32_t)value;
else if (!strcmp(name, "num_gws"))
dev->num_gws = (uint32_t)value;
else if (!strcmp(name, "wave_front_size"))
dev->wave_front_size = (uint32_t)value;
else if (!strcmp(name, "array_count"))
dev->array_count = (uint32_t)value;
else if (!strcmp(name, "simd_arrays_per_engine"))
dev->simd_arrays_per_engine = (uint32_t)value;
else if (!strcmp(name, "cu_per_simd_array"))
dev->cu_per_simd_array = (uint32_t)value;
else if (!strcmp(name, "simd_per_cu"))
dev->simd_per_cu = (uint32_t)value;
else if (!strcmp(name, "max_slots_scratch_cu"))
dev->max_slots_scratch_cu = (uint32_t)value;
else if (!strcmp(name, "vendor_id"))
dev->vendor_id = (uint32_t)value;
else if (!strcmp(name, "device_id"))
dev->device_id = (uint32_t)value;
else if (!strcmp(name, "domain"))
dev->domain = (uint32_t)value;
else if (!strcmp(name, "drm_render_minor"))
dev->drm_render_minor = (uint32_t)value;
else if (!strcmp(name, "hive_id"))
dev->hive_id = value;
else if (!strcmp(name, "num_sdma_engines"))
dev->num_sdma_engines = (uint32_t)value;
else if (!strcmp(name, "num_sdma_xgmi_engines"))
dev->num_sdma_xgmi_engines = (uint32_t)value;
else if (!strcmp(name, "num_sdma_queues_per_engine"))
dev->num_sdma_queues_per_engine = (uint32_t)value;
else if (!strcmp(name, "num_cp_queues"))
dev->num_cp_queues = (uint32_t)value;
else if (!strcmp(name, "fw_version"))
dev->fw_version = (uint32_t)value;
else if (!strcmp(name, "capability"))
dev->capability = (uint32_t)value;
else if (!strcmp(name, "sdma_fw_version"))
dev->sdma_fw_version = (uint32_t)value;
if (!dev->gpu_id && dev->cpu_cores_count >= 1) {
/* This is a CPU - we do not need to parse the other information */
break;
}
}
fclose(file);
return 0;
fail:
pr_err("Failed to parse line = %s\n", line);
fclose(file);
return -EINVAL;
}
/* Parse node memory properties in /sys/class/kfd/kfd/topology/nodes/N/mem_banks */
static int parse_topo_node_mem_banks(struct tp_node *node, const char *dir_path)
{
struct dirent *dirent_node;
DIR *d_node;
char path[300];
FILE *file = NULL;
uint32_t heap_type = 0;
uint64_t mem_size = 0;
int ret;
if (!NODE_IS_GPU(node))
return 0;
sprintf(path, "%s/mem_banks", dir_path);
d_node = opendir(path);
if (!d_node) {
pr_perror("Can't open %s", path);
return -EACCES;
}
while ((dirent_node = readdir(d_node)) != NULL) {
char line[300];
char bank_path[1024];
struct stat st;
int id;
heap_type = 0;
mem_size = 0;
/* Only parse numeric directories */
if (sscanf(dirent_node->d_name, "%d", &id) != 1)
continue;
snprintf(bank_path, sizeof(bank_path), "%s/%s", path, dirent_node->d_name);
if (stat(bank_path, &st)) {
pr_err("Cannot to access %s\n", path);
ret = -EACCES;
goto fail;
}
if ((st.st_mode & S_IFMT) == S_IFDIR) {
char properties_path[PATH_MAX];
snprintf(properties_path, sizeof(properties_path), "%s/properties", bank_path);
file = fopen(properties_path, "r");
if (!file) {
pr_perror("Failed to access %s", properties_path);
ret = -EACCES;
goto fail;
}
while (fgets(line, sizeof(line), file)) {
char name[30];
uint64_t value;
memset(name, 0, sizeof(name));
if (!get_prop(line, name, &value)) {
ret = -EINVAL;
goto fail;
}
if (!strcmp(name, "heap_type"))
heap_type = (uint32_t)value;
if (!strcmp(name, "size_in_bytes"))
mem_size = value;
}
fclose(file);
file = NULL;
}
if (heap_type == TOPO_HEAP_TYPE_PUBLIC || heap_type == TOPO_HEAP_TYPE_PRIVATE)
break;
}
if ((heap_type != TOPO_HEAP_TYPE_PUBLIC && heap_type != TOPO_HEAP_TYPE_PRIVATE) || !mem_size) {
pr_err("Failed to determine memory type and size for device in %s\n", dir_path);
ret = -EINVAL;
goto fail;
}
node->vram_public = (heap_type == TOPO_HEAP_TYPE_PUBLIC);
node->vram_size = mem_size;
closedir(d_node);
return 0;
fail:
if (file)
fclose(file);
closedir(d_node);
return ret;
}
/* Parse node iolinks properties in /sys/class/kfd/kfd/topology/nodes/N/io_links */
static int parse_topo_node_iolinks(struct tp_node *node, const char *dir_path)
{
struct dirent *dirent_node;
DIR *d_node;
char path[300];
FILE *file = NULL;
int ret = 0;
snprintf(path, sizeof(path), "%s/io_links", dir_path);
d_node = opendir(path);
if (!d_node) {
pr_perror("Can't open %s", path);
return -EACCES;
}
while ((dirent_node = readdir(d_node)) != NULL) {
char line[300];
char iolink_path[1024];
struct stat st;
int id;
uint32_t iolink_type = 0;
uint32_t node_to_id = 0;
/* Only parse numeric directories */
if (sscanf(dirent_node->d_name, "%d", &id) != 1)
continue;
snprintf(iolink_path, sizeof(iolink_path), "%s/%s", path, dirent_node->d_name);
if (stat(iolink_path, &st)) {
pr_err("Cannot to access %s\n", path);
ret = -EACCES;
goto fail;
}
if ((st.st_mode & S_IFMT) == S_IFDIR) {
char properties_path[PATH_MAX];
snprintf(properties_path, sizeof(properties_path), "%s/properties", iolink_path);
file = fopen(properties_path, "r");
if (!file) {
pr_perror("Failed to access %s", properties_path);
ret = -EACCES;
goto fail;
}
while (fgets(line, sizeof(line), file)) {
char name[30];
uint64_t value;
memset(name, 0, sizeof(name));
if (!get_prop(line, name, &value)) {
ret = -EINVAL;
goto fail;
}
if (!strcmp(name, "type"))
iolink_type = (uint32_t)value;
if (!strcmp(name, "node_to"))
node_to_id = (uint32_t)value;
}
fclose(file);
file = NULL;
}
/* We only store the link information for now, then once all topology parsing is
* finished we will confirm iolinks
*/
if (iolink_type == TOPO_IOLINK_TYPE_PCIE || iolink_type == TOPO_IOLINK_TYPE_XGMI) {
if (!node_add_iolink(node, iolink_type, node_to_id)) {
ret = -ENOMEM;
goto fail;
}
}
}
closedir(d_node);
return 0;
fail:
if (file)
fclose(file);
closedir(d_node);
return ret;
}
/* Parse a node (CPU or GPU) in /sys/class/kfd/kfd/topology/nodes/N */
static int parse_topo_node(struct tp_node *node, const char *dir_path)
{
if (parse_topo_node_properties(node, dir_path)) {
pr_err("Failed to parse node properties\n");
return -EINVAL;
}
if (parse_topo_node_mem_banks(node, dir_path)) {
pr_err("Failed to parse node mem_banks\n");
return -EINVAL;
}
if (parse_topo_node_iolinks(node, dir_path)) {
pr_err("Failed to parse node iolinks\n");
return -EINVAL;
}
return 0;
}
static const char *p2pgroup_to_str(struct tp_p2pgroup *group)
{
static char topology_printstr[200];
struct tp_node *node;
size_t str_len = 0;
topology_printstr[0] = '\0';
str_len += sprintf(&topology_printstr[str_len], "type:%s:", link_type(group->type));
list_for_each_entry(node, &group->nodes, listm_p2pgroup) {
str_len += sprintf(&topology_printstr[str_len], "0x%04X ", node->gpu_id);
}
return topology_printstr;
}
void topology_print(const struct tp_system *sys, const char *message)
{
struct tp_node *node;
struct tp_p2pgroup *xgmi_group;
pr_info("===System Topology=[%12s]==================================\n", message);
list_for_each_entry(node, &sys->nodes, listm_system) {
struct tp_iolink *iolink;
if (!NODE_IS_GPU(node)) {
pr_info("[%d] CPU\n", node->id);
pr_info(" cpu_cores_count:%u\n", node->cpu_cores_count);
} else {
pr_info("[%d] GPU gpu_id:0x%04X\n", node->id, node->gpu_id);
pr_info(" vendor_id:%u device_id:%u\n", node->vendor_id, node->device_id);
pr_info(" vram_public:%c vram_size:%lu\n", node->vram_public ? 'Y' : 'N', node->vram_size);
pr_info(" io_links_count:%u capability:%u\n", node->io_links_count, node->capability);
pr_info(" mem_banks_count:%u caches_count:%d lds_size_in_kb:%u\n", node->mem_banks_count,
node->caches_count, node->lds_size_in_kb);
pr_info(" simd_count:%u max_waves_per_simd:%u\n", node->simd_count,
node->max_waves_per_simd);
pr_info(" num_gws:%u wave_front_size:%u array_count:%u\n", node->num_gws,
node->wave_front_size, node->array_count);
pr_info(" simd_arrays_per_engine:%u simd_per_cu:%u\n", node->simd_arrays_per_engine,
node->simd_per_cu);
pr_info(" max_slots_scratch_cu:%u cu_per_simd_array:%u\n", node->max_slots_scratch_cu,
node->cu_per_simd_array);
pr_info(" num_sdma_engines:%u\n", node->num_sdma_engines);
pr_info(" num_sdma_xgmi_engines:%u num_sdma_queues_per_engine:%u\n",
node->num_sdma_xgmi_engines, node->num_sdma_queues_per_engine);
pr_info(" num_cp_queues:%u fw_version:%u sdma_fw_version:%u\n", node->num_cp_queues,
node->fw_version, node->sdma_fw_version);
}
list_for_each_entry(iolink, &node->iolinks, listm) {
if (!iolink->valid)
continue;
pr_info(" iolink type:%s node-to:%d (0x%04X) node-from:%d bi-dir:%s\n",
link_type(iolink->type), iolink->node_to_id, iolink->node_to->gpu_id,
iolink->node_from->id, iolink->peer ? "Y" : "N");
}
}
pr_info("===Groups==========================================================\n");
list_for_each_entry(xgmi_group, &sys->xgmi_groups, listm_system)
pr_info("%s\n", p2pgroup_to_str(xgmi_group));
pr_info("===================================================================\n");
}
void topology_init(struct tp_system *sys)
{
memset(sys, 0, sizeof(*sys));
INIT_LIST_HEAD(&sys->nodes);
INIT_LIST_HEAD(&sys->xgmi_groups);
}
void topology_free(struct tp_system *sys)
{
while (!list_empty(&sys->nodes)) {
struct tp_node *node = list_first_entry(&sys->nodes, struct tp_node, listm_system);
list_del(&node->listm_system);
while (!list_empty(&node->iolinks)) {
struct tp_iolink *iolink = list_first_entry(&node->iolinks, struct tp_iolink, listm);
list_del(&iolink->listm);
xfree(iolink);
}
xfree(node);
}
while (!list_empty(&sys->xgmi_groups)) {
struct tp_p2pgroup *p2pgroup = list_first_entry(&sys->xgmi_groups, struct tp_p2pgroup, listm_system);
list_del(&p2pgroup->listm_system);
xfree(p2pgroup);
}
}
/**
* @brief Validates iolinks and determine XGMI hives in a system topology
*
* On some systems, some GPUs may not be accessible because they are masked by cgroups, but the
* iolinks to these GPUs are still visible. If the peer GPU is not accessible, we consider that link
* invalid.
* In a XGMI hive, each GPU will have a bi-directional iolink to every other GPU. So we create a
* XGMI group (hive) and add all the GPUs in that hive to the group when iterating over the first
* GPU in that group.
*
* @param sys system topology
* @return 0 if successful, errno if failed.
*/
int topology_determine_iolinks(struct tp_system *sys)
{
int ret = 0;
struct tp_node *node;
list_for_each_entry(node, &sys->nodes, listm_system) {
struct tp_iolink *iolink;
list_for_each_entry(iolink, &node->iolinks, listm) {
struct tp_p2pgroup *group = NULL;
struct tp_node *peer_node = NULL;
struct tp_iolink *peer_iolink = NULL;
peer_node = sys_get_node_by_node_id(sys, iolink->node_to_id);
if (!peer_node) {
/* node not accessible, usually because it is masked by cgroups */
iolink->valid = false;
continue;
}
iolink->valid = true;
node->num_valid_iolinks++;
iolink->node_to = peer_node;
peer_iolink = get_tp_peer_iolink(peer_node, node, iolink->type);
if (!peer_iolink)
continue; /* This is a one-dir link */
/* We confirmed both sides have same type of iolink */
iolink->peer = peer_iolink;
peer_iolink->peer = iolink;
if (iolink->type == TOPO_IOLINK_TYPE_XGMI) {
group = sys_get_p2pgroup_with_gpu_id(sys, iolink->type, node->gpu_id);
if (!group) {
/* This GPU does not already belong to a group so we create
* a new group
*/
group = sys_add_group(sys, iolink->type);
if (!group) {
ret = -ENOMEM;
goto fail;
}
list_add_tail(&node->listm_p2pgroup, &group->nodes);
}
/* Also add peer GPU to this group */
if (!p2pgroup_get_node_by_gpu_id(group, peer_node->gpu_id))
list_add_tail(&peer_node->listm_p2pgroup, &group->nodes);
}
}
}
fail:
/* In case of failure, caller function will call topology_free which will free groups that
* were successfully allocated
*/
return ret;
}
/**
* @brief Parse system topology
*
* Parse system topology exposed by the drivers in /sys/class/kfd/kfd/topology and fill in the
* system topology structure.
*
* @param sys system topology structure to be filled by this function
* @param message print this message when printing the topology to logs
* @return 0 if successful, errno if failed.
*/
int topology_parse(struct tp_system *sys, const char *message)
{
struct dirent *dirent_system;
DIR *d_system;
char path[300];
int ret;
if (sys->parsed)
return 0;
sys->parsed = true;
INIT_LIST_HEAD(&sys->nodes);
INIT_LIST_HEAD(&sys->xgmi_groups);
d_system = opendir(TOPOLOGY_PATH);
if (!d_system) {
pr_perror("Can't open %s", TOPOLOGY_PATH);
return -EACCES;
}
while ((dirent_system = readdir(d_system)) != NULL) {
struct stat stbuf;
int id, fd;
/* Only parse numeric directories */
if (sscanf(dirent_system->d_name, "%d", &id) != 1)
continue;
sprintf(path, "%s%s", TOPOLOGY_PATH, dirent_system->d_name);
if (stat(path, &stbuf)) {
/* When cgroup is masking some devices, the path exists, but it is not
* accessible, this is not an error
*/
pr_info("Cannot to access %s\n", path);
continue;
}
if ((stbuf.st_mode & S_IFMT) == S_IFDIR) {
struct tp_node *node;
int len;
char gpu_id_path[300];
char read_buf[7]; /* Max gpu_id len is 6 chars */
unsigned int gpu_id;
sprintf(gpu_id_path, "%s/%s/gpu_id", TOPOLOGY_PATH, dirent_system->d_name);
fd = open(gpu_id_path, O_RDONLY);
if (fd < 0) {
pr_perror("Failed to access %s", gpu_id_path);
continue;
}
len = read(fd, read_buf, sizeof(read_buf) - 1);
close(fd);
if (len < 0)
continue;
read_buf[len] = '\0';
if (sscanf(read_buf, "%d", &gpu_id) != 1)
continue;
node = sys_add_node(sys, id, gpu_id);
if (!node) {
ret = -ENOMEM;
goto fail;
}
if (parse_topo_node(node, path)) {
pr_err("Failed to parse node %s\n", path);
ret = -EINVAL;
goto fail;
}
}
}
closedir(d_system);
return 0;
fail:
topology_free(sys);
return ret;
}