2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 09:58:09 +00:00
criu/plugins/amdgpu/amdgpu_plugin_topology.h
David Yat Sin 72905c9c9b criu/plugin: Remap GPUs on checkpoint restore
The device topology on the restore node can be different from the
topology on the checkpointed node. The GPUs on the restore node may
have different gpu_ids, minor number. or some GPUs may have different
properties as checkpointed node. During restore, the CRIU plugin
determines the target GPUs to avoid restore failures caused by trying
to restore a process on a gpu that is different.

Signed-off-by: David Yat Sin <david.yatsin@amd.com>
2022-04-28 17:53:52 -07:00

129 lines
3.7 KiB
C

#ifndef __KFD_PLUGIN_TOPOLOGY_H__
#define __KFD_PLUGIN_TOPOLOGY_H__
#define DRM_FIRST_RENDER_NODE 128
#define DRM_LAST_RENDER_NODE 255
#define TOPO_HEAP_TYPE_PUBLIC 1 /* HSA_HEAPTYPE_FRAME_BUFFER_PUBLIC */
#define TOPO_HEAP_TYPE_PRIVATE 2 /* HSA_HEAPTYPE_FRAME_BUFFER_PRIVATE */
#define TOPO_IOLINK_TYPE_ANY 0 /* HSA_IOLINKTYPE_UNDEFINED */
#define TOPO_IOLINK_TYPE_PCIE 2 /* HSA_IOLINKTYPE_PCIEXPRESS */
#define TOPO_IOLINK_TYPE_XGMI 11 /* HSA_IOLINK_TYPE_XGMI */
#define NODE_IS_GPU(node) ((node)->gpu_id != 0)
#define INVALID_CPU_ID 0xFFFF
/*************************************** Structures ***********************************************/
struct tp_node;
struct tp_iolink {
struct list_head listm;
uint32_t type;
uint32_t node_to_id;
struct tp_node *node_to;
struct tp_node *node_from;
bool valid; /* Set to false if target node is not accessible */
struct tp_iolink *peer; /* If link is bi-directional, peer link */
};
struct tp_node {
uint32_t id;
uint32_t gpu_id;
uint32_t cpu_cores_count;
uint32_t simd_count;
uint32_t mem_banks_count;
uint32_t caches_count;
uint32_t io_links_count;
uint32_t max_waves_per_simd;
uint32_t lds_size_in_kb;
uint32_t num_gws;
uint32_t wave_front_size;
uint32_t array_count;
uint32_t simd_arrays_per_engine;
uint32_t cu_per_simd_array;
uint32_t simd_per_cu;
uint32_t max_slots_scratch_cu;
uint32_t vendor_id;
uint32_t device_id;
uint32_t domain;
uint32_t drm_render_minor;
uint64_t hive_id;
uint32_t num_sdma_engines;
uint32_t num_sdma_xgmi_engines;
uint32_t num_sdma_queues_per_engine;
uint32_t num_cp_queues;
uint32_t fw_version;
uint32_t capability;
uint32_t sdma_fw_version;
bool vram_public;
uint64_t vram_size;
struct list_head listm_system;
struct list_head listm_p2pgroup;
struct list_head listm_mapping; /* Used only during device mapping */
uint32_t num_valid_iolinks;
struct list_head iolinks;
int drm_fd;
};
struct tp_p2pgroup {
uint32_t type;
uint32_t num_nodes;
struct list_head listm_system;
struct list_head nodes;
};
struct tp_system {
bool parsed;
uint32_t num_nodes;
struct list_head nodes;
uint32_t num_xgmi_groups;
struct list_head xgmi_groups;
};
struct id_map {
uint32_t src;
uint32_t dest;
struct list_head listm;
};
struct device_maps {
struct list_head cpu_maps; /* CPUs are mapped using node_id */
struct list_head gpu_maps;
struct list_head *tail_cpu; /* GPUs are mapped using gpu_id */
struct list_head *tail_gpu;
};
/**************************************** Functions ***********************************************/
void topology_init(struct tp_system *sys);
void topology_free(struct tp_system *topology);
int topology_parse(struct tp_system *topology, const char *msg);
int topology_determine_iolinks(struct tp_system *sys);
void topology_print(const struct tp_system *sys, const char *msg);
struct id_map *maps_add_gpu_entry(struct device_maps *maps, const uint32_t src_id, const uint32_t dest_id);
struct tp_node *sys_add_node(struct tp_system *sys, uint32_t id, uint32_t gpu_id);
struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t node_to_id);
struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32_t gpu_id);
struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor);
int node_get_drm_render_device(struct tp_node *node);
void sys_close_drm_render_devices(struct tp_system *sys);
int set_restore_gpu_maps(struct tp_system *tp_checkpoint, struct tp_system *tp_local, struct device_maps *maps);
uint32_t maps_get_dest_gpu(const struct device_maps *maps, const uint32_t src_id);
void maps_init(struct device_maps *maps);
void maps_free(struct device_maps *maps);
#endif /* __KFD_PLUGIN_TOPOLOGY_H__ */