2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 01:51:51 +00:00

amdgpu_plugin: Refactor code used to implement Checkpoint

Refactor code used to Checkpoint DRM devices. Code is moved
into amdgpu_plugin_drm.c file which hosts various methods to
checkpoint and restore a workload.

Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu@amd.com>
This commit is contained in:
Ramesh Errabolu 2023-11-10 13:02:49 -06:00 committed by Andrei Vagin
parent 733ef96315
commit 0d5923c95e
3 changed files with 74 additions and 34 deletions

View File

@ -49,6 +49,13 @@ struct vma_metadata {
/************************************ Global Variables ********************************************/
/**
* FD of KFD device used to checkpoint. On a multi-process
* tree the order of checkpointing goes from parent to child
* and so on - so saving the FD will not be overwritten
*/
static int kfd_checkpoint_fd;
static LIST_HEAD(update_vma_info_list);
size_t kfd_max_buffer_size;
@ -990,6 +997,10 @@ static int unpause_process(int fd)
goto exit;
}
// Reset the KFD FD
kfd_checkpoint_fd = -1;
sys_close_drm_render_devices(&src_topology);
exit:
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
@ -1181,44 +1192,25 @@ int amdgpu_plugin_dump_file(int fd, int id)
return -1;
}
/* Initialize number of device files that will be checkpointed */
init_gpu_count(&src_topology);
/* Check whether this plugin was called for kfd or render nodes */
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
/* This is RenderD dumper plugin, for now just save renderD
* minor number to be used during restore. In later phases this
* needs to save more data for video decode etc.
*/
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
struct tp_node *tp_node;
pr_info("Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id);
tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev));
if (!tp_node) {
pr_err("Failed to find a device with minor number = %d\n", minor(st.st_rdev));
return -ENODEV;
}
rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
if (!rd.gpu_id)
return -ENODEV;
len = criu_render_node__get_packed_size(&rd);
buf = xmalloc(len);
if (!buf)
return -ENOMEM;
criu_render_node__pack(&rd, buf);
snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
ret = write_img_file(img_path, buf, len);
if (ret) {
xfree(buf);
ret = amdgpu_plugin_drm_dump_file(fd, id, &st);
if (ret)
return ret;
}
xfree(buf);
/* Invoke unpause process if needed */
decrement_checkpoint_count();
if (checkpoint_is_complete()) {
ret = unpause_process(kfd_checkpoint_fd);
}
/* Need to return success here so that criu can call plugins for renderD nodes */
return ret;
@ -1315,11 +1307,15 @@ int amdgpu_plugin_dump_file(int fd, int id)
ret = write_img_file(img_path, buf, len);
xfree(buf);
exit:
/* Restore all queues */
unpause_process(fd);
sys_close_drm_render_devices(&src_topology);
exit:
/* Restore all queues if conditions permit */
kfd_checkpoint_fd = fd;
decrement_checkpoint_count();
if (checkpoint_is_complete()) {
ret = unpause_process(fd);
}
xfree((void *)args.devices);
xfree((void *)args.bos);
xfree((void *)args.priv_data);

View File

@ -61,3 +61,41 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
}
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
{
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
struct tp_node *tp_node;
char path[PATH_MAX];
unsigned char *buf;
int minor;
int len;
int ret;
/* Get the topology node of the DRM device */
minor = minor(drm->st_rdev);
tp_node = sys_get_node_by_render_minor(&src_topology, minor);
if (!tp_node) {
pr_err("Failed to find a device with minor number = %d\n", minor);
return -ENODEV;
}
/* Get the GPU_ID of the DRM device */
rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
if (!rd.gpu_id) {
pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id);
return -ENODEV;
}
len = criu_render_node__get_packed_size(&rd);
buf = xmalloc(len);
if (!buf)
return -ENOMEM;
criu_render_node__pack(&rd, buf);
snprintf(path, sizeof(path), IMG_DRM_FILE, id);
ret = write_img_file(path, buf, len);
xfree(buf);
return ret;
}

View File

@ -17,6 +17,12 @@
*/
int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm);
/**
* Serialize meta-data about a particular DRM device, its number of BOs,
* etc into a file. The serialized filename has in it the value ID that
* is passed in as a parameter
*/
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm);
#endif /* __AMDGPU_PLUGIN_DRM_H__ */