mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-22 01:51:51 +00:00
amdgpu_plugin: Refactor code used to implement Checkpoint
Refactor code used to Checkpoint DRM devices. Code is moved into amdgpu_plugin_drm.c file which hosts various methods to checkpoint and restore a workload. Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu@amd.com>
This commit is contained in:
parent
733ef96315
commit
0d5923c95e
@ -49,6 +49,13 @@ struct vma_metadata {
|
||||
|
||||
/************************************ Global Variables ********************************************/
|
||||
|
||||
/**
|
||||
* FD of KFD device used to checkpoint. On a multi-process
|
||||
* tree the order of checkpointing goes from parent to child
|
||||
* and so on - so saving the FD will not be overwritten
|
||||
*/
|
||||
static int kfd_checkpoint_fd;
|
||||
|
||||
static LIST_HEAD(update_vma_info_list);
|
||||
|
||||
size_t kfd_max_buffer_size;
|
||||
@ -990,6 +997,10 @@ static int unpause_process(int fd)
|
||||
goto exit;
|
||||
}
|
||||
|
||||
// Reset the KFD FD
|
||||
kfd_checkpoint_fd = -1;
|
||||
sys_close_drm_render_devices(&src_topology);
|
||||
|
||||
exit:
|
||||
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
|
||||
|
||||
@ -1181,44 +1192,25 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* Initialize number of device files that will be checkpointed */
|
||||
init_gpu_count(&src_topology);
|
||||
|
||||
/* Check whether this plugin was called for kfd or render nodes */
|
||||
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
|
||||
|
||||
/* This is RenderD dumper plugin, for now just save renderD
|
||||
* minor number to be used during restore. In later phases this
|
||||
* needs to save more data for video decode etc.
|
||||
*/
|
||||
|
||||
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
|
||||
struct tp_node *tp_node;
|
||||
|
||||
pr_info("Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id);
|
||||
|
||||
tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev));
|
||||
if (!tp_node) {
|
||||
pr_err("Failed to find a device with minor number = %d\n", minor(st.st_rdev));
|
||||
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
|
||||
if (!rd.gpu_id)
|
||||
return -ENODEV;
|
||||
|
||||
len = criu_render_node__get_packed_size(&rd);
|
||||
buf = xmalloc(len);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
criu_render_node__pack(&rd, buf);
|
||||
|
||||
snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
|
||||
ret = write_img_file(img_path, buf, len);
|
||||
if (ret) {
|
||||
xfree(buf);
|
||||
ret = amdgpu_plugin_drm_dump_file(fd, id, &st);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
xfree(buf);
|
||||
/* Invoke unpause process if needed */
|
||||
decrement_checkpoint_count();
|
||||
if (checkpoint_is_complete()) {
|
||||
ret = unpause_process(kfd_checkpoint_fd);
|
||||
}
|
||||
|
||||
/* Need to return success here so that criu can call plugins for renderD nodes */
|
||||
return ret;
|
||||
@ -1315,11 +1307,15 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
||||
ret = write_img_file(img_path, buf, len);
|
||||
|
||||
xfree(buf);
|
||||
exit:
|
||||
/* Restore all queues */
|
||||
unpause_process(fd);
|
||||
|
||||
sys_close_drm_render_devices(&src_topology);
|
||||
exit:
|
||||
/* Restore all queues if conditions permit */
|
||||
kfd_checkpoint_fd = fd;
|
||||
decrement_checkpoint_count();
|
||||
if (checkpoint_is_complete()) {
|
||||
ret = unpause_process(fd);
|
||||
}
|
||||
|
||||
xfree((void *)args.devices);
|
||||
xfree((void *)args.bos);
|
||||
xfree((void *)args.priv_data);
|
||||
|
@ -61,3 +61,41 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
|
||||
}
|
||||
|
||||
|
||||
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
|
||||
{
|
||||
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
|
||||
struct tp_node *tp_node;
|
||||
char path[PATH_MAX];
|
||||
unsigned char *buf;
|
||||
int minor;
|
||||
int len;
|
||||
int ret;
|
||||
|
||||
/* Get the topology node of the DRM device */
|
||||
minor = minor(drm->st_rdev);
|
||||
tp_node = sys_get_node_by_render_minor(&src_topology, minor);
|
||||
if (!tp_node) {
|
||||
pr_err("Failed to find a device with minor number = %d\n", minor);
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
/* Get the GPU_ID of the DRM device */
|
||||
rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
|
||||
if (!rd.gpu_id) {
|
||||
pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id);
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
len = criu_render_node__get_packed_size(&rd);
|
||||
buf = xmalloc(len);
|
||||
if (!buf)
|
||||
return -ENOMEM;
|
||||
|
||||
criu_render_node__pack(&rd, buf);
|
||||
|
||||
snprintf(path, sizeof(path), IMG_DRM_FILE, id);
|
||||
ret = write_img_file(path, buf, len);
|
||||
xfree(buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -17,6 +17,12 @@
|
||||
*/
|
||||
int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm);
|
||||
|
||||
/**
|
||||
* Serialize meta-data about a particular DRM device, its number of BOs,
|
||||
* etc into a file. The serialized filename has in it the value ID that
|
||||
* is passed in as a parameter
|
||||
*/
|
||||
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm);
|
||||
|
||||
#endif /* __AMDGPU_PLUGIN_DRM_H__ */
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user