mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-29 13:28:27 +00:00
amdgpu_plugin: Refactor code used to implement Checkpoint
Refactor code used to Checkpoint DRM devices. Code is moved into amdgpu_plugin_drm.c file which hosts various methods to checkpoint and restore a workload. Signed-off-by: Ramesh Errabolu <Ramesh.Errabolu@amd.com>
This commit is contained in:
parent
733ef96315
commit
0d5923c95e
@ -49,6 +49,13 @@ struct vma_metadata {
|
|||||||
|
|
||||||
/************************************ Global Variables ********************************************/
|
/************************************ Global Variables ********************************************/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* FD of KFD device used to checkpoint. On a multi-process
|
||||||
|
* tree the order of checkpointing goes from parent to child
|
||||||
|
* and so on - so saving the FD will not be overwritten
|
||||||
|
*/
|
||||||
|
static int kfd_checkpoint_fd;
|
||||||
|
|
||||||
static LIST_HEAD(update_vma_info_list);
|
static LIST_HEAD(update_vma_info_list);
|
||||||
|
|
||||||
size_t kfd_max_buffer_size;
|
size_t kfd_max_buffer_size;
|
||||||
@ -990,6 +997,10 @@ static int unpause_process(int fd)
|
|||||||
goto exit;
|
goto exit;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reset the KFD FD
|
||||||
|
kfd_checkpoint_fd = -1;
|
||||||
|
sys_close_drm_render_devices(&src_topology);
|
||||||
|
|
||||||
exit:
|
exit:
|
||||||
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
|
pr_info("Process unpaused %s (ret:%d)\n", ret ? "Failed" : "Ok", ret);
|
||||||
|
|
||||||
@ -1181,44 +1192,25 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Initialize number of device files that will be checkpointed */
|
||||||
|
init_gpu_count(&src_topology);
|
||||||
|
|
||||||
/* Check whether this plugin was called for kfd or render nodes */
|
/* Check whether this plugin was called for kfd or render nodes */
|
||||||
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
|
if (major(st.st_rdev) != major(st_kfd.st_rdev) || minor(st.st_rdev) != 0) {
|
||||||
|
|
||||||
/* This is RenderD dumper plugin, for now just save renderD
|
/* This is RenderD dumper plugin, for now just save renderD
|
||||||
* minor number to be used during restore. In later phases this
|
* minor number to be used during restore. In later phases this
|
||||||
* needs to save more data for video decode etc.
|
* needs to save more data for video decode etc.
|
||||||
*/
|
*/
|
||||||
|
ret = amdgpu_plugin_drm_dump_file(fd, id, &st);
|
||||||
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
|
if (ret)
|
||||||
struct tp_node *tp_node;
|
|
||||||
|
|
||||||
pr_info("Dumper called for /dev/dri/renderD%d, FD = %d, ID = %d\n", minor(st.st_rdev), fd, id);
|
|
||||||
|
|
||||||
tp_node = sys_get_node_by_render_minor(&src_topology, minor(st.st_rdev));
|
|
||||||
if (!tp_node) {
|
|
||||||
pr_err("Failed to find a device with minor number = %d\n", minor(st.st_rdev));
|
|
||||||
|
|
||||||
return -ENODEV;
|
|
||||||
}
|
|
||||||
|
|
||||||
rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
|
|
||||||
if (!rd.gpu_id)
|
|
||||||
return -ENODEV;
|
|
||||||
|
|
||||||
len = criu_render_node__get_packed_size(&rd);
|
|
||||||
buf = xmalloc(len);
|
|
||||||
if (!buf)
|
|
||||||
return -ENOMEM;
|
|
||||||
|
|
||||||
criu_render_node__pack(&rd, buf);
|
|
||||||
|
|
||||||
snprintf(img_path, sizeof(img_path), IMG_DRM_FILE, id);
|
|
||||||
ret = write_img_file(img_path, buf, len);
|
|
||||||
if (ret) {
|
|
||||||
xfree(buf);
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
|
||||||
|
|
||||||
xfree(buf);
|
/* Invoke unpause process if needed */
|
||||||
|
decrement_checkpoint_count();
|
||||||
|
if (checkpoint_is_complete()) {
|
||||||
|
ret = unpause_process(kfd_checkpoint_fd);
|
||||||
|
}
|
||||||
|
|
||||||
/* Need to return success here so that criu can call plugins for renderD nodes */
|
/* Need to return success here so that criu can call plugins for renderD nodes */
|
||||||
return ret;
|
return ret;
|
||||||
@ -1315,11 +1307,15 @@ int amdgpu_plugin_dump_file(int fd, int id)
|
|||||||
ret = write_img_file(img_path, buf, len);
|
ret = write_img_file(img_path, buf, len);
|
||||||
|
|
||||||
xfree(buf);
|
xfree(buf);
|
||||||
exit:
|
|
||||||
/* Restore all queues */
|
|
||||||
unpause_process(fd);
|
|
||||||
|
|
||||||
sys_close_drm_render_devices(&src_topology);
|
exit:
|
||||||
|
/* Restore all queues if conditions permit */
|
||||||
|
kfd_checkpoint_fd = fd;
|
||||||
|
decrement_checkpoint_count();
|
||||||
|
if (checkpoint_is_complete()) {
|
||||||
|
ret = unpause_process(fd);
|
||||||
|
}
|
||||||
|
|
||||||
xfree((void *)args.devices);
|
xfree((void *)args.devices);
|
||||||
xfree((void *)args.bos);
|
xfree((void *)args.bos);
|
||||||
xfree((void *)args.priv_data);
|
xfree((void *)args.priv_data);
|
||||||
|
@ -61,3 +61,41 @@ int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *st)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm)
|
||||||
|
{
|
||||||
|
CriuRenderNode rd = CRIU_RENDER_NODE__INIT;
|
||||||
|
struct tp_node *tp_node;
|
||||||
|
char path[PATH_MAX];
|
||||||
|
unsigned char *buf;
|
||||||
|
int minor;
|
||||||
|
int len;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
/* Get the topology node of the DRM device */
|
||||||
|
minor = minor(drm->st_rdev);
|
||||||
|
tp_node = sys_get_node_by_render_minor(&src_topology, minor);
|
||||||
|
if (!tp_node) {
|
||||||
|
pr_err("Failed to find a device with minor number = %d\n", minor);
|
||||||
|
return -ENODEV;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get the GPU_ID of the DRM device */
|
||||||
|
rd.gpu_id = maps_get_dest_gpu(&checkpoint_maps, tp_node->gpu_id);
|
||||||
|
if (!rd.gpu_id) {
|
||||||
|
pr_err("Failed to find valid gpu_id for the device = %d\n", rd.gpu_id);
|
||||||
|
return -ENODEV;
|
||||||
|
}
|
||||||
|
|
||||||
|
len = criu_render_node__get_packed_size(&rd);
|
||||||
|
buf = xmalloc(len);
|
||||||
|
if (!buf)
|
||||||
|
return -ENOMEM;
|
||||||
|
|
||||||
|
criu_render_node__pack(&rd, buf);
|
||||||
|
|
||||||
|
snprintf(path, sizeof(path), IMG_DRM_FILE, id);
|
||||||
|
ret = write_img_file(path, buf, len);
|
||||||
|
xfree(buf);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
@ -17,6 +17,12 @@
|
|||||||
*/
|
*/
|
||||||
int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm);
|
int amdgpu_plugin_drm_handle_device_vma(int fd, const struct stat *drm);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Serialize meta-data about a particular DRM device, its number of BOs,
|
||||||
|
* etc into a file. The serialized filename has in it the value ID that
|
||||||
|
* is passed in as a parameter
|
||||||
|
*/
|
||||||
|
int amdgpu_plugin_drm_dump_file(int fd, int id, struct stat *drm);
|
||||||
|
|
||||||
#endif /* __AMDGPU_PLUGIN_DRM_H__ */
|
#endif /* __AMDGPU_PLUGIN_DRM_H__ */
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user