2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 01:51:51 +00:00

Compare commits

...

7 Commits

Author SHA1 Message Date
Yanning Yang
7c4bcdb2d4 plugins/amdgpu: Update README.md and criu-amdgpu-plugin.txt
Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>
2025-05-17 13:36:36 -07:00
Yanning Yang
bfb4a3d842 plugins/amdgpu: Implement parallel restore
This patch implements the entire logic to enable the offloading of
buffer object content restoration.

The goal of this patch is to offload the buffer object content
restoration to the main CRIU process so that this restoration can occur
in parallel with other restoration logic (mainly the restoration of
memory state in the restore blob, which is time-consuming) to speed up
the restore phase. The restoration of buffer object content usually
takes a significant amount of time for GPU applications, so
parallelizing it with other operations can reduce the overall restore
time.

It has three parts: the first replaces the restoration of buffer objects
in the target process by sending a parallel restore command to the main
CRIU process; the second implements the POST_FORKING hook in the amdgpu
plugin to enable buffer object content restoration in the main CRIU
process; the third stops the parallel thread in the RESUME_DEVICES_LATE
hook.

This optimization only focuses on the single-process situation (common
case). In other scenarios, it will turn to the original method. This is
achieved with the new `parallel_disabled` flag.

Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>
2025-05-17 13:36:36 -07:00
Yanning Yang
bfd9aa269b plugins/amdgpu: Add parallel restore command
Currently the restore of buffer object comsumes a significant amount of
time. However, this part has no logical dependencies with other restore
operations. This patch introduce some structures and some helper
functions for the target process to offload this task to the main CRIU
process.

Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>
2025-05-17 13:36:36 -07:00
Yanning Yang
e4c151eab3 plugins/amdgpu: Add socket operations
When enabling parallel restore, the target process and the main CRIU
process need an IPC interface to communicate and transfer restore
commands. This patch adds a Unix domain TCP socket and stores this
socket in `fdstore`.

Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>
2025-05-17 13:36:36 -07:00
Yanning Yang
0a274b6afa pstree: Add has_children function
Currently, parallel restore only focuses on the single-process
situation. Therefore, it needs an interface to know if there is only one
process to restore. This patch adds a `has_children` function in
`pstree.h` and replaces some existing implementations with this
function.

Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>
2025-05-17 13:36:36 -07:00
Yanning Yang
4ba058060c cr-restore: Move cr_plugin_init after fdstore_init
Currently, when CRIU calls `cr_plugin_init`, `fdstore` is not
initialized. However, during the plugin restore procedure, there may be
some common file operations used in multiple hooks. This patch moves
`cr_plugin_init` after `fdstore_init`, allowing `cr_plugin_init` to use
`fdstore` to place these file operations.

Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>
2025-05-17 13:36:36 -07:00
Yanning Yang
8902353057 criu: Introduce a new device plugin hook for restore
Currently, in the target process, device-related restore operations and
other restore operations almost run sequentially. When the target
process executes the corresponding CRIU hook functions, it can't perform
other restore operations.  However, for GPU applications, some device
restore operations have no logical dependencies on other common restore
operations and can be parallelized with other operations to speed up the
process.

Instead of launching a thread in child processes for parallelization,
this patch chooses to add a new hook, `POST_FORKING`, in the main CRIU
process to handle these restore operations. This is because the
restoration of memory state in the restore blob is one of the most
time-consuming parts of all restore logic. The main CRIU process can
easily parallelize these operations, whereas parallelizing in threads
within child processes is challenging.

- POST_FORKING

*POST_FORKING: Hook to enable the main CRIU process to perform some
restore operations of plugins.

Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>
2025-05-17 13:36:36 -07:00
15 changed files with 807 additions and 68 deletions

View File

@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container
Pytorch
Tensorflow
Using CRIU Image Streamer
Parallel Restore
DESCRIPTION
-----------

View File

@ -1396,7 +1396,7 @@ static int dump_zombies(void)
item->sid = pps_buf.sid;
item->pgid = pps_buf.pgid;
BUG_ON(!list_empty(&item->children));
BUG_ON(has_children(item));
if (!item->sid) {
pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n",

View File

@ -2132,6 +2132,9 @@ static int restore_root_task(struct pstree_item *init)
__restore_switch_stage(CR_STATE_FORKING);
skip_ns_bouncing:
ret = run_plugins(POST_FORKING);
if (ret < 0 && ret != -ENOTSUP)
goto out_kill;
ret = restore_wait_inprogress_tasks();
if (ret < 0)
@ -2363,41 +2366,47 @@ int cr_restore_tasks(void)
return 1;
if (check_img_inventory(/* restore = */ true) < 0)
goto err;
if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
return -1;
if (init_stats(RESTORE_STATS))
goto err;
return -1;
if (lsm_check_opts())
goto err;
return -1;
timing_start(TIME_RESTORE);
if (cpu_init() < 0)
goto err;
return -1;
if (vdso_init_restore())
goto err;
return -1;
if (tty_init_restore())
goto err;
return -1;
if (opts.cpu_cap & CPU_CAP_IMAGE) {
if (cpu_validate_cpuinfo())
goto err;
return -1;
}
if (prepare_task_entries() < 0)
goto err;
return -1;
if (prepare_pstree() < 0)
goto err;
return -1;
if (fdstore_init())
goto err;
return -1;
/*
* For the AMDGPU plugin, its parallel restore feature needs to use fdstore to store
* its socket file descriptor. This allows the main process and the target process to
* communicate with each other through this file descriptor. Therefore, cr_plugin_init
* must be initialized after fdstore_init.
*/
if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
return -1;
if (inherit_fd_move_to_fdstore())
goto err;

View File

@ -60,6 +60,8 @@ enum {
CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11,
CR_PLUGIN_HOOK__POST_FORKING = 12,
CR_PLUGIN_HOOK__MAX
};
@ -78,6 +80,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void);
enum {
CR_PLUGIN_STAGE__DUMP,
@ -152,5 +155,6 @@ typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat);
typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff,
uint64_t *new_pgoff, int *plugin_fd);
typedef int(cr_plugin_resume_devices_late_t)(int pid);
typedef int(cr_plugin_post_forking_t)(void);
#endif /* __CRIU_PLUGIN_H__ */

View File

@ -104,6 +104,7 @@ extern void pstree_insert_pid(struct pid *pid_node);
extern struct pid *pstree_pid_by_virt(pid_t pid);
extern struct pstree_item *root_item;
extern bool has_children(struct pstree_item *item);
extern struct pstree_item *pstree_item_next(struct pstree_item *item);
#define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi))

View File

@ -59,6 +59,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
__assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late");
__assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices");
__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
__assign_hook(POST_FORKING, "cr_plugin_post_forking");
#undef __assign_hook

View File

@ -182,7 +182,7 @@ void free_pstree(struct pstree_item *root_item)
struct pstree_item *item = root_item, *parent;
while (item) {
if (!list_empty(&item->children)) {
if (has_children(item)) {
item = list_first_entry(&item->children, struct pstree_item, sibling);
continue;
}
@ -244,10 +244,15 @@ int init_pstree_helper(struct pstree_item *ret)
return 0;
}
bool has_children(struct pstree_item *item)
{
return !list_empty(&item->children);
}
/* Deep first search on children */
struct pstree_item *pstree_item_next(struct pstree_item *item)
{
if (!list_empty(&item->children))
if (has_children(item))
return list_first_entry(&item->children, struct pstree_item, sibling);
while (item->parent) {

View File

@ -1008,7 +1008,7 @@ static int collect_task(struct pstree_item *item)
if (ret < 0)
goto err_close;
if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) {
if ((item->pid->state == TASK_DEAD) && has_children(item)) {
pr_err("Zombie with children?! O_o Run, run, run!\n");
goto err_close;
}

View File

@ -27,7 +27,7 @@ endif
criu-amdgpu.pb-c.c: criu-amdgpu.proto
protoc-c --proto_path=. --c_out=. criu-amdgpu.proto
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c
amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
$(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC)
amdgpu_plugin_clean:

View File

@ -3,7 +3,8 @@ Supporting ROCm with CRIU
_Felix Kuehling <Felix.Kuehling@amd.com>_<br>
_Rajneesh Bardwaj <Rajneesh.Bhardwaj@amd.com>_<br>
_David Yat Sin <David.YatSin@amd.com>_
_David Yat Sin <David.YatSin@amd.com>_<br>
_Yanning Yang <yangyanning@sjtu.edu.cn>_
# Introduction
@ -224,6 +225,26 @@ to resume execution on the GPUs.
*This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC
patch series.*
## Restoring BO content in parallel
Restoring the BO content is an important part in the restore of GPU state and
usually takes a significant amount of time. A possible location for this
procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook
blocks the target process from performing other restore operations, which
hinders further optimization of the restore process.
Therefore, a new plugin hook that runs in the master restore process is
introduced, and it interacts with the `cr_plugin_restore_file` hook to complete
the restore of BO content. Specifically, the target process only needs to send
the relevant BOs to the master restore process, while this new hook handles all
the restore of buffer objects. Through this method, during the restore of the BO
content, the target process can perform other restore operations, thus
accelerating the restore procedure. This is an implementation of the gCROP
method proposed in the ACM SoCC'24 paper: [On-demand and Parallel
Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510).
*This optimization technique is enabled by the `__POST_FORKING` hook.*
## Other CRIU changes
In addition to the new plugins, we need to make some changes to CRIU itself to

View File

@ -28,11 +28,13 @@
#include "xmalloc.h"
#include "criu-log.h"
#include "files.h"
#include "pstree.h"
#include "common/list.h"
#include "amdgpu_plugin_drm.h"
#include "amdgpu_plugin_util.h"
#include "amdgpu_plugin_topology.h"
#include "amdgpu_socket_utils.h"
#include "img-streamer.h"
#include "image.h"
@ -64,6 +66,18 @@ bool plugin_added_to_inventory = false;
bool plugin_disabled = false;
/*
* In the case of a single process (common case), this optimization can effectively
* reduce the restore latency with parallel restore. In the case of multiple processes,
* states are already restored in parallel within different processes. Therefore, this
* optimization does not introduce further improvement and will be disabled by default
* in this case. The flag, parallel_disabled, is used to control whether the
* optimization is enabled or disabled.
*/
bool parallel_disabled = false;
pthread_t parallel_thread = 0;
int parallel_thread_result = 0;
/**************************************************************************************************/
/* Call ioctl, restarting if it is interrupted */
@ -351,6 +365,15 @@ int amdgpu_plugin_init(int stage)
maps_init(&restore_maps);
if (stage == CR_PLUGIN_STAGE__RESTORE) {
if (has_children(root_item)) {
pr_info("Parallel restore disabled\n");
parallel_disabled = true;
} else {
if (install_parallel_sock() < 0) {
pr_err("Failed to install parallel socket\n");
return -1;
}
}
/* Default Values */
kfd_fw_version_check = true;
kfd_sdma_fw_version_check = true;
@ -1439,14 +1462,9 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
{
struct thread_data *thread_datas;
struct thread_data *thread_datas = NULL;
int thread_i, ret = 0;
thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus);
if (!thread_datas) {
ret = -ENOMEM;
goto exit;
}
int offset = 0;
for (int i = 0; i < e->num_of_bos; i++) {
struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
@ -1489,56 +1507,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf
}
}
thread_i = 0;
for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) {
struct tp_node *dev;
int ret_thread = 0;
uint32_t target_gpu_id;
if (!parallel_disabled) {
parallel_restore_cmd restore_cmd;
pr_info("Begin to send parallel restore cmd\n");
ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd);
if (ret)
goto exit_parallel;
if (!e->device_entries[i]->gpu_id)
continue;
for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) {
uint32_t target_gpu_id;
struct tp_node *dev;
/* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */
target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id);
if (!e->device_entries[i]->gpu_id)
continue;
/* We need the fd for actual_gpu_id */
dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
if (!dev) {
pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
ret = -ENODEV;
target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id);
dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
if (!dev) {
pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
ret = -ENODEV;
goto exit_parallel;
}
parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd);
for (int j = 0; j < e->num_of_bos; j++) {
if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id)
continue;
if (bo_buckets[j].alloc_flags &
(KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id,
bo_buckets[j].size, offset, &restore_cmd);
offset += bo_buckets[j].size;
}
}
}
ret = send_parallel_restore_cmd(&restore_cmd);
exit_parallel:
free_parallel_restore_cmd(&restore_cmd);
} else {
thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus);
if (!thread_datas) {
ret = -ENOMEM;
goto exit;
}
thread_datas[thread_i].id = id;
thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id;
thread_datas[thread_i].bo_buckets = bo_buckets;
thread_datas[thread_i].bo_entries = e->bo_entries;
thread_datas[thread_i].pid = e->pid;
thread_datas[thread_i].num_of_bos = e->num_of_bos;
thread_i = 0;
for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) {
struct tp_node *dev;
int ret_thread = 0;
uint32_t target_gpu_id;
thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev);
if (thread_datas[thread_i].drm_fd < 0) {
ret = -thread_datas[thread_i].drm_fd;
goto exit;
if (!e->device_entries[i]->gpu_id)
continue;
/* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */
target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id);
/* We need the fd for actual_gpu_id */
dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
if (!dev) {
pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
ret = -ENODEV;
goto exit;
}
thread_datas[thread_i].id = id;
thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id;
thread_datas[thread_i].bo_buckets = bo_buckets;
thread_datas[thread_i].bo_entries = e->bo_entries;
thread_datas[thread_i].pid = e->pid;
thread_datas[thread_i].num_of_bos = e->num_of_bos;
thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev);
if (thread_datas[thread_i].drm_fd < 0) {
ret = -thread_datas[thread_i].drm_fd;
goto exit;
}
ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents,
(void *)&thread_datas[thread_i]);
if (ret_thread) {
pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread);
ret = -ret_thread;
goto exit;
}
thread_i++;
}
ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents,
(void *)&thread_datas[thread_i]);
if (ret_thread) {
pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread);
ret = -ret_thread;
goto exit;
}
thread_i++;
}
for (int i = 0; i < e->num_of_gpus; i++) {
pthread_join(thread_datas[i].thread, NULL);
pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);
for (int i = 0; i < e->num_of_gpus; i++) {
pthread_join(thread_datas[i].thread, NULL);
pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);
if (thread_datas[i].ret) {
ret = thread_datas[i].ret;
goto exit;
if (thread_datas[i].ret) {
ret = thread_datas[i].ret;
goto exit;
}
}
}
exit:
@ -1546,8 +1609,8 @@ exit:
if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD)
close(bo_buckets[i].dmabuf_fd);
}
xfree(thread_datas);
if (thread_datas)
xfree(thread_datas);
return ret;
}
@ -1836,6 +1899,24 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
if (plugin_disabled)
return -ENOTSUP;
if (!parallel_disabled) {
pr_info("Close parallel restore server\n");
if (close_parallel_restore_server()) {
pr_err("Close parallel restore server fail\n");
return -1;
}
exit_code = pthread_join(parallel_thread, NULL);
if (exit_code) {
pr_err("Failed to join parallel thread ret:%d\n", exit_code);
return -1;
}
if (parallel_thread_result) {
pr_err("Parallel restore fail\n");
return parallel_thread_result;
}
}
pr_info("Inside %s for target pid = %d\n", __func__, target_pid);
fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
@ -1862,3 +1943,244 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late)
int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size,
amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type)
{
return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer,
buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
}
int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size)
{
int ret = 0;
int drm_fd = -1;
uint32_t major, minor;
struct amdgpu_gpu_info gpu_info = { 0 };
drm_fd = open_drm_render_device(dev_minor);
if (drm_fd < 0) {
return drm_fd;
}
ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev);
if (ret) {
pr_perror("Failed to initialize device");
goto err;
}
ret = amdgpu_query_gpu_info(*h_dev, &gpu_info);
if (ret) {
pr_perror("failed to query gpuinfo via libdrm");
goto err;
}
*max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE :
SDMA_LINEAR_COPY_MAX_SIZE - 1;
return 0;
err:
amdgpu_device_deinitialize(*h_dev);
return ret;
}
FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size)
{
char img_path[PATH_MAX];
size_t image_size = 0;
FILE *bo_contents_fp = NULL;
snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id);
bo_contents_fp = open_img_file(img_path, false, &image_size);
if (!bo_contents_fp) {
pr_perror("Cannot fopen %s", img_path);
return NULL;
}
if (tot_size != image_size) {
pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size);
fclose(bo_contents_fp);
return NULL;
}
return bo_contents_fp;
}
struct parallel_thread_data {
pthread_t thread;
uint32_t gpu_id;
int minor;
parallel_restore_cmd *restore_cmd;
int ret;
};
void *parallel_restore_bo_contents(void *_thread_data)
{
struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data;
amdgpu_device_handle h_dev;
uint64_t max_copy_size;
size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0;
FILE *bo_contents_fp = NULL;
parallel_restore_entry *entry;
parallel_restore_cmd *restore_cmd = thread_data->restore_cmd;
int ret = 0;
int offset = 0;
void *buffer = NULL;
ret = init_dev(thread_data->minor, &h_dev, &max_copy_size);
if (ret) {
goto err;
}
for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) {
if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) {
total_bo_size += restore_cmd->entries[i].size;
max_bo_size = max(restore_cmd->entries[i].size, max_bo_size);
}
}
buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size;
bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size);
if (bo_contents_fp == NULL) {
ret = -1;
goto err_sdma;
}
offset = ftell(bo_contents_fp);
posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size);
if (!buffer) {
pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE.");
ret = -ENOMEM;
goto err_sdma;
}
for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) {
if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id)
continue;
entry = &restore_cmd->entries[i];
fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer,
buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
if (ret) {
pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
goto err_sdma;
}
}
err_sdma:
if (bo_contents_fp)
fclose(bo_contents_fp);
if (buffer)
xfree(buffer);
amdgpu_device_deinitialize(h_dev);
err:
thread_data->ret = ret;
return NULL;
}
void *restore_device_parallel_worker(void *arg)
{
while (1) {
parallel_restore_cmd restore_cmd = { 0 };
struct parallel_thread_data *thread_datas = NULL;
int ret;
int error_occurred = 0, join_ret = 0, created_threads = 0;
ret = recv_parallel_restore_cmd(&restore_cmd);
if (ret) {
if (ret == 1) {
*(int *)arg = 0;
goto exit;
}
goto err;
}
thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num);
if (!thread_datas) {
ret = -ENOMEM;
goto err;
}
for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) {
thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id;
thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor;
thread_datas[created_threads].restore_cmd = &restore_cmd;
ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents,
(void *)&thread_datas[created_threads]);
if (ret) {
pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret);
error_occurred = 1;
break;
}
}
for (int i = 0; i < created_threads; i++) {
join_ret = pthread_join(thread_datas[i].thread, NULL);
if (join_ret != 0) {
pr_err("pthread_join failed for Thread[0x%x] ret:%d\n",
thread_datas[i].gpu_id, join_ret);
if (!error_occurred) {
ret = join_ret;
error_occurred = 1;
}
}
pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);
/* Check thread return value */
if (thread_datas[i].ret && !error_occurred) {
ret = thread_datas[i].ret;
error_occurred = 1;
}
}
if (thread_datas)
xfree(thread_datas);
err:
free_parallel_restore_cmd(&restore_cmd);
if (ret) {
*(int *)arg = ret;
return NULL;
}
}
exit:
return NULL;
}
/*
* While the background thread is running, some processing functions (e.g., stop_cgroupd)
* in the main thread need to block SIGCHLD. To prevent interference from this background
* thread, SIGCHLD is blocked in this thread.
*/
static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg)
{
int ret = 0;
sigset_t blockmask, oldmask;
sigemptyset(&blockmask);
sigaddset(&blockmask, SIGCHLD);
sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
ret = pthread_create(newthread, NULL, f, arg);
if (ret) {
pr_err("Create worker thread fail: %d\n", ret);
return -1;
}
sigprocmask(SIG_SETMASK, &oldmask, NULL);
return 0;
}
int amdgpu_plugin_post_forking(void)
{
if (plugin_disabled)
return -ENOTSUP;
if (parallel_disabled)
return 0;
return back_thread_create(&parallel_thread, restore_device_parallel_worker, &parallel_thread_result);
}
CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking)

View File

@ -45,7 +45,7 @@ bool kfd_capability_check = true;
*/
int fd_next = -1;
static int open_drm_render_device(int minor)
int open_drm_render_device(int minor)
{
char path[128];
int fd, ret_fd;

View File

@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32
struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor);
struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index);
int open_drm_render_device(int minor);
int node_get_drm_render_device(struct tp_node *node);
void sys_close_drm_render_devices(struct tp_system *sys);

View File

@ -0,0 +1,320 @@
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <stdint.h>
#include "amdgpu_socket_utils.h"
#include "criu-log.h"
#include "common/scm.h"
#include "fdstore.h"
#include "util-pie.h"
#include "util.h"
int parallel_socket_addr_len;
struct sockaddr_un parallel_socket_addr;
int parallel_socket_id = 0;
static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len)
{
addr->sun_family = AF_UNIX;
snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id);
*len = SUN_LEN(addr);
*addr->sun_path = '\0';
}
int install_parallel_sock(void)
{
int ret = 0;
int sock_fd;
sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
if (sock_fd < 0) {
pr_perror("socket creation failed");
return -1;
}
amdgpu_socket_name_gen(&parallel_socket_addr, &parallel_socket_addr_len);
ret = bind(sock_fd, (struct sockaddr *)&parallel_socket_addr, parallel_socket_addr_len);
if (ret < 0) {
pr_perror("bind failed");
goto err;
}
ret = listen(sock_fd, SOMAXCONN);
if (ret < 0) {
pr_perror("listen failed");
goto err;
}
parallel_socket_id = fdstore_add(sock_fd);
if (parallel_socket_id < 0) {
ret = -1;
goto err;
}
err:
close(sock_fd);
return ret;
}
void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset,
parallel_restore_cmd *restore_cmd)
{
parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num];
restore_entry->gpu_id = gpu_id;
restore_entry->write_id = restore_cmd->cmd_head.fd_write_num;
restore_entry->write_offset = 0;
restore_entry->read_offset = offset;
restore_entry->size = size;
restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd;
restore_cmd->cmd_head.entry_num += 1;
restore_cmd->cmd_head.fd_write_num += 1;
}
void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd)
{
restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor };
restore_cmd->cmd_head.gpu_num += 1;
}
static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd)
{
if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) {
pr_perror("Send parallel restore command head fail");
return -1;
}
return 0;
}
static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd)
{
if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) {
pr_perror("Send GPU ids of parallel restore command fail");
return -1;
}
return 0;
}
static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd)
{
if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) {
pr_perror("Send parallel restore command fail");
return -1;
}
return 0;
}
static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd)
{
if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) {
pr_perror("Send dmabuf fds fail");
return -1;
}
return 0;
}
int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd)
{
int sock_fd;
int ret = 0;
sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
if (sock_fd < 0) {
pr_perror("Socket creation failed");
return -1;
}
ret = connect(sock_fd, (struct sockaddr *)&parallel_socket_addr, parallel_socket_addr_len);
if (ret < 0) {
pr_perror("Connect failed");
goto err;
}
ret = send_metadata(sock_fd, restore_cmd);
if (ret) {
goto err;
}
ret = send_gpu_ids(sock_fd, restore_cmd);
if (ret) {
goto err;
}
ret = send_cmds(sock_fd, restore_cmd);
if (ret) {
goto err;
}
ret = send_dmabuf_fds(sock_fd, restore_cmd);
err:
close(sock_fd);
return ret;
}
int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd)
{
restore_cmd->cmd_head.id = id;
restore_cmd->cmd_head.fd_write_num = 0;
restore_cmd->cmd_head.entry_num = 0;
restore_cmd->cmd_head.gpu_num = 0;
restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info));
if (!restore_cmd->gpu_ids)
return -ENOMEM;
restore_cmd->fds_write = xzalloc(num * sizeof(int));
if (!restore_cmd->fds_write)
return -ENOMEM;
restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry));
if (!restore_cmd->entries)
return -ENOMEM;
return 0;
}
void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd)
{
if (restore_cmd->gpu_ids)
xfree(restore_cmd->gpu_ids);
if (restore_cmd->fds_write)
xfree(restore_cmd->fds_write);
if (restore_cmd->entries)
xfree(restore_cmd->entries);
}
static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd)
{
restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info));
if (!restore_cmd->gpu_ids)
return -ENOMEM;
restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int));
if (!restore_cmd->fds_write)
return -ENOMEM;
restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry));
if (!restore_cmd->entries)
return -ENOMEM;
return 0;
}
static int check_quit_cmd(parallel_restore_cmd *restore_cmd)
{
return restore_cmd->cmd_head.fd_write_num == 0;
}
static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd)
{
if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) {
pr_perror("Recv parallel restore command head fail");
return -1;
}
return 0;
}
static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd)
{
if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) {
pr_perror("Recv parallel restore command fail");
return -1;
}
return 0;
}
static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd)
{
if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) {
pr_perror("Send GPU ids of parallel restore command fail");
return -1;
}
return 0;
}
static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd)
{
if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) {
pr_perror("Recv dmabuf fds fail");
return -1;
}
return 0;
}
int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd)
{
int sock_fd, client_fd;
int ret = 0;
sock_fd = fdstore_get(parallel_socket_id);
if (sock_fd < 0)
return -1;
client_fd = accept(sock_fd, NULL, NULL);
if (client_fd < 0) {
ret = client_fd;
goto err_accept;
}
ret = recv_metadata(client_fd, restore_cmd);
if (ret) {
goto err;
}
// Return 1 to quit
if (check_quit_cmd(restore_cmd)) {
ret = 1;
goto err;
}
ret = init_parallel_restore_cmd_by_head(restore_cmd);
if (ret) {
goto err;
}
ret = recv_gpu_ids(client_fd, restore_cmd);
if (ret) {
goto err;
}
ret = recv_cmds(client_fd, restore_cmd);
if (ret) {
goto err;
}
ret = recv_dmabuf_fds(client_fd, restore_cmd);
err:
close(client_fd);
err_accept:
close(sock_fd);
return ret;
}
int close_parallel_restore_server(void)
{
int sock_fd;
int ret = 0;
parallel_restore_cmd_head cmd_head;
sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
if (sock_fd < 0) {
pr_perror("Socket creation failed");
return -1;
}
ret = connect(sock_fd, (struct sockaddr *)&parallel_socket_addr, parallel_socket_addr_len);
if (ret < 0) {
pr_perror("Connect failed");
goto err;
}
memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head));
if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) {
pr_perror("Send parallel restore command head fail");
return -1;
}
err:
close(sock_fd);
return ret;
}

View File

@ -0,0 +1,54 @@
#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__
#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__
typedef struct {
int id;
int fd_write_num; /* The number of buffer objects to be restored. */
int entry_num; /* The number of restore commands.*/
int gpu_num;
} parallel_restore_cmd_head;
typedef struct {
int gpu_id;
int minor;
} parallel_gpu_info;
typedef struct {
int gpu_id;
int write_id;
uint64_t read_offset;
uint64_t write_offset;
uint64_t size;
} parallel_restore_entry;
typedef struct {
parallel_restore_cmd_head cmd_head;
int *fds_write;
parallel_gpu_info *gpu_ids;
parallel_restore_entry *entries;
} parallel_restore_cmd;
/*
* For parallel_restore, a background thread in the main CRIU process is used to restore the GPU
* buffer object. However, initially, the ownership of these buffer objects and the metadata for
* restoration are all with the target process. Therefore, we introduce a series of functions to
* help the target process send these tasks to the main CRIU process.
*/
int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd);
void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd);
int install_parallel_sock(void);
int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd);
int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd);
void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset,
parallel_restore_cmd *restore_cmd);
void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd);
int close_parallel_restore_server(void);
#endif