plugins/amdgpu: Update README.md and criu-amdgpu-plugin.txt

Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>
plugins/amdgpu: Implement parallel restore
2025-08-22 01:51:51 +00:00 · 2025-05-17 13:36:36 -07:00 · 2025-05-17 13:36:36 -07:00 · 2025-05-17 13:36:36 -07:00 · 2025-05-17 13:36:36 -07:00 · 2025-05-17 13:36:36 -07:00
15 changed files with 807 additions and 68 deletions
--- a/Documentation/criu-amdgpu-plugin.txt
+++ b/Documentation/criu-amdgpu-plugin.txt
@ -15,6 +15,7 @@ Checkpoint / Restore inside a docker container
 Pytorch
 Tensorflow
 Using CRIU Image Streamer
+Parallel Restore

 DESCRIPTION
 -----------
--- a/criu/cr-dump.c
+++ b/criu/cr-dump.c
@ -1396,7 +1396,7 @@ static int dump_zombies(void)
 		item->sid = pps_buf.sid;
 		item->pgid = pps_buf.pgid;

-		BUG_ON(!list_empty(&item->children));
+		BUG_ON(has_children(item));

 		if (!item->sid) {
 			pr_err("A session leader of zombie process %d(%d) is outside of its pid namespace\n",
--- a/criu/cr-restore.c
+++ b/criu/cr-restore.c
@ -2132,6 +2132,9 @@ static int restore_root_task(struct pstree_item *init)
 	__restore_switch_stage(CR_STATE_FORKING);

 skip_ns_bouncing:
+	ret = run_plugins(POST_FORKING);
+	if (ret < 0 && ret != -ENOTSUP)
+		goto out_kill;

 	ret = restore_wait_inprogress_tasks();
 	if (ret < 0)
@ -2363,41 +2366,47 @@ int cr_restore_tasks(void)
 		return 1;

 	if (check_img_inventory(/* restore = */ true) < 0)
-		goto err;
-
-	if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
 		return -1;

 	if (init_stats(RESTORE_STATS))
-		goto err;
+		return -1;

 	if (lsm_check_opts())
-		goto err;
+		return -1;

 	timing_start(TIME_RESTORE);

 	if (cpu_init() < 0)
-		goto err;
+		return -1;

 	if (vdso_init_restore())
-		goto err;
+		return -1;

 	if (tty_init_restore())
-		goto err;
+		return -1;

 	if (opts.cpu_cap & CPU_CAP_IMAGE) {
 		if (cpu_validate_cpuinfo())
-			goto err;
+			return -1;
 	}

 	if (prepare_task_entries() < 0)
-		goto err;
+		return -1;

 	if (prepare_pstree() < 0)
-		goto err;
+		return -1;

 	if (fdstore_init())
-		goto err;
+		return -1;
+
+	/*
+	 * For the AMDGPU plugin, its parallel restore feature needs to use fdstore to store
+	 * its socket file descriptor. This allows the main process and the target process to
+	 * communicate with each other through this file descriptor. Therefore, cr_plugin_init
+	 * must be initialized after fdstore_init.
+	 */
+	if (cr_plugin_init(CR_PLUGIN_STAGE__RESTORE))
+		return -1;

 	if (inherit_fd_move_to_fdstore())
 		goto err;
--- a/criu/include/criu-plugin.h
+++ b/criu/include/criu-plugin.h
@ -60,6 +60,8 @@ enum {

 	CR_PLUGIN_HOOK__CHECKPOINT_DEVICES = 11,

+	CR_PLUGIN_HOOK__POST_FORKING = 12,
+
 	CR_PLUGIN_HOOK__MAX
 };

@ -78,6 +80,7 @@ DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__UPDATE_VMA_MAP, const char *path, const
 DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, int pid);
 DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__PAUSE_DEVICES, int pid);
 DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__CHECKPOINT_DEVICES, int pid);
+DECLARE_PLUGIN_HOOK_ARGS(CR_PLUGIN_HOOK__POST_FORKING, void);

 enum {
 	CR_PLUGIN_STAGE__DUMP,
@ -152,5 +155,6 @@ typedef int(cr_plugin_handle_device_vma_t)(int fd, const struct stat *stat);
 typedef int(cr_plugin_update_vma_map_t)(const char *path, const uint64_t addr, const uint64_t old_pgoff,
 					uint64_t *new_pgoff, int *plugin_fd);
 typedef int(cr_plugin_resume_devices_late_t)(int pid);
+typedef int(cr_plugin_post_forking_t)(void);

 #endif /* __CRIU_PLUGIN_H__ */
--- a/criu/include/pstree.h
+++ b/criu/include/pstree.h
@ -104,6 +104,7 @@ extern void pstree_insert_pid(struct pid *pid_node);
 extern struct pid *pstree_pid_by_virt(pid_t pid);

 extern struct pstree_item *root_item;
+extern bool has_children(struct pstree_item *item);
 extern struct pstree_item *pstree_item_next(struct pstree_item *item);
 #define for_each_pstree_item(pi) for (pi = root_item; pi != NULL; pi = pstree_item_next(pi))

--- a/criu/plugin.c
+++ b/criu/plugin.c
@ -59,6 +59,7 @@ static cr_plugin_desc_t *cr_gen_plugin_desc(void *h, char *path)
 	__assign_hook(RESUME_DEVICES_LATE, "cr_plugin_resume_devices_late");
 	__assign_hook(PAUSE_DEVICES, "cr_plugin_pause_devices");
 	__assign_hook(CHECKPOINT_DEVICES, "cr_plugin_checkpoint_devices");
+	__assign_hook(POST_FORKING, "cr_plugin_post_forking");

 #undef __assign_hook

--- a/criu/pstree.c
+++ b/criu/pstree.c
@ -182,7 +182,7 @@ void free_pstree(struct pstree_item *root_item)
 	struct pstree_item *item = root_item, *parent;

 	while (item) {
-		if (!list_empty(&item->children)) {
+		if (has_children(item)) {
 			item = list_first_entry(&item->children, struct pstree_item, sibling);
 			continue;
 		}
@ -244,10 +244,15 @@ int init_pstree_helper(struct pstree_item *ret)
 	return 0;
 }

+bool has_children(struct pstree_item *item)
+{
+	return !list_empty(&item->children);
+}
+
 /* Deep first search on children */
 struct pstree_item *pstree_item_next(struct pstree_item *item)
 {
-	if (!list_empty(&item->children))
+	if (has_children(item))
 		return list_first_entry(&item->children, struct pstree_item, sibling);

 	while (item->parent) {
--- a/criu/seize.c
+++ b/criu/seize.c
@ -1008,7 +1008,7 @@ static int collect_task(struct pstree_item *item)
 	if (ret < 0)
 		goto err_close;

-	if ((item->pid->state == TASK_DEAD) && !list_empty(&item->children)) {
+	if ((item->pid->state == TASK_DEAD) && has_children(item)) {
 		pr_err("Zombie with children?! O_o Run, run, run!\n");
 		goto err_close;
 	}
--- a/plugins/amdgpu/Makefile
+++ b/plugins/amdgpu/Makefile
@ -27,7 +27,7 @@ endif
 criu-amdgpu.pb-c.c: criu-amdgpu.proto
 		protoc-c --proto_path=. --c_out=. criu-amdgpu.proto

-amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c
+amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
 	$(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC)

 amdgpu_plugin_clean:
--- a/plugins/amdgpu/README.md
+++ b/plugins/amdgpu/README.md
@ -3,7 +3,8 @@ Supporting ROCm with CRIU

 _Felix Kuehling <Felix.Kuehling@amd.com>_<br>
 _Rajneesh Bardwaj <Rajneesh.Bhardwaj@amd.com>_<br>
-_David Yat Sin <David.YatSin@amd.com>_
+_David Yat Sin <David.YatSin@amd.com>_<br>
+_Yanning Yang <yangyanning@sjtu.edu.cn>_

 # Introduction

@ -224,6 +225,26 @@ to resume execution on the GPUs.
 *This new plugin is enabled by the new hook `__RESUME_DEVICES_LATE` in our RFC
 patch series.*

+## Restoring BO content in parallel
+
+Restoring the BO content is an important part in the restore of GPU state and
+usually takes a significant amount of time. A possible location for this
+procedure is the `cr_plugin_restore_file` hook. However, restoring in this hook
+blocks the target process from performing other restore operations, which
+hinders further optimization of the restore process.
+
+Therefore, a new plugin hook that runs in the master restore process is
+introduced, and it interacts with the `cr_plugin_restore_file` hook to complete
+the restore of BO content. Specifically, the target process only needs to send
+the relevant BOs to the master restore process, while this new hook handles all
+the restore of buffer objects. Through this method, during the restore of the BO
+content, the target process can perform other restore operations, thus
+accelerating the restore procedure. This is an implementation of the gCROP
+method proposed in the ACM SoCC'24 paper: [On-demand and Parallel
+Checkpoint/Restore for GPU Applications](https://dl.acm.org/doi/10.1145/3698038.3698510).
+
+*This optimization technique is enabled by the `__POST_FORKING` hook.*
+
 ## Other CRIU changes

 In addition to the new plugins, we need to make some changes to CRIU itself to
--- a/plugins/amdgpu/amdgpu_plugin.c
+++ b/plugins/amdgpu/amdgpu_plugin.c
@ -28,11 +28,13 @@
 #include "xmalloc.h"
 #include "criu-log.h"
 #include "files.h"
+#include "pstree.h"

 #include "common/list.h"
 #include "amdgpu_plugin_drm.h"
 #include "amdgpu_plugin_util.h"
 #include "amdgpu_plugin_topology.h"
+#include "amdgpu_socket_utils.h"

 #include "img-streamer.h"
 #include "image.h"
@ -64,6 +66,18 @@ bool plugin_added_to_inventory = false;

 bool plugin_disabled = false;

+/*
+ * In the case of a single process (common case), this optimization can effectively
+ * reduce the restore latency with parallel restore. In the case of multiple processes,
+ * states are already restored in parallel within different processes. Therefore, this
+ * optimization does not introduce further improvement and will be disabled by default
+ * in this case. The flag, parallel_disabled, is used to control whether the
+ * optimization is enabled or disabled.
+ */
+bool parallel_disabled = false;
+
+pthread_t parallel_thread = 0;
+int parallel_thread_result = 0;
 /**************************************************************************************************/

 /* Call ioctl, restarting if it is interrupted */
@ -351,6 +365,15 @@ int amdgpu_plugin_init(int stage)
 	maps_init(&restore_maps);

 	if (stage == CR_PLUGIN_STAGE__RESTORE) {
+		if (has_children(root_item)) {
+			pr_info("Parallel restore disabled\n");
+			parallel_disabled = true;
+		} else {
+			if (install_parallel_sock() < 0) {
+				pr_err("Failed to install parallel socket\n");
+				return -1;
+			}
+		}
 		/* Default Values */
 		kfd_fw_version_check = true;
 		kfd_sdma_fw_version_check = true;
@ -1439,14 +1462,9 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)

 static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
 {
-	struct thread_data *thread_datas;
+	struct thread_data *thread_datas = NULL;
 	int thread_i, ret = 0;
-
-	thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus);
-	if (!thread_datas) {
-		ret = -ENOMEM;
-		goto exit;
-	}
+	int offset = 0;

 	for (int i = 0; i < e->num_of_bos; i++) {
 		struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
@ -1489,56 +1507,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf
 		}
 	}

-	thread_i = 0;
-	for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) {
-		struct tp_node *dev;
-		int ret_thread = 0;
-		uint32_t target_gpu_id;
+	if (!parallel_disabled) {
+		parallel_restore_cmd restore_cmd;
+		pr_info("Begin to send parallel restore cmd\n");
+		ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd);
+		if (ret)
+			goto exit_parallel;

-		if (!e->device_entries[i]->gpu_id)
-			continue;
+		for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) {
+			uint32_t target_gpu_id;
+			struct tp_node *dev;

-		/* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */
-		target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id);
+			if (!e->device_entries[i]->gpu_id)
+				continue;

-		/* We need the fd for actual_gpu_id */
-		dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
-		if (!dev) {
-			pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
-			ret = -ENODEV;
+			target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id);
+			dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
+			if (!dev) {
+				pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
+				ret = -ENODEV;
+				goto exit_parallel;
+			}
+			parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd);
+
+			for (int j = 0; j < e->num_of_bos; j++) {
+				if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id)
+					continue;
+				if (bo_buckets[j].alloc_flags &
+				    (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
+					parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id,
+								bo_buckets[j].size, offset, &restore_cmd);
+					offset += bo_buckets[j].size;
+				}
+			}
+		}
+		ret = send_parallel_restore_cmd(&restore_cmd);
+exit_parallel:
+		free_parallel_restore_cmd(&restore_cmd);
+	} else {
+		thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus);
+		if (!thread_datas) {
+			ret = -ENOMEM;
 			goto exit;
 		}

-		thread_datas[thread_i].id = id;
-		thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id;
-		thread_datas[thread_i].bo_buckets = bo_buckets;
-		thread_datas[thread_i].bo_entries = e->bo_entries;
-		thread_datas[thread_i].pid = e->pid;
-		thread_datas[thread_i].num_of_bos = e->num_of_bos;
+		thread_i = 0;
+		for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) {
+			struct tp_node *dev;
+			int ret_thread = 0;
+			uint32_t target_gpu_id;

-		thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev);
-		if (thread_datas[thread_i].drm_fd < 0) {
-			ret = -thread_datas[thread_i].drm_fd;
-			goto exit;
+			if (!e->device_entries[i]->gpu_id)
+				continue;
+
+			/* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */
+			target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id);
+
+			/* We need the fd for actual_gpu_id */
+			dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
+			if (!dev) {
+				pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
+				ret = -ENODEV;
+				goto exit;
+			}
+
+			thread_datas[thread_i].id = id;
+			thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id;
+			thread_datas[thread_i].bo_buckets = bo_buckets;
+			thread_datas[thread_i].bo_entries = e->bo_entries;
+			thread_datas[thread_i].pid = e->pid;
+			thread_datas[thread_i].num_of_bos = e->num_of_bos;
+
+			thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev);
+			if (thread_datas[thread_i].drm_fd < 0) {
+				ret = -thread_datas[thread_i].drm_fd;
+				goto exit;
+			}
+
+			ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents,
+						    (void *)&thread_datas[thread_i]);
+			if (ret_thread) {
+				pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread);
+				ret = -ret_thread;
+				goto exit;
+			}
+			thread_i++;
 		}

-		ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents,
-					    (void *)&thread_datas[thread_i]);
-		if (ret_thread) {
-			pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread);
-			ret = -ret_thread;
-			goto exit;
-		}
-		thread_i++;
-	}
+		for (int i = 0; i < e->num_of_gpus; i++) {
+			pthread_join(thread_datas[i].thread, NULL);
+			pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);

-	for (int i = 0; i < e->num_of_gpus; i++) {
-		pthread_join(thread_datas[i].thread, NULL);
-		pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);
-
-		if (thread_datas[i].ret) {
-			ret = thread_datas[i].ret;
-			goto exit;
+			if (thread_datas[i].ret) {
+				ret = thread_datas[i].ret;
+				goto exit;
+			}
 		}
 	}
 exit:
@ -1546,8 +1609,8 @@ exit:
 		if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD)
 			close(bo_buckets[i].dmabuf_fd);
 	}
-
-	xfree(thread_datas);
+	if (thread_datas)
+		xfree(thread_datas);
 	return ret;
 }

@ -1836,6 +1899,24 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
 	if (plugin_disabled)
 		return -ENOTSUP;

+	if (!parallel_disabled) {
+		pr_info("Close parallel restore server\n");
+		if (close_parallel_restore_server()) {
+			pr_err("Close parallel restore server fail\n");
+			return -1;
+		}
+
+		exit_code = pthread_join(parallel_thread, NULL);
+		if (exit_code) {
+			pr_err("Failed to join parallel thread ret:%d\n", exit_code);
+			return -1;
+		}
+		if (parallel_thread_result) {
+			pr_err("Parallel restore fail\n");
+			return parallel_thread_result;
+		}
+	}
+
 	pr_info("Inside %s for target pid = %d\n", __func__, target_pid);

 	fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
@ -1862,3 +1943,244 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
 }

 CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late)
+
+int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size,
+			amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type)
+{
+	return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer,
+			    buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
+}
+
+int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size)
+{
+	int ret = 0;
+	int drm_fd = -1;
+	uint32_t major, minor;
+
+	struct amdgpu_gpu_info gpu_info = { 0 };
+
+	drm_fd = open_drm_render_device(dev_minor);
+	if (drm_fd < 0) {
+		return drm_fd;
+	}
+
+	ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev);
+	if (ret) {
+		pr_perror("Failed to initialize device");
+		goto err;
+	}
+
+	ret = amdgpu_query_gpu_info(*h_dev, &gpu_info);
+	if (ret) {
+		pr_perror("failed to query gpuinfo via libdrm");
+		goto err;
+	}
+	*max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE :
+								    SDMA_LINEAR_COPY_MAX_SIZE - 1;
+	return 0;
+err:
+	amdgpu_device_deinitialize(*h_dev);
+	return ret;
+}
+
+FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size)
+{
+	char img_path[PATH_MAX];
+	size_t image_size = 0;
+	FILE *bo_contents_fp = NULL;
+
+	snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id);
+	bo_contents_fp = open_img_file(img_path, false, &image_size);
+	if (!bo_contents_fp) {
+		pr_perror("Cannot fopen %s", img_path);
+		return NULL;
+	}
+
+	if (tot_size != image_size) {
+		pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size);
+		fclose(bo_contents_fp);
+		return NULL;
+	}
+	return bo_contents_fp;
+}
+
+struct parallel_thread_data {
+	pthread_t thread;
+	uint32_t gpu_id;
+	int minor;
+	parallel_restore_cmd *restore_cmd;
+	int ret;
+};
+
+void *parallel_restore_bo_contents(void *_thread_data)
+{
+	struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data;
+	amdgpu_device_handle h_dev;
+	uint64_t max_copy_size;
+	size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0;
+	FILE *bo_contents_fp = NULL;
+	parallel_restore_entry *entry;
+	parallel_restore_cmd *restore_cmd = thread_data->restore_cmd;
+	int ret = 0;
+	int offset = 0;
+	void *buffer = NULL;
+
+	ret = init_dev(thread_data->minor, &h_dev, &max_copy_size);
+	if (ret) {
+		goto err;
+	}
+
+	for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) {
+		if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) {
+			total_bo_size += restore_cmd->entries[i].size;
+			max_bo_size = max(restore_cmd->entries[i].size, max_bo_size);
+		}
+	}
+
+	buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size;
+
+	bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size);
+	if (bo_contents_fp == NULL) {
+		ret = -1;
+		goto err_sdma;
+	}
+	offset = ftell(bo_contents_fp);
+
+	posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size);
+	if (!buffer) {
+		pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE.");
+		ret = -ENOMEM;
+		goto err_sdma;
+	}
+
+	for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) {
+		if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id)
+			continue;
+
+		entry = &restore_cmd->entries[i];
+		fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
+		ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer,
+					  buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
+		if (ret) {
+			pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
+			goto err_sdma;
+		}
+	}
+
+err_sdma:
+	if (bo_contents_fp)
+		fclose(bo_contents_fp);
+	if (buffer)
+		xfree(buffer);
+	amdgpu_device_deinitialize(h_dev);
+err:
+	thread_data->ret = ret;
+	return NULL;
+}
+
+void *restore_device_parallel_worker(void *arg)
+{
+	while (1) {
+		parallel_restore_cmd restore_cmd = { 0 };
+		struct parallel_thread_data *thread_datas = NULL;
+		int ret;
+		int error_occurred = 0, join_ret = 0, created_threads = 0;
+
+		ret = recv_parallel_restore_cmd(&restore_cmd);
+		if (ret) {
+			if (ret == 1) {
+				*(int *)arg = 0;
+				goto exit;
+			}
+			goto err;
+		}
+
+		thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num);
+		if (!thread_datas) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) {
+			thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id;
+			thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor;
+			thread_datas[created_threads].restore_cmd = &restore_cmd;
+
+			ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents,
+					     (void *)&thread_datas[created_threads]);
+			if (ret) {
+				pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret);
+				error_occurred = 1;
+				break;
+			}
+		}
+
+		for (int i = 0; i < created_threads; i++) {
+			join_ret = pthread_join(thread_datas[i].thread, NULL);
+			if (join_ret != 0) {
+				pr_err("pthread_join failed for Thread[0x%x] ret:%d\n",
+				       thread_datas[i].gpu_id, join_ret);
+				if (!error_occurred) {
+					ret = join_ret;
+					error_occurred = 1;
+				}
+			}
+
+			pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);
+
+			/* Check thread return value */
+			if (thread_datas[i].ret && !error_occurred) {
+				ret = thread_datas[i].ret;
+				error_occurred = 1;
+			}
+		}
+
+		if (thread_datas)
+			xfree(thread_datas);
+err:
+		free_parallel_restore_cmd(&restore_cmd);
+
+		if (ret) {
+			*(int *)arg = ret;
+			return NULL;
+		}
+	}
+exit:
+	return NULL;
+}
+
+/*
+ * While the background thread is running, some processing functions (e.g., stop_cgroupd)
+ * in the main thread need to block SIGCHLD. To prevent interference from this background
+ * thread, SIGCHLD is blocked in this thread.
+ */
+static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg)
+{
+	int ret = 0;
+	sigset_t blockmask, oldmask;
+
+	sigemptyset(&blockmask);
+	sigaddset(&blockmask, SIGCHLD);
+	sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
+
+	ret = pthread_create(newthread, NULL, f, arg);
+	if (ret) {
+		pr_err("Create worker thread fail: %d\n", ret);
+		return -1;
+	}
+
+	sigprocmask(SIG_SETMASK, &oldmask, NULL);
+	return 0;
+}
+
+int amdgpu_plugin_post_forking(void)
+{
+	if (plugin_disabled)
+		return -ENOTSUP;
+
+	if (parallel_disabled)
+		return 0;
+
+	return back_thread_create(&parallel_thread, restore_device_parallel_worker, &parallel_thread_result);
+}
+CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking)
--- a/plugins/amdgpu/amdgpu_plugin_topology.c
+++ b/plugins/amdgpu/amdgpu_plugin_topology.c
@ -45,7 +45,7 @@ bool kfd_capability_check = true;
 */
 int fd_next = -1;

-static int open_drm_render_device(int minor)
+int open_drm_render_device(int minor)
 {
 	char path[128];
 	int fd, ret_fd;
--- a/plugins/amdgpu/amdgpu_plugin_topology.h
+++ b/plugins/amdgpu/amdgpu_plugin_topology.h
@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32
 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor);
 struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index);

+int open_drm_render_device(int minor);
 int node_get_drm_render_device(struct tp_node *node);
 void sys_close_drm_render_devices(struct tp_system *sys);

--- a/plugins/amdgpu/amdgpu_socket_utils.c
+++ b/plugins/amdgpu/amdgpu_socket_utils.c
@ -0,0 +1,320 @@
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdint.h>
+
+#include "amdgpu_socket_utils.h"
+#include "criu-log.h"
+#include "common/scm.h"
+#include "fdstore.h"
+#include "util-pie.h"
+#include "util.h"
+
+int parallel_socket_addr_len;
+struct sockaddr_un parallel_socket_addr;
+int parallel_socket_id = 0;
+
+static void amdgpu_socket_name_gen(struct sockaddr_un *addr, int *len)
+{
+	addr->sun_family = AF_UNIX;
+	snprintf(addr->sun_path, UNIX_PATH_MAX, "x/criu-amdgpu-parallel-%s", criu_run_id);
+	*len = SUN_LEN(addr);
+	*addr->sun_path = '\0';
+}
+
+int install_parallel_sock(void)
+{
+	int ret = 0;
+	int sock_fd;
+
+	sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+	if (sock_fd < 0) {
+		pr_perror("socket creation failed");
+		return -1;
+	}
+
+	amdgpu_socket_name_gen(&parallel_socket_addr, &parallel_socket_addr_len);
+	ret = bind(sock_fd, (struct sockaddr *)&parallel_socket_addr, parallel_socket_addr_len);
+	if (ret < 0) {
+		pr_perror("bind failed");
+		goto err;
+	}
+
+	ret = listen(sock_fd, SOMAXCONN);
+	if (ret < 0) {
+		pr_perror("listen failed");
+		goto err;
+	}
+
+	parallel_socket_id = fdstore_add(sock_fd);
+	if (parallel_socket_id < 0) {
+		ret = -1;
+		goto err;
+	}
+err:
+	close(sock_fd);
+	return ret;
+}
+
+void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset,
+			     parallel_restore_cmd *restore_cmd)
+{
+	parallel_restore_entry *restore_entry = &restore_cmd->entries[restore_cmd->cmd_head.entry_num];
+	restore_entry->gpu_id = gpu_id;
+	restore_entry->write_id = restore_cmd->cmd_head.fd_write_num;
+	restore_entry->write_offset = 0;
+	restore_entry->read_offset = offset;
+	restore_entry->size = size;
+
+	restore_cmd->fds_write[restore_cmd->cmd_head.fd_write_num] = dmabuf_fd;
+
+	restore_cmd->cmd_head.entry_num += 1;
+	restore_cmd->cmd_head.fd_write_num += 1;
+}
+
+void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd)
+{
+	restore_cmd->gpu_ids[restore_cmd->cmd_head.gpu_num] = (parallel_gpu_info){ gpu_id, minor };
+	restore_cmd->cmd_head.gpu_num += 1;
+}
+
+static int send_metadata(int sock_fd, parallel_restore_cmd *restore_cmd)
+{
+	if (send(sock_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) {
+		pr_perror("Send parallel restore command head fail");
+		return -1;
+	}
+	return 0;
+}
+
+static int send_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd)
+{
+	if (send(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) {
+		pr_perror("Send GPU ids of parallel restore command fail");
+		return -1;
+	}
+	return 0;
+}
+
+static int send_cmds(int sock_fd, parallel_restore_cmd *restore_cmd)
+{
+	if (send(sock_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) {
+		pr_perror("Send parallel restore command fail");
+		return -1;
+	}
+	return 0;
+}
+
+static int send_dmabuf_fds(int sock_fd, parallel_restore_cmd *restore_cmd)
+{
+	if (send_fds(sock_fd, NULL, 0, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) {
+		pr_perror("Send dmabuf fds fail");
+		return -1;
+	}
+	return 0;
+}
+
+int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd)
+{
+	int sock_fd;
+	int ret = 0;
+
+	sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+	if (sock_fd < 0) {
+		pr_perror("Socket creation failed");
+		return -1;
+	}
+
+	ret = connect(sock_fd, (struct sockaddr *)&parallel_socket_addr, parallel_socket_addr_len);
+	if (ret < 0) {
+		pr_perror("Connect failed");
+		goto err;
+	}
+
+	ret = send_metadata(sock_fd, restore_cmd);
+	if (ret) {
+		goto err;
+	}
+
+	ret = send_gpu_ids(sock_fd, restore_cmd);
+	if (ret) {
+		goto err;
+	}
+
+	ret = send_cmds(sock_fd, restore_cmd);
+	if (ret) {
+		goto err;
+	}
+
+	ret = send_dmabuf_fds(sock_fd, restore_cmd);
+
+err:
+	close(sock_fd);
+	return ret;
+}
+
+int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd)
+{
+	restore_cmd->cmd_head.id = id;
+	restore_cmd->cmd_head.fd_write_num = 0;
+	restore_cmd->cmd_head.entry_num = 0;
+	restore_cmd->cmd_head.gpu_num = 0;
+
+	restore_cmd->gpu_ids = xzalloc(gpu_num * sizeof(parallel_gpu_info));
+	if (!restore_cmd->gpu_ids)
+		return -ENOMEM;
+	restore_cmd->fds_write = xzalloc(num * sizeof(int));
+	if (!restore_cmd->fds_write)
+		return -ENOMEM;
+	restore_cmd->entries = xzalloc(num * sizeof(parallel_restore_entry));
+	if (!restore_cmd->entries)
+		return -ENOMEM;
+	return 0;
+}
+
+void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd)
+{
+	if (restore_cmd->gpu_ids)
+		xfree(restore_cmd->gpu_ids);
+	if (restore_cmd->fds_write)
+		xfree(restore_cmd->fds_write);
+	if (restore_cmd->entries)
+		xfree(restore_cmd->entries);
+}
+
+static int init_parallel_restore_cmd_by_head(parallel_restore_cmd *restore_cmd)
+{
+	restore_cmd->gpu_ids = xzalloc(restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info));
+	if (!restore_cmd->gpu_ids)
+		return -ENOMEM;
+	restore_cmd->fds_write = xzalloc(restore_cmd->cmd_head.fd_write_num * sizeof(int));
+	if (!restore_cmd->fds_write)
+		return -ENOMEM;
+	restore_cmd->entries = xzalloc(restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry));
+	if (!restore_cmd->entries)
+		return -ENOMEM;
+	return 0;
+}
+
+static int check_quit_cmd(parallel_restore_cmd *restore_cmd)
+{
+	return restore_cmd->cmd_head.fd_write_num == 0;
+}
+
+static int recv_metadata(int client_fd, parallel_restore_cmd *restore_cmd)
+{
+	if (recv(client_fd, &restore_cmd->cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) {
+		pr_perror("Recv parallel restore command head fail");
+		return -1;
+	}
+	return 0;
+}
+
+static int recv_cmds(int client_fd, parallel_restore_cmd *restore_cmd)
+{
+	if (recv(client_fd, restore_cmd->entries, restore_cmd->cmd_head.entry_num * sizeof(parallel_restore_entry), 0) < 0) {
+		pr_perror("Recv parallel restore command fail");
+		return -1;
+	}
+	return 0;
+}
+
+static int recv_gpu_ids(int sock_fd, parallel_restore_cmd *restore_cmd)
+{
+	if (recv(sock_fd, restore_cmd->gpu_ids, restore_cmd->cmd_head.gpu_num * sizeof(parallel_gpu_info), 0) < 0) {
+		pr_perror("Send GPU ids of parallel restore command fail");
+		return -1;
+	}
+	return 0;
+}
+
+static int recv_dmabuf_fds(int client_fd, parallel_restore_cmd *restore_cmd)
+{
+	if (recv_fds(client_fd, restore_cmd->fds_write, restore_cmd->cmd_head.fd_write_num, 0, 0) < 0) {
+		pr_perror("Recv dmabuf fds fail");
+		return -1;
+	}
+	return 0;
+}
+
+int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd)
+{
+	int sock_fd, client_fd;
+	int ret = 0;
+
+	sock_fd = fdstore_get(parallel_socket_id);
+	if (sock_fd < 0)
+		return -1;
+
+	client_fd = accept(sock_fd, NULL, NULL);
+	if (client_fd < 0) {
+		ret = client_fd;
+		goto err_accept;
+	}
+
+	ret = recv_metadata(client_fd, restore_cmd);
+	if (ret) {
+		goto err;
+	}
+
+	// Return 1 to quit
+	if (check_quit_cmd(restore_cmd)) {
+		ret = 1;
+		goto err;
+	}
+
+	ret = init_parallel_restore_cmd_by_head(restore_cmd);
+	if (ret) {
+		goto err;
+	}
+
+	ret = recv_gpu_ids(client_fd, restore_cmd);
+	if (ret) {
+		goto err;
+	}
+
+	ret = recv_cmds(client_fd, restore_cmd);
+	if (ret) {
+		goto err;
+	}
+
+	ret = recv_dmabuf_fds(client_fd, restore_cmd);
+
+err:
+	close(client_fd);
+err_accept:
+	close(sock_fd);
+	return ret;
+}
+
+int close_parallel_restore_server(void)
+{
+	int sock_fd;
+	int ret = 0;
+	parallel_restore_cmd_head cmd_head;
+
+	sock_fd = socket(PF_UNIX, SOCK_STREAM | SOCK_CLOEXEC, 0);
+	if (sock_fd < 0) {
+		pr_perror("Socket creation failed");
+		return -1;
+	}
+
+	ret = connect(sock_fd, (struct sockaddr *)&parallel_socket_addr, parallel_socket_addr_len);
+	if (ret < 0) {
+		pr_perror("Connect failed");
+		goto err;
+	}
+
+	memset(&cmd_head, 0, sizeof(parallel_restore_cmd_head));
+	if (send(sock_fd, &cmd_head, sizeof(parallel_restore_cmd_head), 0) < 0) {
+		pr_perror("Send parallel restore command head fail");
+		return -1;
+	}
+
+err:
+	close(sock_fd);
+	return ret;
+}
--- a/plugins/amdgpu/amdgpu_socket_utils.h
+++ b/plugins/amdgpu/amdgpu_socket_utils.h
@ -0,0 +1,54 @@
+#ifndef __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__
+#define __KFD_PLUGIN_AMDGPU_SOCKET_UTILS_H__
+
+typedef struct {
+	int id;
+	int fd_write_num; /* The number of buffer objects to be restored. */
+	int entry_num;	  /* The number of restore commands.*/
+	int gpu_num;
+} parallel_restore_cmd_head;
+
+typedef struct {
+	int gpu_id;
+	int minor;
+} parallel_gpu_info;
+
+typedef struct {
+	int gpu_id;
+	int write_id;
+	uint64_t read_offset;
+	uint64_t write_offset;
+	uint64_t size;
+} parallel_restore_entry;
+
+typedef struct {
+	parallel_restore_cmd_head cmd_head;
+	int *fds_write;
+	parallel_gpu_info *gpu_ids;
+	parallel_restore_entry *entries;
+} parallel_restore_cmd;
+
+/*
+ * For parallel_restore, a background thread in the main CRIU process is used to restore the GPU
+ * buffer object. However, initially, the ownership of these buffer objects and the metadata for
+ * restoration are all with the target process. Therefore, we introduce a series of functions to
+ * help the target process send these tasks to the main CRIU process.
+ */
+int init_parallel_restore_cmd(int num, int id, int gpu_num, parallel_restore_cmd *restore_cmd);
+
+void free_parallel_restore_cmd(parallel_restore_cmd *restore_cmd);
+
+int install_parallel_sock(void);
+
+int send_parallel_restore_cmd(parallel_restore_cmd *restore_cmd);
+
+int recv_parallel_restore_cmd(parallel_restore_cmd *restore_cmd);
+
+void parallel_restore_bo_add(int dmabuf_fd, int gpu_id, uint64_t size, uint64_t offset,
+			     parallel_restore_cmd *restore_cmd);
+
+void parallel_restore_gpu_id_add(int gpu_id, int minor, parallel_restore_cmd *restore_cmd);
+
+int close_parallel_restore_server(void);
+
+#endif
Author	SHA1	Message	Date
Yanning Yang	7c4bcdb2d4	plugins/amdgpu: Update `README.md` and `criu-amdgpu-plugin.txt` Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>	2025-05-17 13:36:36 -07:00
Yanning Yang	bfb4a3d842	plugins/amdgpu: Implement parallel restore This patch implements the entire logic to enable the offloading of buffer object content restoration. The goal of this patch is to offload the buffer object content restoration to the main CRIU process so that this restoration can occur in parallel with other restoration logic (mainly the restoration of memory state in the restore blob, which is time-consuming) to speed up the restore phase. The restoration of buffer object content usually takes a significant amount of time for GPU applications, so parallelizing it with other operations can reduce the overall restore time. It has three parts: the first replaces the restoration of buffer objects in the target process by sending a parallel restore command to the main CRIU process; the second implements the POST_FORKING hook in the amdgpu plugin to enable buffer object content restoration in the main CRIU process; the third stops the parallel thread in the RESUME_DEVICES_LATE hook. This optimization only focuses on the single-process situation (common case). In other scenarios, it will turn to the original method. This is achieved with the new `parallel_disabled` flag. Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>	2025-05-17 13:36:36 -07:00
Yanning Yang	bfd9aa269b	plugins/amdgpu: Add parallel restore command Currently the restore of buffer object comsumes a significant amount of time. However, this part has no logical dependencies with other restore operations. This patch introduce some structures and some helper functions for the target process to offload this task to the main CRIU process. Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>	2025-05-17 13:36:36 -07:00
Yanning Yang	e4c151eab3	plugins/amdgpu: Add socket operations When enabling parallel restore, the target process and the main CRIU process need an IPC interface to communicate and transfer restore commands. This patch adds a Unix domain TCP socket and stores this socket in `fdstore`. Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>	2025-05-17 13:36:36 -07:00
Yanning Yang	0a274b6afa	pstree: Add `has_children` function Currently, parallel restore only focuses on the single-process situation. Therefore, it needs an interface to know if there is only one process to restore. This patch adds a `has_children` function in `pstree.h` and replaces some existing implementations with this function. Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>	2025-05-17 13:36:36 -07:00
Yanning Yang	4ba058060c	cr-restore: Move `cr_plugin_init` after `fdstore_init` Currently, when CRIU calls `cr_plugin_init`, `fdstore` is not initialized. However, during the plugin restore procedure, there may be some common file operations used in multiple hooks. This patch moves `cr_plugin_init` after `fdstore_init`, allowing `cr_plugin_init` to use `fdstore` to place these file operations. Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>	2025-05-17 13:36:36 -07:00
Yanning Yang	8902353057	criu: Introduce a new device plugin hook for restore Currently, in the target process, device-related restore operations and other restore operations almost run sequentially. When the target process executes the corresponding CRIU hook functions, it can't perform other restore operations. However, for GPU applications, some device restore operations have no logical dependencies on other common restore operations and can be parallelized with other operations to speed up the process. Instead of launching a thread in child processes for parallelization, this patch chooses to add a new hook, `POST_FORKING`, in the main CRIU process to handle these restore operations. This is because the restoration of memory state in the restore blob is one of the most time-consuming parts of all restore logic. The main CRIU process can easily parallelize these operations, whereas parallelizing in threads within child processes is challenging. - POST_FORKING *POST_FORKING: Hook to enable the main CRIU process to perform some restore operations of plugins. Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>	2025-05-17 13:36:36 -07:00