From bfb4a3d8422923c90867c8ec4a2e697d0a45049c Mon Sep 17 00:00:00 2001
From: Yanning Yang <yangyanning@sjtu.edu.cn>
Date: Wed, 15 Jan 2025 06:38:27 +0000
Subject: [PATCH] plugins/amdgpu: Implement parallel restore

This patch implements the entire logic to enable the offloading of
buffer object content restoration.

The goal of this patch is to offload the buffer object content
restoration to the main CRIU process so that this restoration can occur
in parallel with other restoration logic (mainly the restoration of
memory state in the restore blob, which is time-consuming) to speed up
the restore phase. The restoration of buffer object content usually
takes a significant amount of time for GPU applications, so
parallelizing it with other operations can reduce the overall restore
time.

It has three parts: the first replaces the restoration of buffer objects
in the target process by sending a parallel restore command to the main
CRIU process; the second implements the POST_FORKING hook in the amdgpu
plugin to enable buffer object content restoration in the main CRIU
process; the third stops the parallel thread in the RESUME_DEVICES_LATE
hook.

This optimization only focuses on the single-process situation (common
case). In other scenarios, it will turn to the original method. This is
achieved with the new `parallel_disabled` flag.

Signed-off-by: Yanning Yang <yangyanning@sjtu.edu.cn>
---
 plugins/amdgpu/Makefile                 |   2 +-
 plugins/amdgpu/amdgpu_plugin.c          | 420 +++++++++++++++++++++---
 plugins/amdgpu/amdgpu_plugin_topology.c |   2 +-
 plugins/amdgpu/amdgpu_plugin_topology.h |   1 +
 4 files changed, 374 insertions(+), 51 deletions(-)

diff --git a/plugins/amdgpu/Makefile b/plugins/amdgpu/Makefile
index a20d1d163..4bf5e499f 100644
--- a/plugins/amdgpu/Makefile
+++ b/plugins/amdgpu/Makefile
@@ -27,7 +27,7 @@ endif
 criu-amdgpu.pb-c.c: criu-amdgpu.proto
 		protoc-c --proto_path=. --c_out=. criu-amdgpu.proto
 
-amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c
+amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_drm.c amdgpu_plugin_topology.c amdgpu_plugin_util.c criu-amdgpu.pb-c.c amdgpu_socket_utils.c
 	$(CC) $(PLUGIN_CFLAGS) $(shell $(COMPEL) includes) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS) $(LIBDRM_INC)
 
 amdgpu_plugin_clean:
diff --git a/plugins/amdgpu/amdgpu_plugin.c b/plugins/amdgpu/amdgpu_plugin.c
index 96c086162..69194fbc7 100644
--- a/plugins/amdgpu/amdgpu_plugin.c
+++ b/plugins/amdgpu/amdgpu_plugin.c
@@ -28,11 +28,13 @@
 #include "xmalloc.h"
 #include "criu-log.h"
 #include "files.h"
+#include "pstree.h"
 
 #include "common/list.h"
 #include "amdgpu_plugin_drm.h"
 #include "amdgpu_plugin_util.h"
 #include "amdgpu_plugin_topology.h"
+#include "amdgpu_socket_utils.h"
 
 #include "img-streamer.h"
 #include "image.h"
@@ -64,6 +66,18 @@ bool plugin_added_to_inventory = false;
 
 bool plugin_disabled = false;
 
+/*
+ * In the case of a single process (common case), this optimization can effectively
+ * reduce the restore latency with parallel restore. In the case of multiple processes,
+ * states are already restored in parallel within different processes. Therefore, this
+ * optimization does not introduce further improvement and will be disabled by default
+ * in this case. The flag, parallel_disabled, is used to control whether the
+ * optimization is enabled or disabled.
+ */
+bool parallel_disabled = false;
+
+pthread_t parallel_thread = 0;
+int parallel_thread_result = 0;
 /**************************************************************************************************/
 
 /* Call ioctl, restarting if it is interrupted */
@@ -351,6 +365,15 @@ int amdgpu_plugin_init(int stage)
 	maps_init(&restore_maps);
 
 	if (stage == CR_PLUGIN_STAGE__RESTORE) {
+		if (has_children(root_item)) {
+			pr_info("Parallel restore disabled\n");
+			parallel_disabled = true;
+		} else {
+			if (install_parallel_sock() < 0) {
+				pr_err("Failed to install parallel socket\n");
+				return -1;
+			}
+		}
 		/* Default Values */
 		kfd_fw_version_check = true;
 		kfd_sdma_fw_version_check = true;
@@ -1439,14 +1462,9 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
 
 static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
 {
-	struct thread_data *thread_datas;
+	struct thread_data *thread_datas = NULL;
 	int thread_i, ret = 0;
-
-	thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus);
-	if (!thread_datas) {
-		ret = -ENOMEM;
-		goto exit;
-	}
+	int offset = 0;
 
 	for (int i = 0; i < e->num_of_bos; i++) {
 		struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
@@ -1489,56 +1507,101 @@ static int restore_bo_data(int id, struct kfd_criu_bo_bucket *bo_buckets, CriuKf
 		}
 	}
 
-	thread_i = 0;
-	for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) {
-		struct tp_node *dev;
-		int ret_thread = 0;
-		uint32_t target_gpu_id;
+	if (!parallel_disabled) {
+		parallel_restore_cmd restore_cmd;
+		pr_info("Begin to send parallel restore cmd\n");
+		ret = init_parallel_restore_cmd(e->num_of_bos, id, e->num_of_gpus, &restore_cmd);
+		if (ret)
+			goto exit_parallel;
 
-		if (!e->device_entries[i]->gpu_id)
-			continue;
+		for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) {
+			uint32_t target_gpu_id;
+			struct tp_node *dev;
 
-		/* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */
-		target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id);
+			if (!e->device_entries[i]->gpu_id)
+				continue;
 
-		/* We need the fd for actual_gpu_id */
-		dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
-		if (!dev) {
-			pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
-			ret = -ENODEV;
+			target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id);
+			dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
+			if (!dev) {
+				pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
+				ret = -ENODEV;
+				goto exit_parallel;
+			}
+			parallel_restore_gpu_id_add(e->device_entries[i]->gpu_id, dev->drm_render_minor, &restore_cmd);
+
+			for (int j = 0; j < e->num_of_bos; j++) {
+				if (bo_buckets[j].gpu_id != e->device_entries[i]->gpu_id)
+					continue;
+				if (bo_buckets[j].alloc_flags &
+				    (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
+					parallel_restore_bo_add(bo_buckets[j].dmabuf_fd, bo_buckets[j].gpu_id,
+								bo_buckets[j].size, offset, &restore_cmd);
+					offset += bo_buckets[j].size;
+				}
+			}
+		}
+		ret = send_parallel_restore_cmd(&restore_cmd);
+exit_parallel:
+		free_parallel_restore_cmd(&restore_cmd);
+	} else {
+		thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus);
+		if (!thread_datas) {
+			ret = -ENOMEM;
 			goto exit;
 		}
 
-		thread_datas[thread_i].id = id;
-		thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id;
-		thread_datas[thread_i].bo_buckets = bo_buckets;
-		thread_datas[thread_i].bo_entries = e->bo_entries;
-		thread_datas[thread_i].pid = e->pid;
-		thread_datas[thread_i].num_of_bos = e->num_of_bos;
+		thread_i = 0;
+		for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) {
+			struct tp_node *dev;
+			int ret_thread = 0;
+			uint32_t target_gpu_id;
 
-		thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev);
-		if (thread_datas[thread_i].drm_fd < 0) {
-			ret = -thread_datas[thread_i].drm_fd;
-			goto exit;
+			if (!e->device_entries[i]->gpu_id)
+				continue;
+
+			/* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */
+			target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id);
+
+			/* We need the fd for actual_gpu_id */
+			dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
+			if (!dev) {
+				pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
+				ret = -ENODEV;
+				goto exit;
+			}
+
+			thread_datas[thread_i].id = id;
+			thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id;
+			thread_datas[thread_i].bo_buckets = bo_buckets;
+			thread_datas[thread_i].bo_entries = e->bo_entries;
+			thread_datas[thread_i].pid = e->pid;
+			thread_datas[thread_i].num_of_bos = e->num_of_bos;
+
+			thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev);
+			if (thread_datas[thread_i].drm_fd < 0) {
+				ret = -thread_datas[thread_i].drm_fd;
+				goto exit;
+			}
+
+			ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents,
+						    (void *)&thread_datas[thread_i]);
+			if (ret_thread) {
+				pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread);
+				ret = -ret_thread;
+				goto exit;
+			}
+			thread_i++;
 		}
 
-		ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents,
-					    (void *)&thread_datas[thread_i]);
-		if (ret_thread) {
-			pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread);
-			ret = -ret_thread;
-			goto exit;
-		}
-		thread_i++;
-	}
+		for (int i = 0; i < e->num_of_gpus; i++) {
+			pthread_join(thread_datas[i].thread, NULL);
+			pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);
 
-	for (int i = 0; i < e->num_of_gpus; i++) {
-		pthread_join(thread_datas[i].thread, NULL);
-		pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);
-
-		if (thread_datas[i].ret) {
-			ret = thread_datas[i].ret;
-			goto exit;
+			if (thread_datas[i].ret) {
+				ret = thread_datas[i].ret;
+				goto exit;
+			}
 		}
 	}
 exit:
@@ -1546,8 +1609,8 @@ exit:
 		if (bo_buckets[i].dmabuf_fd != KFD_INVALID_FD)
 			close(bo_buckets[i].dmabuf_fd);
 	}
-
-	xfree(thread_datas);
+	if (thread_datas)
+		xfree(thread_datas);
 	return ret;
 }
 
@@ -1836,6 +1899,24 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
 	if (plugin_disabled)
 		return -ENOTSUP;
 
+	if (!parallel_disabled) {
+		pr_info("Close parallel restore server\n");
+		if (close_parallel_restore_server()) {
+			pr_err("Close parallel restore server fail\n");
+			return -1;
+		}
+
+		exit_code = pthread_join(parallel_thread, NULL);
+		if (exit_code) {
+			pr_err("Failed to join parallel thread ret:%d\n", exit_code);
+			return -1;
+		}
+		if (parallel_thread_result) {
+			pr_err("Parallel restore fail\n");
+			return parallel_thread_result;
+		}
+	}
+
 	pr_info("Inside %s for target pid = %d\n", __func__, target_pid);
 
 	fd = open(AMDGPU_KFD_DEVICE, O_RDWR | O_CLOEXEC);
@@ -1862,3 +1943,244 @@ int amdgpu_plugin_resume_devices_late(int target_pid)
 }
 
 CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__RESUME_DEVICES_LATE, amdgpu_plugin_resume_devices_late)
+
+int sdma_copy_bo_helper(uint64_t size, int fd, FILE *storage_fp, void *buffer, size_t buffer_size,
+			amdgpu_device_handle h_dev, uint64_t max_copy_size, enum sdma_op_type type)
+{
+	return sdma_copy_bo((struct kfd_criu_bo_bucket){ 0, size, 0, 0, 0, 0, fd, 0 }, storage_fp, buffer,
+			    buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
+}
+
+int init_dev(int dev_minor, amdgpu_device_handle *h_dev, uint64_t *max_copy_size)
+{
+	int ret = 0;
+	int drm_fd = -1;
+	uint32_t major, minor;
+
+	struct amdgpu_gpu_info gpu_info = { 0 };
+
+	drm_fd = open_drm_render_device(dev_minor);
+	if (drm_fd < 0) {
+		return drm_fd;
+	}
+
+	ret = amdgpu_device_initialize(drm_fd, &major, &minor, h_dev);
+	if (ret) {
+		pr_perror("Failed to initialize device");
+		goto err;
+	}
+
+	ret = amdgpu_query_gpu_info(*h_dev, &gpu_info);
+	if (ret) {
+		pr_perror("failed to query gpuinfo via libdrm");
+		goto err;
+	}
+	*max_copy_size = (gpu_info.family_id >= AMDGPU_FAMILY_AI) ? SDMA_LINEAR_COPY_MAX_SIZE :
+								    SDMA_LINEAR_COPY_MAX_SIZE - 1;
+	return 0;
+err:
+	amdgpu_device_deinitialize(*h_dev);
+	return ret;
+}
+
+FILE *get_bo_contents_fp(int id, int gpu_id, size_t tot_size)
+{
+	char img_path[PATH_MAX];
+	size_t image_size = 0;
+	FILE *bo_contents_fp = NULL;
+
+	snprintf(img_path, sizeof(img_path), IMG_KFD_PAGES_FILE, id, gpu_id);
+	bo_contents_fp = open_img_file(img_path, false, &image_size);
+	if (!bo_contents_fp) {
+		pr_perror("Cannot fopen %s", img_path);
+		return NULL;
+	}
+
+	if (tot_size != image_size) {
+		pr_err("%s size mismatch (current:%ld:expected:%ld)\n", img_path, image_size, tot_size);
+		fclose(bo_contents_fp);
+		return NULL;
+	}
+	return bo_contents_fp;
+}
+
+struct parallel_thread_data {
+	pthread_t thread;
+	uint32_t gpu_id;
+	int minor;
+	parallel_restore_cmd *restore_cmd;
+	int ret;
+};
+
+void *parallel_restore_bo_contents(void *_thread_data)
+{
+	struct parallel_thread_data *thread_data = (struct parallel_thread_data *)_thread_data;
+	amdgpu_device_handle h_dev;
+	uint64_t max_copy_size;
+	size_t total_bo_size = 0, max_bo_size = 0, buffer_size = 0;
+	FILE *bo_contents_fp = NULL;
+	parallel_restore_entry *entry;
+	parallel_restore_cmd *restore_cmd = thread_data->restore_cmd;
+	int ret = 0;
+	int offset = 0;
+	void *buffer = NULL;
+
+	ret = init_dev(thread_data->minor, &h_dev, &max_copy_size);
+	if (ret) {
+		goto err;
+	}
+
+	for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) {
+		if (restore_cmd->entries[i].gpu_id == thread_data->gpu_id) {
+			total_bo_size += restore_cmd->entries[i].size;
+			max_bo_size = max(restore_cmd->entries[i].size, max_bo_size);
+		}
+	}
+
+	buffer_size = kfd_max_buffer_size > 0 ? min(kfd_max_buffer_size, max_bo_size) : max_bo_size;
+
+	bo_contents_fp = get_bo_contents_fp(restore_cmd->cmd_head.id, thread_data->gpu_id, total_bo_size);
+	if (bo_contents_fp == NULL) {
+		ret = -1;
+		goto err_sdma;
+	}
+	offset = ftell(bo_contents_fp);
+
+	posix_memalign(&buffer, sysconf(_SC_PAGE_SIZE), buffer_size);
+	if (!buffer) {
+		pr_perror("Failed to alloc aligned memory. Consider setting KFD_MAX_BUFFER_SIZE.");
+		ret = -ENOMEM;
+		goto err_sdma;
+	}
+
+	for (int i = 0; i < restore_cmd->cmd_head.entry_num; i++) {
+		if (restore_cmd->entries[i].gpu_id != thread_data->gpu_id)
+			continue;
+
+		entry = &restore_cmd->entries[i];
+		fseek(bo_contents_fp, entry->read_offset + offset, SEEK_SET);
+		ret = sdma_copy_bo_helper(entry->size, restore_cmd->fds_write[entry->write_id], bo_contents_fp, buffer,
+					  buffer_size, h_dev, max_copy_size, SDMA_OP_VRAM_WRITE);
+		if (ret) {
+			pr_err("Failed to fill the BO using sDMA: bo_buckets[%d]\n", i);
+			goto err_sdma;
+		}
+	}
+
+err_sdma:
+	if (bo_contents_fp)
+		fclose(bo_contents_fp);
+	if (buffer)
+		xfree(buffer);
+	amdgpu_device_deinitialize(h_dev);
+err:
+	thread_data->ret = ret;
+	return NULL;
+}
+
+void *restore_device_parallel_worker(void *arg)
+{
+	while (1) {
+		parallel_restore_cmd restore_cmd = { 0 };
+		struct parallel_thread_data *thread_datas = NULL;
+		int ret;
+		int error_occurred = 0, join_ret = 0, created_threads = 0;
+
+		ret = recv_parallel_restore_cmd(&restore_cmd);
+		if (ret) {
+			if (ret == 1) {
+				*(int *)arg = 0;
+				goto exit;
+			}
+			goto err;
+		}
+
+		thread_datas = xzalloc(sizeof(*thread_datas) * restore_cmd.cmd_head.gpu_num);
+		if (!thread_datas) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		for (; created_threads < restore_cmd.cmd_head.gpu_num; created_threads++) {
+			thread_datas[created_threads].gpu_id = restore_cmd.gpu_ids[created_threads].gpu_id;
+			thread_datas[created_threads].minor = restore_cmd.gpu_ids[created_threads].minor;
+			thread_datas[created_threads].restore_cmd = &restore_cmd;
+
+			ret = pthread_create(&thread_datas[created_threads].thread, NULL, parallel_restore_bo_contents,
+					     (void *)&thread_datas[created_threads]);
+			if (ret) {
+				pr_err("Failed to create thread[0x%x] ret:%d\n", thread_datas[created_threads].gpu_id, ret);
+				error_occurred = 1;
+				break;
+			}
+		}
+
+		for (int i = 0; i < created_threads; i++) {
+			join_ret = pthread_join(thread_datas[i].thread, NULL);
+			if (join_ret != 0) {
+				pr_err("pthread_join failed for Thread[0x%x] ret:%d\n",
+				       thread_datas[i].gpu_id, join_ret);
+				if (!error_occurred) {
+					ret = join_ret;
+					error_occurred = 1;
+				}
+			}
+
+			pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);
+
+			/* Check thread return value */
+			if (thread_datas[i].ret && !error_occurred) {
+				ret = thread_datas[i].ret;
+				error_occurred = 1;
+			}
+		}
+
+		if (thread_datas)
+			xfree(thread_datas);
+err:
+		free_parallel_restore_cmd(&restore_cmd);
+
+		if (ret) {
+			*(int *)arg = ret;
+			return NULL;
+		}
+	}
+exit:
+	return NULL;
+}
+
+/*
+ * While the background thread is running, some processing functions (e.g., stop_cgroupd)
+ * in the main thread need to block SIGCHLD. To prevent interference from this background
+ * thread, SIGCHLD is blocked in this thread.
+ */
+static int back_thread_create(pthread_t *newthread, void *(*f)(void *), void *arg)
+{
+	int ret = 0;
+	sigset_t blockmask, oldmask;
+
+	sigemptyset(&blockmask);
+	sigaddset(&blockmask, SIGCHLD);
+	sigprocmask(SIG_BLOCK, &blockmask, &oldmask);
+
+	ret = pthread_create(newthread, NULL, f, arg);
+	if (ret) {
+		pr_err("Create worker thread fail: %d\n", ret);
+		return -1;
+	}
+
+	sigprocmask(SIG_SETMASK, &oldmask, NULL);
+	return 0;
+}
+
+int amdgpu_plugin_post_forking(void)
+{
+	if (plugin_disabled)
+		return -ENOTSUP;
+
+	if (parallel_disabled)
+		return 0;
+
+	return back_thread_create(&parallel_thread, restore_device_parallel_worker, &parallel_thread_result);
+}
+CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__POST_FORKING, amdgpu_plugin_post_forking)
\ No newline at end of file
diff --git a/plugins/amdgpu/amdgpu_plugin_topology.c b/plugins/amdgpu/amdgpu_plugin_topology.c
index 5b4396a0c..730f2e028 100644
--- a/plugins/amdgpu/amdgpu_plugin_topology.c
+++ b/plugins/amdgpu/amdgpu_plugin_topology.c
@@ -45,7 +45,7 @@ bool kfd_capability_check = true;
  */
 int fd_next = -1;
 
-static int open_drm_render_device(int minor)
+int open_drm_render_device(int minor)
 {
 	char path[128];
 	int fd, ret_fd;
diff --git a/plugins/amdgpu/amdgpu_plugin_topology.h b/plugins/amdgpu/amdgpu_plugin_topology.h
index c890e3dda..e19f8e7ce 100644
--- a/plugins/amdgpu/amdgpu_plugin_topology.h
+++ b/plugins/amdgpu/amdgpu_plugin_topology.h
@@ -118,6 +118,7 @@ struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32
 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor);
 struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index);
 
+int open_drm_render_device(int minor);
 int node_get_drm_render_device(struct tp_node *node);
 void sys_close_drm_render_devices(struct tp_system *sys);