criu/plugin: Read and write BO contents in parallel

Implement multi-threaded code to read and write contents of each GPU VRAM BOs in parallel in order to speed up dumping process when using multiple GPUs. Signed-off-by: David Yat Sin <david.yatsin@amd.com> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
2025-08-31 06:15:24 +00:00 · 2021-12-22 02:48:45 -05:00
parent ba9c62df24
commit a218fe0baa
4 changed files with 303 additions and 135 deletions
--- a/plugins/amdgpu/Makefile
+++ b/plugins/amdgpu/Makefile
@@ -12,6 +12,7 @@ include $(__nmk_dir)msg.mk

 CC      		:= gcc
 PLUGIN_CFLAGS  		:= -g -Wall -Werror -D _GNU_SOURCE -shared -nostartfiles -fPIC
+PLUGIN_LDFLAGS		:= -lpthread

 ifeq ($(CONFIG_AMDGPU),y)
        all: $(DEPS_OK)
@@ -23,7 +24,7 @@ criu-amdgpu.pb-c.c: criu-amdgpu.proto
 		protoc-c --proto_path=. --c_out=. criu-amdgpu.proto

 amdgpu_plugin.so: amdgpu_plugin.c amdgpu_plugin_topology.c criu-amdgpu.pb-c.c
-	$(CC) $(PLUGIN_CFLAGS) $^ -o $@ $(PLUGIN_INCLUDE)
+	$(CC) $(PLUGIN_CFLAGS) $^ -o $@ $(PLUGIN_INCLUDE) $(PLUGIN_LDFLAGS)

 amdgpu_plugin_clean:
 	$(call msg-clean, $@)
--- a/plugins/amdgpu/amdgpu_plugin.c
+++ b/plugins/amdgpu/amdgpu_plugin.c
@@ -13,6 +13,7 @@
 #include <sys/mman.h>
 #include <sys/types.h>
 #include <stdint.h>
+#include <pthread.h>

 #include "criu-plugin.h"
 #include "plugin.h"
@@ -443,6 +444,17 @@ void amdgpu_plugin_fini(int stage, int ret)

 CR_PLUGIN_REGISTER("amdgpu_plugin", amdgpu_plugin_init, amdgpu_plugin_fini)

+struct thread_data {
+	pthread_t thread;
+	uint64_t num_of_bos;
+	uint32_t gpu_id;
+	pid_t pid;
+	struct kfd_criu_bo_bucket *bo_buckets;
+	BoEntry **bo_entries;
+	int drm_fd;
+	int ret;
+};
+
 int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
 {
 	struct stat st_kfd, st_dri_min;
@@ -480,6 +492,181 @@ int amdgpu_plugin_handle_device_vma(int fd, const struct stat *st_buf)
 }
 CR_PLUGIN_REGISTER_HOOK(CR_PLUGIN_HOOK__HANDLE_DEVICE_VMA, amdgpu_plugin_handle_device_vma)

+void *dump_bo_contents(void *_thread_data)
+{
+	int i, ret = 0;
+	int num_bos = 0;
+	struct thread_data *thread_data = (struct thread_data *)_thread_data;
+	struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets;
+	BoEntry **bo_info = thread_data->bo_entries;
+	char *fname;
+	int mem_fd = -1;
+
+	pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id);
+
+	if (asprintf(&fname, PROCPIDMEM, thread_data->pid) < 0) {
+		pr_perror("failed in asprintf, %s", fname);
+		ret = -1;
+		goto exit;
+	}
+	mem_fd = open(fname, O_RDONLY);
+	if (mem_fd < 0) {
+		pr_perror("Can't open %s for pid %d", fname, thread_data->pid);
+		free(fname);
+		ret = -errno;
+		goto exit;
+	}
+	plugin_log_msg("Opened %s file for pid = %d\n", fname, thread_data->pid);
+	free(fname);
+
+	for (i = 0; i < thread_data->num_of_bos; i++) {
+		if (bo_buckets[i].gpu_id != thread_data->gpu_id)
+			continue;
+
+		num_bos++;
+		if (!(bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) &&
+		    !(bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT))
+			continue;
+
+		if (bo_info[i]->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) {
+			void *addr;
+
+			plugin_log_msg("amdgpu_plugin: large bar read possible\n");
+
+			addr = mmap(NULL, bo_buckets[i].size, PROT_READ, MAP_SHARED, thread_data->drm_fd,
+				    bo_buckets[i].offset);
+			if (addr == MAP_FAILED) {
+				pr_perror("amdgpu_plugin: mmap failed");
+				ret = -errno;
+				goto exit;
+			}
+
+			/* direct memcpy is possible on large bars */
+			memcpy(bo_info[i]->rawdata.data, addr, bo_buckets[i].size);
+			munmap(addr, bo_buckets[i].size);
+		} else {
+			size_t bo_size;
+			plugin_log_msg("Reading BO contents with /proc/pid/mem\n");
+			if (lseek(mem_fd, (off_t)bo_buckets[i].addr, SEEK_SET) == -1) {
+				pr_perror("Can't lseek for BO offset for pid = %d", thread_data->pid);
+				ret = -errno;
+				goto exit;
+			}
+
+			bo_size = read(mem_fd, bo_info[i]->rawdata.data, bo_info[i]->size);
+			if (bo_size != bo_info[i]->size) {
+				pr_perror("Can't read buffer");
+				ret = -errno;
+				goto exit;
+			}
+		} /* PROCPIDMEM read done */
+	}
+
+exit:
+	pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret);
+
+	if (mem_fd >= 0)
+		close(mem_fd);
+
+	thread_data->ret = ret;
+	return NULL;
+};
+
+void *restore_bo_contents(void *_thread_data)
+{
+	int i, ret = 0;
+	int num_bos = 0;
+	struct thread_data *thread_data = (struct thread_data *)_thread_data;
+	struct kfd_criu_bo_bucket *bo_buckets = thread_data->bo_buckets;
+	BoEntry **bo_info = thread_data->bo_entries;
+	char *fname;
+	int mem_fd = -1;
+
+	pr_info("amdgpu_plugin: Thread[0x%x] started\n", thread_data->gpu_id);
+
+	if (asprintf(&fname, PROCPIDMEM, thread_data->pid) < 0) {
+		pr_perror("failed in asprintf, %s", fname);
+		ret = -1;
+		goto exit;
+	}
+
+	mem_fd = open(fname, O_RDWR);
+	if (mem_fd < 0) {
+		pr_perror("Can't open %s for pid %d", fname, thread_data->pid);
+		free(fname);
+		ret = -errno;
+		goto exit;
+	}
+	plugin_log_msg("Opened %s file for pid = %d\n", fname, thread_data->pid);
+	free(fname);
+
+	for (i = 0; i < thread_data->num_of_bos; i++) {
+		void *addr;
+
+		if (bo_buckets[i].gpu_id != thread_data->gpu_id)
+			continue;
+
+		num_bos++;
+
+		if (!(bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) &&
+		    !(bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT))
+			continue;
+
+		if (bo_info[i]->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) {
+			plugin_log_msg("amdgpu_plugin: large bar write possible\n");
+
+			addr = mmap(NULL, bo_buckets[i].size, PROT_WRITE, MAP_SHARED, thread_data->drm_fd,
+				    bo_buckets[i].restored_offset);
+			if (addr == MAP_FAILED) {
+				pr_perror("amdgpu_plugin: mmap failed");
+				ret = -errno;
+				goto exit;
+			}
+
+			/* direct memcpy is possible on large bars */
+			memcpy(addr, (void *)bo_info[i]->rawdata.data, bo_info[i]->size);
+			munmap(addr, bo_info[i]->size);
+		} else {
+			size_t bo_size;
+			/* Use indirect host data path via /proc/pid/mem on small pci bar GPUs or
+			 * for Buffer Objects that don't have HostAccess permissions.
+			 */
+			plugin_log_msg("amdgpu_plugin: using PROCPIDMEM to restore BO contents\n");
+			addr = mmap(NULL, bo_info[i]->size, PROT_NONE, MAP_SHARED, thread_data->drm_fd,
+				    bo_buckets[i].restored_offset);
+
+			if (addr == MAP_FAILED) {
+				pr_perror("amdgpu_plugin: mmap failed");
+				ret = -errno;
+				goto exit;
+			}
+
+			if (lseek(mem_fd, (off_t)addr, SEEK_SET) == -1) {
+				pr_perror("Can't lseek for BO offset for pid = %d", thread_data->pid);
+				ret = -errno;
+				goto exit;
+			}
+
+			plugin_log_msg("Attempt writing now\n");
+			bo_size = write(mem_fd, bo_info[i]->rawdata.data, bo_info[i]->size);
+			if (bo_size != bo_info[i]->size) {
+				pr_perror("Can't write buffer");
+				ret = -errno;
+				goto exit;
+			}
+			munmap(addr, bo_info[i]->size);
+		}
+	}
+
+exit:
+	pr_info("amdgpu_plugin: Thread[0x%x] done num_bos:%d ret:%d\n", thread_data->gpu_id, num_bos, ret);
+
+	if (mem_fd >= 0)
+		close(mem_fd);
+	thread_data->ret = ret;
+	return NULL;
+};
+
 static int unpause_process(int fd)
 {
 	int ret = 0;
@@ -539,11 +726,17 @@ exit:

 static int save_bos(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
 {
+	struct thread_data *thread_datas;
 	int ret = 0, i;
-	char *fname;

 	pr_debug("Dumping %d BOs\n", args->num_bos);

+	thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus);
+	if (!thread_datas) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
 	e->num_of_bos = args->num_bos;
 	ret = allocate_bo_entries(e, e->num_of_bos, bo_buckets);
 	if (ret)
@@ -558,66 +751,48 @@ static int save_bos(int fd, struct kfd_ioctl_criu_args *args, struct kfd_criu_bo
 		boinfo->size = bo_bucket->size;
 		boinfo->offset = bo_bucket->offset;
 		boinfo->alloc_flags = bo_bucket->alloc_flags;
+	}

-		if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM ||
-		    bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
-			if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) {
-				void *addr;
+	for (int i = 0; i < e->num_of_gpus; i++) {
+		struct tp_node *dev;
+		int ret_thread = 0;

-				pr_info("amdgpu_plugin: large bar read possible\n");
+		dev = sys_get_node_by_index(&src_topology, i);
+		if (!dev) {
+			ret = -ENODEV;
+			goto exit;
+		}

-				addr = mmap(NULL, boinfo->size, PROT_READ, MAP_SHARED, fd, boinfo->offset);
-				if (addr == MAP_FAILED) {
-					pr_perror("amdgpu_plugin: mmap failed\n");
-					ret = -errno;
-					goto exit;
-				}
+		thread_datas[i].gpu_id = dev->gpu_id;
+		thread_datas[i].bo_buckets = bo_buckets;
+		thread_datas[i].bo_entries = e->bo_entries;
+		thread_datas[i].pid = e->pid;
+		thread_datas[i].num_of_bos = args->num_bos;
+		thread_datas[i].drm_fd = node_get_drm_render_device(dev);
+		if (thread_datas[i].drm_fd < 0) {
+			ret = thread_datas[i].drm_fd;
+			goto exit;
+		}

-				/* direct memcpy is possible on large bars */
-				memcpy(boinfo->rawdata.data, addr, boinfo->size);
-				munmap(addr, boinfo->size);
-			} else {
-				size_t bo_size;
-				int mem_fd;
+		ret_thread = pthread_create(&thread_datas[i].thread, NULL, dump_bo_contents, (void *)&thread_datas[i]);
+		if (ret_thread) {
+			pr_err("Failed to create thread[%i]\n", i);
+			ret = -ret_thread;
+			goto exit;
+		}
+	}

-				pr_info("Now try reading BO contents with /proc/pid/mem\n");
-				if (asprintf(&fname, PROCPIDMEM, args->pid) < 0) {
-					pr_perror("failed in asprintf, %s", fname);
-					ret = -1;
-					goto exit;
-				}
+	for (int i = 0; i < e->num_of_gpus; i++) {
+		pthread_join(thread_datas[i].thread, NULL);
+		pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);

-				mem_fd = open(fname, O_RDONLY);
-				if (mem_fd < 0) {
-					pr_perror("Can't open %s for pid %d", fname, args->pid);
-					free(fname);
-					close(mem_fd);
-					ret = -1;
-					goto exit;
-				}
-
-				pr_info("Opened %s file for pid = %d\n", fname, args->pid);
-				free(fname);
-
-				if (lseek(mem_fd, (off_t)bo_bucket->addr, SEEK_SET) == -1) {
-					pr_perror("Can't lseek for bo_offset for pid = %d", args->pid);
-					close(mem_fd);
-					ret = -1;
-					goto exit;
-				}
-
-				bo_size = read(mem_fd, boinfo->rawdata.data, boinfo->size);
-				if (bo_size != boinfo->size) {
-					close(mem_fd);
-					pr_perror("Can't read buffer");
-					ret = -1;
-					goto exit;
-				}
-				close(mem_fd);
-			}
+		if (thread_datas[i].ret) {
+			ret = thread_datas[i].ret;
+			goto exit;
 		}
 	}
 exit:
+	xfree(thread_datas);
 	pr_info("Dumped bos %s (ret:%d)\n", ret ? "failed" : "ok", ret);
 	return ret;
 }
@@ -922,15 +1097,20 @@ static int restore_bos(struct kfd_ioctl_criu_args *args, CriuKfd *e)
 	return 0;
 }

-static int restore_bo_data(int fd, struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
+static int restore_bo_data(struct kfd_criu_bo_bucket *bo_buckets, CriuKfd *e)
 {
-	int mem_fd = -1, ret = 0;
+	struct thread_data *thread_datas;
+	int thread_i, ret = 0;
+
+	thread_datas = xzalloc(sizeof(*thread_datas) * e->num_of_gpus);
+	if (!thread_datas) {
+		ret = -ENOMEM;
+		goto exit;
+	}

 	for (int i = 0; i < e->num_of_bos; i++) {
-		void *addr;
 		struct kfd_criu_bo_bucket *bo_bucket = &bo_buckets[i];
 		struct tp_node *tp_node;
-		BoEntry *bo_entry = e->bo_entries[i];

 		if (bo_bucket->alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT |
 					      KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP | KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)) {
@@ -967,86 +1147,61 @@ static int restore_bo_data(int fd, struct kfd_criu_bo_bucket *bo_buckets, CriuKf

 			list_add_tail(&vma_md->list, &update_vma_info_list);
 		}
-
-		if (bo_bucket->alloc_flags & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
-			pr_info("amdgpu_plugin: Trying mmap in stage 2\n");
-			if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC ||
-			    bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
-				plugin_log_msg("amdgpu_plugin: large bar write possible\n");
-				addr = mmap(NULL, bo_bucket->size, PROT_WRITE, MAP_SHARED, fd,
-					    bo_bucket->restored_offset);
-				if (addr == MAP_FAILED) {
-					pr_perror("amdgpu_plugin: mmap failed");
-					fd = -EBADFD;
-					goto exit;
-				}
-
-				/* direct memcpy is possible on large bars */
-				memcpy(addr, (void *)bo_entry->rawdata.data, bo_entry->size);
-				munmap(addr, bo_entry->size);
-			} else {
-				size_t bo_size;
-				char *fname;
-				/* Use indirect host data path via /proc/pid/mem
-				 * on small pci bar GPUs or for Buffer Objects
-				 * that don't have HostAccess permissions.
-				 */
-				plugin_log_msg("amdgpu_plugin: using PROCPIDMEM to restore BO contents\n");
-				addr = mmap(NULL, bo_bucket->size, PROT_NONE, MAP_SHARED, fd,
-					    bo_bucket->restored_offset);
-				if (addr == MAP_FAILED) {
-					pr_perror("amdgpu_plugin: mmap failed");
-					fd = -EBADFD;
-					goto exit;
-				}
-
-				if (asprintf(&fname, PROCPIDMEM, e->pid) < 0) {
-					pr_perror("failed in asprintf, %s", fname);
-					munmap(addr, bo_bucket->size);
-					fd = -EBADFD;
-					goto exit;
-				}
-
-				mem_fd = open(fname, O_RDWR);
-				if (mem_fd < 0) {
-					pr_perror("Can't open %s for pid %d", fname, e->pid);
-					free(fname);
-					munmap(addr, bo_bucket->size);
-					fd = -EBADFD;
-					goto exit;
-				}
-
-				plugin_log_msg("Opened %s file for pid = %d", fname, e->pid);
-				free(fname);
-
-				if (lseek(mem_fd, (off_t)addr, SEEK_SET) == -1) {
-					pr_perror("Can't lseek for bo_offset for pid = %d", e->pid);
-					munmap(addr, bo_entry->size);
-					fd = -EBADFD;
-					goto exit;
-				}
-
-				plugin_log_msg("Attempt writing now");
-				bo_size = write(mem_fd, bo_entry->rawdata.data, bo_entry->size);
-				if (bo_size != bo_entry->size) {
-					pr_perror("Can't write buffer");
-					munmap(addr, bo_entry->size);
-					fd = -EBADFD;
-					goto exit;
-				}
-				munmap(addr, bo_entry->size);
-				close(mem_fd);
-			}
-		} else {
-			plugin_log_msg("Not a VRAM BO\n");
-			continue;
-		}
 	}

-exit:
-	if (mem_fd > 0)
-		close(mem_fd);
+	thread_i = 0;
+	for (int i = 0; i < e->num_of_gpus + e->num_of_cpus; i++) {
+		struct tp_node *dev;
+		int ret_thread = 0;
+		uint32_t target_gpu_id;

+		if (!e->device_entries[i]->gpu_id)
+			continue;
+
+		/* e->device_entries[i]->gpu_id is user_gpu_id, target_gpu_id is actual_gpu_id */
+		target_gpu_id = maps_get_dest_gpu(&restore_maps, e->device_entries[i]->gpu_id);
+
+		/* We need the fd for actual_gpu_id */
+		dev = sys_get_node_by_gpu_id(&dest_topology, target_gpu_id);
+		if (!dev) {
+			pr_err("Failed to find node with gpu_id:0x%04x\n", target_gpu_id);
+			ret = -ENODEV;
+			goto exit;
+		}
+
+		thread_datas[thread_i].gpu_id = e->device_entries[i]->gpu_id;
+		thread_datas[thread_i].bo_buckets = bo_buckets;
+		thread_datas[thread_i].bo_entries = e->bo_entries;
+		thread_datas[thread_i].pid = e->pid;
+		thread_datas[thread_i].num_of_bos = e->num_of_bos;
+
+		thread_datas[thread_i].drm_fd = node_get_drm_render_device(dev);
+		if (thread_datas[thread_i].drm_fd < 0) {
+			ret = -thread_datas[thread_i].drm_fd;
+			goto exit;
+		}
+
+		ret_thread = pthread_create(&thread_datas[thread_i].thread, NULL, restore_bo_contents,
+					    (void *)&thread_datas[thread_i]);
+		if (ret_thread) {
+			pr_err("Failed to create thread[%i] ret:%d\n", thread_i, ret_thread);
+			ret = -ret_thread;
+			goto exit;
+		}
+		thread_i++;
+	}
+
+	for (int i = 0; i < e->num_of_gpus; i++) {
+		pthread_join(thread_datas[i].thread, NULL);
+		pr_info("Thread[0x%x] finished ret:%d\n", thread_datas[i].gpu_id, thread_datas[i].ret);
+
+		if (thread_datas[i].ret) {
+			ret = thread_datas[i].ret;
+			goto exit;
+		}
+	}
+exit:
+	xfree(thread_datas);
 	return ret;
 }

@@ -1197,7 +1352,7 @@ int amdgpu_plugin_restore_file(int id)
 		goto exit;
 	}

-	ret = restore_bo_data(fd, (struct kfd_criu_bo_bucket *)args.bos, e);
+	ret = restore_bo_data((struct kfd_criu_bo_bucket *)args.bos, e);
 	if (ret)
 		goto exit;

--- a/plugins/amdgpu/amdgpu_plugin_topology.c
+++ b/plugins/amdgpu/amdgpu_plugin_topology.c
@@ -151,6 +151,17 @@ struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const
 	return NULL;
 }

+struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index)
+{
+	struct tp_node *node;
+
+	list_for_each_entry(node, &sys->nodes, listm_system) {
+		if (NODE_IS_GPU(node) && index-- == 0)
+			return node;
+	}
+	return NULL;
+}
+
 struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32_t gpu_id)
 {
 	struct tp_node *node;
--- a/plugins/amdgpu/amdgpu_plugin_topology.h
+++ b/plugins/amdgpu/amdgpu_plugin_topology.h
@@ -114,6 +114,7 @@ struct tp_iolink *node_add_iolink(struct tp_node *node, uint32_t type, uint32_t

 struct tp_node *sys_get_node_by_gpu_id(const struct tp_system *sys, const uint32_t gpu_id);
 struct tp_node *sys_get_node_by_render_minor(const struct tp_system *sys, const int drm_render_minor);
+struct tp_node *sys_get_node_by_index(const struct tp_system *sys, uint32_t index);

 int node_get_drm_render_device(struct tp_node *node);
 void sys_close_drm_render_devices(struct tp_system *sys);