mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-30 05:48:05 +00:00
mem: Delayed vma/pr restore (v2)
Performance experiments show, that we spend (relatively) a lot of time mremap-ing areas from premap area into their proper places. This time depends on the task being restored, but for those with many vmas this can be up to 20%. The thing is that premapping is only needed to restore cow pages since we don't have any API in the kernel to share a page between two or more anonymous vmas. For non-cowing areas we map mmap() them directly in place. But for such cases we'll also need to restore the page's contents also from the pie code. Doing the whole page-read code from PIE is way too complex (for now), so the proposal is to optimize the case when we have a single local pagemap layer. This is what pr.pieok boolean stands for. v2: * Fixed ARM compiling (vma addresses formatting) * Unused tail of premapped area was left in task after restore * Preadv-ing pages in restorer context worked on corrupted iovs due to mistakes in pointer arithmetics * AIO mapping skipped at premap wasn't mapped in pie * Growsdown VMAs should sometimes (when they are "guarded" by previous VMA and guard page's contents cannot be restored in place) be premmaped * Always premmap for lazy-pages restore Signed-off-by: Pavel Emelyanov <xemul@virtuozzo.com> Signed-off-by: Andrei Vagin <avagin@virtuozzo.com>
This commit is contained in:
parent
074e7b8901
commit
91388fce03
@ -3136,6 +3136,7 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns
|
||||
RST_MEM_FIXUP_PPTR(task_args->helpers);
|
||||
RST_MEM_FIXUP_PPTR(task_args->zombies);
|
||||
RST_MEM_FIXUP_PPTR(task_args->seccomp_filters);
|
||||
RST_MEM_FIXUP_PPTR(task_args->vma_ios);
|
||||
|
||||
if (core->tc->has_seccomp_mode)
|
||||
task_args->seccomp_mode = core->tc->seccomp_mode;
|
||||
|
@ -52,6 +52,9 @@ struct page_read {
|
||||
int (*sync)(struct page_read *pr);
|
||||
int (*seek_pagemap)(struct page_read *pr, unsigned long vaddr);
|
||||
|
||||
/* Whether or not pages can be read in PIE code */
|
||||
bool pieok;
|
||||
|
||||
/* Private data of reader */
|
||||
struct cr_img *pmi;
|
||||
struct cr_img *pi;
|
||||
@ -95,8 +98,11 @@ extern int open_page_read(int pid, struct page_read *, int pr_flags);
|
||||
extern int open_page_read_at(int dfd, int pid, struct page_read *pr,
|
||||
int pr_flags);
|
||||
|
||||
struct task_restore_args;
|
||||
|
||||
int pagemap_enqueue_iovec(struct page_read *pr, void *buf,
|
||||
unsigned long len, struct list_head *to);
|
||||
int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta);
|
||||
|
||||
extern int dedup_one_iovec(struct page_read *pr, unsigned long base,
|
||||
unsigned long len);
|
||||
|
@ -101,6 +101,14 @@ struct thread_restore_args {
|
||||
|
||||
typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args);
|
||||
|
||||
struct restore_vma_io {
|
||||
int nr_iovs;
|
||||
loff_t off;
|
||||
struct iovec iovs[0];
|
||||
};
|
||||
|
||||
#define RIO_SIZE(niovs) (sizeof(struct restore_vma_io) + (niovs) * sizeof(struct iovec))
|
||||
|
||||
struct task_restore_args {
|
||||
struct thread_restore_args *t; /* thread group leader */
|
||||
|
||||
@ -121,6 +129,10 @@ struct task_restore_args {
|
||||
VmaEntry *vmas;
|
||||
unsigned int vmas_n;
|
||||
|
||||
int vma_ios_fd;
|
||||
struct restore_vma_io *vma_ios;
|
||||
unsigned int vma_ios_n;
|
||||
|
||||
struct restore_posix_timer *posix_timers;
|
||||
unsigned int posix_timers_n;
|
||||
|
||||
|
@ -39,6 +39,8 @@ struct rst_info {
|
||||
|
||||
struct vm_area_list vmas;
|
||||
struct _MmEntry *mm;
|
||||
struct list_head vma_io;
|
||||
unsigned int pages_img_id;
|
||||
|
||||
u32 cg_set;
|
||||
|
||||
|
94
criu/mem.c
94
criu/mem.c
@ -710,8 +710,34 @@ static int premap_private_vma(struct pstree_item *t, struct vma_area *vma, void
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool vma_force_premap(struct vma_area *vma, struct list_head *head)
|
||||
{
|
||||
/*
|
||||
* Growsdown VMAs always have one guard page at the
|
||||
* beginning and sometimes this page contains data.
|
||||
* In case the VMA is premmaped, we premmap one page
|
||||
* larger VMA. In case of in place restore we can only
|
||||
* do this if the VMA in question is not "guarded" by
|
||||
* some other VMA.
|
||||
*/
|
||||
if (vma->e->flags & MAP_GROWSDOWN) {
|
||||
if (vma->list.prev != head) {
|
||||
struct vma_area *prev;
|
||||
|
||||
prev = list_entry(vma->list.prev, struct vma_area, list);
|
||||
if (prev->e->end == vma->e->start) {
|
||||
pr_debug("Force premmap for 0x%"PRIx64":0x%"PRIx64"\n",
|
||||
vma->e->start, vma->e->end);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
|
||||
void *at, struct page_read *pr)
|
||||
void **at, struct page_read *pr)
|
||||
{
|
||||
struct vma_area *vma;
|
||||
unsigned long pstart = 0;
|
||||
@ -729,7 +755,14 @@ static int premap_priv_vmas(struct pstree_item *t, struct vm_area_list *vmas,
|
||||
if (!vma_area_is_private(vma, kdat.task_size))
|
||||
continue;
|
||||
|
||||
ret = premap_private_vma(t, vma, &at);
|
||||
if (vma->pvma == NULL && pr->pieok && !vma_force_premap(vma, &vmas->h))
|
||||
/*
|
||||
* VMA in question is not shared with anyone. We'll
|
||||
* restore it with its contents in restorer.
|
||||
*/
|
||||
continue;
|
||||
|
||||
ret = premap_private_vma(t, vma, at);
|
||||
if (ret < 0)
|
||||
break;
|
||||
}
|
||||
@ -742,6 +775,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
|
||||
struct vma_area *vma;
|
||||
int ret = 0;
|
||||
struct list_head *vmas = &rsti(t)->vmas.h;
|
||||
struct list_head *vma_io = &rsti(t)->vma_io;
|
||||
|
||||
unsigned int nr_restored = 0;
|
||||
unsigned int nr_shared = 0;
|
||||
@ -750,6 +784,7 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
|
||||
unsigned long va;
|
||||
|
||||
vma = list_first_entry(vmas, struct vma_area, list);
|
||||
rsti(t)->pages_img_id = pr->pages_img_id;
|
||||
|
||||
/*
|
||||
* Read page contents.
|
||||
@ -791,6 +826,28 @@ static int restore_priv_vma_content(struct pstree_item *t, struct page_read *pr)
|
||||
goto err_addr;
|
||||
}
|
||||
|
||||
if (!vma_area_is(vma, VMA_PREMMAPED)) {
|
||||
unsigned long len = min_t(unsigned long,
|
||||
(nr_pages - i) * PAGE_SIZE,
|
||||
vma->e->end - va);
|
||||
|
||||
if (pagemap_enqueue_iovec(pr, (void *)va, len, vma_io))
|
||||
return -1;
|
||||
|
||||
pr->skip_pages(pr, len);
|
||||
|
||||
va += len;
|
||||
len >>= PAGE_SHIFT;
|
||||
nr_restored += len;
|
||||
i += len - 1;
|
||||
pr_debug("Enqueue page-read\n");
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* Otherwise to the COW restore
|
||||
*/
|
||||
|
||||
off = (va - vma->e->start) / PAGE_SIZE;
|
||||
p = decode_pointer((off) * PAGE_SIZE +
|
||||
vma->premmaped_addr);
|
||||
@ -925,7 +982,7 @@ int prepare_mappings(struct pstree_item *t)
|
||||
|
||||
pr.advance(&pr); /* shift to the 1st iovec */
|
||||
|
||||
ret = premap_priv_vmas(t, vmas, addr, &pr);
|
||||
ret = premap_priv_vmas(t, vmas, &addr, &pr);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
@ -942,6 +999,23 @@ int prepare_mappings(struct pstree_item *t)
|
||||
old_premmapped_addr, old_premmapped_len);
|
||||
}
|
||||
|
||||
/*
|
||||
* Not all VMAs were premmaped. Find out the unused tail of the
|
||||
* premapped area and unmap it.
|
||||
*/
|
||||
old_premmapped_len = addr - rsti(t)->premmapped_addr;
|
||||
if (old_premmapped_len < rsti(t)->premmapped_len) {
|
||||
unsigned long tail;
|
||||
|
||||
tail = rsti(t)->premmapped_len - old_premmapped_len;
|
||||
ret = munmap(addr, tail);
|
||||
if (ret < 0)
|
||||
pr_perror("Unable to unmap %p(%lx)", addr, tail);
|
||||
rsti(t)->premmapped_len = old_premmapped_len;
|
||||
pr_info("Shrunk premap area to %p(%lx)\n",
|
||||
rsti(t)->premmapped_addr, rsti(t)->premmapped_len);
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
@ -995,6 +1069,18 @@ int open_vmas(struct pstree_item *t)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
|
||||
{
|
||||
struct cr_img *pages;
|
||||
|
||||
pages = open_image(CR_FD_PAGES, O_RSTR, rsti(t)->pages_img_id);
|
||||
if (!pages)
|
||||
return -1;
|
||||
|
||||
ta->vma_ios_fd = img_raw_fd(pages);
|
||||
return pagemap_render_iovec(&rsti(t)->vma_io, ta);
|
||||
}
|
||||
|
||||
int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta)
|
||||
{
|
||||
struct vma_area *vma;
|
||||
@ -1020,6 +1106,6 @@ int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta)
|
||||
vma_premmaped_start(vme) = vma->premmaped_addr;
|
||||
}
|
||||
|
||||
return 0;
|
||||
return prepare_vma_ios(t, ta);
|
||||
}
|
||||
|
||||
|
@ -10,7 +10,8 @@
|
||||
#include "cr_options.h"
|
||||
#include "servicefd.h"
|
||||
#include "pagemap.h"
|
||||
|
||||
#include "restorer.h"
|
||||
#include "rst-malloc.h"
|
||||
#include "fault-injection.h"
|
||||
#include "xmalloc.h"
|
||||
#include "protobuf.h"
|
||||
@ -309,6 +310,32 @@ static int enqueue_async_iov(struct page_read *pr, void *buf,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int pagemap_render_iovec(struct list_head *from, struct task_restore_args *ta)
|
||||
{
|
||||
struct page_read_iov *piov;
|
||||
|
||||
ta->vma_ios = (struct restore_vma_io *)rst_mem_align_cpos(RM_PRIVATE);
|
||||
ta->vma_ios_n = 0;
|
||||
|
||||
list_for_each_entry(piov, from, l) {
|
||||
struct restore_vma_io *rio;
|
||||
|
||||
pr_info("`- render %d iovs (%p:%zd...)\n", piov->nr,
|
||||
piov->to[0].iov_base, piov->to[0].iov_len);
|
||||
rio = rst_mem_alloc(RIO_SIZE(piov->nr), RM_PRIVATE);
|
||||
if (!rio)
|
||||
return -1;
|
||||
|
||||
rio->nr_iovs = piov->nr;
|
||||
rio->off = piov->from;
|
||||
memcpy(rio->iovs, piov->to, piov->nr * sizeof(struct iovec));
|
||||
|
||||
ta->vma_ios_n++;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int pagemap_enqueue_iovec(struct page_read *pr, void *buf,
|
||||
unsigned long len, struct list_head *to)
|
||||
{
|
||||
@ -641,6 +668,7 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
|
||||
pr->bunch.iov_len = 0;
|
||||
pr->bunch.iov_base = NULL;
|
||||
pr->pmes = NULL;
|
||||
pr->pieok = false;
|
||||
|
||||
pr->pmi = open_image_at(dfd, i_typ, O_RSTR, (long)pid);
|
||||
if (!pr->pmi)
|
||||
@ -673,6 +701,8 @@ int open_page_read_at(int dfd, int pid, struct page_read *pr, int pr_flags)
|
||||
pr->sync = process_async_reads;
|
||||
pr->seek_pagemap = seek_pagemap;
|
||||
pr->id = ids++;
|
||||
if (!pr->parent)
|
||||
pr->pieok = true;
|
||||
|
||||
pr_debug("Opened page read %u (parent %u)\n",
|
||||
pr->id, pr->parent ? pr->parent->id : 0);
|
||||
|
@ -595,6 +595,10 @@ static unsigned long restore_mapping(VmaEntry *vma_entry)
|
||||
if (vma_entry_is(vma_entry, VMA_ANON_SHARED) && (vma_entry->fd != -1UL))
|
||||
flags &= ~MAP_ANONYMOUS;
|
||||
|
||||
/* See comment in premap_private_vma() for this flag change */
|
||||
if (vma_entry_is(vma_entry, VMA_AREA_AIORING))
|
||||
flags |= MAP_ANONYMOUS;
|
||||
|
||||
/* A mapping of file with MAP_SHARED is up to date */
|
||||
if (vma_entry->fd == -1 || !(vma_entry->flags & MAP_SHARED))
|
||||
prot |= PROT_WRITE;
|
||||
@ -1082,7 +1086,7 @@ long __export_restore_task(struct task_restore_args *args)
|
||||
int i;
|
||||
VmaEntry *vma_entry;
|
||||
unsigned long va;
|
||||
|
||||
struct restore_vma_io *rio;
|
||||
struct rt_sigframe *rt_sigframe;
|
||||
struct prctl_mm_map prctl_map;
|
||||
unsigned long new_sp;
|
||||
@ -1179,7 +1183,8 @@ long __export_restore_task(struct task_restore_args *args)
|
||||
for (i = 0; i < args->vmas_n; i++) {
|
||||
vma_entry = args->vmas + i;
|
||||
|
||||
if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR))
|
||||
if (!vma_entry_is(vma_entry, VMA_AREA_REGULAR) &&
|
||||
!vma_entry_is(vma_entry, VMA_AREA_AIORING))
|
||||
continue;
|
||||
|
||||
if (vma_entry_is(vma_entry, VMA_PREMMAPED))
|
||||
@ -1193,6 +1198,49 @@ long __export_restore_task(struct task_restore_args *args)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Now read the contents (if any)
|
||||
*/
|
||||
|
||||
rio = args->vma_ios;
|
||||
for (i = 0; i < args->vma_ios_n; i++) {
|
||||
struct iovec *iovs = rio->iovs;
|
||||
int nr = rio->nr_iovs;
|
||||
ssize_t r;
|
||||
|
||||
while (nr) {
|
||||
pr_debug("Preadv %lx:%d... (%d iovs)\n",
|
||||
(unsigned long)iovs->iov_base,
|
||||
(int)iovs->iov_len, nr);
|
||||
r = sys_preadv(args->vma_ios_fd, iovs, nr, rio->off);
|
||||
if (r < 0) {
|
||||
pr_err("Can't read pages data (%d)\n", (int)r);
|
||||
goto core_restore_end;
|
||||
}
|
||||
|
||||
pr_debug("`- returned %ld\n", (long)r);
|
||||
rio->off += r;
|
||||
/* Advance the iovecs */
|
||||
do {
|
||||
if (iovs->iov_len <= r) {
|
||||
pr_debug(" `- skip pagemap\n");
|
||||
r -= iovs->iov_len;
|
||||
iovs++;
|
||||
nr--;
|
||||
continue;
|
||||
}
|
||||
|
||||
iovs->iov_base += r;
|
||||
iovs->iov_len -= r;
|
||||
break;
|
||||
} while (nr > 0);
|
||||
}
|
||||
|
||||
rio = ((void *)rio) + RIO_SIZE(rio->nr_iovs);
|
||||
}
|
||||
|
||||
sys_close(args->vma_ios_fd);
|
||||
|
||||
#ifdef CONFIG_VDSO
|
||||
/*
|
||||
* Proxify vDSO.
|
||||
|
@ -209,6 +209,7 @@ struct pstree_item *__alloc_pstree_item(bool rst)
|
||||
|
||||
memset(item, 0, sz);
|
||||
vm_area_list_init(&rsti(item)->vmas);
|
||||
INIT_LIST_HEAD(&rsti(item)->vma_io);
|
||||
item->pid = (void *)item + sizeof(*item) + sizeof(struct rst_info);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user