diff --git a/criu/arch/x86/Makefile b/criu/arch/x86/Makefile index 618e85bb3..46f00e9e9 100644 --- a/criu/arch/x86/Makefile +++ b/criu/arch/x86/Makefile @@ -9,6 +9,7 @@ obj-y += cpu.o obj-y += crtools.o obj-y += kerndat.o obj-y += sigframe.o +obj-y += shstk.o ifeq ($(CONFIG_COMPAT),y) obj-y += sigaction_compat.o endif diff --git a/criu/arch/x86/include/asm/restorer.h b/criu/arch/x86/include/asm/restorer.h index f7a6d5058..3a673958d 100644 --- a/criu/arch/x86/include/asm/restorer.h +++ b/criu/arch/x86/include/asm/restorer.h @@ -8,6 +8,7 @@ #include #include #include "asm/compat.h" +#include "asm/shstk.h" #ifdef CONFIG_COMPAT extern void restore_tls(tls_t *ptls); diff --git a/criu/arch/x86/include/asm/shstk.h b/criu/arch/x86/include/asm/shstk.h new file mode 100644 index 000000000..a81062010 --- /dev/null +++ b/criu/arch/x86/include/asm/shstk.h @@ -0,0 +1,69 @@ +#ifndef __CR_ASM_SHSTK_H__ +#define __CR_ASM_SHSTK_H__ + +/* + * Shadow stack constants from Linux + */ +/* arch/x86/include/uapi/asm/mman.h */ +#ifndef SHADOW_STACK_SET_TOKEN +#define SHADOW_STACK_SET_TOKEN 0x1 /* Set up a restore token in the shadow stack */ +#endif + +/* arch/x86/include/uapi/asm/prctl.h */ +#define ARCH_SHSTK_ENABLE 0x5001 +#define ARCH_SHSTK_DISABLE 0x5002 +#define ARCH_SHSTK_LOCK 0x5003 +#define ARCH_SHSTK_UNLOCK 0x5004 +#define ARCH_SHSTK_STATUS 0x5005 + +#define ARCH_SHSTK_SHSTK (1ULL << 0) +#define ARCH_SHSTK_WRSS (1ULL << 1) + +#define ARCH_HAS_SHSTK + +/* from arch/x86/kernel/shstk.c */ +#define SHSTK_DATA_BIT (1UL << 63) /* BIT(63) */ + +/* + * Shadow stack memory cannot be restored with memcpy/pread but only using + * a special instruction that can write to shadow stack. + * That instruction is only available when shadow stack is enabled, + * otherwise it causes #UD. + * + * Also, shadow stack VMAs cannot be mmap()ed or mrepmap()ed, they must be + * created using map_shadow_stack() system call. This pushes creation of + * shadow stack VMAs to the restorer blob after CRIU mappings are freed. + * + * And there is an additional jungling with shadow stacks to ensure that we + * don't unmap an active shadow stack + * + * The overall sequence of restoring shadow stack is + * - Enable shadow stack early after clone()ing the task + * - Unlock shadow stack features using ptrace + * - In the restorer blob: + * - switch to a temporary shadow stack to be able to unmap shadow stack + * with the CRIU mappings + * - after memory mappigns are restored, recreate shadow stack VMAs, + * populate them using wrss instruction and switch to the task shadow + * stack + * - lock shadow stack features + */ +struct rst_shstk_info { + unsigned long vma_start; /* start of shadow stack VMA */ + unsigned long vma_size; /* size of shadow stack VMA */ + unsigned long premmaped_addr; /* address of shadow stack copy in + the premmaped area */ + unsigned long tmp_shstk; /* address of temporary shadow stack */ + u64 ssp; /* shadow stack pointer */ + u64 cet; /* CET conrtol state */ +}; +#define rst_shstk_info rst_shstk_info + +struct task_restore_args; +struct pstree_item; + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta); +#define arch_shstk_prepare arch_shstk_prepare + +#endif /* __CR_ASM_SHSTK_H__ */ diff --git a/criu/arch/x86/shstk.c b/criu/arch/x86/shstk.c new file mode 100644 index 000000000..f6bc81dc6 --- /dev/null +++ b/criu/arch/x86/shstk.c @@ -0,0 +1,90 @@ +#include + +#include + +#include "pstree.h" +#include "restorer.h" +#include "rst-malloc.h" +#include "vma.h" + +static bool task_needs_shstk(struct pstree_item *item, CoreEntry *core) +{ + UserX86FpregsEntry *fpregs; + + if (!task_alive(item)) + return false; + + fpregs = core->thread_info->fpregs; + if (fpregs->xsave && fpregs->xsave->cet) { + if (!compel_cpu_has_feature(X86_FEATURE_SHSTK)) { + pr_warn_once("Restoring task with shadow stack on non-CET machine\n"); + return false; + } + + if (fpregs->xsave->cet->cet & ARCH_SHSTK_SHSTK) + return true; + } + + return false; +} + +static int shstk_prepare_task(struct vm_area_list *vmas, + struct rst_shstk_info *shstk) +{ + struct vma_area *vma; + + list_for_each_entry(vma, &vmas->h, list) { + if (vma_area_is(vma, VMA_AREA_SHSTK) && + in_vma_area(vma, shstk->ssp)) { + unsigned long premmaped_addr = vma->premmaped_addr; + unsigned long size = vma_area_len(vma); + + shstk->vma_start = vma->e->start; + shstk->vma_size = size; + shstk->premmaped_addr = premmaped_addr; + shstk->tmp_shstk = premmaped_addr + size; + + break; + } + } + + return 0; +} + +int arch_shstk_prepare(struct pstree_item *item, CoreEntry *core, + struct task_restore_args *ta) +{ + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + UserX86FpregsEntry *fpregs = core->thread_info->fpregs; + struct vm_area_list *vmas = &rsti(item)->vmas; + struct rst_shstk_info *shstk = &ta->shstk; + int i; + + if (!task_needs_shstk(item, core)) + return 0; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + + for (i = 0; i < item->nr_threads; i++) { + struct thread_restore_args *thread_args = &args_array[i]; + + core = item->core[i]; + fpregs = core->thread_info->fpregs; + shstk = &thread_args->shstk; + + shstk->cet = fpregs->xsave->cet->cet; + shstk->ssp = fpregs->xsave->cet->ssp; + if (shstk_prepare_task(vmas, shstk)) { + pr_err("Failed to prepare shadow stack memory\n"); + return -1; + } + } + + return 0; +} diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 270049721..e43cc1742 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -975,6 +975,9 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (setup_uffd(pid, ta)) return -1; + if (arch_shstk_prepare(current, core, ta)) + return -1; + return sigreturn_restore(pid, ta, args_len, core); } diff --git a/criu/include/restore.h b/criu/include/restore.h index 8ef0dbddf..7d29496f2 100644 --- a/criu/include/restore.h +++ b/criu/include/restore.h @@ -7,4 +7,17 @@ extern int arch_set_thread_regs_nosigrt(struct pid *pid); +struct task_restore_args; +struct pstree_item; + +#ifndef arch_shstk_prepare +static inline int arch_shstk_prepare(struct pstree_item *item, + CoreEntry *core, + struct task_restore_args *ta) +{ + return 0; +} +#define arch_shstk_prepare arch_shstk_prepare +#endif + #endif diff --git a/criu/include/restorer.h b/criu/include/restorer.h index f398d8d8f..73565d1de 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -56,6 +56,10 @@ struct restore_posix_timer { int overrun; }; +#ifndef rst_shstk_info +struct rst_shstk_info {}; +#endif + /* * We should be able to construct fpu sigframe in sigreturn_prep_fpu_frame, * so the mem_zone.rt_sigframe should be 64-bytes aligned. To make things @@ -119,6 +123,8 @@ struct thread_restore_args { unsigned int seccomp_filters_n; bool seccomp_force_tsync; + struct rst_shstk_info shstk; + char comm[TASK_COMM_LEN]; int cg_set; int cgroupd_sk; @@ -240,6 +246,8 @@ struct task_restore_args { uid_t uid; u32 cap_eff[CR_CAP_SIZE]; + + struct rst_shstk_info shstk; } __aligned(64); /*