diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 208bd5060..370b1e03d 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -763,12 +763,12 @@ static int dump_task_core_all(struct parasite_ctl *ctl, creds = dmpi(item)->pi_creds; if (creds->s.seccomp_mode != SECCOMP_MODE_DISABLED) { pr_info("got seccomp mode %d for %d\n", creds->s.seccomp_mode, vpid(item)); - core->tc->has_seccomp_mode = true; - core->tc->seccomp_mode = creds->s.seccomp_mode; + core->tc->has_old_seccomp_mode = true; + core->tc->old_seccomp_mode = creds->s.seccomp_mode; if (creds->s.seccomp_mode == SECCOMP_MODE_FILTER) { - core->tc->has_seccomp_filter = true; - core->tc->seccomp_filter = creds->last_filter; + core->tc->has_old_seccomp_filter = true; + core->tc->old_seccomp_filter = creds->last_filter; } } diff --git a/criu/cr-restore.c b/criu/cr-restore.c index 8a436cc98..9e4a4e01d 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -331,7 +331,7 @@ static int root_prepare_shared(void) if (prepare_remaps()) return -1; - if (prepare_seccomp_filters()) + if (seccomp_read_image()) return -1; if (collect_images(cinfos, ARRAY_SIZE(cinfos))) @@ -1031,7 +1031,7 @@ static int restore_one_alive_task(int pid, CoreEntry *core) if (prepare_timerfds(ta)) return -1; - if (seccomp_filters_get_rst_pos(core, ta) < 0) + if (seccomp_prepare_threads(current, ta) < 0) return -1; if (prepare_itimers(pid, ta, core) < 0) @@ -1236,6 +1236,21 @@ static int check_core(CoreEntry *core, struct pstree_item *me) pr_err("Core info data missed for non-zombie\n"); goto out; } + + /* + * Seccomp are moved to per-thread origin, + * so for old images we need to move per-task + * data into proper place. + */ + if (core->tc->has_old_seccomp_mode) { + core->thread_core->has_seccomp_mode = core->tc->has_old_seccomp_mode; + core->thread_core->seccomp_mode = core->tc->old_seccomp_mode; + } + if (core->tc->has_old_seccomp_filter) { + core->thread_core->has_seccomp_filter = core->tc->has_old_seccomp_filter; + core->thread_core->seccomp_filter = core->tc->old_seccomp_filter; + rsti(me)->has_old_seccomp_filter = true; + } } ret = 0; @@ -1511,13 +1526,16 @@ static inline int fork_with_pid(struct pstree_item *item) item->pid->state = ca.core->tc->task_state; rsti(item)->cg_set = ca.core->tc->cg_set; - rsti(item)->has_seccomp = ca.core->tc->seccomp_mode != SECCOMP_MODE_DISABLED; - if (item->pid->state != TASK_DEAD && !task_alive(item)) { pr_err("Unknown task state %d\n", item->pid->state); return -1; } + if (item->pid->state != TASK_DEAD) + rsti(item)->has_seccomp = ca.core->thread_core->seccomp_mode != SECCOMP_MODE_DISABLED; + else + rsti(item)->has_seccomp = false; + if (unlikely(item == root_item)) maybe_clone_parent(item, &ca); } else { @@ -3663,12 +3681,8 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns RST_MEM_FIXUP_PPTR(task_args->rlims); RST_MEM_FIXUP_PPTR(task_args->helpers); RST_MEM_FIXUP_PPTR(task_args->zombies); - RST_MEM_FIXUP_PPTR(task_args->seccomp_filters); RST_MEM_FIXUP_PPTR(task_args->vma_ios); - if (core->tc->has_seccomp_mode) - task_args->seccomp_mode = core->tc->seccomp_mode; - task_args->compatible_mode = core_is_compat(core); if (opts.check_only) @@ -3758,6 +3772,9 @@ static int sigreturn_restore(pid_t pid, struct task_restore_args *task_args, uns if (ret) goto err; + seccomp_rst_reloc(&thread_args[i]); + thread_args[i].seccomp_force_tsync = rsti(current)->has_old_seccomp_filter; + thread_args[i].mz = mz + i; sigframe = (struct rt_sigframe *)&mz[i].rt_sigframe; diff --git a/criu/include/restorer.h b/criu/include/restorer.h index 70223be55..9e6912bbd 100644 --- a/criu/include/restorer.h +++ b/criu/include/restorer.h @@ -4,6 +4,7 @@ #include #include #include +#include #include "common/config.h" #include "types.h" @@ -76,6 +77,11 @@ struct thread_creds_args { unsigned long mem_pos_next; }; +struct thread_seccomp_filter { + struct sock_fprog sock_fprog; + unsigned int flags; +}; + struct thread_restore_args { struct restore_mem_zone *mz; @@ -100,6 +106,13 @@ struct thread_restore_args { bool check_only; struct thread_creds_args *creds_args; + + int seccomp_mode; + unsigned long seccomp_filters_pos; + struct thread_seccomp_filter *seccomp_filters; + void *seccomp_filters_data; + unsigned int seccomp_filters_n; + bool seccomp_force_tsync; } __aligned(64); typedef long (*thread_restore_fcall_t) (struct thread_restore_args *args); @@ -163,9 +176,6 @@ struct task_restore_args { pid_t *zombies; unsigned int zombies_n; - struct sock_fprog *seccomp_filters; - unsigned int seccomp_filters_n; - /* * * * * * * * * * * * * * * * * * * * */ unsigned long task_size; diff --git a/criu/include/rst_info.h b/criu/include/rst_info.h index f9840d168..07c634f4a 100644 --- a/criu/include/rst_info.h +++ b/criu/include/rst_info.h @@ -61,6 +61,11 @@ struct rst_info { * restorer blob. */ bool has_seccomp; + /* + * To be compatible with old images where filters + * are bound to group leader and we need to use tsync flag. + */ + bool has_old_seccomp_filter; bool has_thp_enabled; diff --git a/criu/include/seccomp.h b/criu/include/seccomp.h index b50ea34e2..0791597fe 100644 --- a/criu/include/seccomp.h +++ b/criu/include/seccomp.h @@ -27,6 +27,9 @@ #define SECCOMP_FILTER_FLAG_TSYNC 1 #endif +struct thread_restore_args; +struct task_restore_args; + struct seccomp_info { struct seccomp_info *prev; int id; @@ -35,6 +38,9 @@ struct seccomp_info { extern int collect_seccomp_filters(void); extern int prepare_seccomp_filters(void); -struct task_restore_args; -extern int seccomp_filters_get_rst_pos(CoreEntry *item, struct task_restore_args *); + +extern int seccomp_read_image(void); +extern int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta); +extern void seccomp_rst_reloc(struct thread_restore_args *thread_arg); + #endif diff --git a/criu/pie/restorer.c b/criu/pie/restorer.c index da20f3298..151f2c651 100644 --- a/criu/pie/restorer.c +++ b/criu/pie/restorer.c @@ -408,54 +408,87 @@ static int restore_signals(siginfo_t *ptr, int nr, bool group) return 0; } -static int restore_seccomp(struct task_restore_args *args) +static int restore_seccomp_filter(pid_t tid, struct thread_restore_args *args) { + unsigned int flags = args->seccomp_force_tsync ? SECCOMP_FILTER_FLAG_TSYNC : 0; + size_t i; int ret; - switch (args->seccomp_mode) { - case SECCOMP_MODE_DISABLED: - return 0; - case SECCOMP_MODE_STRICT: - ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0); + for (i = 0; i < args->seccomp_filters_n; i++) { + struct thread_seccomp_filter *filter = &args->seccomp_filters[i]; + + pr_debug("seccomp: Restoring mode %d flags %x on tid %d filter %d\n", + SECCOMP_SET_MODE_FILTER, (filter->flags | flags), tid, (int)i); + + ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, filter->flags | flags, (void *)&filter->sock_fprog); if (ret < 0) { - pr_err("prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT) returned %d\n", ret); - goto die; - } - return 0; - case SECCOMP_MODE_FILTER: { - int i; - void *filter_data; - - filter_data = &args->seccomp_filters[args->seccomp_filters_n]; - - for (i = 0; i < args->seccomp_filters_n; i++) { - struct sock_fprog *fprog = &args->seccomp_filters[i]; - - fprog->filter = filter_data; - - /* We always TSYNC here, since we require that the - * creds for all threads be the same; this means we - * don't have to restore_seccomp() in threads, and that - * future TSYNC behavior will be correct. - */ - ret = sys_seccomp(SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, (char *) fprog); - if (ret < 0) { - pr_err("sys_seccomp() returned %d\n", ret); - goto die; + if (ret == -ENOSYS) { + pr_debug("seccomp: sys_seccomp is not supported in kernel, " + "switching to prctl interface\n"); + ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, + (long)(void *)&filter->sock_fprog, 0, 0); + if (ret) { + pr_err("seccomp: PR_SET_SECCOMP returned %d on tid %d\n", + ret, tid); + return -1; + } + } else { + pr_err("seccomp: SECCOMP_SET_MODE_FILTER returned %d on tid %d\n", + ret, tid); + return -1; } - - filter_data += fprog->len * sizeof(struct sock_filter); } - - return 0; - } - default: - goto die; } return 0; -die: - return -1; +} + +static int restore_seccomp(struct thread_restore_args *args) +{ + pid_t tid = 0; + int ret, i; + + for (i = 0; i < MAX_NS_NESTING; i++) { + if (args->pid[i] == 0) { + tid = args->pid[i - 1]; + break; + } + } + + if (tid != sys_gettid()) { + pr_err("seccomp: Unexpected tid %d != %d\n", + tid, (pid_t)sys_gettid()); + return -1; + } + + switch (args->seccomp_mode) { + case SECCOMP_MODE_DISABLED: + pr_debug("seccomp: mode %d on tid %d\n", SECCOMP_MODE_DISABLED, tid); + return 0; + break; + case SECCOMP_MODE_STRICT: + ret = sys_prctl(PR_SET_SECCOMP, SECCOMP_MODE_STRICT, 0, 0, 0); + if (ret < 0) { + pr_err("seccomp: SECCOMP_MODE_STRICT returned %d on tid %d\n", + ret, tid); + } + break; + case SECCOMP_MODE_FILTER: + ret = restore_seccomp_filter(tid, args); + break; + default: + pr_err("seccomp: Unknown seccomp mode %d on tid %d\n", + args->seccomp_mode, tid); + ret = -1; + break; + } + + if (!ret) { + pr_debug("seccomp: Restored mode %d on tid %d\n", + args->seccomp_mode, tid); + } + + return ret; } static int restore_robust_futex(struct thread_restore_args *args) @@ -554,6 +587,13 @@ long __export_restore_thread(struct thread_restore_args *args) sys_close(fd); } + /* + * Make sure it's before creds, since it's privileged + * operation bound to uid 0 in current user ns. + */ + if (restore_seccomp(args)) + goto core_restore_end; + ret = restore_creds(args->creds_args, args->ta->proc_fd); if (ret) goto core_restore_end; @@ -572,9 +612,6 @@ long __export_restore_thread(struct thread_restore_args *args) restore_finish_stage(task_entries_local, CR_STATE_RESTORE_SIGCHLD); restore_pdeath_sig(args); - if (args->ta->seccomp_mode != SECCOMP_MODE_DISABLED) - pr_info("Restoring seccomp mode %d for %ld\n", args->ta->seccomp_mode, sys_getpid()); - restore_finish_stage(task_entries_local, CR_STATE_RESTORE_CREDS); futex_dec_and_wake(&thread_inprogress); @@ -1693,11 +1730,11 @@ long __export_restore_task(struct task_restore_args *args) sys_close(fd); } - /* The kernel restricts setting seccomp to uid 0 in the current user - * ns, so we must do this before restore_creds. + /* + * Make sure it's before creds, since it's privileged + * operation bound to uid 0 in current user ns. */ - pr_info("restoring seccomp mode %d for %ld\n", args->seccomp_mode, sys_getpid()); - if (restore_seccomp(args)) + if (restore_seccomp(args->t)) goto core_restore_end; /* diff --git a/criu/seccomp.c b/criu/seccomp.c index c8cd35f9a..8da5a2932 100644 --- a/criu/seccomp.c +++ b/criu/seccomp.c @@ -21,6 +21,8 @@ #undef LOG_PREFIX #define LOG_PREFIX "seccomp: " +static SeccompEntry *seccomp_img_entry; + /* populated on dump during collect_seccomp_filters() */ static int next_filter_id = 0; static struct seccomp_info **filters = NULL; @@ -233,10 +235,8 @@ int collect_seccomp_filters(void) return 0; } -/* Populated on restore by prepare_seccomp_filters */ -static SeccompEntry *se; - -int prepare_seccomp_filters(void) +/* The seccomp_img_entry will be shared between all children */ +int seccomp_read_image(void) { struct cr_img *img; int ret; @@ -245,66 +245,129 @@ int prepare_seccomp_filters(void) if (!img) return -1; - ret = pb_read_one_eof(img, &se, PB_SECCOMP); + ret = pb_read_one_eof(img, &seccomp_img_entry, PB_SECCOMP); close_image(img); if (ret <= 0) return 0; /* there were no filters */ - BUG_ON(!se); + BUG_ON(!seccomp_img_entry); return 0; } -int seccomp_filters_get_rst_pos(CoreEntry *core, struct task_restore_args *ta) +/* seccomp_img_entry will be freed per-children after forking */ +static void free_seccomp_filters(void) { - SeccompFilter *sf = NULL; - struct sock_fprog *arr = NULL; - void *filter_data = NULL; - int ret = -1, i, n_filters; - size_t filter_size = 0; - - ta->seccomp_filters_n = 0; - - if (!core->tc->has_seccomp_filter) - return 0; - - ta->seccomp_filters = (struct sock_fprog *)rst_mem_align_cpos(RM_PRIVATE); - - BUG_ON(core->tc->seccomp_filter > se->n_seccomp_filters); - sf = se->seccomp_filters[core->tc->seccomp_filter]; - - while (1) { - ta->seccomp_filters_n++; - filter_size += sf->filter.len; - - if (!sf->has_prev) - break; - - sf = se->seccomp_filters[sf->prev]; + if (seccomp_img_entry) { + seccomp_entry__free_unpacked(seccomp_img_entry, NULL); + seccomp_img_entry = NULL; } - - n_filters = ta->seccomp_filters_n; - arr = rst_mem_alloc(sizeof(struct sock_fprog) * n_filters + filter_size, RM_PRIVATE); - if (!arr) - goto out; - - filter_data = &arr[n_filters]; - sf = se->seccomp_filters[core->tc->seccomp_filter]; - for (i = 0; i < n_filters; i++) { - struct sock_fprog *fprog = &arr[i]; - - BUG_ON(sf->filter.len % sizeof(struct sock_filter)); - fprog->len = sf->filter.len / sizeof(struct sock_filter); - - memcpy(filter_data, sf->filter.data, sf->filter.len); - - filter_data += sf->filter.len; - sf = se->seccomp_filters[sf->prev]; - } - - ret = 0; - -out: - seccomp_entry__free_unpacked(se, NULL); - return ret; +} + +void seccomp_rst_reloc(struct thread_restore_args *args) +{ + size_t j, off; + + if (!args->seccomp_filters_n) + return; + + args->seccomp_filters = rst_mem_remap_ptr(args->seccomp_filters_pos, RM_PRIVATE); + args->seccomp_filters_data = (void *)args->seccomp_filters + + args->seccomp_filters_n * sizeof(struct thread_seccomp_filter); + + for (j = off = 0; j < args->seccomp_filters_n; j++) { + struct thread_seccomp_filter *f = &args->seccomp_filters[j]; + + f->sock_fprog.filter = args->seccomp_filters_data + off; + off += f->sock_fprog.len * sizeof(struct sock_filter); + } +} + +int seccomp_prepare_threads(struct pstree_item *item, struct task_restore_args *ta) +{ + struct thread_restore_args *args_array = (struct thread_restore_args *)(&ta[1]); + size_t i, j, nr_filters, filters_size, rst_size, off; + + for (i = 0; i < item->nr_threads; i++) { + ThreadCoreEntry *thread_core = item->core[i]->thread_core; + struct thread_restore_args *args = &args_array[i]; + SeccompFilter *sf; + + args->seccomp_mode = SECCOMP_MODE_DISABLED; + args->seccomp_filters_pos = 0; + args->seccomp_filters_n = 0; + args->seccomp_filters = NULL; + args->seccomp_filters_data = NULL; + + if (thread_core->has_seccomp_mode) + args->seccomp_mode = thread_core->seccomp_mode; + + if (args->seccomp_mode != SECCOMP_MODE_FILTER) + continue; + + if (thread_core->seccomp_filter >= seccomp_img_entry->n_seccomp_filters) { + pr_err("Corrupted filter index on tid %d (%u > %zu)\n", + item->threads[i]->ns[0].virt, thread_core->seccomp_filter, + seccomp_img_entry->n_seccomp_filters); + return -1; + } + + sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter]; + if (sf->filter.len % (sizeof(struct sock_filter))) { + pr_err("Corrupted filter len on tid %d (index %u)\n", + item->threads[i]->ns[0].virt, + thread_core->seccomp_filter); + return -1; + } + filters_size = sf->filter.len; + nr_filters = 1; + + while (sf->has_prev) { + if (sf->prev >= seccomp_img_entry->n_seccomp_filters) { + pr_err("Corrupted filter index on tid %d (%u > %zu)\n", + item->threads[i]->ns[0].virt, sf->prev, + seccomp_img_entry->n_seccomp_filters); + return -1; + } + + sf = seccomp_img_entry->seccomp_filters[sf->prev]; + if (sf->filter.len % (sizeof(struct sock_filter))) { + pr_err("Corrupted filter len on tid %d (index %u)\n", + item->threads[i]->ns[0].virt, sf->prev); + return -1; + } + filters_size += sf->filter.len; + nr_filters++; + } + + args->seccomp_filters_n = nr_filters; + + rst_size = filters_size + nr_filters * sizeof(struct thread_seccomp_filter); + args->seccomp_filters_pos = rst_mem_align_cpos(RM_PRIVATE); + args->seccomp_filters = rst_mem_alloc(rst_size, RM_PRIVATE); + if (!args->seccomp_filters) { + pr_err("Can't allocate %zu bytes for filters on tid %d\n", + rst_size, item->threads[i]->ns[0].virt); + return -ENOMEM; + } + args->seccomp_filters_data = (void *)args->seccomp_filters + + nr_filters * sizeof(struct thread_seccomp_filter); + + sf = seccomp_img_entry->seccomp_filters[thread_core->seccomp_filter]; + for (j = off = 0; j < nr_filters; j++) { + struct thread_seccomp_filter *f = &args->seccomp_filters[j]; + + f->sock_fprog.len = sf->filter.len / sizeof(struct sock_filter); + f->sock_fprog.filter = args->seccomp_filters_data + off; + f->flags = sf->flags; + + memcpy(f->sock_fprog.filter, sf->filter.data, sf->filter.len); + + off += sf->filter.len; + sf = seccomp_img_entry->seccomp_filters[sf->prev]; + } + } + + free_seccomp_filters(); + return 0; } diff --git a/images/core.proto b/images/core.proto index 0291fae68..726803646 100644 --- a/images/core.proto +++ b/images/core.proto @@ -40,8 +40,9 @@ message task_core_entry { optional signal_queue_entry signals_s = 10; - optional seccomp_mode seccomp_mode = 11; - optional uint32 seccomp_filter = 12; + /* These two are deprecated, should be per-thread */ + optional seccomp_mode old_seccomp_mode = 11; + optional uint32 old_seccomp_filter = 12; optional uint32 loginuid = 13; @@ -87,6 +88,9 @@ message thread_core_entry { optional signal_queue_entry signals_p = 9; optional creds_entry creds = 10; + + optional seccomp_mode seccomp_mode = 11; + optional uint32 seccomp_filter = 12; } message task_rlimits_entry {