2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-21 17:37:39 +00:00

rseq: handle rseq/rseq_cs flags properly

Userspace may configure rseq cs abort policy by
setting RSEQ_CS_FLAG_NO_RESTART_ON_* flags.

In ("cr-dump: fixup thread IP when inside rseq cs") we have supported
the case when process was caught by CRIU during rseq cs execution by
fixing up IP to abort_ip. Thats a common case, but there is special flag
called RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL, in this case we have to leave
process IP as it was before CRIU seized it. Unfortunately, that's not
all that we need here. We also must preserve (struct rseq)->rseq_cs field.

You may ask like "why we need to preserve it by hands? CRIU is dumping
all process memory and restores it". That's true. But not so easy. The problem
here is that the kernel performs this field cleanup when it realized that
the process gets out of rseq cs. But during dump/restore procedures we are
executing parasite/restorer from the process context. It means that process
will get out of rseq cs in any case and (struct rseq)->rseq_cs will be cleared
by the kernel. So we need to restore this field by hands at the *last* stage
of restore just before releasing processes.

Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
This commit is contained in:
Alexander Mikhalitsyn 2022-02-22 18:22:45 +03:00 committed by Andrei Vagin
parent 13338dee5c
commit 1e0bed3d69
5 changed files with 164 additions and 34 deletions

View File

@ -1034,13 +1034,13 @@ static int dump_task_signals(pid_t pid, struct pstree_item *item)
return 0;
}
static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, struct rseq_cs *rseq_cs)
static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseqc, struct rseq_cs *rseq_cs,
struct criu_rseq *rseq)
{
int ret;
uint64_t addr;
/* rseq is not registered */
if (!rseq->rseq_abi_pointer)
if (!rseqc->rseq_abi_pointer)
return 0;
/*
@ -1055,23 +1055,20 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str
* then rseq_ip_fixup() -> clear_rseq_cs() and user space memory with struct rseq
* will be cleared. So, let's use ptrace(PTRACE_PEEKDATA).
*/
ret = ptrace_peek_area(tid, &addr, decode_pointer(rseq->rseq_abi_pointer + offsetof(struct criu_rseq, rseq_cs)),
sizeof(uint64_t));
ret = ptrace_peek_area(tid, rseq, decode_pointer(rseqc->rseq_abi_pointer), sizeof(struct criu_rseq));
if (ret) {
pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs addr\n", tid, (unsigned long)&addr,
(unsigned long)(rseq->rseq_abi_pointer + offsetof(struct criu_rseq, rseq_cs)),
(unsigned long)sizeof(uint64_t));
pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq struct\n", tid, (unsigned long)rseq,
(unsigned long)(rseqc->rseq_abi_pointer), (unsigned long)sizeof(uint64_t));
return -1;
}
/* (struct rseq)->rseq_cs is NULL */
if (!addr)
if (!rseq->rseq_cs)
return 0;
ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(addr), sizeof(struct rseq_cs));
ret = ptrace_peek_area(tid, rseq_cs, decode_pointer(rseq->rseq_cs), sizeof(struct rseq_cs));
if (ret) {
pr_err("ptrace_peek_area(%d, %lx, %lx, %lx): fail to read rseq_cs struct\n", tid,
(unsigned long)rseq_cs, (unsigned long)addr, (unsigned long)sizeof(struct rseq_cs));
(unsigned long)rseq_cs, (unsigned long)rseq->rseq_cs, (unsigned long)sizeof(struct rseq_cs));
return -1;
}
@ -1080,11 +1077,12 @@ static int read_rseq_cs(pid_t tid, struct __ptrace_rseq_configuration *rseq, str
static int dump_thread_rseq(struct pstree_item *item, int i)
{
struct __ptrace_rseq_configuration rseq;
struct __ptrace_rseq_configuration rseqc;
RseqEntry *rseqe = NULL;
int ret;
CoreEntry *core = item->core[i];
RseqEntry **rseqep = &core->thread_core->rseq_entry;
struct criu_rseq rseq = {};
struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i];
pid_t tid = item->threads[i].real;
@ -1099,20 +1097,20 @@ static int dump_thread_rseq(struct pstree_item *item, int i)
if (!kdat.has_ptrace_get_rseq_conf)
return 0;
ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseq), &rseq);
if (ret != sizeof(rseq)) {
ret = ptrace(PTRACE_GET_RSEQ_CONFIGURATION, tid, sizeof(rseqc), &rseqc);
if (ret != sizeof(rseqc)) {
pr_perror("ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) = %d", tid, ret);
return -1;
}
if (rseq.flags != 0) {
if (rseqc.flags != 0) {
pr_err("something wrong with ptrace(PTRACE_GET_RSEQ_CONFIGURATION, %d) flags = 0x%x\n", tid,
rseq.flags);
rseqc.flags);
return -1;
}
pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseq.rseq_abi_pointer,
rseq.signature);
pr_info("Dump rseq of %d: ptr = 0x%lx sign = 0x%x\n", tid, (unsigned long)rseqc.rseq_abi_pointer,
rseqc.signature);
rseqe = xmalloc(sizeof(*rseqe));
if (!rseqe)
@ -1120,13 +1118,24 @@ static int dump_thread_rseq(struct pstree_item *item, int i)
rseq_entry__init(rseqe);
rseqe->rseq_abi_pointer = rseq.rseq_abi_pointer;
rseqe->rseq_abi_size = rseq.rseq_abi_size;
rseqe->signature = rseq.signature;
rseqe->rseq_abi_pointer = rseqc.rseq_abi_pointer;
rseqe->rseq_abi_size = rseqc.rseq_abi_size;
rseqe->signature = rseqc.signature;
if (read_rseq_cs(tid, &rseq, rseq_cs))
if (read_rseq_cs(tid, &rseqc, rseq_cs, &rseq))
goto err;
/* we won't save rseq_cs to the image (only pointer),
* so let's combine flags from both struct rseq and struct rseq_cs
* (kernel does the same when interpreting RSEQ_CS_FLAG_*)
*/
rseq_cs->flags |= rseq.flags;
if (rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL) {
rseqe->has_rseq_cs_pointer = true;
rseqe->rseq_cs_pointer = rseq.rseq_cs;
}
/* save rseq entry to the image */
*rseqep = rseqe;
@ -1176,11 +1185,12 @@ static int fixup_thread_rseq(struct pstree_item *item, int i)
struct rseq_cs *rseq_cs = &dmpi(item)->thread_rseq_cs[i];
pid_t tid = item->threads[i].real;
/* (struct rseq)->rseq_cs is NULL */
/* equivalent to (struct rseq)->rseq_cs is NULL */
if (!rseq_cs->start_ip)
return 0;
pr_info("fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n",
pr_debug(
"fixup_thread_rseq for %d: rseq_cs start_ip = %llx abort_ip = %llx post_commit_offset = %llx flags = %x version = %x; IP = %lx\n",
tid, rseq_cs->start_ip, rseq_cs->abort_ip, rseq_cs->post_commit_offset, rseq_cs->flags,
rseq_cs->version, (unsigned long)TI_IP(core));
@ -1192,25 +1202,35 @@ static int fixup_thread_rseq(struct pstree_item *item, int i)
if (task_in_rseq(rseq_cs, TI_IP(core))) {
struct pid *tid = &item->threads[i];
pr_info("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n",
tid->real);
/*
* We need to fixup task instruction pointer from
* the original one (which lays inside rseq critical section)
* to rseq abort handler address.
* to rseq abort handler address. But we need to look on rseq_cs->flags
* (please refer to struct rseq -> flags field description).
* Naive idea of flags support may be like... let's change instruction pointer (IP)
* to rseq_cs->abort_ip if !(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL).
* But unfortunately, it doesn't work properly, because the kernel does
* clean up of rseq_cs field in the struct rseq (modifies userspace memory).
* So, we need to preserve original value of (struct rseq)->rseq_cs field in the
* image and restore it's value before releasing threads (see restore_rseq_cs()).
*
* It's worth to mention that we need to fixup IP in CoreEntry
* (used when full dump/restore is performed) and also in
* the parasite regs storage (used if --leave-running option is used,
* or if dump error occurred and process execution is resumed).
*/
TI_IP(core) = rseq_cs->abort_ip;
if (item->pid->real == tid->real) {
compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip);
} else {
compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip);
if (!(rseq_cs->flags & RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL)) {
pr_warn("The %d task is in rseq critical section. IP will be set to rseq abort handler addr\n",
tid->real);
TI_IP(core) = rseq_cs->abort_ip;
if (item->pid->real == tid->real) {
compel_set_leader_ip(dmpi(item)->parasite_ctl, rseq_cs->abort_ip);
} else {
compel_set_thread_ip(dmpi(item)->thread_ctls[i], rseq_cs->abort_ip);
}
}
}

View File

@ -812,6 +812,23 @@ static int open_cores(int pid, CoreEntry *leader_core)
}
}
for (i = 0; i < current->nr_threads; i++) {
ThreadCoreEntry *tc = cores[i]->thread_core;
struct rst_rseq *rseqs = rsti(current)->rseqe;
RseqEntry *rseqe = tc->rseq_entry;
/* compatibility with older CRIU versions */
if (!rseqe)
continue;
/* rseq cs had no RSEQ_CS_FLAG_NO_RESTART_ON_SIGNAL */
if (!rseqe->has_rseq_cs_pointer)
continue;
rseqs[i].rseq_abi_pointer = rseqe->rseq_abi_pointer;
rseqs[i].rseq_cs_pointer = rseqe->rseq_cs_pointer;
}
return 0;
err:
xfree(cores);
@ -1963,6 +1980,50 @@ static int attach_to_tasks(bool root_seized)
return 0;
}
static int restore_rseq_cs(void)
{
struct pstree_item *item;
for_each_pstree_item(item) {
int i;
if (!task_alive(item))
continue;
if (item->nr_threads == 1) {
item->threads[0].real = item->pid->real;
} else {
if (parse_threads(item->pid->real, &item->threads, &item->nr_threads)) {
pr_err("restore_rseq_cs: parse_threads failed\n");
return -1;
}
}
for (i = 0; i < item->nr_threads; i++) {
pid_t pid = item->threads[i].real;
struct rst_rseq *rseqe = rsti(item)->rseqe;
if (!rseqe) {
pr_err("restore_rseq_cs: rsti(item)->rseqe is NULL\n");
return -1;
}
if (!rseqe[i].rseq_cs_pointer || !rseqe[i].rseq_abi_pointer)
continue;
if (ptrace_poke_area(
pid, &rseqe[i].rseq_cs_pointer,
decode_pointer(rseqe[i].rseq_abi_pointer + offsetof(struct criu_rseq, rseq_cs)),
sizeof(uint64_t))) {
pr_err("Can't restore rseq_cs pointer (pid: %d)\n", pid);
return -1;
}
}
}
return 0;
}
static int catch_tasks(bool root_seized, enum trace_flags *flag)
{
struct pstree_item *item;
@ -2420,6 +2481,10 @@ skip_ns_bouncing:
if (restore_freezer_state())
pr_err("Unable to restore freezer state\n");
/* just before releasing threads we have to restore rseq_cs */
if (restore_rseq_cs())
pr_err("Unable to restore rseq_cs state\n");
/* Detaches from processes and they continue run through sigreturn. */
if (finalize_restore_detach())
goto out_kill_network_unlocked;

View File

@ -6,6 +6,7 @@
#include "vma.h"
#include "kerndat.h"
#include "images/mm.pb-c.h"
#include "images/core.pb-c.h"
struct task_entries {
int nr_threads, nr_tasks, nr_helpers;
@ -26,6 +27,11 @@ struct fdt {
futex_t fdt_lock;
};
struct rst_rseq {
uint64_t rseq_abi_pointer;
uint64_t rseq_cs_pointer;
};
struct rst_info {
struct list_head fds;
@ -69,6 +75,8 @@ struct rst_info {
bool has_thp_enabled;
struct rst_rseq *rseqe;
void *breakpoint;
};

View File

@ -954,6 +954,31 @@ static int prepare_pstree_kobj_ids(void)
return 0;
}
static int prepare_pstree_rseqs(void)
{
struct pstree_item *item;
for_each_pstree_item(item) {
struct rst_rseq *rseqs;
size_t sz = sizeof(*rseqs) * item->nr_threads;
if (!task_alive(item))
continue;
rseqs = shmalloc(sz);
if (!rseqs) {
pr_err("prepare_pstree_rseqs shmalloc(%lu) failed\n", (unsigned long)sz);
return -1;
}
memset(rseqs, 0, sz);
rsti(item)->rseqe = rseqs;
}
return 0;
}
int prepare_pstree(void)
{
int ret;
@ -1011,6 +1036,17 @@ int prepare_pstree(void)
* pstree with properly injected helper tasks.
*/
ret = prepare_pstree_ids(pid);
if (!ret)
/*
* We need to alloc shared buffers for RseqEntry'es
* arrays (one RseqEntry per pstree item thread).
*
* We need shared memory because we perform
* open_core() on the late stage inside
* restore_one_alive_task(), so that's the only
* way to transfer that data to the main CRIU process.
*/
ret = prepare_pstree_rseqs();
return ret;
}

View File

@ -6,4 +6,5 @@ message rseq_entry {
required uint64 rseq_abi_pointer = 1;
required uint32 rseq_abi_size = 2;
required uint32 signature = 3;
optional uint64 rseq_cs_pointer = 4;
}