2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 09:58:09 +00:00

Initial commit

Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
This commit is contained in:
Cyrill Gorcunov 2011-09-23 12:00:45 +04:00
commit 523de23624
53 changed files with 12644 additions and 0 deletions

9
.gitignore vendored Normal file
View File

@ -0,0 +1,9 @@
*.o
*.d
*.img
*.bin
*.elf
*.out
cscope*
tags
TAGS

171
Makefile Normal file
View File

@ -0,0 +1,171 @@
ifeq ($(strip $(V)),)
E = @echo
Q = @
else
E = @\#
Q =
endif
export E Q
FIND := find
CSCOPE := cscope
TAGS := ctags
RM := rm
LD := ld
HEXDUMP := hexdump
CC := gcc
ECHO := echo
NM := nm
AWK := awk
SH := sh
CFLAGS += -I./include
CFLAGS += -O0 -ggdb3
LIBS += -lrt
# Additional ARCH settings for x86
ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
-e s/arm.*/arm/ -e s/sa110/arm/ \
-e s/s390x/s390/ -e s/parisc64/parisc/ \
-e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
-e s/sh[234].*/sh/ )
uname_M := $(shell uname -m | sed -e s/i.86/i386/)
ifeq ($(uname_M),i386)
ARCH := x86
DEFINES += -DCONFIG_X86_32
endif
ifeq ($(uname_M),x86_64)
ARCH := x86
DEFINES += -DCONFIG_X86_64
endif
DEFINES += -D_FILE_OFFSET_BITS=64
DEFINES += -D_GNU_SOURCE
ifneq ($(WERROR),0)
WARNINGS += -Werror
endif
WARNINGS += -Wall -Wno-unused
CFLAGS += $(WARNINGS) $(DEFINES)
PROGRAM := crtools
TESTEE := testee
TESTEE-TH := testee-threads
TESTEE-STATIC := testee-static
all: $(PROGRAM) $(TESTEE) $(TESTEE-TH) $(TESTEE-STATIC)
OBJS += crtools.o
OBJS += parasite-syscall.o
OBJS += cr-dump.o
OBJS += cr-restore.o
OBJS += cr-show.o
OBJS += util.o
OBJS += rbtree.o
OBJS += elf.o
OBJS-TESTEE += testee.o
OBJS-TESTEE-TH += testee-threads.o
OBJS-BLOB += parasite.o
DEPS := $(patsubst %.o,%.d,$(OBJS))
DEPS-TESTEE := $(patsubst %.o,%.d,$(OBJS-TESTEE))
DEPS-TESTEE-TH := $(patsubst %.o,%.d,$(OBJS-TESTEE-TH))
DEPS-BLOB := $(patsubst %.o,%.d,$(OBJS-BLOB))
SRCS-BLOB += $(patsubst %.o,%.c,$(OBJS-BLOB))
HEAD-BLOB := $(patsubst %.o,%.h,$(OBJS-BLOB))
HEAD-BLOB-GEN := $(patsubst %.o,%-blob.h,$(OBJS-BLOB))
HEAD-BIN := $(patsubst %.o,%.bin,$(OBJS-BLOB))
HEAD-LDS := $(patsubst %.o,%.lds.S,$(OBJS-BLOB))
HEAD-IDS := $(patsubst %.h,%_h__,$(subst -,_,$(HEAD-BLOB)))
$(OBJS-BLOB): $(SRCS-BLOB) $(DEPS-BLOB)
$(E) " CC " $@
$(Q) $(CC) -c $(CFLAGS) -fpic $< -o $@
$(HEAD-BIN): $(OBJS-BLOB) $(HEAD-LDS)
%.bin: %.o
$(E) " GEN " $@
$(Q) $(LD) -T $(patsubst %.bin,%.lds.S,$@) $< -o $@
$(Q) $(LD) -T $(patsubst %.bin,%-elf.lds.S,$@) $< -o $@.o
$(HEAD-BLOB): $(DEPS-BLOB) $(HEAD-BIN)
%-blob.h: %.bin
%.h: %.bin
$(E) " GEN " $@
$(Q) $(SH) gen-offsets.sh \
$(subst -,_,$(patsubst %.h,%,$@))_h__ \
$(subst -,_,$(patsubst %.h,%,$@))_blob_offset__ \
$(subst -,_,$(patsubst %.h,%,$@))_blob \
$(patsubst %.h,%.o,$@) \
$(patsubst %.h,%.bin,$@) > $(patsubst %.h,%-blob.h,$@)
$(OBJS): $(HEAD-BLOB) $(DEPS)
$(OBJS-TESTEE): $(DEPS-TESTEE)
$(OBJS-TESTEE-TH): $(DEPS-TESTEE-TH)
%.o: %.c
$(E) " CC " $@
$(Q) $(CC) -c $(CFLAGS) $< -o $@
$(PROGRAM): $(OBJS)
$(E) " LINK " $@
$(Q) $(CC) $(OBJS) $(LIBS) -o $@
$(TESTEE): $(OBJS-TESTEE)
$(E) " LINK " $@
$(Q) $(CC) $(OBJS-TESTEE) -o $@
$(TESTEE-TH): $(OBJS-TESTEE-TH)
$(E) " LINK " $@
$(Q) $(CC) $(OBJS-TESTEE-TH) -lpthread -o $@
$(TESTEE-STATIC).o: testee-static.c
$(Q) gcc -c -static -I./.include -o testee-static.o testee-static.c
$(TESTEE-STATIC): $(TESTEE-STATIC).o
$(Q) gcc -o testee-static -static testee-static.o
$(DEPS):
$(DEPS-TESTEE):
$(DEPS-TESTEE-TH):
$(DEPS-BLOB):
%.d: %.c
$(Q) $(CC) -M -MT $(patsubst %.d,%.o,$@) $(CFLAGS) $< -o $@
clean:
$(E) " CLEAN"
$(Q) rm -f ./*.o
$(Q) rm -f ./*.d
$(Q) rm -f ./*.img
$(Q) rm -f ./*.elf
$(Q) rm -f ./*.out
$(Q) rm -f ./*.bin
$(Q) rm -f ./tags
$(Q) rm -f ./cscope*
$(Q) rm -f ./$(PROGRAM)
$(Q) rm -f ./$(TESTEE)
$(Q) rm -f ./$(TESTEE-STATIC)
$(Q) rm -f ./$(TESTEE-TH)
$(Q) rm -f ./$(HEAD-BLOB)
$(Q) rm -f ./$(HEAD-BLOB-GEN)
.PHONY: clean
tags:
$(E) " GEN" $@
$(Q) $(RM) -f tags
$(Q) $(FIND) . -name '*.[hcS]' -print | xargs ctags -a
.PHONY: tags
cscope:
$(E) " GEN" $@
$(Q) $(FIND) . -name '*.[hcS]' -print > cscope.files
$(Q) $(CSCOPE) -bkqu
.PHONY: cscope

15
README Normal file
View File

@ -0,0 +1,15 @@
crtools
=======
An utility to to checkpoint/restore tasks.
Some code is borrowed from
- Linux kernel (http://kernel.org/)
- git (http://git-scm.com/)
- kvm-tools (https://github.com/penberg/linux-kvm.git)
- ptrace-parasite (https://code.google.com/p/ptrace-parasite/)
Many thanks to these projects.
Licensed under GPLv2

977
cr-dump.c Normal file
View File

@ -0,0 +1,977 @@
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <limits.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <dirent.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/vfs.h>
#include <sys/ptrace.h>
#include <sys/user.h>
#include <sys/wait.h>
#include <sys/sendfile.h>
#include "types.h"
#include "list.h"
#include "compiler.h"
#include "crtools.h"
#include "syscall.h"
#include "util.h"
#include "image.h"
#include "parasite.h"
#include "parasite-syscall.h"
#include "parasite-blob.h"
#ifndef CONFIG_X86_64
# error No x86-32 support yet
#endif
static LIST_HEAD(vma_area_list);
static LIST_HEAD(pstree_list);
static char big_buffer[PATH_MAX];
static struct parasite_ctl *parasite_ctl;
static char loc_buf[PAGE_SIZE];
static void free_pstree(void)
{
struct pstree_item *item, *p;
list_for_each_entry_safe(item, p, &pstree_list, list) {
xfree(item->children);
xfree(item);
}
INIT_LIST_HEAD(&pstree_list);
}
static void free_mappings(void)
{
struct vma_area *vma_area, *p;
list_for_each_entry_safe(vma_area, p, &vma_area_list, list) {
if (vma_area->vm_file_fd > 0)
close(vma_area->vm_file_fd);
free(vma_area);
}
INIT_LIST_HEAD(&vma_area_list);
}
static int collect_mappings(pid_t pid)
{
struct vma_area *vma_area;
int ret = -1;
pr_info("\n");
pr_info("Collecting mappings (pid: %d)\n", pid);
pr_info("----------------------------------------\n");
ret = parse_maps(pid, &vma_area_list);
if (ret)
goto err;
pr_info_vma_list(&vma_area_list);
pr_info("----------------------------------------\n");
err:
return ret;
err_bogus_mapping:
pr_error("Bogus mapping %lx-%lx\n",
vma_area->vma.start,
vma_area->vma.end);
goto err;
}
static int dump_one_reg_file(int type, unsigned long fd_name, int lfd,
bool do_close, unsigned long pos, unsigned int flags,
struct cr_fdset *cr_fdset)
{
struct fdinfo_entry e;
char fd_str[128];
int len;
int ret = -1;
snprintf(fd_str, sizeof(fd_str), "/proc/self/fd/%d", lfd);
len = readlink(fd_str, big_buffer, sizeof(big_buffer) - 1);
if (len < 0) {
pr_perror("Can't readlink %s\n", fd_str);
goto err;
}
big_buffer[len] = '\0';
pr_info("Dumping path for %lx fd via self %d [%s]\n",
fd_name, lfd, big_buffer);
if (do_close)
close(lfd);
e.type = type;
e.len = len;
e.flags = flags;
e.pos = pos;
e.addr = fd_name;
pr_info("fdinfo: type: %2x len: %2x flags: %4x pos: %8x addr: %16lx\n",
type, len, flags, pos, fd_name);
write_ptr_safe(cr_fdset->desc[CR_FD_FDINFO].fd, &e, err);
write_safe(cr_fdset->desc[CR_FD_FDINFO].fd, big_buffer, e.len, err);
ret = 0;
err:
return ret;
}
static int dump_pipe_and_data(int lfd, struct pipe_entry *e,
struct cr_fdset *cr_fdset)
{
int fd_pipes;
int steal_pipe[2];
int pipe_size;
int has_bytes;
int ret = -1;
fd_pipes = cr_fdset->desc[CR_FD_PIPES].fd;
pr_info("Dumping data from pipe %x\n", e->pipeid);
if (pipe(steal_pipe) < 0) {
pr_perror("Can't create pipe for stealing data\n");
goto err;
}
pipe_size = fcntl(lfd, F_GETPIPE_SZ);
if (pipe_size < 0) {
pr_error("Can't obtain piped data size\n");
goto err;
}
has_bytes = tee(lfd, steal_pipe[1], pipe_size, SPLICE_F_NONBLOCK);
if (has_bytes < 0) {
if (errno != EAGAIN) {
pr_perror("Can't pick pipe data\n");
goto err_close;
} else
has_bytes = 0;
}
e->bytes = has_bytes;
write_ptr_safe(fd_pipes, e, err_close);
if (has_bytes) {
ret = splice(steal_pipe[0], NULL, fd_pipes,
NULL, has_bytes, 0);
if (ret < 0) {
pr_perror("Can't push pipe data\n");
goto err_close;
}
}
ret = 0;
err_close:
close(steal_pipe[0]);
close(steal_pipe[1]);
err:
return ret;
}
static int dump_one_pipe(int fd, int lfd, unsigned int id, unsigned int flags,
struct cr_fdset *cr_fdset)
{
struct pipe_entry e;
int ret = -1;
pr_info("Dumping pipe %d/%x flags %x\n", fd, id, flags);
e.fd = fd;
e.pipeid = id;
e.flags = flags;
if (flags & O_WRONLY) {
e.bytes = 0;
write_ptr_safe(cr_fdset->desc[CR_FD_PIPES].fd, &e, err);
ret = 0;
} else
ret = dump_pipe_and_data(lfd, &e, cr_fdset);
err:
if (!ret)
pr_info("Dumped pipe: fd: %8lx pipeid: %8lx flags: %8lx bytes: %8lx\n",
e.fd, e.pipeid, e.flags, e.bytes);
else
pr_error("Dumping pipe %d/%x flags %x\n", fd, id, flags);
return ret;
}
static int dump_one_fd(char *pid_fd_dir, int dir, char *fd_name, unsigned long pos,
unsigned int flags, struct cr_fdset *cr_fdset)
{
struct statfs stfs_buf;
struct stat st_buf;
int fd;
fd = openat(dir, fd_name, O_RDONLY);
if (fd < 0) {
pr_perror("Failed to openat %s/%d %s\n", pid_fd_dir, dir, fd_name);
return -1;
}
if (fstat(fd, &st_buf) < 0) {
pr_perror("Can't get stat on %s\n", fd_name);
return -1;
}
if (S_ISREG(st_buf.st_mode))
return dump_one_reg_file(FDINFO_FD, atol(fd_name),
fd, 1, pos, flags, cr_fdset);
if (S_ISFIFO(st_buf.st_mode)) {
if (fstatfs(fd, &stfs_buf) < 0) {
pr_perror("Can't fstatfs on %s\n", fd_name);
return -1;
}
if (stfs_buf.f_type == PIPEFS_MAGIC)
return dump_one_pipe(atol(fd_name), fd,
st_buf.st_ino, flags, cr_fdset);
}
if (!strcmp(fd_name, "0")) {
pr_info("... Skipping stdin ...\n");
return 0;
}
if (!strcmp(fd_name, "1")) {
pr_info("... Skipping stdout ...\n");
return 0;
}
if (!strcmp(fd_name, "2")) {
pr_info("... Skipping stderr ...\n");
return 0;
}
if (!strcmp(fd_name, "3")) {
pr_info("... Skipping tty ...\n");
return 0;
}
pr_error("Can't dump file %s of that type [%x]\n", fd_name, st_buf.st_mode);
return 1;
}
static int read_fd_params(pid_t pid, char *fd, unsigned long *pos, unsigned int *flags)
{
char fd_str[128];
int ifd;
snprintf(fd_str, sizeof(fd_str), "/proc/%d/fdinfo/%s", pid, fd);
ifd = open(fd_str, O_RDONLY);
if (ifd < 0) {
pr_perror("Can't open %s\n", fd_str);
return -1;
}
read(ifd, big_buffer, sizeof(big_buffer));
close(ifd);
sscanf(big_buffer, "pos:\t%li\nflags:\t%o\n", pos, flags);
pr_info("%s: pos: %16lx flags: %16lx\n", fd_str, *pos, *flags);
return 0;
}
static int dump_task_files(pid_t pid, struct cr_fdset *cr_fdset)
{
char pid_fd_dir[64];
struct dirent *de;
unsigned long pos;
unsigned int flags;
DIR *fd_dir;
pr_info("\n");
pr_info("Dumping opened files (pid: %d)\n", pid);
pr_info("----------------------------------------\n");
snprintf(pid_fd_dir, sizeof(pid_fd_dir), "/proc/%d/fd", pid);
fd_dir = opendir(pid_fd_dir);
if (!fd_dir) {
pr_perror("Can't open %s\n", pid_fd_dir);
return -1;
}
while ((de = readdir(fd_dir))) {
if (de->d_name[0] == '.')
continue;
if (read_fd_params(pid, de->d_name, &pos, &flags))
return -1;
if (dump_one_fd(pid_fd_dir, dirfd(fd_dir), de->d_name, pos, flags, cr_fdset))
return -1;
}
pr_info("----------------------------------------\n");
closedir(fd_dir);
return 0;
}
static int dump_task_mappings(pid_t pid, struct cr_fdset *cr_fdset)
{
struct vma_area *vma_area;
int ret = -1;
pr_info("\n");
pr_info("Dumping mappings (pid: %d)\n", pid);
pr_info("----------------------------------------\n");
list_for_each_entry(vma_area, &vma_area_list, list) {
struct vma_entry *vma = &vma_area->vma;
if (!(vma->status & VMA_AREA_REGULAR))
continue;
pr_info_vma(vma_area);
switch (vma->flags) {
case MAP_SHARED:
case MAP_PRIVATE:
if ((vma->status & VMA_ANON_SHARED)) {
struct shmem_entry e;
e.start = vma->start;
e.end = vma->end;
e.shmid = vma_area->shmid;
pr_info("shmem: s: %16lx e: %16lx shmid: %16lx\n",
e.start, e.end, e.shmid);
write_ptr_safe(cr_fdset->desc[CR_FD_SHMEM].fd, &e, err);
} else if ((vma->status & VMA_FILE_PRIVATE) ||
(vma->status & VMA_FILE_SHARED)) {
unsigned int flags;
if (vma->prot & PROT_WRITE && (vma->status & VMA_FILE_SHARED))
flags = O_RDWR;
else
flags = O_RDONLY;
ret = dump_one_reg_file(FDINFO_MAP,
vma->start,
vma_area->vm_file_fd,
0, 0, flags,
cr_fdset);
if (ret)
goto err;
}
break;
default:
pr_panic("Unknown VMA (pid: %d)\n", pid);
goto err;
break;
}
}
ret = 0;
pr_info("----------------------------------------\n");
err:
return ret;
}
#define assign_reg(dst, src, e) dst.e = (__typeof__(dst.e))src.e
#define assign_array(dst, src, e) memcpy(&dst.e, &src.e, sizeof(dst.e))
static int get_task_personality(pid_t pid, u32 *personality)
{
FILE *file = NULL;
int ret = -1;
snprintf(loc_buf, sizeof(loc_buf), "/proc/%d/personality", pid);
file = fopen(loc_buf, "r");
if (!file) {
perror("Can't open task personality");
goto err;
}
if (!fgets(loc_buf, sizeof(loc_buf), file)) {
perror("Can't read task personality");
goto err;
}
*personality = atoi(loc_buf);
ret = 0;
err:
if (file)
fclose(file);
return ret;
}
static int dump_task_tls(pid_t pid, struct desc_struct *tls_array, int size)
{
FILE *file = NULL;
int ret = -1;
if (size != GDT_ENTRY_TLS_ENTRIES) {
pr_error("Wrong TLS storage size: %d\n", size);
goto err;
}
snprintf(loc_buf, sizeof(loc_buf), "/proc/%d/tls", pid);
file = fopen(loc_buf, "r");
if (!file) {
perror("Can't open task tls");
goto err;
}
ret = 0;
while (fgets(loc_buf, sizeof(loc_buf), file)) {
u32 a, b;
if (sscanf(loc_buf, "%x %x", &a, &b) != 2) {
pr_error("Can't parse tls entry: %s\n");
ret = -1;
goto err;
}
if (ret >= GDT_ENTRY_TLS_ENTRIES) {
pr_error("Too many entries in tls\n");
ret = -1;
goto err;
}
tls_array[ret].a = a;
tls_array[ret].b = b;
ret++;
}
if (ret != GDT_ENTRY_TLS_ENTRIES) {
pr_error("tls returened %i entries instead of %i\n",
ret, GDT_ENTRY_TLS_ENTRIES);
ret = -1;
goto err;
}
ret = 0;
err:
if (file)
fclose(file);
return ret;
}
static int dump_task_core_seized(pid_t pid, struct cr_fdset *cr_fdset)
{
struct core_entry *core = xzalloc(sizeof(*core));
user_fpregs_struct_t fpregs = {-1};
user_regs_struct_t regs = {-1};
int fd_core = cr_fdset->desc[CR_FD_CORE].fd;
int ret = -1;
pr_info("\n");
pr_info("Dumping core (pid: %d)\n", pid);
pr_info("----------------------------------------\n");
if (!core)
goto err;
lseek(fd_core, MAGIC_OFFSET, SEEK_SET);
jerr(ptrace(PTRACE_GETREGS, pid, NULL, &regs), err_free);
jerr(ptrace(PTRACE_GETFPREGS, pid, NULL, &fpregs), err_free);
pr_info("Dumping GP/FPU registers ... ");
assign_reg(core->gpregs, regs, r15);
assign_reg(core->gpregs, regs, r14);
assign_reg(core->gpregs, regs, r13);
assign_reg(core->gpregs, regs, r12);
assign_reg(core->gpregs, regs, bp);
assign_reg(core->gpregs, regs, bx);
assign_reg(core->gpregs, regs, r11);
assign_reg(core->gpregs, regs, r10);
assign_reg(core->gpregs, regs, r9);
assign_reg(core->gpregs, regs, r8);
assign_reg(core->gpregs, regs, ax);
assign_reg(core->gpregs, regs, cx);
assign_reg(core->gpregs, regs, dx);
assign_reg(core->gpregs, regs, si);
assign_reg(core->gpregs, regs, di);
assign_reg(core->gpregs, regs, orig_ax);
assign_reg(core->gpregs, regs, ip);
assign_reg(core->gpregs, regs, cs);
assign_reg(core->gpregs, regs, flags);
assign_reg(core->gpregs, regs, sp);
assign_reg(core->gpregs, regs, ss);
assign_reg(core->gpregs, regs, fs_base);
assign_reg(core->gpregs, regs, gs_base);
assign_reg(core->gpregs, regs, ds);
assign_reg(core->gpregs, regs, es);
assign_reg(core->gpregs, regs, fs);
assign_reg(core->gpregs, regs, gs);
assign_reg(core->fpregs, fpregs, cwd);
assign_reg(core->fpregs, fpregs, swd);
assign_reg(core->fpregs, fpregs, twd);
assign_reg(core->fpregs, fpregs, fop);
assign_reg(core->fpregs, fpregs, rip);
assign_reg(core->fpregs, fpregs, rdp);
assign_reg(core->fpregs, fpregs, mxcsr);
assign_reg(core->fpregs, fpregs, mxcsr_mask);
assign_array(core->fpregs, fpregs, st_space);
assign_array(core->fpregs, fpregs, xmm_space);
assign_array(core->fpregs, fpregs, padding);
pr_info("OK\n");
pr_info("Obtainting TLS ... ");
ret = dump_task_tls(pid, core->tls_array, ARRAY_SIZE(core->tls_array));
if (ret)
goto err_free;
pr_info("OK\n");
pr_info("Obtainting personality ... ");
ret = get_task_personality(pid, &core->personality);
if (ret)
goto err_free;
pr_info("OK\n");
pr_info("Dumping header ... ");
core->hdr.version = HEADER_VERSION;
core->hdr.arch = HEADER_ARCH_X86_64;
core->hdr.flags = 0;
write_ptr_safe(fd_core, core, err_free);
pr_info("OK\n");
ret = 0;
err_free:
free(core);
err:
pr_info("----------------------------------------\n");
return ret;
}
static struct pstree_item *find_children(pid_t pid)
{
struct pstree_item *item = NULL;
u32 *children = NULL;
u32 nr_allocated = 0;
u32 nr_children = 0;
bool found = false;
FILE *file;
char *tok;
pr_debug("pid: %d\n", pid);
snprintf(loc_buf, sizeof(loc_buf), "/proc/%d/status", pid);
file = fopen(loc_buf, "r");
if (!file) {
perror("Can't open task status");
goto err;
}
while ((fgets(loc_buf, sizeof(loc_buf), file))) {
if (strncmp(loc_buf, "Children:", 9)) {
continue;
} else {
found = true;
break;
}
}
fclose(file), file = NULL;
if (!found) {
pr_error("Children marker is not found\n");
goto err;
}
item = xzalloc(sizeof(*item));
if (!item)
goto err;
tok = strtok(&loc_buf[10], " \n");
while (tok) {
u32 child_pid = atoi(tok);
pr_debug("child_pid: %d\n", child_pid);
if (nr_allocated <= nr_children) {
nr_allocated += 64;
if (xrealloc_safe((void **)&children, nr_allocated)) {
xfree(children);
xfree(item);
item = NULL;
goto err;
}
}
children[nr_children++] = child_pid;
tok = strtok(NULL, " \n");
}
item->pid = pid;
item->nr_children = nr_children;
item->children = children;
err:
return item;
}
static int collect_pstree(pid_t pid)
{
struct pstree_item *item;
unsigned long i;
int ret = -1;
item = find_children(pid);
if (!item)
goto err;
list_add_tail(&item->list, &pstree_list);
for (i = 0; i < item->nr_children; i++) {
ret = collect_pstree(item->children[i]);
if (ret)
goto err;
}
ret = 0;
err:
return ret;
}
static int dump_pstree(pid_t pid, struct cr_fdset *cr_fdset)
{
struct pstree_item *item;
struct pstree_entry e;
unsigned long i;
int ret = -1;
pr_info("\n");
pr_info("Dumping pstree (pid: %d)\n", pid);
pr_info("----------------------------------------\n");
list_for_each_entry(item, &pstree_list, list) {
pr_info("Process: %d (%d children)\n",
item->pid, item->nr_children);
e.pid = item->pid;
e.nr_children = item->nr_children;
write_ptr_safe(cr_fdset->desc[CR_FD_PSTREE].fd, &e, err);
pr_info("Children:");
for (i = 0; i < item->nr_children; i++) {
pr_info(" %d", item->children[i]);
write_ptr_safe(cr_fdset->desc[CR_FD_PSTREE].fd,
&item->children[i], err);
}
pr_info("\n");
}
ret = 0;
err:
pr_info("----------------------------------------\n");
return ret;
}
static struct vma_area *find_vma_by_addr(unsigned long addr)
{
struct vma_area *vma_area;
list_for_each_entry(vma_area, &vma_area_list, list) {
if (in_vma_area(vma_area, addr))
return vma_area;
}
return NULL;
}
/* kernel expects a special format in core file */
static int finalize_core(pid_t pid, struct cr_fdset *cr_fdset)
{
int fd_pages, fd_pages_shmem, fd_core;
unsigned long num, num_anon;
struct vma_area *vma_area;
struct vma_entry ve;
int ret = -1;
u64 va;
pr_info("\n");
pr_info("Finalizing core (pid: %d)\n", pid);
pr_info("----------------------------------------\n");
fd_core = cr_fdset->desc[CR_FD_CORE].fd;
fd_pages = cr_fdset->desc[CR_FD_PAGES].fd;
fd_pages_shmem = cr_fdset->desc[CR_FD_PAGES_SHMEM].fd;
pr_debug("dsc: fd_core %d fd_pages %d fd_pages_shmem %d\n",
fd_core, fd_pages, fd_pages_shmem);
lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
lseek(fd_pages, MAGIC_OFFSET, SEEK_SET);
lseek(fd_pages_shmem, MAGIC_OFFSET, SEEK_SET);
num = 0;
pr_info("Appending VMAs ... ");
/* All VMAs first */
list_for_each_entry(vma_area, &vma_area_list, list) {
ret = write(fd_core, &vma_area->vma, sizeof(vma_area->vma));
if (ret != sizeof(vma_area->vma)) {
pr_perror("\nUnable to write vma entry (%li written)\n", num);
goto err;
}
num++;
}
/* Ending marker */
memset(&ve, 0, sizeof(ve));
write_ptr_safe(fd_core, &ve, err);
pr_info("OK (%li written)\n", num);
num = 0;
num_anon = 0;
pr_info("Appending pages ... ");
while (1) {
ret = read(fd_pages, &va, sizeof(va));
if (!ret)
break;
if (ret != sizeof(va)) {
pr_perror("\nUnable to read VA of page (%li written)\n", num);
goto err;
}
/* Ending marker */
if (va == 0) {
write_ptr_safe(fd_core, &zero_page_entry, err);
write_ptr_safe(fd_pages_shmem, &zero_page_entry, err);
break;
}
vma_area = find_vma_by_addr((unsigned long)va);
if (!vma_area) {
pr_panic("\nA page with address %lx is unknown\n", va);
goto err;
}
/*
* Just in case if someone broke parasite page
* dumper code.
*/
if (!vma_area_has(vma_area, VMA_AREA_REGULAR)) {
pr_panic("\nA page with address %lx has a wrong status\n", va);
goto err;
}
if (vma_area_has(vma_area, VMA_ANON_PRIVATE) ||
vma_area_has(vma_area, VMA_FILE_PRIVATE)) {
ret = write(fd_core, &va, sizeof(va));
ret += sendfile(fd_core, fd_pages, NULL, PAGE_SIZE);
if (ret != sizeof(va) + PAGE_SIZE) {
pr_perror("\nUnable to write VMA_FILE_PRIVATE|VMA_ANON_PRIVATE "
"page (%li, %li written)\n",
num, num_anon);
goto err;
}
num++;
} else if (vma_area_has(vma_area, VMA_ANON_SHARED)) {
ret = write(fd_pages_shmem, &va, sizeof(va));
ret += sendfile(fd_pages_shmem, fd_pages, NULL, PAGE_SIZE);
if (ret != sizeof(va) + PAGE_SIZE) {
pr_perror("\nUnable to write VMA_ANON_SHARED "
"page (%li, %li written)\n",
num, num_anon);
goto err;
}
num_anon++;
} else {
/* skip the page */
lseek(fd_pages, PAGE_SIZE, SEEK_CUR);
}
}
ret = 0;
pr_info("OK (%li written)\n", num + num_anon);
err:
pr_info("----------------------------------------\n");
return ret;
err_strno:
pr_perror("Error catched\n");
goto err;
}
static int dump_one_task(pid_t pid, struct cr_fdset *cr_fdset)
{
int ret = 0;
pr_info("========================================\n");
pr_info("Dumping task (pid: %d)\n", pid);
pr_info("========================================\n");
ret = collect_mappings(pid);
if (ret) {
pr_error("Collect mappings (pid: %d) failed with %d\n", pid, ret);
goto err;
}
ret = seize_task(pid);
if (ret) {
pr_error("Failed to seize task (pid: %d) with %d\n",
pid, ret);
goto err;
}
ret = dump_task_core_seized(pid, cr_fdset);
if (ret) {
pr_error("Dump core (pid: %d) failed with %d\n", pid, ret);
goto err;
}
parasite_ctl = parasite_infect_seized(pid, NULL, &vma_area_list);
if (!parasite_ctl) {
pr_error("Can't infect (pid: %d) with parasite\n", pid);
goto err;
}
ret = parasite_dump_pages_seized(parasite_ctl, &vma_area_list,
cr_fdset, CR_FD_PAGES);
if (ret) {
pr_error("Can't dump pages (pid: %d) with parasite\n", pid);
goto err;
}
ret = parasite_cure_seized(&parasite_ctl, &vma_area_list);
if (ret) {
pr_error("Can't cure (pid: %d) from parasite\n", pid);
goto err;
}
ret = unseize_task(pid);
if (ret) {
pr_error("Can't unsieze (pid: %d) task\n", pid);
goto err;
}
ret = dump_task_files(pid, cr_fdset);
if (ret) {
pr_error("Dump files (pid: %d) failed with %d\n", pid, ret);
goto err;
}
ret = dump_task_mappings(pid, cr_fdset);
if (ret) {
pr_error("Dump mappings (pid: %d) failed with %d\n", pid, ret);
goto err;
}
ret = finalize_core(pid, cr_fdset);
if (ret) {
pr_error("Finalizing core (pid: %d) failed with %d\n", pid, ret);
goto err;
}
err:
free_mappings();
return ret;
}
int cr_dump_tasks(pid_t pid, bool leader_only, int leave_stopped)
{
struct cr_fdset *cr_fdset = NULL;
struct pstree_item *item;
int ret = -1;
if (!leader_only) {
pr_info("========================================\n");
pr_info("Dumping process group (pid: %d)\n", pid);
pr_info("========================================\n");
}
if (collect_pstree(pid))
goto err;
list_for_each_entry(item, &pstree_list, list) {
stop_task(item->pid);
if (leader_only)
break;
}
/* Dump the process tree first */
cr_fdset = alloc_cr_fdset(pid);
if (!cr_fdset)
goto err;
if (prep_cr_fdset_for_dump(cr_fdset, CR_FD_DESC_USE(CR_FD_PSTREE)))
goto err;
if (dump_pstree(pid, cr_fdset))
goto err;
close_cr_fdset(cr_fdset);
free_cr_fdset(&cr_fdset);
/* Now all other data */
list_for_each_entry(item, &pstree_list, list) {
cr_fdset = alloc_cr_fdset(item->pid);
if (!cr_fdset)
goto err;
if (prep_cr_fdset_for_dump(cr_fdset, CR_FD_DESC_NOPSTREE))
goto err;
if (dump_one_task(item->pid, cr_fdset))
goto err;
close_cr_fdset(cr_fdset);
free_cr_fdset(&cr_fdset);
if (leader_only)
break;
}
ret = 0;
err:
if (!leave_stopped) {
list_for_each_entry(item, &pstree_list, list) {
continue_task(item->pid);
if (leader_only)
break;
}
}
free_pstree();
close_cr_fdset(cr_fdset);
free_cr_fdset(&cr_fdset);
return ret;
}

1144
cr-restore.c Normal file

File diff suppressed because it is too large Load Diff

389
cr-show.c Normal file
View File

@ -0,0 +1,389 @@
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <limits.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <dirent.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/vfs.h>
#include <sys/ptrace.h>
#include <sys/user.h>
#include <sys/wait.h>
#include "types.h"
#include "list.h"
#include "compiler.h"
#include "crtools.h"
#include "syscall.h"
#include "util.h"
#include "image.h"
#ifndef CONFIG_X86_64
# error No x86-32 support yet
#endif
#define pr_regs4(s, n1, n2, n3, n4) \
pr_info("%8s: %16lx " \
"%8s: %16lx " \
"%8s: %16lx " \
"%8s: %16lx\n", \
#n1, s.n1, \
#n2, s.n2, \
#n3, s.n3, \
#n4, s.n4)
#define pr_regs3(s, n1, n2, n3) \
pr_info("%8s: %16lx " \
"%8s: %16lx " \
"%8s: %16lx\n", \
#n1, s.n1, \
#n2, s.n2, \
#n3, s.n3)
static char local_buf[PAGE_SIZE];
static LIST_HEAD(pstree_list);
/* FIXME: same as dump -- unify */
static void free_pstree(void)
{
struct pstree_item *item, *p;
list_for_each_entry_safe(item, p, &pstree_list, list) {
xfree(item->children);
xfree(item);
}
INIT_LIST_HEAD(&pstree_list);
}
static void show_regs(struct cr_fdset *cr_fdset)
{
struct user_regs_entry regs;
struct desc_struct tls;
int fd_core, i;
fd_core = cr_fdset->desc[CR_FD_CORE].fd;
if (fd_core < 0)
goto err;
pr_info("\n\t---[GP registers set]---\n");
lseek(fd_core, GET_FILE_OFF(struct core_entry, gpregs), SEEK_SET);
read_ptr_safe(fd_core, &regs, err);
pr_regs4(regs, cs, ip, ds, es);
pr_regs4(regs, ss, sp, fs, gs);
pr_regs4(regs, di, si, dx, cx);
pr_regs4(regs, ax, r8, r9, r10);
pr_regs4(regs, r11, r12, r13, r14);
pr_regs3(regs, r15, bp, bx);
pr_regs4(regs, orig_ax, flags, fs_base, gs_base);
pr_info("\n\t---[TLS area]---\n");
lseek(fd_core, GET_FILE_OFF(struct core_entry, tls_array), SEEK_SET);
for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) {
read_ptr_safe(fd_core, &tls, err);
pr_info("tls[%2i] = %x %x\n", i, tls.a, tls.b);
}
err:
return;
}
static void show_files(struct cr_fdset *cr_fdset)
{
struct fdinfo_entry e;
int fd_files, ret;
pr_info("\n");
pr_info("CR_FD_FDINFO: %s\n", cr_fdset->desc[CR_FD_FDINFO].name);
pr_info("----------------------------------------\n");
fd_files = cr_fdset->desc[CR_FD_FDINFO].fd;
lseek(fd_files, MAGIC_OFFSET, SEEK_SET);
while (1) {
ret = read(fd_files, &e, sizeof(e));
if (!ret)
goto err;
if (ret != sizeof(e)) {
pr_perror("Can't read fdinfo entry");
goto err;
}
if (e.len) {
ret = read(fd_files, local_buf, e.len);
if (ret != e.len) {
pr_perror("Can't read %d bytes\n", e.len);
goto err;
}
local_buf[e.len] = 0;
pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx --> %s\n",
e.type, e.len, e.flags, e.pos, e.addr, local_buf);
} else
pr_info("type: %02x len: %02x flags: %4x pos: %8x addr: %16lx\n",
e.type, e.len, e.flags, e.pos, e.addr);
}
err:
pr_info("----------------------------------------\n");
}
static void show_pipes(struct cr_fdset *cr_fdset)
{
struct pipe_entry e;
int fd_pipes, ret;
pr_info("\n");
pr_info("CR_FD_PIPES: %s\n", cr_fdset->desc[CR_FD_PIPES].name);
pr_info("----------------------------------------\n");
fd_pipes = cr_fdset->desc[CR_FD_PIPES].fd;
lseek(fd_pipes, MAGIC_OFFSET, SEEK_SET);
while (1) {
ret = read(fd_pipes, &e, sizeof(e));
if (!ret)
goto err;
if (ret != sizeof(e)) {
pr_perror("Can't read pipe entry\n");
goto err;
}
pr_info("fd: %8lx pipeid: %8lx flags: %8lx bytes: %8lx\n",
e.fd, e.pipeid, e.flags, e.bytes);
if (e.bytes)
lseek(fd_pipes, e.bytes, SEEK_CUR);
}
err:
pr_info("----------------------------------------\n");
}
static void show_core(struct cr_fdset *cr_fdset)
{
struct vma_area vma_area = {};
struct vma_entry ve;
int fd_core, ret;
u64 va;
pr_info("\n");
pr_info("CR_FD_CORE: %s\n", cr_fdset->desc[CR_FD_CORE].name);
pr_info("----------------------------------------\n");
fd_core = cr_fdset->desc[CR_FD_CORE].fd;
if (fd_core < 0)
goto out;
show_regs(cr_fdset);
lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
/*
* Start with VMA, then pages.
*/
pr_info("\n\t---[VMA areas]---\n");
while (1) {
ret = read(fd_core, &ve, sizeof(ve));
if (!ret)
break;
if (ret != sizeof(ve)) {
pr_perror("Unable to read VMA\n");
goto out;
}
if (is_ending_vma(&ve)) {
pr_info("\n\t---[Pages]---\n");
while (1) {
ret = read(fd_core, &va, sizeof(va));
if (!ret)
goto out;
if (ret != sizeof(va)) {
pr_perror("Unable to read VA\n");
goto out;
}
if (va == 0)
goto out;
pr_info("page va: %16lx\n", va);
lseek(fd_core, PAGE_SIZE, SEEK_CUR);
}
}
/* Simply in a sake of fancy printing */
vma_area.vma = ve;
pr_info_vma(&vma_area);
}
out:
pr_info("----------------------------------------\n");
}
static void show_pstree_from_file(int fd, char *name)
{
int ret;
pr_info("\n");
pr_info("CR_FD_PSTREE: %s\n", name);
pr_info("----------------------------------------\n");
while (1) {
struct pstree_entry e;
unsigned long i;
u32 child_pid;
ret = read(fd, &e, sizeof(e));
if (!ret)
break;
if (ret != sizeof(e)) {
pr_perror("Bad pstree entry");
break;
}
pr_info("Process %d number of children: %d\n",
e.pid, e.nr_children);
for (i = 0; i < e.nr_children; i++) {
ret = read(fd, &child_pid,
sizeof(child_pid));
pr_info(" %d", child_pid);
}
if (e.nr_children)
pr_info("\n");
}
pr_info("----------------------------------------\n");
}
static void show_pstree(struct list_head *head, char *name)
{
struct pstree_item *item;
int i;
pr_info("\n");
pr_info("CR_FD_PSTREE: %s\n", name);
pr_info("----------------------------------------\n");
list_for_each_entry(item, head, list) {
pr_info("Process %d number of children: %d\n",
item->pid, item->nr_children);
for (i = 0; i < item->nr_children; i++)
pr_info(" %d", item->children[i]);
if (item->nr_children)
pr_info("\n");
}
pr_info("----------------------------------------\n");
}
static int collect_pstree(pid_t pid, struct cr_fdset *cr_fdset)
{
int fd = cr_fdset->desc[CR_FD_PSTREE].fd;
struct pstree_item *item = NULL;
struct pstree_entry e;
int ret = -1;
for (;;) {
size_t size;
ret = read(fd, &e, sizeof(e));
if (ret && ret != sizeof(e)) {
pr_perror("Wrong pstree entry\n");
goto err;
}
if (!ret)
break;
item = xmalloc(sizeof(*item));
if (!item)
goto err;
size = sizeof(u32) * e.nr_children;
item->pid = e.pid;
item->nr_children = e.nr_children;
item->children = xmalloc(size);
if (!item->children) {
pr_error("No memory for children pids\n");
goto err;
}
ret = read(fd, item->children, size);
if (ret != size) {
pr_error("An error in reading children pids\n");
xfree(item->children);
goto err;
}
list_add_tail(&item->list, &pstree_list);
}
item = NULL;
ret = 0;
err:
xfree(item);
return ret;
}
int cr_show(unsigned long pid, bool leader_only)
{
struct cr_fdset *cr_fdset;
struct pstree_item *item;
int i, ret = -1;
cr_fdset = alloc_cr_fdset(pid);
if (!cr_fdset)
goto out;
ret = prep_cr_fdset_for_restore(cr_fdset, CR_FD_DESC_ALL);
if (ret)
goto out;
ret = collect_pstree(pid, cr_fdset);
if (ret)
goto out;
show_pstree(&pstree_list, cr_fdset->desc[CR_FD_PSTREE].name);
close_cr_fdset(cr_fdset);
free_cr_fdset(&cr_fdset);
list_for_each_entry(item, &pstree_list, list) {
cr_fdset = alloc_cr_fdset(item->pid);
if (!cr_fdset)
goto out;
ret = prep_cr_fdset_for_restore(cr_fdset, CR_FD_DESC_NOPSTREE);
if (ret)
goto out;
show_core(cr_fdset);
show_pipes(cr_fdset);
show_files(cr_fdset);
if (leader_only)
break;
}
out:
free_pstree();
close_cr_fdset(cr_fdset);
free_cr_fdset(&cr_fdset);
return ret;
}

280
crtools.c Normal file
View File

@ -0,0 +1,280 @@
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <limits.h>
#include <unistd.h>
#include <errno.h>
#include <dirent.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/vfs.h>
#include <sys/ptrace.h>
#include <sys/user.h>
#include <sys/wait.h>
#include <sys/sendfile.h>
#include "types.h"
#include "list.h"
#include "compiler.h"
#include "crtools.h"
#include "util.h"
struct page_entry zero_page_entry;
static struct cr_fd_desc_tmpl template[CR_FD_MAX] = {
[CR_FD_FDINFO] = {
.fmt = "fdinfo-%li.img",
.magic = FDINFO_MAGIC,
},
[CR_FD_PAGES] = {
.fmt = "pages-%li.img",
.magic = PAGES_MAGIC,
},
[CR_FD_PAGES_SHMEM] = {
.fmt = "pages-shmem-%li.img",
.magic = PAGES_MAGIC,
},
[CR_FD_CORE] = {
.fmt = "core-%li.img",
.magic = CORE_MAGIC,
},
[CR_FD_PIPES] = {
.fmt = "pipes-%li.img",
.magic = PIPES_MAGIC,
},
[CR_FD_PSTREE] = {
.fmt = "pstree-%li.img",
.magic = PSTREE_MAGIC,
},
[CR_FD_SHMEM] = {
.fmt = "shmem-%li.img",
.magic = SHMEM_MAGIC,
},
};
struct cr_fdset *alloc_cr_fdset(pid_t pid)
{
struct cr_fdset *cr_fdset;
unsigned int i;
cr_fdset = xzalloc(sizeof(*cr_fdset));
if (!cr_fdset)
goto err;
for (i = 0; i < CR_FD_MAX; i++) {
cr_fdset->desc[i].tmpl = &template[i];
snprintf(cr_fdset->desc[i].name,
sizeof(cr_fdset->desc[i].name),
cr_fdset->desc[i].tmpl->fmt,
(long)pid);
cr_fdset->desc[i].fd = -1;
}
err:
return cr_fdset;
}
int prep_cr_fdset_for_dump(struct cr_fdset *cr_fdset,
unsigned long use_mask)
{
unsigned int i;
u32 magic;
int ret = -1;
if (!cr_fdset)
goto err;
cr_fdset->use_mask = use_mask;
for (i = 0; i < CR_FD_MAX; i++) {
if (!(use_mask & CR_FD_DESC_USE(i)))
continue;
ret = unlink(cr_fdset->desc[i].name);
if (ret && errno != ENOENT) {
pr_perror("Unable to unlink %s (%s)\n",
cr_fdset->desc[i].name,
strerror(errno));
goto err;
} else
ret = -1;
cr_fdset->desc[i].fd = open(cr_fdset->desc[i].name,
O_RDWR | O_CREAT | O_EXCL,
CR_FD_PERM);
if (cr_fdset->desc[i].fd < 0) {
pr_perror("Unable to open %s (%s)\n",
cr_fdset->desc[i].name,
strerror(errno));
goto err;
}
pr_debug("Opened %s with %d\n",
cr_fdset->desc[i].name,
cr_fdset->desc[i].fd);
magic = cr_fdset->desc[i].tmpl->magic;
write_ptr_safe(cr_fdset->desc[i].fd, &magic, err);
/*
* Make sure it's on disk since we might
* need to re-open files in parasite.
*/
fsync(cr_fdset->desc[i].fd);
}
ret = 0;
err:
return ret;
}
int prep_cr_fdset_for_restore(struct cr_fdset *cr_fdset,
unsigned long use_mask)
{
unsigned int i;
int ret = -1;
u32 magic;
if (!cr_fdset)
goto err;
cr_fdset->use_mask = use_mask;
for (i = 0; i < CR_FD_MAX; i++) {
if (!(use_mask & CR_FD_DESC_USE(i)))
continue;
cr_fdset->desc[i].fd = open(cr_fdset->desc[i].name,
O_RDWR, CR_FD_PERM);
if (cr_fdset->desc[i].fd < 0) {
pr_perror("Unable to open %s (%s)\n",
cr_fdset->desc[i].name,
strerror(errno));
goto err;
}
pr_debug("Opened %s with %d\n",
cr_fdset->desc[i].name,
cr_fdset->desc[i].fd);
read_ptr_safe(cr_fdset->desc[i].fd, &magic, err);
if (magic != cr_fdset->desc[i].tmpl->magic) {
pr_error("Magic doesn't match for %s\n",
cr_fdset->desc[i].name);
goto err;
}
}
ret = 0;
err:
return ret;
}
void close_cr_fdset(struct cr_fdset *cr_fdset)
{
unsigned int i;
if (!cr_fdset)
return;
for (i = 0; i < CR_FD_MAX; i++) {
if (!(cr_fdset->use_mask & CR_FD_DESC_USE(i)))
continue;
if (cr_fdset->desc[i].fd >= 0) {
pr_debug("Closed %s with %d\n",
cr_fdset->desc[i].name,
cr_fdset->desc[i].fd);
close(cr_fdset->desc[i].fd);
cr_fdset->desc[i].fd = -1;
}
}
}
void free_cr_fdset(struct cr_fdset **cr_fdset)
{
if (cr_fdset && *cr_fdset) {
free(*cr_fdset);
*cr_fdset = NULL;
}
}
int main(int argc, char *argv[])
{
pid_t pid;
int ret = -1;
BUILD_BUG_ON(PAGE_SIZE != PAGE_IMAGE_SIZE);
if (argc < 3)
goto usage;
memset(&zero_page_entry, 0, sizeof(zero_page_entry));
if (!strcmp(argv[1], "dump")) {
bool leader_only;
switch (argv[2][1]) {
case 'p':
pid = atol(argv[3]);
leader_only = true;
break;
case 't':
pid = atol(argv[3]);
leader_only = false;
break;
default:
goto usage;
}
ret = cr_dump_tasks(pid, leader_only, 1);
} else if (!strcmp(argv[1], "restore")) {
bool leader_only;
switch (argv[2][1]) {
case 'p':
pid = atol(argv[3]);
leader_only = true;
break;
case 't':
pid = atol(argv[3]);
leader_only = false;
break;
default:
goto usage;
}
ret = cr_restore_tasks(pid, leader_only, 1);
} else if (!strcmp(argv[1], "show")) {
bool leader_only = true;
switch (argv[2][1]) {
case 'p':
leader_only = true;
pid = atol(argv[3]);
break;
case 't':
leader_only = false;
pid = atol(argv[3]);
break;
default:
goto usage;
}
ret = cr_show(pid, leader_only);
} else
goto usage;
return ret;
usage:
printk("\nUsage:\n");
printk("\tcrtools (dump|show|restore) (-p|-t) pid\n\n");
return -1;
}

213
elf.c Normal file
View File

@ -0,0 +1,213 @@
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <limits.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <dirent.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/vfs.h>
#include <sys/ptrace.h>
#include <sys/user.h>
#include <sys/wait.h>
#include <sys/sendfile.h>
#include "types.h"
#include "list.h"
#include "compiler.h"
#include "crtools.h"
#include "syscall.h"
#include "util.h"
#include "image.h"
#include "elf.h"
#define ELF_MAX_PHDR ((65536U / sizeof(Elf64_Phdr)) - 1)
#define ELF_MAX_PAGES (1 << 10)
/*
* Convert the c/r core file into elf
* executable, the kernel will handle it.
*/
int convert_to_elf(char *elf_path, int fd_core)
{
Elf64_Ehdr elf_ehdr;
Elf64_Phdr elf_phdr;
Elf64_Half e_phnum = 0;
Elf64_Addr e_entry = 0;
struct page_entry page_entry;
unsigned long nrpages = 0;
struct core_entry core;
struct vma_area area;
struct vma_entry vma;
u64 va;
unsigned long phoff = 0;
unsigned long phoff_regs, phoff_pages;
int fd_elf;
int ret = -1;
fd_elf = open(elf_path, O_RDWR | O_CREAT | O_EXCL, 0700);
if (fd_elf < 0) {
pr_perror("Can't open %s\n", elf_path);
goto err;
}
memset(&elf_ehdr, 0, sizeof(elf_ehdr));
memset(&area, 0, sizeof(area));
memcpy(elf_ehdr.e_ident, ELFMAG, SELFMAG);
elf_ehdr.e_ident[EI_CLASS] = ELFCLASS64;
elf_ehdr.e_ident[EI_DATA] = ELFDATA2LSB;
elf_ehdr.e_ident[EI_VERSION] = EV_CURRENT;
elf_ehdr.e_type = ET_CKPT;
elf_ehdr.e_machine = EM_X86_64;
elf_ehdr.e_version = EV_CURRENT;
elf_ehdr.e_phoff = sizeof(elf_ehdr);
elf_ehdr.e_ehsize = sizeof(elf_ehdr);
elf_ehdr.e_phentsize = sizeof(Elf64_Phdr);
/* Get EP */
lseek(fd_core, MAGIC_OFFSET, SEEK_SET);
read_ptr_safe(fd_core, &core, err_close);
/*
* Count the numbers of segments. Each segment
* is the VMA record with appropriate permissions.
* Then we need one big segment which would hold
* all the pages dumped.
*/
lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
while(1) {
read_ptr_safe(fd_core, &vma, err_close);
if (vma.start == 0 && vma.end == 0)
break;
e_phnum++;
}
while (1) {
read_ptr_safe(fd_core, &va, err_close);
nrpages++;
if (va == 0)
break;
lseek(fd_core, PAGE_SIZE, SEEK_CUR);
}
/* Figure out if we're overflowed */
if (e_phnum > ELF_MAX_PHDR) {
pr_error("Too many VMA areas (%li of %li allowed)\n",
e_phnum, ELF_MAX_PHDR);
goto err_close;
} else if (nrpages > ELF_MAX_PAGES) {
pr_error("Too many pages to restore (%li of %li allowed)\n",
nrpages, ELF_MAX_PAGES);
goto err_close;
}
/*
* We can write elf header now.
*/
lseek(fd_elf, 0, SEEK_SET);
elf_ehdr.e_phnum = e_phnum + 2;
elf_ehdr.e_entry = core.gpregs.ip;
write_ptr_safe(fd_elf, &elf_ehdr, err_close);
/* Offset in file (after all headers) */
phoff = elf_ehdr.e_phnum * sizeof(elf_phdr) + sizeof(elf_ehdr);
/* VMAs to headers */
e_phnum = 0;
lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
while(1) {
read_ptr_safe(fd_core, &vma, err_close);
if (vma.start == 0 && vma.end == 0)
break;
memset(&elf_phdr, 0, sizeof(elf_phdr));
elf_phdr.p_type = PT_CKPT_VMA;
elf_phdr.p_offset = phoff;
elf_phdr.p_vaddr = vma.start;
elf_phdr.p_paddr = vma.start;
elf_phdr.p_filesz = sizeof(vma);
elf_phdr.p_memsz = vma.end - vma.start;
elf_phdr.p_align = 0x1000;
if (vma.prot & PROT_READ)
elf_phdr.p_flags |= PF_R;
if (vma.prot & PROT_WRITE)
elf_phdr.p_flags |= PF_W;
if (vma.prot & PROT_EXEC)
elf_phdr.p_flags |= PF_X;
write_ptr_safe(fd_elf, &elf_phdr, err_close);
phoff += sizeof(vma);
}
/* The binfmt header */
memset(&elf_phdr, 0, sizeof(elf_phdr));
elf_phdr.p_type = PT_CKPT_CORE;
elf_phdr.p_flags = PF_R;
elf_phdr.p_offset = phoff;
elf_phdr.p_vaddr = 0;
elf_phdr.p_filesz = sizeof(core);
elf_phdr.p_memsz = sizeof(core);
elf_phdr.p_align = 0x1000;
write_ptr_safe(fd_elf, &elf_phdr, err_close);
phoff += sizeof(core);
/* The pages and binfmt header */
memset(&elf_phdr, 0, sizeof(elf_phdr));
elf_phdr.p_type = PT_CKPT_PAGES;
elf_phdr.p_flags = PF_R;
elf_phdr.p_offset = phoff;
elf_phdr.p_vaddr = 0;
elf_phdr.p_filesz = nrpages * (sizeof(page_entry));
elf_phdr.p_memsz = nrpages * (sizeof(page_entry));
elf_phdr.p_align = 0x1000;
write_ptr_safe(fd_elf, &elf_phdr, err_close);
/* Now write real contents for program segments */
lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
while(1) {
read_ptr_safe(fd_core, &vma, err_close);
if (vma.start == 0 && vma.end == 0)
break;
area.vma = vma, pr_info_vma(&area);
write_ptr_safe(fd_elf, &vma, err_close);
}
write_ptr_safe(fd_elf, &core, err_close);
if (sendfile(fd_elf, fd_core, NULL, nrpages * (sizeof(page_entry))) !=
nrpages * (sizeof(page_entry))) {
pr_perror("Can't send %li bytes to elf\n",
(long)(nrpages * (sizeof(page_entry))));
goto err;
}
ret = 0;
err_close:
close(fd_elf);
err:
return ret;
}

22
gen-offsets.sh Normal file
View File

@ -0,0 +1,22 @@
#!/bin/sh
name_ifndef=$1
name_prefix_offset=$2
name_blob=$3
name_objname=$4
name_bin=$5
awk_cmd="{ print \"#define $name_prefix_offset\" \$3 \" 0x\" \$1; }"
echo "/* Autogenerated file, don't edit */"
echo "#ifndef $name_ifndef"
echo "#define $name_ifndef"
echo ""
nm $name_objname | grep ' [Tt] ' | awk "$awk_cmd"
echo ""
echo "static char $name_blob[] = {"
hexdump -v -e '"\t"' -e '8/1 "0x%02x, "' -e '"\n"' $name_bin
echo "};"
echo ""
echo "#endif /* $name_ifndef */"

54
include/bitops.h Normal file
View File

@ -0,0 +1,54 @@
#ifndef CR_BITOPS_H_
#define CR_BITOPS_H_
#ifdef CONFIG_X86_64
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, 8 * sizeof(long))
#define DECLARE_BITMAP(name, bits) \
unsigned long name[BITS_TO_LONGS(bits)]
#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
/* Technically wrong, but this avoids compilation errors on some gcc
versions. */
#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
#else
#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
#endif
#define ADDR BITOP_ADDR(addr)
static void set_bit(int nr, volatile unsigned long *addr)
{
asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
}
static void change_bit(int nr, volatile unsigned long *addr)
{
asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
}
static int test_bit(int nr, volatile const unsigned long *addr)
{
int oldbit;
asm volatile("bt %2,%1\n\t"
"sbb %0,%0"
: "=r" (oldbit)
: "m" (*(unsigned long *)addr), "Ir" (nr));
return oldbit;
}
static void clear_bit(int nr, volatile unsigned long *addr)
{
asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
}
#else /* CONFIG_X86_64 */
# error x86-32 is not implemented yet
#endif /* CONFIG_X86_64 */
#endif /* CR_BITOPS_H_ */

57
include/compiler.h Normal file
View File

@ -0,0 +1,57 @@
#ifndef CR_COMPILER_H_
#define CR_COMPILER_H_
/*
* Various definitions for success build,
* picked from various places, mostly from
* the linux kernel.
*/
#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
#define __stringify_1(x...) #x
#define __stringify(x...) __stringify_1(x)
#define NORETURN __attribute__((__noreturn__))
#define __packed __attribute__((__packed__))
#define __used __attribute__((__used__))
#define __section(S) __attribute__ ((__section__(#S)))
#ifndef __always_inline
# define __always_inline inline __attribute__((always_inline))
#endif
#ifndef always_inline
# define always_inline __always_inline
#endif
#ifndef offsetof
# define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
#endif
#define container_of(ptr, type, member) ({ \
const typeof( ((type *)0)->member ) *__mptr = (ptr); \
(type *)( (char *)__mptr - offsetof(type,member) );})
#define __round_mask(x, y) ((__typeof__(x))((y) - 1))
#define round_up(x, y) ((((x) - 1) | __round_mask(x, y)) + 1)
#define round_down(x, y) ((x) & ~__round_mask(x, y))
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
#define min(x, y) ({ \
typeof(x) _min1 = (x); \
typeof(y) _min2 = (y); \
(void) (&_min1 == &_min2); \
_min1 < _min2 ? _min1 : _min2; })
#define max(x, y) ({ \
typeof(x) _max1 = (x); \
typeof(y) _max2 = (y); \
(void) (&_max1 == &_max2); \
_max1 > _max2 ? _max1 : _max2; })
#define is_log2(v) (((v) & ((v) - 1)) == 0)
#endif /* CR_COMPILER_H_ */

105
include/crtools.h Normal file
View File

@ -0,0 +1,105 @@
#ifndef CRTOOLS_H_
#define CRTOOLS_H_
#include <sys/types.h>
#include "types.h"
#include "list.h"
#include "image.h"
extern struct page_entry zero_page_entry;
int cr_dump_tasks(pid_t pid, bool leader_only, int leave_stopped);
int cr_restore_tasks(pid_t pid, bool leader_only, int leave_stopped);
int cr_show(unsigned long pid, bool leader_only);
int convert_to_elf(char *elf_path, int fd_core);
#define CR_FD_PERM 0600
enum {
CR_FD_FDINFO,
CR_FD_PAGES,
CR_FD_PAGES_SHMEM,
CR_FD_CORE,
CR_FD_PIPES,
CR_FD_PSTREE,
CR_FD_SHMEM,
CR_FD_MAX
};
/* file descriptors template */
struct cr_fd_desc_tmpl {
const char *fmt; /* format for the name */
u32 magic; /* magic in the header */
};
/* file descriptors */
struct cr_fd_desc {
struct cr_fd_desc_tmpl *tmpl; /* template we refer to */
char name[64]; /* the name, based on pid */
int fd; /* descriptor for open/close */
};
struct cr_fdset {
struct cr_fd_desc desc[CR_FD_MAX];
u32 use_mask; /*
* if descriptor get used,set
* bit here
*/
};
#define CR_FD_DESC_USE(type) ((1 << (type)))
#define CR_FD_DESC_ALL ((1 << CR_FD_MAX) - 1)
#define CR_FD_DESC_NOPSTREE (CR_FD_DESC_ALL & ~(CR_FD_DESC_USE(CR_FD_PSTREE)))
#define CR_FD_DESC_NONE (0)
struct cr_fdset *alloc_cr_fdset(pid_t pid);
int prep_cr_fdset_for_dump(struct cr_fdset *cr_fdset,
unsigned long use_mask);
int prep_cr_fdset_for_restore(struct cr_fdset *cr_fdset,
unsigned long use_mask);
void close_cr_fdset(struct cr_fdset *cr_fdset);
void free_cr_fdset(struct cr_fdset **cr_fdset);
struct vma_area {
struct list_head list;
struct vma_entry vma;
unsigned long shmid;
int vm_file_fd;
};
#define vma_area_has(vma_area, s) vma_entry_has(&vma_area->vma, s)
#define vma_entry_len(vma) ((vma)->end - (vma)->start)
struct pstree_item {
struct list_head list;
pid_t pid; /* leader pid */
u32 nr_children; /* number of children */
u32 *children; /* array of children */
};
struct pstree_item_info {
struct list_head list;
pid_t pid; /* leader pid */
u32 nr_children; /* number of children */
u32 *children; /* array of children */
bool launched; /* set if launched */
};
static inline unsigned long vma_area_size(struct vma_area *vma)
{
return vma->vma.end - vma->vma.start;
}
static inline int in_vma_area(struct vma_area *vma, unsigned long addr)
{
return addr >= (unsigned long)vma->vma.start &&
addr < (unsigned long)vma->vma.end;
}
#endif /* CRTOOLS_H_ */

507
include/elf.h Normal file
View File

@ -0,0 +1,507 @@
#ifndef CR_ELF_H
#define CR_ELF_H
#include "types.h"
/* Segment types */
#define PT_NULL 0
#define PT_LOAD 1
#define PT_DYNAMIC 2
#define PT_INTERP 3
#define PT_NOTE 4
#define PT_SHLIB 5
#define PT_PHDR 6
#define PT_TLS 7
#define PT_LOOS 0x60000000
#define PT_HIOS 0x6fffffff
#define PT_LOPROC 0x70000000
#define PT_HIPROC 0x7fffffff
#define PT_GNU_EH_FRAME 0x6474e550
#define PT_CKPT_OFFSET 0x01010101
#define PT_CKPT_VMA (PT_LOOS + PT_CKPT_OFFSET + 1)
#define PT_CKPT_CORE (PT_LOOS + PT_CKPT_OFFSET + 2)
#define PT_CKPT_PAGES (PT_LOOS + PT_CKPT_OFFSET + 3)
/* ELF file types */
#define ET_NONE 0
#define ET_REL 1
#define ET_EXEC 2
#define ET_DYN 3
#define ET_CORE 4
#define ET_CKPT 5
#define ET_LOPROC 0xff00
#define ET_HIPROC 0xffff
/* ELF machine types */
#define EM_NONE 0
#define EM_M32 1
#define EM_SPARC 2
#define EM_386 3
#define EM_68K 4
#define EM_88K 5
#define EM_486 6 /* Not used in Linux at least */
#define EM_860 7
#define EM_MIPS 8 /* R3k, bigendian(?) */
#define EM_MIPS_RS4_BE 10 /* R4k BE */
#define EM_PARISC 15
#define EM_SPARC32PLUS 18
#define EM_PPC 20
#define EM_PPC64 21
#define EM_S390 22
#define EM_SH 42
#define EM_SPARCV9 43 /* v9 = SPARC64 */
#define EM_H8_300H 47
#define EM_H8S 48
#define EM_IA_64 50
#define EM_X86_64 62
#define EM_CRIS 76
#define EM_V850 87
#define EM_ALPHA 0x9026 /* Interrim Alpha that stuck around */
#define EM_CYGNUS_V850 0x9080 /* Old v850 ID used by Cygnus */
#define EM_S390_OLD 0xA390 /* Obsolete interrim value for S/390 */
/* Dynamic type values */
#define DT_NULL 0
#define DT_NEEDED 1
#define DT_PLTRELSZ 2
#define DT_PLTGOT 3
#define DT_HASH 4
#define DT_STRTAB 5
#define DT_SYMTAB 6
#define DT_RELA 7
#define DT_RELASZ 8
#define DT_RELAENT 9
#define DT_STRSZ 10
#define DT_SYMENT 11
#define DT_INIT 12
#define DT_FINI 13
#define DT_SONAME 14
#define DT_RPATH 15
#define DT_SYMBOLIC 16
#define DT_REL 17
#define DT_RELSZ 18
#define DT_RELENT 19
#define DT_PLTREL 20
#define DT_DEBUG 21
#define DT_TEXTREL 22
#define DT_JMPREL 23
#define DT_LOPROC 0x70000000
#define DT_HIPROC 0x7fffffff
/* Auxilliary table entries */
#define AT_NULL 0 /* end of vector */
#define AT_IGNORE 1 /* entry should be ignored */
#define AT_EXECFD 2 /* file descriptor of program */
#define AT_PHDR 3 /* program headers for program */
#define AT_PHENT 4 /* size of program header entry */
#define AT_PHNUM 5 /* number of program headers */
#define AT_PAGESZ 6 /* system page size */
#define AT_BASE 7 /* base address of interpreter */
#define AT_FLAGS 8 /* flags */
#define AT_ENTRY 9 /* entry point of program */
#define AT_NOTELF 10 /* program is not ELF */
#define AT_UID 11 /* real uid */
#define AT_EUID 12 /* effective uid */
#define AT_GID 13 /* real gid */
#define AT_EGID 14 /* effective gid */
#define AT_PLATFORM 15 /* string identifying CPU for optimizations */
#define AT_HWCAP 16 /* arch dependent hints at CPU capabilities */
#define AT_CLKTCK 17 /* frequency at which times() increments */
/* 18..22 = ? */
#define AT_SECURE 23 /* secure mode boolean */
/* Program header permission flags */
#define PF_X 0x1
#define PF_W 0x2
#define PF_R 0x4
/* Section header types */
#define SHT_NULL 0
#define SHT_PROGBITS 1
#define SHT_SYMTAB 2
#define SHT_STRTAB 3
#define SHT_RELA 4
#define SHT_HASH 5
#define SHT_DYNAMIC 6
#define SHT_NOTE 7
#define SHT_NOBITS 8
#define SHT_REL 9
#define SHT_SHLIB 10
#define SHT_DYNSYM 11
#define SHT_NUM 12
#define SHT_LOPROC 0x70000000
#define SHT_HIPROC 0x7fffffff
#define SHT_LOUSER 0x80000000
#define SHT_HIUSER 0xffffffff
/* Section header flags */
#define SHF_WRITE (1 << 0) /* Writable */
#define SHF_ALLOC (1 << 1) /* Occupies memory during execution */
#define SHF_EXECINSTR (1 << 2) /* Executable */
#define SHF_MERGE (1 << 4) /* Might be merged */
#define SHF_STRINGS (1 << 5) /* Contains nul-terminated strings */
#define SHF_INFO_LINK (1 << 6) /* `sh_info' contains SHT index */
#define SHF_LINK_ORDER (1 << 7) /* Preserve order after combining */
#define SHF_OS_NONCONFORMING (1 << 8) /* Non-standard OS specific handling required */
#define SHF_GROUP (1 << 9) /* Section is member of a group. */
#define SHF_TLS (1 << 10) /* Section hold thread-local data. */
/* Special section numbers */
#define SHN_UNDEF 0
#define SHN_LORESERVE 0xff00
#define SHN_LOPROC 0xff00
#define SHN_HIPROC 0xff1f
#define SHN_ABS 0xfff1
#define SHN_COMMON 0xfff2
#define SHN_HIRESERVE 0xffff
/* Section align flag */
#define SHA_ANY 1 /* No alignment constraint */
/* Lenght of magic at the start of a file */
#define EI_NIDENT 16
/* Magic number constants... */
#define EI_MAG0 0 /* e_ident[] indexes */
#define EI_MAG1 1
#define EI_MAG2 2
#define EI_MAG3 3
#define EI_CLASS 4
#define EI_DATA 5
#define EI_VERSION 6
#define EI_OSABI 7
#define EI_PAD 8
#define ELFMAG0 0x7f /* EI_MAG */
#define ELFMAG1 'E'
#define ELFMAG2 'L'
#define ELFMAG3 'F'
#define ELFMAG "\177ELF"
#define SELFMAG 4
#define ELFCLASSNONE 0 /* EI_CLASS */
#define ELFCLASS32 1
#define ELFCLASS64 2
#define ELFCLASSNUM 3
#define ELFDATANONE 0 /* e_ident[EI_DATA] */
#define ELFDATA2LSB 1
#define ELFDATA2MSB 2
#define EV_NONE 0 /* e_version, EI_VERSION */
#define EV_CURRENT 1
#define EV_NUM 2
#define ELFOSABI_NONE 0
#define ELFOSABI_LINUX 3
/* Legal values for ST_BIND subfield of st_info (symbol binding). */
#define STB_LOCAL 0 /* Local symbol */
#define STB_GLOBAL 1 /* Global symbol */
#define STB_WEAK 2 /* Weak symbol */
#define STB_NUM 3 /* Number of defined types. */
#define STB_LOOS 10 /* Start of OS-specific */
#define STB_HIOS 12 /* End of OS-specific */
#define STB_LOPROC 13 /* Start of processor-specific */
#define STB_HIPROC 15 /* End of processor-specific */
/* Symbol types */
#define STT_NOTYPE 0 /* Symbol type is unspecified */
#define STT_OBJECT 1 /* Symbol is a data object */
#define STT_FUNC 2 /* Symbol is a code object */
#define STT_SECTION 3 /* Symbol associated with a section */
#define STT_FILE 4 /* Symbol's name is file name */
#define STT_COMMON 5 /* Symbol is a common data object */
#define STT_TLS 6 /* Symbol is thread-local data object */
#define STT_NUM 7 /* Number of defined types. */
/* Symbol visibilities */
#define STV_DEFAULT 0 /* Default symbol visibility rules */
#define STV_INTERNAL 1 /* Processor specific hidden class */
#define STV_HIDDEN 2 /* Sym unavailable in other modules */
#define STV_PROTECTED 3 /* Not preemptible, not exported */
/* Both Elf32_Sym and Elf64_Sym use the same one-byte st_info field */
#define ELF32_ST_BIND(i) ((i) >> 4)
#define ELF32_ST_MKBIND(i) ((i) << 4) /* just a helper */
#define ELF32_ST_TYPE(i) ((i) & 0xf)
#define ELF32_ST_INFO(b, i) (ELF_ST_MKBIND(b) + ELF_ST_TYPE(i))
#define ELF64_ST_BIND(i) ELF32_ST_BIND(i)
#define ELF64_ST_MKBIND(i) ELF32_ST_MKBIND(i)
#define ELF64_ST_TYPE(i) ELF32_ST_TYPE(i)
#define ELF64_ST_INFO(b, i) ELF32_ST_INFO(b, i)
/*
* ELF standard typedefs (yet more proof that <stdint.h> was way overdue)
*/
typedef u16 Elf32_Half;
typedef s16 Elf32_SHalf;
typedef u32 Elf32_Word;
typedef s32 Elf32_Sword;
typedef u64 Elf32_Xword;
typedef s64 Elf32_Sxword;
typedef u32 Elf32_Off;
typedef u32 Elf32_Addr;
typedef u16 Elf32_Section;
typedef u16 Elf64_Half;
typedef s16 Elf64_SHalf;
typedef u32 Elf64_Word;
typedef s32 Elf64_Sword;
typedef u64 Elf64_Xword;
typedef s64 Elf64_Sxword;
typedef u64 Elf64_Off;
typedef u64 Elf64_Addr;
typedef u16 Elf64_Section;
/*
* Dynamic header
*/
typedef struct elf32_dyn {
Elf32_Sword d_tag;
union {
Elf32_Sword d_val;
Elf32_Addr d_ptr;
} d_un;
} Elf32_Dyn;
typedef struct elf64_dyn {
Elf64_Sxword d_tag;
union {
Elf64_Xword d_val;
Elf64_Addr d_ptr;
} d_un;
} Elf64_Dyn;
/*
* Relocations
*/
#define ELF32_R_SYM(x) ((x) >> 8)
#define ELF32_R_TYPE(x) ((x) & 0xff)
typedef struct elf32_rel {
Elf32_Addr r_offset;
Elf32_Word r_info;
} Elf32_Rel;
typedef struct elf32_rela {
Elf32_Addr r_offset;
Elf32_Word r_info;
Elf32_Sword r_addend;
} Elf32_Rela;
enum reloc32_type {
R_386_32 = 1, /* ordinary absolute relocation */
R_386_PC32 = 2, /* PC-relative relocation */
R_386_GOT32 = 3, /* an offset into GOT */
R_386_PLT32 = 4, /* a PC-relative offset into PLT */
R_386_COPY = 5, /* ??? */
R_386_GLOB_DAT = 6, /* ??? */
R_386_JUMP_SLOT = 7, /* ??? */
R_386_RELATIVE = 8, /* ??? */
R_386_GOTOFF = 9, /* an offset from GOT base */
R_386_GOTPC = 10, /* a PC-relative offset _to_ GOT */
R_386_TLS_TPOFF = 14, /* Offset in static TLS block */
R_386_TLS_IE = 15, /* Address of GOT entry for static TLS block offset */
/* These are GNU extensions, but useful */
R_386_16 = 20, /* A 16-bit absolute relocation */
R_386_PC16 = 21, /* A 16-bit PC-relative relocation */
R_386_8 = 22, /* An 8-bit absolute relocation */
R_386_PC8 = 23 /* An 8-bit PC-relative relocation */
};
#define ELF64_R_SYM(x) ((x) >> 32)
#define ELF64_R_TYPE(x) ((x) & 0xffffffff)
typedef struct elf64_rel {
Elf64_Addr r_offset;
Elf64_Xword r_info;
} Elf64_Rel;
typedef struct elf64_rela {
Elf64_Addr r_offset;
Elf64_Xword r_info;
Elf64_Sxword r_addend;
} Elf64_Rela;
enum reloc64_type {
R_X86_64_NONE = 0, /* No reloc */
R_X86_64_64 = 1, /* Direct 64 bit */
R_X86_64_PC32 = 2, /* PC relative 32 bit signed */
R_X86_64_GOT32 = 3, /* 32 bit GOT entry */
R_X86_64_PLT32 = 4, /* 32 bit PLT address */
R_X86_64_COPY = 5, /* Copy symbol at runtime */
R_X86_64_GLOB_DAT = 6, /* Create GOT entry */
R_X86_64_JUMP_SLOT = 7, /* Create PLT entry */
R_X86_64_RELATIVE = 8, /* Adjust by program base */
R_X86_64_GOTPCREL = 9, /* 32 bit signed PC relative offset to GOT */
R_X86_64_32 = 10, /* Direct 32 bit zero extended */
R_X86_64_32S = 11, /* Direct 32 bit sign extended */
R_X86_64_16 = 12, /* Direct 16 bit zero extended */
R_X86_64_PC16 = 13, /* 16 bit sign extended pc relative */
R_X86_64_8 = 14, /* Direct 8 bit sign extended */
R_X86_64_PC8 = 15, /* 8 bit sign extended pc relative */
R_X86_64_DTPMOD64 = 16, /* ID of module containing symbol */
R_X86_64_DTPOFF64 = 17, /* Offset in module's TLS block */
R_X86_64_TPOFF64 = 18, /* Offset in initial TLS block */
R_X86_64_TLSGD = 19, /* 32 bit signed PC relative offset to two GOT entries for GD symbol */
R_X86_64_TLSLD = 20, /* 32 bit signed PC relative offset to two GOT entries for LD symbol */
R_X86_64_DTPOFF32 = 21, /* Offset in TLS block */
R_X86_64_GOTTPOFF = 22, /* 32 bit signed PC relative offset to GOT entry for IE symbol */
R_X86_64_TPOFF32 = 23, /* Offset in initial TLS block */
R_X86_64_PC64 = 24, /* word64 S + A - P */
R_X86_64_GOTOFF64 = 25, /* word64 S + A - GOT */
R_X86_64_GOTPC32 = 26, /* word32 GOT + A - P */
R_X86_64_GOT64 = 27, /* word64 G + A */
R_X86_64_GOTPCREL64 = 28,/* word64 G + GOT - P + A */
R_X86_64_GOTPC64 = 29, /* word64 GOT - P + A */
R_X86_64_GOTPLT64 = 30, /* word64 G + A */
R_X86_64_PLTOFF64 = 31, /* word64 L - GOT + A */
R_X86_64_SIZE32 = 32, /* word32 Z + A */
R_X86_64_SIZE64 = 33, /* word64 Z + A */
R_X86_64_GOTPC32_TLSDESC = 34, /* word32 */
R_X86_64_TLSDESC_CALL = 35, /* none */
R_X86_64_TLSDESC = 36 /* word64?2 */
};
/*
* Symbol
*/
typedef struct elf32_sym {
Elf32_Word st_name;
Elf32_Addr st_value;
Elf32_Word st_size;
unsigned char st_info;
unsigned char st_other;
Elf32_Half st_shndx;
} Elf32_Sym;
typedef struct elf64_sym {
Elf64_Word st_name;
unsigned char st_info;
unsigned char st_other;
Elf64_Half st_shndx;
Elf64_Addr st_value;
Elf64_Xword st_size;
} Elf64_Sym;
/*
* Main file header
*/
typedef struct elf32_hdr {
unsigned char e_ident[EI_NIDENT];
Elf32_Half e_type;
Elf32_Half e_machine;
Elf32_Word e_version;
Elf32_Addr e_entry;
Elf32_Off e_phoff;
Elf32_Off e_shoff;
Elf32_Word e_flags;
Elf32_Half e_ehsize;
Elf32_Half e_phentsize;
Elf32_Half e_phnum;
Elf32_Half e_shentsize;
Elf32_Half e_shnum;
Elf32_Half e_shstrndx;
} Elf32_Ehdr;
typedef struct elf64_hdr {
unsigned char e_ident[EI_NIDENT];
Elf64_Half e_type;
Elf64_Half e_machine;
Elf64_Word e_version;
Elf64_Addr e_entry;
Elf64_Off e_phoff;
Elf64_Off e_shoff;
Elf64_Word e_flags;
Elf64_Half e_ehsize;
Elf64_Half e_phentsize;
Elf64_Half e_phnum;
Elf64_Half e_shentsize;
Elf64_Half e_shnum;
Elf64_Half e_shstrndx;
} Elf64_Ehdr;
/*
* Program header
*/
typedef struct elf32_phdr {
Elf32_Word p_type;
Elf32_Off p_offset;
Elf32_Addr p_vaddr;
Elf32_Addr p_paddr;
Elf32_Word p_filesz;
Elf32_Word p_memsz;
Elf32_Word p_flags;
Elf32_Word p_align;
} Elf32_Phdr;
typedef struct elf64_phdr {
Elf64_Word p_type;
Elf64_Word p_flags;
Elf64_Off p_offset;
Elf64_Addr p_vaddr;
Elf64_Addr p_paddr;
Elf64_Xword p_filesz;
Elf64_Xword p_memsz;
Elf64_Xword p_align;
} Elf64_Phdr;
/*
* Section headers.
*/
typedef struct elf32_shdr {
Elf32_Word sh_name;
Elf32_Word sh_type;
Elf32_Word sh_flags;
Elf32_Addr sh_addr;
Elf32_Off sh_offset;
Elf32_Word sh_size;
Elf32_Word sh_link;
Elf32_Word sh_info;
Elf32_Word sh_addralign;
Elf32_Word sh_entsize;
} Elf32_Shdr;
typedef struct elf64_shdr {
Elf64_Word sh_name;
Elf64_Word sh_type;
Elf64_Xword sh_flags;
Elf64_Addr sh_addr;
Elf64_Off sh_offset;
Elf64_Xword sh_size;
Elf64_Word sh_link;
Elf64_Word sh_info;
Elf64_Xword sh_addralign;
Elf64_Xword sh_entsize;
} Elf64_Shdr;
/*
* Note header
*/
typedef struct elf32_note {
Elf32_Word n_namesz; /* Name size */
Elf32_Word n_descsz; /* Content size */
Elf32_Word n_type; /* Content type */
} Elf32_Nhdr;
typedef struct elf64_note {
Elf64_Word n_namesz; /* Name size */
Elf64_Word n_descsz; /* Content size */
Elf64_Word n_type; /* Content type */
} Elf64_Nhdr;
#endif /* CR_ELF_H */

191
include/image.h Normal file
View File

@ -0,0 +1,191 @@
#ifndef CR_IMAGE_H
#define CR_IMAGE_H
#include "types.h"
#include "compiler.h"
#define FDINFO_MAGIC 0x01010101
#define PAGES_MAGIC 0x20202020
#define CORE_MAGIC 0xa75b8d43
#define SHMEM_MAGIC 0x03300330
#define PIPEFS_MAGIC 0x50495045
#define PSTREE_MAGIC 0x40044004
#define PIPES_MAGIC 0x05055050
#define FDINFO_FD 1
#define FDINFO_MAP 2
#define PAGE_IMAGE_SIZE 4096
#define PAGE_RSS 1
struct fdinfo_entry {
u8 type;
u8 len;
u16 flags;
u32 pos;
u64 addr;
u8 name[0];
} __packed;
struct shmem_entry {
u64 start;
u64 end;
u64 shmid;
} __packed;
struct pstree_entry {
u32 pid;
u32 nr_children;
u32 children[0];
} __packed;
struct pipe_entry {
u32 fd;
u32 pipeid;
u32 flags;
u32 bytes;
u8 data[0];
} __packed;
#define VMA_AREA_REGULAR (1 << 0)
#define VMA_AREA_STACK (1 << 1)
#define VMA_AREA_VSYSCALL (1 << 2)
#define VMA_AREA_VDSO (1 << 3)
#define VMA_FORCE_READ (1 << 4)
#define VMA_AREA_HEAP (1 << 5)
#define VMA_FILE_PRIVATE (1 << 6)
#define VMA_FILE_SHARED (1 << 7)
#define VMA_ANON_SHARED (1 << 8)
#define VMA_ANON_PRIVATE (1 << 9)
#define VMA_FORCE_WRITE (1 << 10)
#define VMA_DUMP_ALL (1 << 11)
#define vma_entry_has(vma, s) (((vma)->status & (s)) == (s))
struct vma_entry {
u64 start;
u64 end;
u64 pgoff;
u32 prot;
u32 flags;
u32 status;
u32 pid;
s64 fd;
u64 ino;
u32 dev_maj;
u32 dev_min;
} __packed;
struct page_entry {
u64 va;
u8 data[PAGE_IMAGE_SIZE];
} __packed;
#define HEADER_VERSION 1
#define HEADER_ARCH_X86_64 1
struct image_header {
u16 version;
u16 arch;
u32 flags;
} __packed;
/*
* PTRACE_GETREGS
* PTRACE_GETFPREGS
* PTRACE_GETFPXREGS dep CONFIG_X86_32
* PTRACE_GET_THREAD_AREA dep CONFIG_X86_32 || CONFIG_IA32_EMULATION
* PTRACE_GETFDPIC dep CONFIG_BINFMT_ELF_FDPIC
*
* PTRACE_ARCH_PRCTL dep CONFIG_X86_64
* ARCH_SET_GS/ARCH_GET_FS
* ARCH_SET_FS/ARCH_GET_GS
*/
#ifdef CONFIG_X86_64
struct user_regs_entry {
u64 r15;
u64 r14;
u64 r13;
u64 r12;
u64 bp;
u64 bx;
u64 r11;
u64 r10;
u64 r9;
u64 r8;
u64 ax;
u64 cx;
u64 dx;
u64 si;
u64 di;
u64 orig_ax;
u64 ip;
u64 cs;
u64 flags;
u64 sp;
u64 ss;
u64 fs_base;
u64 gs_base;
u64 ds;
u64 es;
u64 fs;
u64 gs;
} __packed;
struct desc_struct {
union {
struct {
u32 a;
u32 b;
};
struct {
u16 limit0;
u16 base0;
unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
};
};
} __packed;
struct user_fpregs_entry {
u16 cwd;
u16 swd;
u16 twd; /* Note this is not the same as
the 32bit/x87/FSAVE twd */
u16 fop;
u64 rip;
u64 rdp;
u32 mxcsr;
u32 mxcsr_mask;
u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
u32 padding[24];
} __packed;
#define GDT_ENTRY_TLS_ENTRIES 3
struct core_entry {
struct image_header hdr;
struct user_regs_entry gpregs;
struct user_fpregs_entry fpregs;
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
u32 personality;
} __packed;
#endif /* CONFIG_X86_64 */
#ifndef offsetof
# define offsetof(TYPE, MEMBER) ((long) &((TYPE *)0)->MEMBER)
#endif
/*
* There are always 4 magic bytes at the
* beginning of the every file.
*/
#define MAGIC_OFFSET (sizeof(u32))
#define GET_FILE_OFF(s, m) (offsetof(s,m) + MAGIC_OFFSET)
#define GET_FILE_OFF_AFTER(s) (sizeof(s) + MAGIC_OFFSET)
#endif /* CR_IMAGE_H */

286
include/list.h Normal file
View File

@ -0,0 +1,286 @@
#ifndef CR_LIST_H_
#define CR_LIST_H_
/*
* Double linked lists.
*/
#include "compiler.h"
#define POISON_POINTER_DELTA 0
#define LIST_POISON1 ((void *) 0x00100100 + POISON_POINTER_DELTA)
#define LIST_POISON2 ((void *) 0x00200200 + POISON_POINTER_DELTA)
struct list_head {
struct list_head *prev, *next;
};
#define LIST_HEAD_INIT(name) { &(name), &(name) }
#define LIST_HEAD(name) struct list_head name = LIST_HEAD_INIT(name)
static inline void INIT_LIST_HEAD(struct list_head *list)
{
list->next = list;
list->prev = list;
}
static inline void __list_add(struct list_head *new,
struct list_head *prev,
struct list_head *next)
{
next->prev = new;
new->next = next;
new->prev = prev;
prev->next = new;
}
static inline void list_add(struct list_head *new, struct list_head *head)
{
__list_add(new, head, head->next);
}
static inline void list_add_tail(struct list_head *new, struct list_head *head)
{
__list_add(new, head->prev, head);
}
static inline void __list_del(struct list_head * prev, struct list_head * next)
{
next->prev = prev;
prev->next = next;
}
static inline void __list_del_entry(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
}
static inline void list_del(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
entry->next = LIST_POISON1;
entry->prev = LIST_POISON2;
}
static inline void list_replace(struct list_head *old,
struct list_head *new)
{
new->next = old->next;
new->next->prev = new;
new->prev = old->prev;
new->prev->next = new;
}
static inline void list_replace_init(struct list_head *old,
struct list_head *new)
{
list_replace(old, new);
INIT_LIST_HEAD(old);
}
static inline void list_del_init(struct list_head *entry)
{
__list_del_entry(entry);
INIT_LIST_HEAD(entry);
}
static inline void list_move(struct list_head *list, struct list_head *head)
{
__list_del_entry(list);
list_add(list, head);
}
static inline void list_move_tail(struct list_head *list,
struct list_head *head)
{
__list_del_entry(list);
list_add_tail(list, head);
}
static inline int list_is_last(const struct list_head *list,
const struct list_head *head)
{
return list->next == head;
}
static inline int list_is_first(const struct list_head *list,
const struct list_head *head)
{
return list->prev == head;
}
static inline int list_empty(const struct list_head *head)
{
return head->next == head;
}
static inline int list_empty_careful(const struct list_head *head)
{
struct list_head *next = head->next;
return (next == head) && (next == head->prev);
}
static inline void list_rotate_left(struct list_head *head)
{
struct list_head *first;
if (!list_empty(head)) {
first = head->next;
list_move_tail(first, head);
}
}
static inline int list_is_singular(const struct list_head *head)
{
return !list_empty(head) && (head->next == head->prev);
}
static inline void __list_cut_position(struct list_head *list,
struct list_head *head, struct list_head *entry)
{
struct list_head *new_first = entry->next;
list->next = head->next;
list->next->prev = list;
list->prev = entry;
entry->next = list;
head->next = new_first;
new_first->prev = head;
}
static inline void list_cut_position(struct list_head *list,
struct list_head *head, struct list_head *entry)
{
if (list_empty(head))
return;
if (list_is_singular(head) &&
(head->next != entry && head != entry))
return;
if (entry == head)
INIT_LIST_HEAD(list);
else
__list_cut_position(list, head, entry);
}
static inline void __list_splice(const struct list_head *list,
struct list_head *prev,
struct list_head *next)
{
struct list_head *first = list->next;
struct list_head *last = list->prev;
first->prev = prev;
prev->next = first;
last->next = next;
next->prev = last;
}
static inline void list_splice(const struct list_head *list,
struct list_head *head)
{
if (!list_empty(list))
__list_splice(list, head, head->next);
}
static inline void list_splice_tail(struct list_head *list,
struct list_head *head)
{
if (!list_empty(list))
__list_splice(list, head->prev, head);
}
static inline void list_splice_init(struct list_head *list,
struct list_head *head)
{
if (!list_empty(list)) {
__list_splice(list, head, head->next);
INIT_LIST_HEAD(list);
}
}
static inline void list_splice_tail_init(struct list_head *list,
struct list_head *head)
{
if (!list_empty(list)) {
__list_splice(list, head->prev, head);
INIT_LIST_HEAD(list);
}
}
#define list_entry(ptr, type, member) \
container_of(ptr, type, member)
#define list_first_entry(ptr, type, member) \
list_entry((ptr)->next, type, member)
#define list_for_each(pos, head) \
for (pos = (head)->next; pos != (head); pos = pos->next)
#define __list_for_each(pos, head) \
for (pos = (head)->next; pos != (head); pos = pos->next)
#define list_for_each_prev(pos, head) \
for (pos = (head)->prev; pos != (head); pos = pos->prev)
#define list_for_each_safe(pos, n, head) \
for (pos = (head)->next, n = pos->next; pos != (head); \
pos = n, n = pos->next)
#define list_for_each_prev_safe(pos, n, head) \
for (pos = (head)->prev, n = pos->prev; \
pos != (head); \
pos = n, n = pos->prev)
#define list_for_each_entry(pos, head, member) \
for (pos = list_entry((head)->next, typeof(*pos), member); \
&pos->member != (head); \
pos = list_entry(pos->member.next, typeof(*pos), member))
#define list_for_each_entry_reverse(pos, head, member) \
for (pos = list_entry((head)->prev, typeof(*pos), member); \
&pos->member != (head); \
pos = list_entry(pos->member.prev, typeof(*pos), member))
#define list_prepare_entry(pos, head, member) \
((pos) ? : list_entry(head, typeof(*pos), member))
#define list_for_each_entry_continue(pos, head, member) \
for (pos = list_entry(pos->member.next, typeof(*pos), member); \
&pos->member != (head); \
pos = list_entry(pos->member.next, typeof(*pos), member))
#define list_for_each_entry_continue_reverse(pos, head, member) \
for (pos = list_entry(pos->member.prev, typeof(*pos), member); \
&pos->member != (head); \
pos = list_entry(pos->member.prev, typeof(*pos), member))
#define list_for_each_entry_from(pos, head, member) \
for (; &pos->member != (head); \
pos = list_entry(pos->member.next, typeof(*pos), member))
#define list_for_each_entry_safe(pos, n, head, member) \
for (pos = list_entry((head)->next, typeof(*pos), member), \
n = list_entry(pos->member.next, typeof(*pos), member); \
&pos->member != (head); \
pos = n, n = list_entry(n->member.next, typeof(*n), member))
#define list_for_each_entry_safe_continue(pos, n, head, member) \
for (pos = list_entry(pos->member.next, typeof(*pos), member), \
n = list_entry(pos->member.next, typeof(*pos), member); \
&pos->member != (head); \
pos = n, n = list_entry(n->member.next, typeof(*n), member))
#define list_for_each_entry_safe_from(pos, n, head, member) \
for (n = list_entry(pos->member.next, typeof(*pos), member); \
&pos->member != (head); \
pos = n, n = list_entry(n->member.next, typeof(*n), member))
#define list_for_each_entry_safe_reverse(pos, n, head, member) \
for (pos = list_entry((head)->prev, typeof(*pos), member), \
n = list_entry(pos->member.prev, typeof(*pos), member); \
&pos->member != (head); \
pos = n, n = list_entry(n->member.prev, typeof(*n), member))
#define list_safe_reset_next(pos, n, member) \
n = list_entry(pos->member.next, typeof(*pos), member)
#endif /* CR_LIST_H_ */

View File

@ -0,0 +1,46 @@
#ifndef PARASITE_SYSCALL_H_
#define PARASITE_SYSCALL_H_
#include <sys/types.h>
#include <sys/mman.h>
#include "compiler.h"
#include "types.h"
#include "list.h"
#include "crtools.h"
#define BUILTIN_SYSCALL_SIZE 8
/* parasite control block */
struct parasite_ctl {
pid_t pid; /* process where we live */
struct vma_area *vma_area; /* our space */
unsigned long parasite_ip; /* service routine start ip */
unsigned long parasite_complete_ip; /* where we end execution */
unsigned long addr_cmd; /* addr for command */
unsigned long addr_args; /* address for arguments */
};
int can_run_syscall(unsigned long ip, unsigned long start, unsigned long end);
void *mmap_seized(pid_t pid, user_regs_struct_t *regs,
void *addr, size_t length, int prot,
int flags, int fd, off_t offset);
int munmap_seized(pid_t pid, user_regs_struct_t *regs,
void *addr, size_t length);
int kill_seized(pid_t pid, user_regs_struct_t *where);
int syscall_seized(pid_t pid,
user_regs_struct_t *where,
user_regs_struct_t *params,
user_regs_struct_t *result);
int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct list_head *vma_area_list,
struct cr_fdset *cr_fdset, int fd_type);
struct parasite_ctl *parasite_infect_seized(pid_t pid, void *addr_hint, struct list_head *vma_area_list);
int parasite_cure_seized(struct parasite_ctl **p_ctrl, struct list_head *vma_area_list);
#endif /* PARASITE_SYSCALL_H_ */

68
include/parasite.h Normal file
View File

@ -0,0 +1,68 @@
#ifndef CR_PARASITE_H_
#define CR_PARASITE_H_
#include "compiler.h"
#include "syscall.h"
#include "image.h"
#define __parasite_head __used __section(.parasite.head.text)
#define __parasite_text __used __section(.parasite.text)
#define __parasite_stack __used __section(.parasite.stack)
#define PARASITE_STACK_SIZE 2048
#define PARASITE_ARG_SIZE 256
#define PARASITE_BRK_SIZE 32768
#define PARASITE_MAX_SIZE (64 << 10)
/* we need own error code for diagnostics */
#define PARASITE_ERR_FAIL -1024
#define PARASITE_ERR_OPEN -1025
#define PARASITE_ERR_MMAP -1026
#define PARASITE_ERR_MINCORE -1027
#define PARASITE_ERR_MUNMAP -1028
#define PARASITE_ERR_CLOSE -1029
#define PARASITE_ERR_WRITE -1030
#define PARASITE_ERR_MPROTECT -1031
#define PARASITE_ERR_CORE_VMA -1032
#define PARASITE_ERR_CORE_PAGE -1033
enum {
PARASITE_CMD_NONE,
PARASITE_CMD_KILLME,
PARASITE_CMD_PINGME,
PARASITE_CMD_DUMPPAGES,
PARASITE_CMD_RESTORECORE,
PARASITE_CMD_MAX,
};
typedef struct {
unsigned long command;
unsigned long args_size;
void *args;
} parasite_args_t;
typedef struct {
struct vma_entry vma_entry;
unsigned long nrpages_dumped; /* how many pages are dumped */
unsigned long fd;
unsigned long open_mode;
unsigned long open_flags;
char open_path[64];
} parasite_args_cmd_dumppages_t;
/*
* Some useful offsets
*/
#define PARASITE_ARGS_ADDR(start) \
((start) + parasite_blob_offset__parasite_args)
#define PARASITE_CMD_ADDR(start) \
((start) + parasite_blob_offset__parasite_cmd)
#define PARASITE_HEAD_ADDR(start) \
((start) + parasite_blob_offset__parasite_head_start)
#define PARASITE_COMPLETE_ADDR(start) \
((start) + parasite_blob_offset__parasite_service_complete)
#endif /* CR_PARASITE_H_ */

79
include/rbtree.h Normal file
View File

@ -0,0 +1,79 @@
/*
* RBtree implementation adopted from the Linux
* kernel sources.
*/
#ifndef _LINUX_RBTREE_H
#define _LINUX_RBTREE_H
#include <stddef.h>
#define RB_RED 0
#define RB_BLACK 1
#define RB_COLOR_MASK 3
struct rb_node {
unsigned long rb_parent_color;
struct rb_node *rb_right;
struct rb_node *rb_left;
} __attribute__((aligned(sizeof(long))));
struct rb_root {
struct rb_node *rb_node;
};
#define rb_parent(r) ((struct rb_node *)((r)->rb_parent_color & ~RB_COLOR_MASK))
#define rb_color(r) ((r)->rb_parent_color & RB_BLACK)
#define rb_is_red(r) (!rb_color(r))
#define rb_is_black(r) rb_color(r)
#define rb_set_red(r) do { (r)->rb_parent_color &= ~RB_BLACK; } while (0)
#define rb_set_black(r) do { (r)->rb_parent_color |= RB_BLACK; } while (0)
static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
{
rb->rb_parent_color = (rb->rb_parent_color & RB_COLOR_MASK) |(unsigned long)p;
}
static inline void rb_set_color(struct rb_node *rb, int color)
{
rb->rb_parent_color = (rb->rb_parent_color & ~RB_BLACK) | color;
}
#define RB_ROOT (struct rb_root) { NULL, }
#define rb_entry(ptr, type, member) \
container_of(ptr, type, member)
#define RB_EMPTY_ROOT(root) ((root)->rb_node == NULL)
#define RB_EMPTY_NODE(node) (rb_parent(node) == node)
#define RB_CLEAR_NODE(node) (rb_set_parent(node, node))
static inline void rb_init_node(struct rb_node *rb)
{
rb->rb_parent_color = 0;
rb->rb_right = NULL;
rb->rb_left = NULL;
RB_CLEAR_NODE(rb);
}
void rb_insert_color(struct rb_node *, struct rb_root *);
void rb_erase(struct rb_node *, struct rb_root *);
struct rb_node *rb_next(const struct rb_node *node);
struct rb_node *rb_prev(const struct rb_node *node);
struct rb_node *rb_first(const struct rb_root *node);
struct rb_node *rb_last(const struct rb_root *node);
void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root);
static inline void rb_link_node(struct rb_node *node, struct rb_node *parent,
struct rb_node **rb_link)
{
node->rb_parent_color = (unsigned long )parent;
node->rb_left = node->rb_right = NULL;
*rb_link = node;
}
#endif /* _LINUX_RBTREE_H */

181
include/syscall.h Normal file
View File

@ -0,0 +1,181 @@
#ifndef CR_SYSCALL_H_
#define CR_SYSCALL_H_
#include <sys/types.h>
#include "compiler.h"
#ifdef CONFIG_X86_64
static long syscall0(int nr)
{
long ret;
asm volatile("syscall"
: "=a" (ret)
: "a" (nr)
: "memory");
return ret;
}
static long syscall1(int nr, unsigned long arg0)
{
long ret;
asm volatile("syscall"
: "=a" (ret)
: "a" (nr), "D" (arg0)
: "memory");
return ret;
}
static long syscall2(int nr, unsigned long arg0, unsigned long arg1)
{
long ret;
asm volatile("syscall"
: "=a" (ret)
: "a" (nr), "D" (arg0), "S" (arg1)
: "memory");
return ret;
}
static long syscall3(int nr, unsigned long arg0, unsigned long arg1,
unsigned long arg2)
{
long ret;
asm volatile("syscall"
: "=a" (ret)
: "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2)
: "memory");
return ret;
}
static long syscall4(int nr, unsigned long arg0, unsigned long arg1,
unsigned long arg2, unsigned long arg3)
{
register unsigned long r10 asm("r10") = r10;
long ret;
r10 = arg3;
asm volatile("syscall"
: "=a" (ret)
: "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2)
: "memory");
return ret;
}
static long syscall5(int nr, unsigned long arg0, unsigned long arg1,
unsigned long arg2, unsigned long arg3,
unsigned long arg4)
{
register unsigned long r10 asm("r10") = r10;
register unsigned long r8 asm("r8") = r8;
long ret;
r10 = arg3;
r8 = arg4;
asm volatile("syscall"
: "=a" (ret)
: "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2)
: "memory");
return ret;
}
static long syscall6(int nr, unsigned long arg0, unsigned long arg1,
unsigned long arg2, unsigned long arg3,
unsigned long arg4, unsigned long arg5)
{
register unsigned long r10 asm("r10") = r10;
register unsigned long r8 asm("r8") = r8;
register unsigned long r9 asm("r9") = r9;
long ret;
r10 = arg3;
r8 = arg4;
r9 = arg5;
asm volatile("syscall"
: "=a" (ret)
: "a" (nr), "D" (arg0), "S" (arg1), "d" (arg2)
: "memory");
return ret;
}
/*
* syscall codes
*/
#define __NR_read 0
#define __NR_write 1
#define __NR_open 2
#define __NR_close 3
#define __NR_lseek 8
#define __NR_mmap 9
#define __NR_mprotect 10
#define __NR_munmap 11
#define __NR_mincore 27
#define __NR_dup 32
#define __NR_dup2 33
#define __NR_pause 34
#define __NR_nanosleep 35
#define __NR_getpid 39
#define __NR_exit 60
static unsigned long sys_pause(void)
{
return syscall0(__NR_pause);
}
static unsigned long sys_mmap(void *addr, unsigned long len, unsigned long prot,
unsigned long flags, unsigned long fd, unsigned long offset)
{
return syscall6(__NR_mmap, (unsigned long)addr,
len, prot, flags, fd, offset);
}
static unsigned long sys_munmap(void *addr,unsigned long len)
{
return syscall2(__NR_munmap, (unsigned long)addr, len);
}
static long sys_open(const char *filename, unsigned long flags, unsigned long mode)
{
return syscall3(__NR_open, (unsigned long)filename, flags, mode);
}
static long sys_close(int fd)
{
return syscall1(__NR_close, fd);
}
static long sys_write(unsigned long fd, const void *buf, unsigned long count)
{
return syscall3(__NR_write, fd, (unsigned long)buf, count);
}
static long sys_mincore(unsigned long addr, unsigned long size, void *vec)
{
return syscall3(__NR_mincore, addr, size, (unsigned long)vec);
}
static long sys_lseek(unsigned long fd, unsigned long offset, unsigned long origin)
{
return syscall3(__NR_lseek, fd, offset, origin);
}
static long sys_mprotect(unsigned long start, unsigned long len, unsigned long prot)
{
return syscall3(__NR_mprotect, start, len, prot);
}
static long sys_nanosleep(struct timespec *req, struct timespec *rem)
{
return syscall2(__NR_nanosleep, (unsigned long)req, (unsigned long)rem);
}
static long sys_read(unsigned long fd, void *buf, unsigned long count)
{
return syscall3(__NR_read, fd, (unsigned long)buf, count);
}
#else /* CONFIG_X86_64 */
# error x86-32 bit mode not yet implemented
#endif /* CONFIG_X86_64 */
#endif /* CR_SYSCALL_H_ */

132
include/types.h Normal file
View File

@ -0,0 +1,132 @@
#ifndef CR_TYPES_H_
#define CR_TYPES_H_
#include <stdint.h>
#include <stdbool.h>
#include "bitops.h"
/* some constants for ptrace */
#define PTRACE_SEIZE 0x4206
#define PTRACE_INTERRUPT 0x4207
#define PTRACE_LISTEN 0x4208
#define PTRACE_SEIZE_DEVEL 0x80000000
#define PTRACE_EVENT_FORK 1
#define PTRACE_EVENT_VFORK 2
#define PTRACE_EVENT_CLONE 3
#define PTRACE_EVENT_EXEC 4
#define PTRACE_EVENT_VFORK_DONE 5
#define PTRACE_EVENT_EXIT 6
#define PTRACE_EVENT_STOP 7
#define PTRACE_O_TRACESYSGOOD 0x00000001
#define PTRACE_O_TRACEFORK 0x00000002
#define PTRACE_O_TRACEVFORK 0x00000004
#define PTRACE_O_TRACECLONE 0x00000008
#define PTRACE_O_TRACEEXEC 0x00000010
#define PTRACE_O_TRACEVFORKDONE 0x00000020
#define PTRACE_O_TRACEEXIT 0x00000040
/* fcntl */
#ifndef F_LINUX_SPECIFIC_BASE
#define F_LINUX_SPECIFIC_BASE 1024
#endif
#ifndef F_SETPIPE_SZ
# define F_SETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 7)
#endif
#ifndef F_GETPIPE_SZ
# define F_GETPIPE_SZ (F_LINUX_SPECIFIC_BASE + 8)
#endif
#define CLONE_CHILD_USEPID 0x02000000
#define CLONE_VFORK 0x00004000
typedef uint64_t u64;
typedef int64_t s64;
typedef unsigned int u32;
typedef signed int s32;
typedef unsigned short u16;
typedef signed short s16;
typedef unsigned char u8;
typedef signed char s8;
#define MAJOR(dev) ((dev)>>8)
#ifdef CONFIG_X86_64
typedef struct {
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
unsigned long r11;
unsigned long r10;
unsigned long r9;
unsigned long r8;
unsigned long ax;
unsigned long cx;
unsigned long dx;
unsigned long si;
unsigned long di;
unsigned long orig_ax;
unsigned long ip;
unsigned long cs;
unsigned long flags;
unsigned long sp;
unsigned long ss;
unsigned long fs_base;
unsigned long gs_base;
unsigned long ds;
unsigned long es;
unsigned long fs;
unsigned long gs;
} user_regs_struct_t;
typedef struct {
unsigned short cwd;
unsigned short swd;
unsigned short twd; /* Note this is not the same as
the 32bit/x87/FSAVE twd */
unsigned short fop;
u64 rip;
u64 rdp;
u32 mxcsr;
u32 mxcsr_mask;
u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
u32 padding[24];
} user_fpregs_struct_t;
#else /* CONFIG_X86_64 */
typedef struct {
unsigned long bx;
unsigned long cx;
unsigned long dx;
unsigned long si;
unsigned long di;
unsigned long bp;
unsigned long ax;
unsigned long ds;
unsigned long es;
unsigned long fs;
unsigned long gs;
unsigned long orig_ax;
unsigned long ip;
unsigned long cs;
unsigned long flags;
unsigned long sp;
unsigned long ss;
} user_regs_struct_t;
#endif /* CONFIG_X86_64 */
#ifndef PAGE_SIZE
# define PAGE_SIZE 4096
#endif
#endif /* CR_TYPES_H_ */

178
include/util.h Normal file
View File

@ -0,0 +1,178 @@
#ifndef UTIL_H_
#define UTIL_H_
/*
* Some bits are stolen from perf and kvm tools
*/
#include <string.h>
#include <stdlib.h>
#include <signal.h>
#include <sys/types.h>
#include "compiler.h"
#include "types.h"
extern void printk(const char *format, ...);
#define pr_info(fmt, ...) printk(fmt, ##__VA_ARGS__)
#define pr_error(fmt, ...) printk("Error (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
#define pr_panic(fmt, ...) printk("PANIC (%s:%d): " fmt, __FILE__, __LINE__, ##__VA_ARGS__)
#define pr_warning(fmt, ...) printk("Warning: " fmt, ##__VA_ARGS__)
#define pr_error_jmp(label) \
do { \
printk("EJMP: %s:%d\n", __FILE__, __LINE__); \
goto label; \
} while (0)
#define jerr(code, label) \
do { \
if ((code)) \
pr_error_jmp(label); \
} while (0)
#define jerr_cond(code, cond, label) \
do { \
if ((code) cond) \
pr_error_jmp(label); \
} while (0)
#define jerr_rc(code, rc, label) \
do { \
rc = (code); \
if (rc) \
pr_error_jmp(label); \
} while (0)
#if 0
#define pr_debug(fmt, ...) \
do { \
printk("%s (%s:%d): " fmt, \
__func__, __FILE__, __LINE__, \
##__VA_ARGS__); \
} while (0)
#else
#define pr_debug(fmt, ...)
#endif
#define die(fmt, ...) \
do { \
printk("die (%s:%d): " fmt, __FILE__, \
__LINE__, ##__VA_ARGS__); \
exit(1); \
} while (0)
#define pr_perror(fmt, ...) \
do { \
pr_error("%s: " fmt, strerror(errno), \
##__VA_ARGS__); \
} while (0)
#define stop_task(pid) kill(pid, SIGSTOP)
#define continue_task(pid) kill(pid, SIGCONT)
#define write_ptr(fd, ptr) \
write(fd, (ptr), sizeof(*(ptr)))
#define write_ptr_safe(fd, ptr, err) \
jerr(write_ptr(fd, ptr) != sizeof(*(ptr)), err)
#define write_safe(fd, ptr, size, err) \
jerr(write(fd, (ptr), (size)) != (size), err)
#define write_safe_imm(fd, imm, err) \
do { \
typeof(imm) x__ = imm; \
write_ptr_safe(fd, &x__, err); \
} while (0)
#define read_safe(fd, ptr, size, err) \
jerr(read(fd, ptr, (size)) != (size), err)
#define read_ptr_safe(fd, ptr, err) \
jerr(read(fd, ptr, sizeof(*(ptr))) != sizeof(*(ptr)), err)
#define read_safe_eof(fd, ptr, size, rc, err, eof) \
do { \
rc = read(fd, ptr, (size)); \
if (!rc) \
goto eof; \
if (rc != (size)) \
goto err; \
} while (0)
#define read_ptr_safe_eof(fd, ptr, rc, err, eof) \
read_safe_eof(fd, ptr, sizeof(*(ptr)), rc, err, eof)
int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes);
int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes);
int ptrace_show_area(pid_t pid, void *addr, long bytes);
int ptrace_show_area_r(pid_t pid, void *addr, long bytes);
int seize_task(pid_t pid);
int unseize_task(pid_t pid);
void printk_registers(user_regs_struct_t *regs);
void printk_siginfo(siginfo_t *siginfo);
struct vma_area;
struct list_head;
void printk_vma(struct vma_area *vma_area);
/* A special marker */
#define is_ending_vma(vma) ((vma)->start == 0 && (vma)->end == 0)
#define pr_info_vma_list(head) \
do { \
struct vma_area *vma; \
list_for_each_entry(vma, head, list) \
pr_info_vma(vma); \
} while (0)
#define alloc_vma_area() \
({ \
struct vma_area *p__ = xzalloc(sizeof(*p__)); \
if (p__) { \
p__->shmid = -1; \
p__->vm_file_fd = -1; \
p__->vma.fd = -1; \
} \
p__; \
})
#define pr_info_vma(vma_area) printk_vma(vma_area)
#define pr_info_registers(regs) printk_registers(regs)
#define pr_info_siginfo(siginfo) printk_siginfo(siginfo)
int reopen_fd_as(int new_fd, int old_fd);
int parse_maps(pid_t pid, struct list_head *vma_list);
#define __xalloc(op, size, ...) \
({ \
void *___p = op( __VA_ARGS__ ); \
if (!___p) \
pr_error("%s: Can't allocate %li bytes\n", \
__func__, (long)(size)); \
___p; \
})
#define xmalloc(size) __xalloc(malloc, size, size)
#define xzalloc(size) __xalloc(calloc, size, 1, size)
#define xrealloc(p, size) __xalloc(realloc, size, p, size)
#define xfree(p) if (p) free(p)
#define xrealloc_safe(pptr, size) \
({ \
int __ret = -1; \
void *new = xrealloc(*pptr, size); \
if (new) { \
*pptr = new; \
__ret = 0; \
} \
__ret; \
})
#endif /* UTIL_H_ */

636
kernel/binfmt-elf-for-cr-4 Normal file
View File

@ -0,0 +1,636 @@
elf: Add support for loading files
This patch add ability to run checkpoint files by enhancing
Elf file format.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
arch/x86/include/asm/elf.h | 3
arch/x86/vdso/vma.c | 22 ++
fs/binfmt_elf.c | 404 ++++++++++++++++++++++++++++++++++++++++++++-
include/linux/elf_ckpt.h | 135 +++++++++++++++
4 files changed, 562 insertions(+), 2 deletions(-)
Index: linux-2.6.git/arch/x86/include/asm/elf.h
===================================================================
--- linux-2.6.git.orig/arch/x86/include/asm/elf.h
+++ linux-2.6.git/arch/x86/include/asm/elf.h
@@ -314,7 +314,8 @@ struct linux_binprm;
#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
extern int arch_setup_additional_pages(struct linux_binprm *bprm,
int uses_interp);
-
+extern int arch_setup_additional_pages_at(struct linux_binprm *bprm,
+ void *addr, int uses_interp);
extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
#define compat_arch_setup_additional_pages syscall32_setup_pages
Index: linux-2.6.git/arch/x86/vdso/vma.c
===================================================================
--- linux-2.6.git.orig/arch/x86/vdso/vma.c
+++ linux-2.6.git/arch/x86/vdso/vma.c
@@ -137,6 +137,28 @@ up_fail:
return ret;
}
+int arch_setup_additional_pages_at(struct linux_binprm *bprm, void *addr, int uses_interp)
+{
+ struct mm_struct *mm = current->mm;
+ int ret;
+
+ if (!vdso_enabled)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+ current->mm->context.vdso = addr;
+ ret = install_special_mapping(mm, (unsigned long)addr, vdso_size,
+ VM_READ | VM_EXEC |
+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC |
+ VM_ALWAYSDUMP,
+ vdso_pages);
+ if (ret)
+ current->mm->context.vdso = NULL;
+
+ up_write(&mm->mmap_sem);
+ return ret;
+}
+
static __init int vdso_setup(char *s)
{
vdso_enabled = simple_strtoul(s, NULL, 0);
Index: linux-2.6.git/fs/binfmt_elf.c
===================================================================
--- linux-2.6.git.orig/fs/binfmt_elf.c
+++ linux-2.6.git/fs/binfmt_elf.c
@@ -36,6 +36,11 @@
#include <asm/param.h>
#include <asm/page.h>
+#include <linux/elf_ckpt.h>
+#include <linux/flex_array.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
static int load_elf_library(struct file *);
static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
@@ -556,6 +561,395 @@ static unsigned long randomize_stack_top
#endif
}
+#ifdef CONFIG_X86_64
+
+static int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
+ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr)
+{
+ struct thread_struct *thread = &current->thread;
+ struct elf_phdr *elf_phdr_pages;
+ struct elf_phdr *elf_phdr_core;
+ struct flex_array *fa = NULL;
+ struct vma_entry *vma_entry_ptr;
+ int nr_vma_found, nr_vma_mapped;
+ struct vma_entry vma_entry;
+ struct file *file = NULL;
+ unsigned long elf_entry;
+ unsigned long map_addr;
+
+ unsigned long start_code, end_code, start_data, end_data;
+ unsigned long start_brk, brk, start_stack;
+ unsigned long elf_bss, elf_brk;
+ unsigned long vdso;
+
+ struct core_entry core_entry;
+ int i, ret = -ENOEXEC;
+ loff_t off;
+
+ int cpu, seg;
+
+ BUILD_BUG_ON(CKPT_GDT_ENTRY_TLS_ENTRIES != GDT_ENTRY_TLS_ENTRIES);
+ BUILD_BUG_ON(CKPT_PAGE_SIZE != PAGE_SIZE);
+
+ elf_phdr_core = NULL;
+ elf_phdr_pages = NULL;
+ nr_vma_found = 0;
+ nr_vma_mapped = 0;
+
+ elf_bss = 0;
+ elf_brk = 0;
+
+ start_code = -1UL;
+ end_code = 0;
+
+ start_data = -1UL;
+ end_data = 0;
+
+ start_stack = -1UL;
+ start_brk = -1UL;
+ brk = -1UL;
+
+ vdso = -1UL;
+
+ fa = flex_array_alloc(sizeof(vma_entry), elf_ex->e_phnum, GFP_KERNEL);
+ if (!fa || flex_array_prealloc(fa, 0, elf_ex->e_phnum, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ if (fa) {
+ flex_array_free(fa);
+ fa = NULL;
+ goto out;
+ }
+ }
+
+ /* Flush all traces of the currently running executable */
+ ret = flush_old_exec(bprm);
+ if (ret)
+ goto out;
+
+ /* No return point */
+ current->flags &= ~PF_FORKNOEXEC;
+ current->mm->def_flags = 0;
+
+ /*
+ * We don't care about parameters passed (such as argc, argv, env)
+ * when execute checkpoint file because we're to substitute
+ * all the things anyway -- so drop any previous memory mappings.
+ */
+ do_munmap(current->mm, 0, TASK_SIZE);
+
+ SET_PERSONALITY(loc->elf_ex);
+
+ for (i = 0; i < elf_ex->e_phnum; i++) {
+
+ switch (elf_phdr[i].p_type) {
+ case PT_CKPT_VMA:
+ ret = kernel_read(bprm->file, elf_phdr[i].p_offset,
+ (char *)&vma_entry, sizeof(vma_entry));
+ if (ret != sizeof(vma_entry)) {
+ pr_err("elf-ckpt: Can't read vma_entry\n");
+ ret = -EIO;
+ goto out;
+ }
+ if (flex_array_put(fa, i, &vma_entry, GFP_KERNEL))
+ BUG();
+
+ /* We need to know if there is executable stack */
+ if (vma_entry.status & VMA_AREA_STACK) {
+ if (vma_entry.flags & PROT_EXEC)
+ current->personality |= READ_IMPLIES_EXEC;
+ }
+
+ nr_vma_found++;
+ continue;
+ case PT_CKPT_CORE:
+ elf_phdr_core = &elf_phdr[i];
+ continue;
+ case PT_CKPT_PAGES:
+ elf_phdr_pages = &elf_phdr[i];
+ continue;
+ default:
+ continue;
+ }
+ }
+
+ /* Be sure it has the file structure we expect to see. */
+ if (!elf_phdr_pages || !elf_phdr_core || !nr_vma_found) {
+ send_sig(SIGKILL, current, 0);
+ ret = -ENOEXEC;
+ goto out;
+ }
+
+ /*
+ * VMA randomization still needs to be set (just in case if
+ * the program we restore will exec something else later).
+ */
+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+ current->flags |= PF_RANDOMIZE;
+
+ setup_new_exec(bprm);
+
+ current->mm->free_area_cache = current->mm->mmap_base;
+ current->mm->cached_hole_size = 0;
+
+ for (i = 0; i < nr_vma_found; i++) {
+ vma_entry_ptr = flex_array_get(fa, i);
+
+ if (vma_entry_ptr->status & VMA_AREA_HEAP) {
+ start_brk = vma_entry_ptr->start;
+ }
+
+ if (vma_entry_ptr->status & VMA_AREA_VDSO) {
+ vdso = vma_entry_ptr->start;
+ }
+
+ if (!(vma_entry_ptr->status & VMA_AREA_REGULAR))
+ continue;
+
+ if (vma_entry_ptr->fd != -1) {
+ file = fget((unsigned int)vma_entry_ptr->fd);
+ if (!file) {
+ send_sig(SIGKILL, current, 0);
+ ret = -EBADF;
+ goto out_unmap;
+ }
+
+ /* Reuse this field to handle error cases */
+ vma_entry_ptr->fd = (__u64)file;
+ } else
+ file = NULL;
+
+ down_write(&current->mm->mmap_sem);
+ map_addr = do_mmap(file,
+ vma_entry_ptr->start,
+ vma_entry_ptr->end - vma_entry_ptr->start,
+ vma_entry_ptr->prot,
+ vma_entry_ptr->flags | MAP_FIXED,
+ vma_entry_ptr->pgoff);
+ up_write(&current->mm->mmap_sem);
+
+ if (file) {
+ fput(file);
+ do_close((unsigned int)vma_entry_ptr->fd);
+ }
+
+ if (BAD_ADDR(map_addr)) {
+ send_sig(SIGKILL, current, 0);
+ ret = IS_ERR((void *)map_addr) ? PTR_ERR((void*)map_addr) : -EINVAL;
+ goto out_unmap;
+ }
+
+ /*
+ * FIXME
+ * Some heuristics to guess previously loaded real
+ * elf file structure. Probably this things should
+ * be exported via /proc somewhere instead.
+ */
+
+ if (vma_entry_ptr->status & VMA_AREA_STACK) {
+ /* Note if stack is VM_GROWSUP -- it should be reversed */
+ start_stack = vma_entry_ptr->start;
+ }
+
+ if (vma_entry_ptr->prot & PROT_EXEC) {
+ if (start_code > vma_entry_ptr->start)
+ start_code = vma_entry_ptr->start;
+ if (end_code < vma_entry_ptr->end)
+ end_code = vma_entry_ptr->end;
+ } else {
+ /*
+ * Neither .bss nor .data was being file mapped.
+ * FIXME: .rodata are loaded by interp.
+ */
+ if (!file) {
+ if (vma_entry_ptr->prot & (PROT_WRITE)) {
+ if (start_data > vma_entry_ptr->start)
+ start_data = vma_entry_ptr->start;
+ if (end_data < vma_entry_ptr->end)
+ end_data = vma_entry_ptr->end;
+ }
+ }
+ }
+
+ nr_vma_mapped++;
+ }
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+ if (vdso == -1UL) {
+ pr_err("elf-ckpt: Can't find VDSO address\n");
+ ret = -ENOEXEC;
+ goto out_unmap;
+ }
+#endif
+
+ /* Restore core data */
+ ret = kernel_read(bprm->file, elf_phdr_core->p_offset,
+ (char *)&core_entry, sizeof(core_entry));
+ if (ret != sizeof(core_entry)) {
+ pr_err("elf-ckpt: Can't read core_entry\n");
+ ret = -EIO;
+ goto out_unmap;
+ }
+
+ elf_entry = core_entry.gpregs.ip;
+ bprm->p = start_stack;
+
+ current->mm->start_code = start_code;
+ current->mm->end_code = end_code;
+ current->mm->start_data = start_data;
+ current->mm->end_data = end_data;
+ current->mm->start_stack = start_stack;
+ current->mm->start_brk = start_brk;
+ current->mm->brk = brk;
+
+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
+ ret = arch_setup_additional_pages_at(bprm, (void *)vdso, 0);
+ if (ret) {
+ pr_err("elf-ckpt: Can't setup additional pages at %lx with %d\n",
+ vdso, ret);
+ goto out_unmap;
+ }
+#endif
+
+ /*
+ * Restore pages
+ */
+ off = elf_phdr_pages->p_offset;
+ while (1) {
+ struct vm_area_struct *vma;
+ struct page *page;
+ void *page_data;
+ __u64 va;
+
+ ret = kernel_read(bprm->file, off, (char *)&va, sizeof(va));
+ if (ret != sizeof(va)) {
+ pr_err("elf-ckpt: Can't read page virtual address: "
+ "ret = %d off = %lx\n", ret, (unsigned long)off);
+ ret = -EIO;
+ goto out_unmap;
+ }
+
+ /* End of pages reached */
+ if (!va)
+ break;
+
+ vma = find_vma(current->mm, (unsigned long)va);
+ if (!vma) {
+ pr_err("elf-ckpt: No VMA for page: %16lx\n", (unsigned long)va);
+ ret = -ESRCH;
+ goto out_unmap;
+ }
+
+ ret = get_user_pages(current, current->mm, (unsigned long)va,
+ 1, 1, 1, &page, NULL);
+ if (ret != 1) {
+ pr_err("elf-ckpt: Can't get user page: %16lx\n", (unsigned long)va);
+ ret = -EFAULT;
+ goto out_unmap;
+ }
+
+ page_data = kmap(page);
+ ret = kernel_read(bprm->file, off + sizeof(va), page_data, PAGE_SIZE);
+ kunmap(page);
+ put_page(page);
+
+ if (ret != PAGE_SIZE) {
+ pr_err("elf-ckpt: Can't read data on page: %16lx\n", (unsigned long)va);
+ ret = -EFAULT;
+ goto out_unmap;
+ }
+
+ off += sizeof(va) + PAGE_SIZE;
+ }
+
+ set_binfmt(&elf_format);
+
+ /*
+ * Registers setup.
+ *
+ * Since we might be modifying MSRs we're
+ * to be sure the task wont be preempted
+ * until modification is complete.
+ */
+ cpu = get_cpu();
+
+ regs->ip = core_entry.gpregs.ip;
+ regs->sp = core_entry.gpregs.sp;
+ regs->cs = core_entry.gpregs.cs;
+ regs->ss = core_entry.gpregs.ss;
+ regs->flags = core_entry.gpregs.flags;
+ regs->r15 = core_entry.gpregs.r15;
+ regs->r14 = core_entry.gpregs.r14;
+ regs->r13 = core_entry.gpregs.r13;
+ regs->r12 = core_entry.gpregs.r12;
+ regs->bp = core_entry.gpregs.bp;
+ regs->bx = core_entry.gpregs.bx;
+ regs->r11 = core_entry.gpregs.r11;
+ regs->r10 = core_entry.gpregs.r10;
+ regs->r8 = core_entry.gpregs.r8;
+ regs->ax = core_entry.gpregs.ax;
+ regs->cx = core_entry.gpregs.cx;
+ regs->dx = core_entry.gpregs.dx;
+ regs->si = core_entry.gpregs.si;
+ regs->di = core_entry.gpregs.di;
+ regs->orig_ax = core_entry.gpregs.orig_ax;
+
+ thread->usersp = core_entry.gpregs.sp;
+ thread->ds = core_entry.gpregs.ds;
+ thread->es = core_entry.gpregs.es;
+ thread->fs = core_entry.gpregs.fs;
+ thread->gs = core_entry.gpregs.gs;
+
+ thread->fsindex = thread->fs;
+ thread->gsindex = thread->gs;
+
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) {
+ thread->tls_array[i].a = core_entry.tls_array[i].a;
+ thread->tls_array[i].b = core_entry.tls_array[i].b;
+ }
+
+ load_TLS(thread, cpu);
+
+ seg = thread->fsindex;
+ loadsegment(fs, seg);
+ savesegment(fs, seg);
+
+ if (seg != thread->fsindex) {
+ pr_err("Fixup on FS loading exception: %i %i\n",
+ thread->fsindex, seg);
+ }
+
+ if (core_entry.gpregs.fs_base)
+ wrmsrl(MSR_FS_BASE, core_entry.gpregs.fs_base);
+
+ if (core_entry.gpregs.gs_base)
+ wrmsrl(MSR_GS_BASE, core_entry.gpregs.gs_base);
+
+ put_cpu();
+
+ ret = 0;
+out:
+ if (fa)
+ flex_array_free(fa);
+ return ret;
+
+out_unmap:
+ for (i = 0; i < nr_vma_mapped; i++) {
+ vma_entry_ptr = flex_array_get(fa, i);
+ down_write(&current->mm->mmap_sem);
+ do_munmap(current->mm, vma_entry_ptr->start,
+ vma_entry_ptr->end - vma_entry_ptr->start);
+ up_write(&current->mm->mmap_sem);
+ }
+ goto out;
+}
+#else
+static int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs,
+ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr)
+{
+ return -ENOEXEC;
+}
+#endif
+
static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
{
struct file *interpreter = NULL; /* to shut gcc up */
@@ -592,7 +986,9 @@ static int load_elf_binary(struct linux_
if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0)
goto out;
- if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN)
+ if (loc->elf_ex.e_type != ET_EXEC &&
+ loc->elf_ex.e_type != ET_DYN &&
+ loc->elf_ex.e_type != ET_CKPT)
goto out;
if (!elf_check_arch(&loc->elf_ex))
goto out;
@@ -619,6 +1015,12 @@ static int load_elf_binary(struct linux_
goto out_free_ph;
}
+ if (loc->elf_ex.e_type == ET_CKPT) {
+ retval = load_elf_ckpt(bprm, regs, &loc->elf_ex,
+ (struct elf_phdr *)elf_phdata);
+ goto out_free_ph;
+ }
+
elf_ppnt = elf_phdata;
elf_bss = 0;
elf_brk = 0;
Index: linux-2.6.git/include/linux/elf_ckpt.h
===================================================================
--- /dev/null
+++ linux-2.6.git/include/linux/elf_ckpt.h
@@ -0,0 +1,135 @@
+#ifndef _LINUX_ELF_CHECKPOINT_H
+#define _LINUX_ELF_CHECKPOINT_H
+
+#include <linux/types.h>
+#include <linux/elf-em.h>
+
+#ifdef __KERNEL__
+
+#include <asm/elf.h>
+
+/*
+ * Elf extension includes new Elf file type
+ * and program header types as well.
+ */
+#define ET_CKPT 5
+
+#define PT_CKPT_OFFSET 0x01010101
+
+#define PT_CKPT_VMA (PT_LOOS + PT_CKPT_OFFSET + 1)
+#define PT_CKPT_CORE (PT_LOOS + PT_CKPT_OFFSET + 2)
+#define PT_CKPT_PAGES (PT_LOOS + PT_CKPT_OFFSET + 3)
+
+#define CKPT_PAGE_SIZE 4096
+#define CKPT_GDT_ENTRY_TLS_ENTRIES 3
+
+#define HEADER_VERSION 1
+#define HEADER_ARCH_X86_64 1
+
+#define VMA_AREA_REGULAR (1 << 0)
+#define VMA_AREA_STACK (1 << 1)
+#define VMA_AREA_VSYSCALL (1 << 2)
+#define VMA_AREA_VDSO (1 << 3)
+#define VMA_FORCE_READ (1 << 4)
+#define VMA_AREA_HEAP (1 << 5)
+#define VMA_FILE_PRIVATE (1 << 6)
+#define VMA_FILE_SHARED (1 << 7)
+#define VMA_ANON_SHARED (1 << 8)
+#define VMA_ANON_PRIVATE (1 << 9)
+#define VMA_FORCE_WRITE (1 << 10)
+
+struct vma_entry {
+ __u64 start;
+ __u64 end;
+ __u64 pgoff;
+ __u32 prot;
+ __u32 flags;
+ __u32 status;
+ __u32 pid;
+ __s64 fd;
+ __u64 ino;
+ __u32 dev_maj;
+ __u32 dev_min;
+} __packed;
+
+struct page_entry {
+ __u64 va;
+ __u8 data[CKPT_PAGE_SIZE];
+} __packed;
+
+struct image_header {
+ __u16 version;
+ __u16 arch;
+ __u32 flags;
+} __packed;
+
+struct user_regs_entry {
+ __u64 r15;
+ __u64 r14;
+ __u64 r13;
+ __u64 r12;
+ __u64 bp;
+ __u64 bx;
+ __u64 r11;
+ __u64 r10;
+ __u64 r9;
+ __u64 r8;
+ __u64 ax;
+ __u64 cx;
+ __u64 dx;
+ __u64 si;
+ __u64 di;
+ __u64 orig_ax;
+ __u64 ip;
+ __u64 cs;
+ __u64 flags;
+ __u64 sp;
+ __u64 ss;
+ __u64 fs_base;
+ __u64 gs_base;
+ __u64 ds;
+ __u64 es;
+ __u64 fs;
+ __u64 gs;
+} __packed;
+
+struct desc_struct_entry {
+ union {
+ struct {
+ __u32 a;
+ __u32 b;
+ };
+ struct {
+ __u16 limit0;
+ __u16 base0;
+ unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
+ unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
+ };
+ };
+} __packed;
+
+struct user_fpregs_entry {
+ __u16 cwd;
+ __u16 swd;
+ __u16 twd;
+ __u16 fop;
+ __u64 rip;
+ __u64 rdp;
+ __u32 mxcsr;
+ __u32 mxcsr_mask;
+ __u32 st_space[32];
+ __u32 xmm_space[64];
+ __u32 padding[24];
+} __packed;
+
+struct core_entry {
+ struct image_header header;
+ struct user_regs_entry gpregs;
+ struct user_fpregs_entry fpregs;
+ struct desc_struct tls_array[CKPT_GDT_ENTRY_TLS_ENTRIES];
+ __u32 personality;
+} __packed;
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_ELF_CHECKPOINT_H */

View File

@ -0,0 +1,172 @@
Allow processes to be created with specified pid
We will need it to restore processes so they would not
even notice that they were being checkpointed.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
include/linux/pid.h | 2 -
include/linux/sched.h | 1
kernel/fork.c | 10 ++++++-
kernel/pid.c | 70 ++++++++++++++++++++++++++++++++++++--------------
4 files changed, 62 insertions(+), 21 deletions(-)
Index: linux-2.6.git/include/linux/pid.h
===================================================================
--- linux-2.6.git.orig/include/linux/pid.h
+++ linux-2.6.git/include/linux/pid.h
@@ -119,7 +119,7 @@ extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
int next_pidmap(struct pid_namespace *pid_ns, unsigned int last);
-extern struct pid *alloc_pid(struct pid_namespace *ns);
+extern struct pid *alloc_pid(struct pid_namespace *ns, int pid);
extern void free_pid(struct pid *pid);
/*
Index: linux-2.6.git/include/linux/sched.h
===================================================================
--- linux-2.6.git.orig/include/linux/sched.h
+++ linux-2.6.git/include/linux/sched.h
@@ -23,6 +23,7 @@
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
and is now available for re-use. */
+#define CLONE_CHILD_USEPID 0x02000000 /* use the given pid */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
Index: linux-2.6.git/kernel/fork.c
===================================================================
--- linux-2.6.git.orig/kernel/fork.c
+++ linux-2.6.git/kernel/fork.c
@@ -1239,8 +1239,16 @@ static struct task_struct *copy_process(
goto bad_fork_cleanup_io;
if (pid != &init_struct_pid) {
+ int want_pid = 0;
+
+ if (clone_flags & CLONE_CHILD_USEPID) {
+ retval = get_user(want_pid, child_tidptr);
+ if (retval)
+ goto bad_fork_cleanup_io;
+ }
+
retval = -ENOMEM;
- pid = alloc_pid(p->nsproxy->pid_ns);
+ pid = alloc_pid(p->nsproxy->pid_ns, want_pid);
if (!pid)
goto bad_fork_cleanup_io;
}
Index: linux-2.6.git/kernel/pid.c
===================================================================
--- linux-2.6.git.orig/kernel/pid.c
+++ linux-2.6.git/kernel/pid.c
@@ -159,11 +159,55 @@ static void set_last_pid(struct pid_name
} while ((prev != last_write) && (pid_before(base, last_write, pid)));
}
-static int alloc_pidmap(struct pid_namespace *pid_ns)
+static int alloc_pidmap_page(struct pidmap *map)
+{
+ if (unlikely(!map->page)) {
+ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ /*
+ * Free the page if someone raced with us
+ * installing it:
+ */
+ spin_lock_irq(&pidmap_lock);
+ if (!map->page) {
+ map->page = page;
+ page = NULL;
+ }
+ spin_unlock_irq(&pidmap_lock);
+ kfree(page);
+ if (unlikely(!map->page))
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static int set_pidmap(struct pid_namespace *pid_ns, int pid)
+{
+ int offset;
+ struct pidmap *map;
+
+ offset = pid & BITS_PER_PAGE_MASK;
+ map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
+
+ if (alloc_pidmap_page(map) < 0)
+ return -ENOMEM;
+
+ if (!test_and_set_bit(offset, map->page)) {
+ atomic_dec(&map->nr_free);
+ return pid;
+ }
+
+ return -EBUSY;
+}
+
+static int alloc_pidmap(struct pid_namespace *pid_ns, int desired_pid)
{
int i, offset, max_scan, pid, last = pid_ns->last_pid;
struct pidmap *map;
+ if (desired_pid)
+ return set_pidmap(pid_ns, desired_pid);
+
pid = last + 1;
if (pid >= pid_max)
pid = RESERVED_PIDS;
@@ -176,22 +220,9 @@ static int alloc_pidmap(struct pid_names
*/
max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
for (i = 0; i <= max_scan; ++i) {
- if (unlikely(!map->page)) {
- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
- /*
- * Free the page if someone raced with us
- * installing it:
- */
- spin_lock_irq(&pidmap_lock);
- if (!map->page) {
- map->page = page;
- page = NULL;
- }
- spin_unlock_irq(&pidmap_lock);
- kfree(page);
- if (unlikely(!map->page))
- break;
- }
+ if (alloc_pidmap_page(map) < 0)
+ break;
+
if (likely(atomic_read(&map->nr_free))) {
do {
if (!test_and_set_bit(offset, map->page)) {
@@ -277,7 +308,7 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}
-struct pid *alloc_pid(struct pid_namespace *ns)
+struct pid *alloc_pid(struct pid_namespace *ns, int this_ns_pid)
{
struct pid *pid;
enum pid_type type;
@@ -291,13 +322,14 @@ struct pid *alloc_pid(struct pid_namespa
tmp = ns;
for (i = ns->level; i >= 0; i--) {
- nr = alloc_pidmap(tmp);
+ nr = alloc_pidmap(tmp, this_ns_pid);
if (nr < 0)
goto out_free;
pid->numbers[i].nr = nr;
pid->numbers[i].ns = tmp;
tmp = tmp->parent;
+ this_ns_pid = 0;
}
get_pid_ns(ns);

View File

@ -0,0 +1,46 @@
proc: Introduce the Children: line in /proc/<pid>/status
From: Pavel Emelyanov <xemul@parallels.com>
Although we can get the pids of some task's issue, this is just
more convenient to have them this way.
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
fs/proc/array.c | 14 ++++++++++++++
1 file changed, 14 insertions(+)
Index: linux-2.6.git/fs/proc/array.c
===================================================================
--- linux-2.6.git.orig/fs/proc/array.c
+++ linux-2.6.git/fs/proc/array.c
@@ -158,6 +158,18 @@ static inline const char *get_task_state
return *p;
}
+static void task_children(struct seq_file *m, struct task_struct *p, struct pid_namespace *ns)
+{
+ struct task_struct *c;
+
+ seq_printf(m, "Children:");
+ read_lock(&tasklist_lock);
+ list_for_each_entry(c, &p->children, sibling)
+ seq_printf(m, " %d", pid_nr_ns(task_pid(c), ns));
+ read_unlock(&tasklist_lock);
+ seq_putc(m, '\n');
+}
+
static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *p)
{
@@ -192,6 +204,8 @@ static inline void task_state(struct seq
cred->uid, cred->euid, cred->suid, cred->fsuid,
cred->gid, cred->egid, cred->sgid, cred->fsgid);
+ task_children(m, p, ns);
+
task_lock(p);
if (p->files)
fdt = files_fdtable(p->files);

522
kernel/cr-proc-map-files-21 Normal file
View File

@ -0,0 +1,522 @@
fs, proc: Introduce the /proc/<pid>/map_files/ directory v14
From: Pavel Emelyanov <xemul@parallels.com>
This one behaves similarly to the /proc/<pid>/fd/ one - it contains symlinks
one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end",
the target is the file. Opening a symlink results in a file that point exactly
to the same inode as them vma's one.
For example the ls -l of some arbitrary /proc/<pid>/map_files/
| lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
| lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
| lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
| lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
| lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so
This *helps* checkpointing process in three ways:
1. When dumping a task mappings we do know exact file that is mapped by particular
region. We do this by opening /proc/$pid/map_files/$address symlink the way we do
with file descriptors.
2. This also helps in determining which anonymous shared mappings are shared with
each other by comparing the inodes of them.
3. When restoring a set of processes in case two of them has a mapping shared, we map
the memory by the 1st one and then open its /proc/$pid/map_files/$address file and
map it by the 2nd task.
Using /proc/$pid/maps for this is quite inconvenient since it brings repeatable
re-reading and reparsing for this text file which slows down restore procedure
significantly. Also as being pointed in (3) it is a way easier to use top level
shared mapping in children as /proc/$pid/map_files/$address when needed.
v2: (spotted by Tejun Heo)
- /proc/<pid>/mfd changed to /proc/<pid>/map_files
- find_vma helper is used instead of linear search
- routines are re-grouped
- d_revalidate is set now
v3:
- d_revalidate reworked, now it should drops no longer valid dentries (Tejun Heo)
- ptrace_may_access added into proc_map_files_lookup (Vasiliy Kulikov)
- because of filldir (which eventually might need to lock mmap_sem)
the proc_map_files_readdir() was reworked to call proc_fill_cache()
with unlocked mmap_sem
v4: (feedback by Tejun Heo and Vasiliy Kulikov)
- instead of saving data in proc_inode we rather make a dentry name
to keep both vm_start and vm_end accordingly
- d_revalidate now honor task credentials
v5: (feedback by Kirill A. Shutemov)
- don't forget to release mmap_sem on error path
v6:
- sizeof get used in map_files_info which shrink member a bit on
x86-32 (by Kirill A. Shutemov)
- map_name_to_addr returns -EINVAL instead of -1
which is more appropriate (by Tejun Heo)
v7:
- add [get/set]attr handlers for
proc_map_files_inode_operations (by Vasiliy Kulikov)
v8:
- Kirill A. Shutemov spotted a parasite semicolon
which ruined the ptrace_check call, fixed.
v9: (feedback by Andrew Morton)
- find_exact_vma moved into include/linux/mm.h as an inline helper
- proc_map_files_setattr uses either kmalloc or vmalloc depending
on how many objects are to be allocated
- no more map_name_to_addr but dname_to_vma_addr introduced instead
and it uses sscanf because in one case the find_exact_vma() is used
only to confirm existence of vma area the boolean flag is used
- fancy justification dropped
- still the proc_map_files_get/setattr leaved untouched
until additional fd/ patches applied first.
v10: (feedback by Andrew Morton)
- flex_arrays are used instead of kmalloc/vmalloc calls
- map_files_d_revalidate use ptrace_may_access for
security reason (by Vasiliy Kulikov)
v11:
- should use fput and drop !ret test from a loop code
(feedback by Andrew Morton)
- no need for 'used' variable, use existing
nr_files with file->pos predicate
- if preallocation fails no need to go further,
simply release mmap semaphore and jump out
v12:
- rework map_files_d_revalidate to make sure
the task get released on return (by Vasiliy Kulikov)
v13:
- proc_map_files_inode_operations are set to be the same
as proc_fd_inode_operations, ie to include .permission
pointing to proc_fd_permission
v14: (by Vasiliy Kulikov)
- for security reason map_files/ entries are allowed for
readers with CAP_SYS_ADMIN credentials granted only
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reviewed-by: Vasiliy Kulikov <segoon@openwall.com>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
CC: Pavel Machek <pavel@ucw.cz>
---
fs/proc/base.c | 345 +++++++++++++++++++++++++++++++++++++++++++++++++++++
include/linux/mm.h | 12 +
2 files changed, 357 insertions(+)
Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -83,6 +83,7 @@
#include <linux/pid_namespace.h>
#include <linux/fs_struct.h>
#include <linux/slab.h>
+#include <linux/flex_array.h>
#ifdef CONFIG_HARDWALL
#include <asm/hardwall.h>
#endif
@@ -133,6 +134,8 @@ struct pid_entry {
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
+static int proc_fd_permission(struct inode *inode, int mask);
+
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
* and .. links.
@@ -2201,6 +2204,347 @@ static const struct file_operations proc
};
/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+ unsigned long *start, unsigned long *end)
+{
+ if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ unsigned long vm_start, vm_end;
+ bool exact_vma_exists = false;
+ struct mm_struct *mm = NULL;
+ struct task_struct *task;
+ const struct cred *cred;
+ struct inode *inode;
+ int status = 0;
+
+ if (nd && nd->flags & LOOKUP_RCU)
+ return -ECHILD;
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ status = -EACCES;
+ goto out_notask;
+ }
+
+ inode = dentry->d_inode;
+ task = get_proc_task(inode);
+ if (!task)
+ goto out_notask;
+
+ if (!ptrace_may_access(task, PTRACE_MODE_READ))
+ goto out;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out;
+
+ if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+ down_read(&mm->mmap_sem);
+ exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+ up_read(&mm->mmap_sem);
+ }
+
+ mmput(mm);
+
+ if (exact_vma_exists) {
+ if (task_dumpable(task)) {
+ rcu_read_lock();
+ cred = __task_cred(task);
+ inode->i_uid = cred->euid;
+ inode->i_gid = cred->egid;
+ rcu_read_unlock();
+ } else {
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ }
+ security_task_to_inode(task, inode);
+ status = 1;
+ }
+
+out:
+ put_task_struct(task);
+
+out_notask:
+ if (status <= 0)
+ d_drop(dentry);
+
+ return status;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+ .d_revalidate = map_files_d_revalidate,
+ .d_delete = pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ int rc;
+
+ rc = -ENOENT;
+ task = get_proc_task(dentry->d_inode);
+ if (!task)
+ goto out;
+
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ if (!mm)
+ goto out;
+
+ rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+ if (rc)
+ goto out_mmput;
+
+ down_read(&mm->mmap_sem);
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (vma && vma->vm_file) {
+ *path = vma->vm_file->f_path;
+ path_get(path);
+ rc = 0;
+ }
+ up_read(&mm->mmap_sem);
+
+out_mmput:
+ mmput(mm);
+out:
+ return rc;
+}
+
+struct map_files_info {
+ struct file *file;
+ unsigned long len;
+ unsigned char name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+ struct task_struct *task, const void *ptr)
+{
+ const struct file *file = ptr;
+ struct proc_inode *ei;
+ struct inode *inode;
+
+ if (!file)
+ return ERR_PTR(-ENOENT);
+
+ inode = proc_pid_make_inode(dir->i_sb, task);
+ if (!inode)
+ return ERR_PTR(-ENOENT);
+
+ ei = PROC_I(inode);
+ ei->op.proc_get_link = proc_map_files_get_link;
+
+ inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_size = 64;
+ inode->i_mode = S_IFLNK;
+
+ if (file->f_mode & FMODE_READ)
+ inode->i_mode |= S_IRUSR;
+ if (file->f_mode & FMODE_WRITE)
+ inode->i_mode |= S_IWUSR;
+
+ d_set_d_op(dentry, &tid_map_files_dentry_operations);
+ d_add(dentry, inode);
+
+ return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+ struct dentry *dentry, struct nameidata *nd)
+{
+ unsigned long vm_start, vm_end;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct dentry *result;
+ struct mm_struct *mm;
+
+ result = ERR_PTR(-EACCES);
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ result = ERR_PTR(-ENOENT);
+ task = get_proc_task(dir);
+ if (!task)
+ goto out;
+
+ result = ERR_PTR(-EACCES);
+ if (lock_trace(task))
+ goto out_put_task;
+
+ result = ERR_PTR(-ENOENT);
+ if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+ goto out_unlock;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_unlock;
+
+ down_read(&mm->mmap_sem);
+ vma = find_exact_vma(mm, vm_start, vm_end);
+ if (!vma)
+ goto out_no_vma;
+
+ result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+out_unlock:
+ unlock_trace(task);
+out_put_task:
+ put_task_struct(task);
+out:
+ return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+ .lookup = proc_map_files_lookup,
+ .permission = proc_fd_permission,
+ .setattr = proc_setattr,
+};
+
+static int proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct dentry *dentry = filp->f_path.dentry;
+ struct inode *inode = dentry->d_inode;
+ struct vm_area_struct *vma;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ ino_t ino;
+ int ret;
+
+ ret = -EACCES;
+ if (!capable(CAP_SYS_ADMIN))
+ goto out;
+
+ ret = -ENOENT;
+ task = get_proc_task(inode);
+ if (!task)
+ goto out;
+
+ ret = -EACCES;
+ if (lock_trace(task))
+ goto out_put_task;
+
+ ret = 0;
+ switch (filp->f_pos) {
+ case 0:
+ ino = inode->i_ino;
+ if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+ goto out_unlock;
+ filp->f_pos++;
+ case 1:
+ ino = parent_ino(dentry);
+ if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+ goto out_unlock;
+ filp->f_pos++;
+ default:
+ {
+ unsigned long nr_files, pos, i;
+ struct flex_array *fa = NULL;
+ struct map_files_info info;
+ struct map_files_info *p;
+
+ mm = get_task_mm(task);
+ if (!mm)
+ goto out_unlock;
+ down_read(&mm->mmap_sem);
+
+ nr_files = 0;
+
+ /*
+ * We need two passes here:
+ *
+ * 1) Collect vmas of mapped files with mmap_sem taken
+ * 2) Release mmap_sem and instantiate entries
+ *
+ * otherwise we get lockdep complained, since filldir()
+ * routine might require mmap_sem taken in might_fault().
+ */
+
+ for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ if (vma->vm_file && ++pos > filp->f_pos)
+ nr_files++;
+ }
+
+ if (nr_files) {
+ fa = flex_array_alloc(sizeof(info), nr_files, GFP_KERNEL);
+ if (!fa || flex_array_prealloc(fa, 0, nr_files, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ if (fa)
+ flex_array_free(fa);
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+ goto out_unlock;
+ }
+ for (i = 0, vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+ if (!vma->vm_file)
+ continue;
+ if (++pos <= filp->f_pos)
+ continue;
+
+ get_file(vma->vm_file);
+ info.file = vma->vm_file;
+ info.len = snprintf(info.name, sizeof(info.name),
+ "%lx-%lx", vma->vm_start,
+ vma->vm_end);
+ if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+ BUG();
+ }
+ }
+ up_read(&mm->mmap_sem);
+
+ for (i = 0; i < nr_files; i++) {
+ p = flex_array_get(fa, i);
+ ret = proc_fill_cache(filp, dirent, filldir,
+ p->name, p->len,
+ proc_map_files_instantiate,
+ task, p->file);
+ if (ret)
+ break;
+ filp->f_pos++;
+ fput(p->file);
+ }
+ for (; i < nr_files; i++) {
+ /*
+ * In case of error don't forget
+ * to put rest of file refs.
+ */
+ p = flex_array_get(fa, i);
+ fput(p->file);
+ }
+ if (fa)
+ flex_array_free(fa);
+ mmput(mm);
+ }
+ }
+
+out_unlock:
+ unlock_trace(task);
+out_put_task:
+ put_task_struct(task);
+out:
+ return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+ .read = generic_read_dir,
+ .readdir = proc_map_files_readdir,
+ .llseek = default_llseek,
+};
+
+/*
* /proc/pid/fd needs a special permission handler so that a process can still
* access /proc/self/fd after it has executed a setuid().
*/
@@ -2815,6 +3159,7 @@ static const struct inode_operations pro
static const struct pid_entry tgid_base_stuff[] = {
DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+ DIR("map_files", S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
#ifdef CONFIG_NET
Index: linux-2.6.git/include/linux/mm.h
===================================================================
--- linux-2.6.git.orig/include/linux/mm.h
+++ linux-2.6.git/include/linux/mm.h
@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st
return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
}
+/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+static inline struct vm_area_struct *
+find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end)
+{
+ struct vm_area_struct *vma = find_vma(mm, vm_start);
+
+ if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+ vma = NULL;
+
+ return vma;
+}
+
#ifdef CONFIG_MMU
pgprot_t vm_get_page_prot(unsigned long vm_flags);
#else

View File

@ -0,0 +1,27 @@
vfs: Add ->statfs callback for pipefs
From: Pavel Emelyanov <xemul@parallels.com>
This is done to make it possible to distinguish pipes
from fifos when opening one via /proc/<pid>/fd/ link.
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Reviewed-by: Tejun Heo <tj@kernel.org>
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
fs/pipe.c | 1 +
1 file changed, 1 insertion(+)
Index: linux-2.6.git/fs/pipe.c
===================================================================
--- linux-2.6.git.orig/fs/pipe.c
+++ linux-2.6.git/fs/pipe.c
@@ -1254,6 +1254,7 @@ out:
static const struct super_operations pipefs_ops = {
.destroy_inode = free_inode_nonrcu,
+ .statfs = simple_statfs,
};
/*

86
kernel/fs-add-do-close Normal file
View File

@ -0,0 +1,86 @@
fs: Add do_close helper
To be able to close file descriptors right from inside
kernel space do_close() helper is added. We need it at
checkpoint restore time.
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
fs/open.c | 32 ++++++++++++++++++++------------
include/linux/fs.h | 1 +
2 files changed, 21 insertions(+), 12 deletions(-)
Index: linux-2.6.git/fs/open.c
===================================================================
--- linux-2.6.git.orig/fs/open.c
+++ linux-2.6.git/fs/open.c
@@ -1056,17 +1056,11 @@ int filp_close(struct file *filp, fl_own
EXPORT_SYMBOL(filp_close);
-/*
- * Careful here! We test whether the file pointer is NULL before
- * releasing the fd. This ensures that one clone task can't release
- * an fd while another clone is opening it.
- */
-SYSCALL_DEFINE1(close, unsigned int, fd)
+int do_close(unsigned int fd)
{
struct file * filp;
struct files_struct *files = current->files;
struct fdtable *fdt;
- int retval;
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
@@ -1079,7 +1073,25 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
FD_CLR(fd, fdt->close_on_exec);
__put_unused_fd(files, fd);
spin_unlock(&files->file_lock);
- retval = filp_close(filp, files);
+
+ return filp_close(filp, files);
+
+out_unlock:
+ spin_unlock(&files->file_lock);
+ return -EBADF;
+}
+EXPORT_SYMBOL_GPL(do_close);
+
+/*
+ * Careful here! We test whether the file pointer is NULL before
+ * releasing the fd. This ensures that one clone task can't release
+ * an fd while another clone is opening it.
+ */
+SYSCALL_DEFINE1(close, unsigned int, fd)
+{
+ int retval;
+
+ retval = do_close(fd);
/* can't restart close syscall because file table entry was cleared */
if (unlikely(retval == -ERESTARTSYS ||
@@ -1089,10 +1101,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
retval = -EINTR;
return retval;
-
-out_unlock:
- spin_unlock(&files->file_lock);
- return -EBADF;
}
EXPORT_SYMBOL(sys_close);
Index: linux-2.6.git/include/linux/fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/fs.h
+++ linux-2.6.git/include/linux/fs.h
@@ -2027,6 +2027,7 @@ extern struct file *file_open_root(struc
extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
const struct cred *);
extern int filp_close(struct file *, fl_owner_t id);
+extern int do_close(unsigned int fd);
extern char * getname(const char __user *);
/* fs/ioctl.c */

45
kernel/fs-proc-add-tls Normal file
View File

@ -0,0 +1,45 @@
fs, proc: Add /proc/$pid/tls entry
To be able to restart checkpointed tasks we need
to know TLS status at dumping time. Export this
information by /proc/$pid/tls entry.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
---
fs/proc/base.c | 16 ++++++++++++++++
1 file changed, 16 insertions(+)
Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -3150,6 +3150,21 @@ static int proc_pid_personality(struct s
return err;
}
+static int proc_pid_tls(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *task)
+{
+ int err = lock_trace(task);
+ if (!err) {
+ int i;
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ seq_printf(m, "%x %x\n",
+ task->thread.tls_array[i].a,
+ task->thread.tls_array[i].b);
+ unlock_trace(task);
+ }
+ return err;
+}
+
/*
* Thread groups
*/
@@ -3169,6 +3184,7 @@ static const struct pid_entry tgid_base_
INF("auxv", S_IRUSR, proc_pid_auxv),
ONE("status", S_IRUGO, proc_pid_status),
ONE("personality", S_IRUGO, proc_pid_personality),
+ ONE("tls", S_IRUGO, proc_pid_tls),
INF("limits", S_IRUGO, proc_pid_limits),
#ifdef CONFIG_SCHED_DEBUG
REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations),

View File

@ -0,0 +1,108 @@
fs, proc: Make proc_get_link to use dentry instead of inode
This patch prepares the ground for the next "map_files"
patch which needs a name of a link file to analyse.
So instead of squashing this change into one big
patch the separate one is done.
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Pavel Emelyanov <xemul@parallels.com>
CC: Tejun Heo <tj@kernel.org>
CC: Vasiliy Kulikov <segoon@openwall.com>
CC: "Kirill A. Shutemov" <kirill@shutemov.name>
CC: Alexey Dobriyan <adobriyan@gmail.com>
CC: Al Viro <viro@ZenIV.linux.org.uk>
CC: Andrew Morton <akpm@linux-foundation.org>
---
fs/proc/base.c | 20 ++++++++++----------
include/linux/proc_fs.h | 2 +-
2 files changed, 11 insertions(+), 11 deletions(-)
Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -165,9 +165,9 @@ static int get_task_root(struct task_str
return result;
}
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(dentry->d_inode);
int result = -ENOENT;
if (task) {
@@ -182,9 +182,9 @@ static int proc_cwd_link(struct inode *i
return result;
}
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
{
- struct task_struct *task = get_proc_task(inode);
+ struct task_struct *task = get_proc_task(dentry->d_inode);
int result = -ENOENT;
if (task) {
@@ -1580,13 +1580,13 @@ static const struct file_operations proc
.release = single_release,
};
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
{
struct task_struct *task;
struct mm_struct *mm;
struct file *exe_file;
- task = get_proc_task(inode);
+ task = get_proc_task(dentry->d_inode);
if (!task)
return -ENOENT;
mm = get_task_mm(task);
@@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct
if (!proc_fd_access_allowed(inode))
goto out;
- error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+ error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
out:
return ERR_PTR(error);
}
@@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent
if (!proc_fd_access_allowed(inode))
goto out;
- error = PROC_I(inode)->op.proc_get_link(inode, &path);
+ error = PROC_I(inode)->op.proc_get_link(dentry, &path);
if (error)
goto out;
@@ -1959,9 +1959,9 @@ out_task:
return rc;
}
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
{
- return proc_fd_info(inode, path, NULL);
+ return proc_fd_info(dentry->d_inode, path, NULL);
}
static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
Index: linux-2.6.git/include/linux/proc_fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/proc_fs.h
+++ linux-2.6.git/include/linux/proc_fs.h
@@ -253,7 +253,7 @@ extern const struct proc_ns_operations u
extern const struct proc_ns_operations ipcns_operations;
union proc_op {
- int (*proc_get_link)(struct inode *, struct path *);
+ int (*proc_get_link)(struct dentry *, struct path *);
int (*proc_read)(struct task_struct *task, char *page);
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,

View File

@ -0,0 +1,28 @@
From: Vasiliy Kulikov <segooon@gmail.com>
In the patch "proc: fix races against execve() of /proc/PID/fd**"
proc_pid_fd_link_getattr() leaked task_struct if ptrace check fails.
Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Reported-by: Cyrill Gorcunov <gorcunov@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
fs/proc/base.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff -puN fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd-fix fs/proc/base.c
--- a/fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd-fix
+++ a/fs/proc/base.c
@@ -1681,9 +1681,9 @@ static int proc_pid_fd_link_getattr(stru
generic_fillattr(inode, stat);
unlock_trace(task);
- put_task_struct(task);
rc = 0;
out_task:
+ put_task_struct(task);
return rc;
}
_

View File

@ -0,0 +1,255 @@
From: Vasiliy Kulikov <segoon@openwall.com>
fd* files are restricted to the task's owner, and other users may not get
direct access to them. But one may open any of these files and run any
setuid program, keeping opened file descriptors. As there are permission
checks on open(), but not on readdir() and read(), operations on the kept
file descriptors will not be checked. It makes it possible to violate
procfs permission model.
Reading fdinfo/* may disclosure current fds' position and flags, reading
directory contents of fdinfo/ and fd/ may disclosure the number of opened
files by the target task. This information is not sensible per se, but it
can reveal some private information (like length of a password stored in a
file) under certain conditions.
Used existing (un)lock_trace functions to check for ptrace_may_access(),
but instead of using EPERM return code from it use EACCES to be consistent
with existing proc_pid_follow_link()/proc_pid_readlink() return code. If
they differ, attacker can guess what fds exist by analyzing stat() return
code. Patched handlers: stat() for fd/*, stat() and read() for fdindo/*,
readdir() and lookup() for fd/ and fdinfo/.
Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
fs/proc/base.c | 146 +++++++++++++++++++++++++++++++++--------------
1 file changed, 103 insertions(+), 43 deletions(-)
diff -puN fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd fs/proc/base.c
--- a/fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd
+++ a/fs/proc/base.c
@@ -1652,12 +1652,46 @@ out:
return error;
}
+static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct task_struct *task = get_proc_task(inode);
+ int rc;
+
+ if (task == NULL)
+ return -ESRCH;
+
+ rc = -EACCES;
+ if (lock_trace(task))
+ goto out_task;
+
+ generic_fillattr(inode, stat);
+ unlock_trace(task);
+ put_task_struct(task);
+ rc = 0;
+out_task:
+ return rc;
+}
+
static const struct inode_operations proc_pid_link_inode_operations = {
.readlink = proc_pid_readlink,
.follow_link = proc_pid_follow_link,
.setattr = proc_setattr,
};
+static const struct inode_operations proc_fdinfo_link_inode_operations = {
+ .setattr = proc_setattr,
+ .getattr = proc_pid_fd_link_getattr,
+};
+
+static const struct inode_operations proc_fd_link_inode_operations = {
+ .readlink = proc_pid_readlink,
+ .follow_link = proc_pid_follow_link,
+ .setattr = proc_setattr,
+ .getattr = proc_pid_fd_link_getattr,
+};
+
/* building an inode */
@@ -1889,49 +1923,61 @@ out:
static int proc_fd_info(struct inode *inode, struct path *path, char *info)
{
- struct task_struct *task = get_proc_task(inode);
- struct files_struct *files = NULL;
+ struct task_struct *task;
+ struct files_struct *files;
struct file *file;
int fd = proc_fd(inode);
+ int rc;
- if (task) {
- files = get_files_struct(task);
- put_task_struct(task);
- }
- if (files) {
- /*
- * We are not taking a ref to the file structure, so we must
- * hold ->file_lock.
- */
- spin_lock(&files->file_lock);
- file = fcheck_files(files, fd);
- if (file) {
- unsigned int f_flags;
- struct fdtable *fdt;
-
- fdt = files_fdtable(files);
- f_flags = file->f_flags & ~O_CLOEXEC;
- if (FD_ISSET(fd, fdt->close_on_exec))
- f_flags |= O_CLOEXEC;
-
- if (path) {
- *path = file->f_path;
- path_get(&file->f_path);
- }
- if (info)
- snprintf(info, PROC_FDINFO_MAX,
- "pos:\t%lli\n"
- "flags:\t0%o\n",
- (long long) file->f_pos,
- f_flags);
- spin_unlock(&files->file_lock);
- put_files_struct(files);
- return 0;
+ task = get_proc_task(inode);
+ if (!task)
+ return -ENOENT;
+
+ rc = -EACCES;
+ if (lock_trace(task))
+ goto out_task;
+
+ rc = -ENOENT;
+ files = get_files_struct(task);
+ if (files == NULL)
+ goto out_unlock;
+
+ /*
+ * We are not taking a ref to the file structure, so we must
+ * hold ->file_lock.
+ */
+ spin_lock(&files->file_lock);
+ file = fcheck_files(files, fd);
+ if (file) {
+ unsigned int f_flags;
+ struct fdtable *fdt;
+
+ fdt = files_fdtable(files);
+ f_flags = file->f_flags & ~O_CLOEXEC;
+ if (FD_ISSET(fd, fdt->close_on_exec))
+ f_flags |= O_CLOEXEC;
+
+ if (path) {
+ *path = file->f_path;
+ path_get(&file->f_path);
}
- spin_unlock(&files->file_lock);
- put_files_struct(files);
- }
- return -ENOENT;
+ if (info)
+ snprintf(info, PROC_FDINFO_MAX,
+ "pos:\t%lli\n"
+ "flags:\t0%o\n",
+ (long long) file->f_pos,
+ f_flags);
+ rc = 0;
+ } else
+ rc = -ENOENT;
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+
+out_unlock:
+ unlock_trace(task);
+out_task:
+ put_task_struct(task);
+ return rc;
}
static int proc_fd_link(struct inode *inode, struct path *path)
@@ -2026,7 +2072,7 @@ static struct dentry *proc_fd_instantiat
spin_unlock(&files->file_lock);
put_files_struct(files);
- inode->i_op = &proc_pid_link_inode_operations;
+ inode->i_op = &proc_fd_link_inode_operations;
inode->i_size = 64;
ei->op.proc_get_link = proc_fd_link;
d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2058,7 +2104,12 @@ static struct dentry *proc_lookupfd_comm
if (fd == ~0U)
goto out;
+ result = ERR_PTR(-EACCES);
+ if (lock_trace(task))
+ goto out;
+
result = instantiate(dir, dentry, task, &fd);
+ unlock_trace(task);
out:
put_task_struct(task);
out_no_task:
@@ -2078,23 +2129,28 @@ static int proc_readfd_common(struct fil
retval = -ENOENT;
if (!p)
goto out_no_task;
+
+ retval = -EACCES;
+ if (lock_trace(p))
+ goto out;
+
retval = 0;
fd = filp->f_pos;
switch (fd) {
case 0:
if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0)
- goto out;
+ goto out_unlock;
filp->f_pos++;
case 1:
ino = parent_ino(dentry);
if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
- goto out;
+ goto out_unlock;
filp->f_pos++;
default:
files = get_files_struct(p);
if (!files)
- goto out;
+ goto out_unlock;
rcu_read_lock();
for (fd = filp->f_pos-2;
fd < files_fdtable(files)->max_fds;
@@ -2118,6 +2174,9 @@ static int proc_readfd_common(struct fil
rcu_read_unlock();
put_files_struct(files);
}
+
+out_unlock:
+ unlock_trace(p);
out:
put_task_struct(p);
out_no_task:
@@ -2195,6 +2254,7 @@ static struct dentry *proc_fdinfo_instan
ei->fd = fd;
inode->i_mode = S_IFREG | S_IRUSR;
inode->i_fop = &proc_fdinfo_file_operations;
+ inode->i_op = &proc_fdinfo_link_inode_operations;
d_set_d_op(dentry, &tid_fd_dentry_operations);
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */
_

View File

@ -0,0 +1,118 @@
From: Vasiliy Kulikov <segoon@openwall.com>
The patch "proc: fix races against execve() of /proc/PID/fd**" is still a
partial fix for a setxid problem. link(2) is a yet another way to
identify whether a specific fd is opened by a privileged process. By
calling link(2) against /proc/PID/fd/* an attacker may identify whether
the fd number is valid for PID by analysing link(2) return code.
Both getattr() and link() can be used by the attacker iff the dentry is
present in the dcache. In this case ->lookup() is not called and the only
way to check ptrace permissions is either operation handler or
->revalidate(). The easiest solution to prevent any unauthorized access
to /proc/PID/fd*/ files is to force the dentry drop on each unauthorized
access attempt.
If an attacker keeps opened fd of /proc/PID/fd/ and dcache contains a
specific dentry for some /proc/PID/fd/XXX, any future attemp to use the
dentry by the attacker would lead to the dentry drop as a result of a
failed ptrace check in ->revalidate(). Then the attacker cannot spawn a
dentry for the specific fd number because of ptrace check in ->lookup().
The dentry drop can be still observed by an attacker by analysing
information from /proc/slabinfo, which is addressed in the successive
patch.
Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
fs/proc/base.c | 42 ++++++------------------------------------
1 file changed, 6 insertions(+), 36 deletions(-)
Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -1665,46 +1665,12 @@ out:
return error;
}
-static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry,
- struct kstat *stat)
-{
- struct inode *inode = dentry->d_inode;
- struct task_struct *task = get_proc_task(inode);
- int rc;
-
- if (task == NULL)
- return -ESRCH;
-
- rc = -EACCES;
- if (lock_trace(task))
- goto out_task;
-
- generic_fillattr(inode, stat);
- unlock_trace(task);
- rc = 0;
-out_task:
- put_task_struct(task);
- return rc;
-}
-
static const struct inode_operations proc_pid_link_inode_operations = {
.readlink = proc_pid_readlink,
.follow_link = proc_pid_follow_link,
.setattr = proc_setattr,
};
-static const struct inode_operations proc_fdinfo_link_inode_operations = {
- .setattr = proc_setattr,
- .getattr = proc_pid_fd_link_getattr,
-};
-
-static const struct inode_operations proc_fd_link_inode_operations = {
- .readlink = proc_pid_readlink,
- .follow_link = proc_pid_follow_link,
- .setattr = proc_setattr,
- .getattr = proc_pid_fd_link_getattr,
-};
-
/* building an inode */
@@ -2013,6 +1979,11 @@ static int tid_fd_revalidate(struct dent
task = get_proc_task(inode);
fd = proc_fd(inode);
+ if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+ put_task_struct(task);
+ task = NULL;
+ }
+
if (task) {
files = get_files_struct(task);
if (files) {
@@ -2085,7 +2056,7 @@ static struct dentry *proc_fd_instantiat
spin_unlock(&files->file_lock);
put_files_struct(files);
- inode->i_op = &proc_fd_link_inode_operations;
+ inode->i_op = &proc_pid_link_inode_operations;
inode->i_size = 64;
ei->op.proc_get_link = proc_fd_link;
d_set_d_op(dentry, &tid_fd_dentry_operations);
@@ -2267,7 +2238,6 @@ static struct dentry *proc_fdinfo_instan
ei->fd = fd;
inode->i_mode = S_IFREG | S_IRUSR;
inode->i_fop = &proc_fdinfo_file_operations;
- inode->i_op = &proc_fdinfo_link_inode_operations;
d_set_d_op(dentry, &tid_fd_dentry_operations);
d_add(dentry, inode);
/* Close the race of the process dying before we return the dentry */

View File

@ -0,0 +1,26 @@
From: Pavel Emelyanov <xemul@openvz.org>
On reading sysctl dirs we should return -EISDIR instead of -EINVAL.
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
fs/proc/proc_sysctl.c | 1 +
1 file changed, 1 insertion(+)
diff -puN fs/proc/proc_sysctl.c~procfs-report-eisdir-when-reading-sysctl-dirs-in-proc fs/proc/proc_sysctl.c
--- a/fs/proc/proc_sysctl.c~procfs-report-eisdir-when-reading-sysctl-dirs-in-proc
+++ a/fs/proc/proc_sysctl.c
@@ -360,6 +360,7 @@ static const struct file_operations proc
};
static const struct file_operations proc_sys_dir_file_operations = {
+ .read = generic_read_dir,
.readdir = proc_sys_readdir,
.llseek = generic_file_llseek,
};
_

5
kernel/readme Normal file
View File

@ -0,0 +1,5 @@
The kernel patches series. See "series" file to obtain
order of appliance. Not all patches do address C/R directly
but some of them are needed due to dependencies.
Has been tested on Linux 3.1-rc3.

12
kernel/series Normal file
View File

@ -0,0 +1,12 @@
cr-proc-add-children
procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch
proc-fix-races-against-execve-of-proc-pid-fd.patch
proc-fix-races-against-execve-of-proc-pid-fd-fix.patch
proc-force-dcache-drop-on-unauthorized-access.patch
cr-statfs-callback-for-pipefs
cr-clone-with-pid-support
fs-proc-switch-to-dentry
cr-proc-map-files-21
fs-proc-add-tls
fs-add-do-close
binfmt-elf-for-cr-4

19
parasite-elf.lds.S Normal file
View File

@ -0,0 +1,19 @@
OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
OUTPUT_ARCH(i386:x86-64)
SECTIONS
{
. = 0;
.text : {
*(.parasite.head.text)
*(.text)
. = ALIGN(8);
}
.data : {
*(.data)
*(.rodata)
*(.bss)
*(.parasite.stack)
. = ALIGN(8);
}
}

514
parasite-syscall.c Normal file
View File

@ -0,0 +1,514 @@
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <limits.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/user.h>
#include <sys/wait.h>
#include "compiler.h"
#include "syscall.h"
#include "types.h"
#include "util.h"
#include "parasite-syscall.h"
#include "parasite-blob.h"
#include "parasite.h"
#ifdef CONFIG_X86_64
static const char code_syscall[] = {0x0f, 0x05, 0xcc, 0xcc,
0xcc, 0xcc, 0xcc, 0xcc};
#define code_syscall_size (round_up(sizeof(code_syscall), sizeof(long)))
#define parasite_size (round_up(sizeof(parasite_blob), sizeof(long)))
static int syscall_fits_vma_area(struct vma_area *vma_area)
{
return can_run_syscall((unsigned long)vma_area->vma.start,
(unsigned long)vma_area->vma.start,
(unsigned long)vma_area->vma.end);
}
int can_run_syscall(unsigned long ip, unsigned long start, unsigned long end)
{
return ip >= start && ip < (end - code_syscall_size);
}
void *mmap_seized(pid_t pid, user_regs_struct_t *regs,
void *addr, size_t length, int prot,
int flags, int fd, off_t offset)
{
user_regs_struct_t params = *regs;
void *mmaped = NULL;
int ret;
params.ax = (unsigned long)__NR_mmap; /* mmap */
params.di = (unsigned long)addr; /* @addr */
params.si = (unsigned long)length; /* @length */
params.dx = (unsigned long)prot; /* @prot */
params.r10 = (unsigned long)flags; /* @flags */
params.r8 = (unsigned long)fd; /* @fd */
params.r9 = (unsigned long)offset; /* @offset */
ret = syscall_seized(pid, regs, &params, &params);
if (ret)
goto err;
mmaped = (void *)params.ax;
/* error code from the kernel space */
if ((long)mmaped < 0)
mmaped = NULL;
err:
return mmaped;
}
int munmap_seized(pid_t pid, user_regs_struct_t *regs,
void *addr, size_t length)
{
user_regs_struct_t params = *regs;
int ret;
params.ax = (unsigned long)__NR_munmap; /* mmap */
params.di = (unsigned long)addr; /* @addr */
params.si = (unsigned long)length; /* @length */
ret = syscall_seized(pid, regs, &params, &params);
if (!ret)
ret = (int)params.ax;
return ret;
}
int kill_seized(pid_t pid, user_regs_struct_t *where)
{
user_regs_struct_t params = *where;
int ret;
params.ax = (unsigned long)__NR_exit; /* exit */
params.di = (unsigned long)-1; /* @error-code */
ret = syscall_seized(pid, where, &params, &params);
return ret;
}
int syscall_seized(pid_t pid,
user_regs_struct_t *where,
user_regs_struct_t *params,
user_regs_struct_t *result)
{
user_regs_struct_t regs_orig, regs;
unsigned long start_ip;
char saved[sizeof(code_syscall)];
siginfo_t siginfo;
int status;
int ret = -1;
BUILD_BUG_ON(sizeof(code_syscall) != BUILTIN_SYSCALL_SIZE);
BUILD_BUG_ON(!is_log2(sizeof(code_syscall)));
start_ip = (unsigned long)where->ip;
jerr(ptrace_peek_area(pid, (void *)saved, (void *)start_ip, code_syscall_size), err);
jerr(ptrace_poke_area(pid, (void *)code_syscall, (void *)start_ip, code_syscall_size), err);
again:
jerr(ptrace(PTRACE_GETREGS, pid, NULL, &regs), err);
regs_orig = regs;
regs.ip = start_ip;
regs.ax = params->ax;
regs.di = params->di;
regs.si = params->si;
regs.dx = params->dx;
regs.r10 = params->r10;
regs.r8 = params->r8;
regs.r9 = params->r9;
regs.orig_ax = -1; /* avoid end-of-syscall processing */
jerr(ptrace(PTRACE_SETREGS, pid, NULL, &regs), err_restore);
/*
* Most ideas are taken from Tejun Heo's parasite thread
* https://code.google.com/p/ptrace-parasite/
*/
/*
* Run the parasite code, at the completion it'll trigger
* int3 and inform us that all is done.
*/
jerr(ptrace(PTRACE_CONT, pid, NULL, NULL), err_restore_full);
jerr(wait4(pid, &status, __WALL, NULL) != pid, err_restore_full);
jerr(!WIFSTOPPED(status), err_restore_full);
jerr(ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo),err_restore_full);
jerr(ptrace(PTRACE_GETREGS, pid, NULL, &regs), err_restore_full);
if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != SI_KERNEL) {
retry_signal:
/* pr_debug("** delivering signal %d si_code=%d\n",
siginfo.si_signo, siginfo.si_code); */
/* FIXME: jerr(siginfo.si_code > 0, err_restore_full); */
jerr(ptrace(PTRACE_SETREGS, pid, NULL, (void *)&regs_orig), err_restore_full);
jerr(ptrace(PTRACE_INTERRUPT, pid, NULL, NULL), err_restore_full);
jerr(ptrace(PTRACE_CONT, pid, NULL, (void *)(unsigned long)siginfo.si_signo), err_restore_full);
jerr(wait4(pid, &status, __WALL, NULL) != pid, err_restore_full);
jerr(!WIFSTOPPED(status), err_restore_full);
jerr(ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo), err_restore_full);
if (siginfo.si_code >> 8 != PTRACE_EVENT_STOP)
goto retry_signal;
goto again;
}
ret = 0;
/*
* Our code is done.
*/
jerr(ptrace(PTRACE_INTERRUPT, pid, NULL, NULL), err_restore_full);
jerr(ptrace(PTRACE_CONT, pid, NULL, NULL), err_restore_full);
jerr(wait4(pid, &status, __WALL, NULL) != pid, err_restore_full);
jerr(!WIFSTOPPED(status), err_restore_full);
jerr(ptrace(PTRACE_GETSIGINFO, pid, NULL, &siginfo), err_restore_full);
jerr((siginfo.si_code >> 8 != PTRACE_EVENT_STOP), err_restore_full);
jerr(ptrace(PTRACE_GETREGS, pid, NULL, &regs), err_restore_full);
ret = 0;
*result = regs;
err_restore_full:
if (ptrace(PTRACE_SETREGS, pid, NULL, &regs_orig))
pr_panic("Can't restore registers (pid: %d)\n", pid);
err_restore:
if (ptrace_poke_area(pid, (void *)saved, (void *)start_ip, code_syscall_size))
pr_panic("Crap... Can't restore data (pid: %d)\n", pid);
err:
return ret;
}
static struct vma_area *get_vma_by_ip(struct list_head *vma_area_list, unsigned long ip)
{
struct vma_area *vma_area;
list_for_each_entry(vma_area, vma_area_list, list) {
if (in_vma_area(vma_area, ip)) {
if (vma_area->vma.prot & PROT_EXEC) {
if (syscall_fits_vma_area(vma_area))
return vma_area;
}
}
}
return NULL;
}
int parasite_dump_pages_seized(struct parasite_ctl *ctl, struct list_head *vma_area_list,
struct cr_fdset *cr_fdset, int fd_type)
{
parasite_args_cmd_dumppages_t parasite_dumppages = { };
parasite_args_t parasite_arg = { };
user_regs_struct_t regs, regs_orig;
unsigned long nrpages_dumped = 0;
struct vma_area *vma_area;
siginfo_t siginfo;
int status, path_len, ret = -1;
pr_info("\n");
pr_info("Dumping pages (type: %d pid: %d)\n", fd_type, ctl->pid);
pr_info("----------------------------------------\n");
path_len = strlen(cr_fdset->desc[fd_type].name) + 1;
if (path_len > sizeof(parasite_dumppages.open_path)) {
pr_panic("Dumping pages path is too long (%d while %d allowed)\n",
path_len, sizeof(parasite_dumppages.open_path));
goto err;
}
jerr(ptrace(PTRACE_GETREGS, ctl->pid, NULL, &regs_orig), err);
parasite_arg.command = PARASITE_CMD_DUMPPAGES;
parasite_arg.args_size = sizeof(parasite_dumppages);
parasite_arg.args = &parasite_dumppages;
strncpy(parasite_dumppages.open_path, cr_fdset->desc[fd_type].name,
sizeof(parasite_dumppages.open_path));
parasite_dumppages.open_flags = O_WRONLY;
parasite_dumppages.open_mode = CR_FD_PERM;
parasite_dumppages.fd = -1UL;
/*
* Pass the command first, it's immutable.
*/
jerr(ptrace_poke_area((long)ctl->pid, (void *)&parasite_arg.command,
(void *)ctl->addr_cmd, sizeof(parasite_arg.command)),
err_restore);
list_for_each_entry(vma_area, vma_area_list, list) {
/*
* The special areas are not dumped.
*/
if (!(vma_area->vma.status & VMA_AREA_REGULAR))
continue;
/* No dumps for file-shared mappings */
if (vma_area->vma.status & VMA_FILE_SHARED)
continue;
pr_info_vma(vma_area);
again:
jerr(ptrace(PTRACE_GETREGS, ctl->pid, NULL, &regs), err_restore);
regs.ip = ctl->parasite_ip;
jerr(ptrace(PTRACE_SETREGS, ctl->pid, NULL, &regs), err_restore);
parasite_dumppages.vma_entry = vma_area->vma;
if (ptrace_poke_area((long)ctl->pid, (void *)parasite_arg.args,
(void *)ctl->addr_args, parasite_arg.args_size)) {
pr_error("Can't setup parasite arguments (pid: %d)\n", ctl->pid);
goto err_restore;
}
jerr(ptrace(PTRACE_CONT, (long)ctl->pid, NULL, NULL), err_restore);
jerr(wait4((long)ctl->pid, &status, __WALL, NULL) != (long)ctl->pid, err_restore);
jerr(!WIFSTOPPED(status), err_restore);
jerr(ptrace(PTRACE_GETSIGINFO, (long)ctl->pid, NULL, &siginfo), err_restore);
if (WSTOPSIG(status) != SIGTRAP || siginfo.si_code != SI_KERNEL) {
retry_signal:
/* pr_debug("** delivering signal %d si_code=%d\n",
siginfo.si_signo, siginfo.si_code); */
/* FIXME: jerr(siginfo.si_code > 0, err_restore_full); */
jerr(ptrace(PTRACE_SETREGS, (long)ctl->pid, NULL, (void *)&regs_orig), err_restore);
jerr(ptrace(PTRACE_INTERRUPT, (long)ctl->pid, NULL, NULL), err_restore);
jerr(ptrace(PTRACE_CONT, (long)ctl->pid, NULL, (void *)(unsigned long)siginfo.si_signo), err_restore);
jerr(wait4((long)ctl->pid, &status, __WALL, NULL) != (long)ctl->pid, err_restore);
jerr(!WIFSTOPPED(status), err_restore);
jerr(ptrace(PTRACE_GETSIGINFO, (long)ctl->pid, NULL, &siginfo), err_restore);
if (siginfo.si_code >> 8 != PTRACE_EVENT_STOP)
goto retry_signal;
goto again;
}
/*
* It's a bit tricky, the file get opened inside
* parasite but close via explicit syscall. Better would
* be to add some 'status' and close inside parasite on
* last call.
*/
if (parasite_dumppages.fd == -1UL) {
if (ptrace_peek_area((long)ctl->pid,
(void *)&parasite_dumppages.fd,
(void *)(ctl->addr_args +
offsetof(parasite_args_cmd_dumppages_t, fd)),
sizeof(parasite_dumppages.fd))) {
pr_error("Can't get file descriptor back (pid: %d)\n", ctl->pid);
goto err_restore;
}
}
/*
* Get some statistics.
*/
if (ptrace_peek_area((long)ctl->pid,
(void *)&parasite_dumppages.nrpages_dumped,
(void *)(ctl->addr_args +
offsetof(parasite_args_cmd_dumppages_t, nrpages_dumped)),
sizeof(parasite_dumppages.fd))) {
pr_error("Can't get statistics (pid: %d)\n", ctl->pid);
goto err_restore;
}
pr_info(" (dumped: %16li pages)\n", parasite_dumppages.nrpages_dumped);
nrpages_dumped += parasite_dumppages.nrpages_dumped;
}
/*
* Our code is done.
*/
jerr(ptrace(PTRACE_INTERRUPT, (long)ctl->pid, NULL, NULL), err_restore);
jerr(ptrace(PTRACE_CONT, (long)ctl->pid, NULL, NULL), err_restore);
jerr(wait4((long)ctl->pid, &status, __WALL, NULL) != (long)ctl->pid, err_restore);
jerr(!WIFSTOPPED(status), err_restore);
jerr(ptrace(PTRACE_GETSIGINFO, (long)ctl->pid, NULL, &siginfo), err_restore);
jerr((siginfo.si_code >> 8 != PTRACE_EVENT_STOP), err_restore);
jerr(ptrace(PTRACE_GETREGS, (long)ctl->pid, NULL, &regs), err_restore);
ret = 0;
/* Finally close the descriptor the parasite has opened */
if (parasite_dumppages.fd != -1UL) {
regs = regs_orig;
regs.ax = __NR_close; /* close */
regs.di = parasite_dumppages.fd; /* @fd */
ret = syscall_seized(ctl->pid, &regs_orig, &regs, &regs);
}
/*
* We don't know the position in file since it's updated
* outside of our process.
*/
lseek(cr_fdset->desc[CR_FD_PAGES].fd, 0, SEEK_END);
/* Ending page */
write_ptr_safe(cr_fdset->desc[CR_FD_PAGES].fd, &zero_page_entry, err_restore);
pr_info("\n");
pr_info("Summary: %16li pages dumped\n", nrpages_dumped);
err_restore:
if (ptrace(PTRACE_SETREGS, (long)ctl->pid, NULL, &regs_orig))
pr_panic("Can't restore registers (pid: %d)\n", ctl->pid);
err:
pr_info("----------------------------------------\n");
return ret;
}
int parasite_cure_seized(struct parasite_ctl **p_ctl,
struct list_head *vma_area_list)
{
user_regs_struct_t regs, regs_orig;
struct parasite_ctl *ctl;
struct vma_area *vma_area;
int ret = -1;
if (!p_ctl || !*p_ctl)
return 0;
ctl = *p_ctl;
jerr(ptrace(PTRACE_GETREGS, ctl->pid, NULL, &regs), err);
regs_orig = regs;
vma_area = get_vma_by_ip(vma_area_list, regs.ip);
if (!vma_area) {
pr_error("No suitable VMA found to run cure (pid: %d)\n", ctl->pid);
goto err;
}
regs.ip = vma_area->vma.start;
ret = munmap_seized(ctl->pid, &regs,
(void *)ctl->vma_area->vma.start,
(size_t)vma_entry_len(&ctl->vma_area->vma));
if (ret)
pr_error("munmap_seized failed (pid: %d)\n", ctl->pid);
if (ptrace(PTRACE_SETREGS, ctl->pid, NULL, &regs_orig)) {
ret = -1;
pr_panic("PTRACE_SETREGS failed (pid: %d)\n", ctl->pid);
}
free(*p_ctl), *p_ctl = NULL;
err:
return ret;
}
struct parasite_ctl *parasite_infect_seized(pid_t pid, void *addr_hint, struct list_head *vma_area_list)
{
user_regs_struct_t regs, regs_orig;
struct parasite_ctl *ctl = NULL;
struct vma_area *vma_area;
void *mmaped;
ctl = xzalloc(sizeof(*ctl) + sizeof(*vma_area));
if (!ctl) {
pr_error("Parasite control block allocation failed (pid: %d)\n", pid);
goto err;
}
/* Setup control block */
ctl->pid = pid;
ctl->vma_area = (struct vma_area *)(char *)&ctl[sizeof(*ctl)];
if (ptrace(PTRACE_GETREGS, pid, NULL, &regs))
pr_error_jmp(err_free);
vma_area = get_vma_by_ip(vma_area_list, regs.ip);
if (!vma_area) {
pr_error("No suitable VMA found to run parasite "
"bootstrap code (pid: %d)\n", pid);
goto err_free;
}
regs_orig = regs;
/*
* Prepare for in-process syscall.
*/
ctl->vma_area->vma.prot = PROT_READ | PROT_WRITE | PROT_EXEC;
ctl->vma_area->vma.flags = MAP_PRIVATE | MAP_ANONYMOUS;
regs.ip = vma_area->vma.start;
mmaped = mmap_seized(pid, &regs, addr_hint, (size_t)parasite_size,
(int)ctl->vma_area->vma.prot,
(int)ctl->vma_area->vma.flags,
(int)-1, (off_t)0);
if (!mmaped || (long)mmaped < 0) {
pr_error("Can't allocate memory for parasite blob (pid: %d)\n", pid);
goto err_restore_regs;
}
ctl->parasite_ip = PARASITE_HEAD_ADDR((unsigned long)mmaped);
ctl->parasite_complete_ip = PARASITE_COMPLETE_ADDR((unsigned long)mmaped);
ctl->addr_cmd = PARASITE_CMD_ADDR((unsigned long)mmaped);
ctl->addr_args = PARASITE_ARGS_ADDR((unsigned long)mmaped);
ctl->vma_area->vma.start= (u64)mmaped;
ctl->vma_area->vma.end = (u64)(mmaped + parasite_size);
if (ptrace_poke_area(pid, parasite_blob, mmaped, parasite_size)) {
pr_error("Can't inject parasite blob (pid: %d)\n", pid);
goto err_munmap_restore;
}
jerr(ptrace(PTRACE_SETREGS, pid, NULL, &regs_orig), err_munmap_restore);
return ctl;
err_munmap_restore:
regs = regs_orig, regs.ip = vma_area->vma.start;
if (munmap_seized(pid, &regs, mmaped, parasite_size))
pr_panic("mmap_seized failed (pid: %d)\n", pid);
err_restore_regs:
if (ptrace(PTRACE_SETREGS, pid, NULL, &regs_orig))
pr_panic("PTRACE_SETREGS failed (pid: %d)\n", pid);
err_free:
if (ctl)
free(ctl);
err:
return NULL;
}
#else /* CONFIG_X86_64 */
# error x86-32 is not yet implemented
#endif /* CONFIG_X86_64 */

339
parasite.c Normal file
View File

@ -0,0 +1,339 @@
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include "compiler.h"
#include "types.h"
#include "syscall.h"
#include "parasite.h"
#include "image.h"
#include "crtools.h"
#ifdef CONFIG_X86_64
static void *brk_start, *brk_end, *brk_tail;
static struct page_entry page;
static struct vma_entry vma;
void *memcpy(void *dest, const void *src, size_t n)
{
long d0, d1, d2;
asm volatile(
"rep ; movsq\n\t"
"movq %4,%%rcx\n\t"
"rep ; movsb\n\t"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
: "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
: "memory");
return dest;
}
static void brk_init(void *brk)
{
brk_start = brk_tail = brk;
brk_end = brk_start + PARASITE_BRK_SIZE;
}
static void *brk_alloc(unsigned long bytes)
{
void *addr = NULL;
if (brk_end > (brk_tail + bytes)) {
addr = brk_tail;
brk_tail+= bytes;
}
return addr;
}
static void brk_free(unsigned long bytes)
{
if (brk_start >= (brk_tail - bytes))
brk_tail -= bytes;
}
static unsigned long builtin_strlen(char *str)
{
unsigned long len = 0;
while (*str++)
len++;
return len;
}
static const unsigned char hex[] = "0123456789abcdef";
static char *long2hex(unsigned long v)
{
static char buf[32];
char *p = buf;
int i;
for (i = sizeof(long) - 1; i >= 0; i--) {
*p++ = hex[ ((((unsigned char *)&v)[i]) & 0xf0) >> 4 ];
*p++ = hex[ ((((unsigned char *)&v)[i]) & 0x0f) >> 0 ];
}
*p = 0;
return buf;
}
static void sys_write_msg(const char *msg)
{
int size = 0;
while (msg[size])
size++;
sys_write(1, msg, size);
}
static int restore_core(char *corefile)
{
int ret = PARASITE_ERR_FAIL;
int fd_core;
fd_core = (int)sys_open(corefile, O_RDONLY, 0600);
if (fd_core < 0) {
ret = PARASITE_ERR_OPEN;
goto err_open;
}
/* Skip the header */
sys_lseek(fd_core, GET_FILE_OFF_AFTER(struct core_entry), SEEK_SET);
/* First VMA areas */
while (1) {
unsigned long addr;
ret = sys_read(fd_core, &vma, sizeof(vma));
if (ret && ret != sizeof(vma)) {
ret = PARASITE_ERR_CORE_VMA;
goto err;
}
if (vma.start == 0 && vma.end == 0)
break;
/* Make sure it's mapped into proper place */
addr = sys_mmap((void *)vma.start,
vma.end - vma.start,
vma.prot,
vma.flags | MAP_FIXED,
vma.fd,
vma.pgoff);
if (addr != vma.start) {
ret = PARASITE_ERR_MMAP;
goto err;
}
}
/* Now pages */
while (1) {
unsigned long count;
ret = sys_read(fd_core, &page.va, sizeof(page.va));
if (ret && ret != sizeof(page.va)) {
ret = PARASITE_ERR_CORE_PAGE;
goto err;
}
if (page.va == 0)
break;
ret = sys_read(fd_core, page.data, sizeof(page.data));
if (ret && ret != sizeof(page.data)) {
ret = PARASITE_ERR_CORE_PAGE;
goto err;
}
memcpy((void *)page.va, page.data, sizeof(page.data));
}
ret = 0;
err:
sys_close(fd_core);
err_open:
return ret;
}
static int dump_pages(parasite_args_cmd_dumppages_t *args)
{
int ret = PARASITE_ERR_FAIL;
unsigned long nrpages, pfn, length;
unsigned long prot_old, prot_new;
unsigned char *map_brk = NULL;
unsigned char *map;
bool dump_all = false;
args->nrpages_dumped = 0;
prot_old = prot_new = 0;
if (args->fd == -1UL) {
args->fd = sys_open(args->open_path, args->open_flags, args->open_mode);
if (args->fd < 0) {
sys_write_msg("sys_open failed\n");
ret = PARASITE_ERR_OPEN;
goto err;
}
}
/* Start from the end of file */
sys_lseek(args->fd, 0, SEEK_END);
length = args->vma_entry.end - args->vma_entry.start;
nrpages = length / PAGE_SIZE;
/*
* brk should allow us to handle up to 128M of memory,
* otherwise call for mmap.
*/
map = brk_alloc(nrpages);
if (map) {
map_brk = map;
} else {
map = (void *)sys_mmap(NULL, nrpages,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1, 0);
if ((long)map < 0) {
sys_write_msg("sys_mmap failed\n");
ret = PARASITE_ERR_MMAP;
goto err;
}
}
dump_all = !!(args->vma_entry.status & VMA_DUMP_ALL);
/*
* Try to change page protection if needed so we would
* be able to dump contents.
*/
if (!(args->vma_entry.prot & PROT_READ)) {
prot_old = (unsigned long)args->vma_entry.prot;
prot_new = prot_old | PROT_READ;
if (sys_mprotect((unsigned long)args->vma_entry.start,
(unsigned long)vma_entry_len(&args->vma_entry),
prot_new)) {
sys_write_msg("sys_mprotect failed\n");
ret = PARASITE_ERR_MPROTECT;
goto err_free;
}
}
/*
* Dumping the whole VMA range is not a common operation
* so stick for mincore as a basis.
*/
if (sys_mincore((unsigned long)args->vma_entry.start, length, map)) {
sys_write_msg("sys_mincore failed\n");
ret = PARASITE_ERR_MINCORE;
goto err_free;
}
ret = 0;
for (pfn = 0; pfn < nrpages; pfn++) {
unsigned long vaddr, written;
if ((map[pfn] & PAGE_RSS) || dump_all) {
/*
* That's the optimized write of
* page_entry structure, see image.h
*/
vaddr = (unsigned long)args->vma_entry.start + pfn * PAGE_SIZE;
written = 0;
written += sys_write(args->fd, &vaddr, sizeof(vaddr));
written += sys_write(args->fd, (void *)vaddr, PAGE_SIZE);
if (written != sizeof(vaddr) + PAGE_SIZE) {
ret = PARASITE_ERR_WRITE;
sys_write_msg("sys_write on page failed\n");
goto err_free;
}
args->nrpages_dumped++;
}
}
/*
* Don't left pages readable if they were not.
*/
if (prot_old != prot_new) {
if (sys_mprotect((unsigned long)args->vma_entry.start,
(unsigned long)vma_entry_len(&args->vma_entry),
prot_old)) {
sys_write_msg("PANIC: Ouch! sys_mprotect failed on resore\n");
ret = PARASITE_ERR_MPROTECT;
goto err_free;
}
}
err_free:
if (map_brk)
brk_free(nrpages);
else
sys_munmap(map, nrpages);
err:
return ret;
}
static int __used parasite_service(unsigned long cmd, void *args, void *brk)
{
brk_init(brk);
switch (cmd) {
case PARASITE_CMD_KILLME:
sys_close(0);
break;
case PARASITE_CMD_PINGME:
break;
case PARASITE_CMD_DUMPPAGES:
return dump_pages((parasite_args_cmd_dumppages_t *)args);
break;
case PARASITE_CMD_RESTORECORE:
return restore_core((char *)args);
break;
default:
sys_write_msg("Unknown command to parasite\n");
break;
}
return 0;
}
static void __parasite_head __used parasite_head(void)
{
/*
* The linker will handle the stack allocation.
*/
asm volatile("parasite_head_start: \n\t"
"leaq parasite_stack(%rip), %rsp \n\t"
"pushq $0 \n\t"
"movq %rsp, %rbp \n\t"
"movl parasite_cmd(%rip), %edi \n\t"
"leaq parasite_args(%rip), %rsi \n\t"
"leaq parasite_brk(%rip), %rdx \n\t"
"call parasite_service \n\t"
"parasite_service_complete: \n\t"
"int $0x03 \n\t"
".align 8 \n\t"
"parasite_cmd: \n\t"
".long 0 \n\t"
"parasite_args: \n\t"
".long 0 \n\t"
".skip "__stringify(PARASITE_ARG_SIZE)",0 \n\t"
".skip "__stringify(PARASITE_STACK_SIZE)", 0 \n\t"
"parasite_stack: \n\t"
".long 0 \n\t"
"parasite_brk: \n\t"
".skip "__stringify(PARASITE_BRK_SIZE)", 0 \n\t"
".long 0 \n\t");
}
#else /* CONFIG_X86_64 */
# error x86-32 bit mode not yet implemented
#endif /* CONFIG_X86_64 */

19
parasite.lds.S Normal file
View File

@ -0,0 +1,19 @@
OUTPUT_FORMAT("binary")
OUTPUT_ARCH(i386:x86-64)
SECTIONS
{
. = 0;
.text : {
*(.parasite.head.text)
*(.text)
. = ALIGN(8);
}
.data : {
*(.data)
*(.rodata)
*(.bss)
*(.parasite.stack)
. = ALIGN(8);
}
}

322
rbtree.c Normal file
View File

@ -0,0 +1,322 @@
/*
* RBtree implementation adopted from the Linux
* kernel sources.
*/
#include "rbtree.h"
static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
{
struct rb_node *right = node->rb_right;
struct rb_node *parent = rb_parent(node);
if ((node->rb_right = right->rb_left))
rb_set_parent(right->rb_left, node);
right->rb_left = node;
rb_set_parent(right, parent);
if (parent) {
if (node == parent->rb_left)
parent->rb_left = right;
else
parent->rb_right = right;
} else
root->rb_node = right;
rb_set_parent(node, right);
}
static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
{
struct rb_node *left = node->rb_left;
struct rb_node *parent = rb_parent(node);
if ((node->rb_left = left->rb_right))
rb_set_parent(left->rb_right, node);
left->rb_right = node;
rb_set_parent(left, parent);
if (parent) {
if (node == parent->rb_right)
parent->rb_right = left;
else
parent->rb_left = left;
} else
root->rb_node = left;
rb_set_parent(node, left);
}
void rb_insert_color(struct rb_node *node, struct rb_root *root)
{
struct rb_node *parent, *gparent;
while ((parent = rb_parent(node)) && rb_is_red(parent)) {
gparent = rb_parent(parent);
if (parent == gparent->rb_left) {
{
register struct rb_node *uncle = gparent->rb_right;
if (uncle && rb_is_red(uncle)) {
rb_set_black(uncle);
rb_set_black(parent);
rb_set_red(gparent);
node = gparent;
continue;
}
}
if (parent->rb_right == node) {
register struct rb_node *tmp;
__rb_rotate_left(parent, root);
tmp = parent;
parent = node;
node = tmp;
}
rb_set_black(parent);
rb_set_red(gparent);
__rb_rotate_right(gparent, root);
} else {
{
register struct rb_node *uncle = gparent->rb_left;
if (uncle && rb_is_red(uncle)) {
rb_set_black(uncle);
rb_set_black(parent);
rb_set_red(gparent);
node = gparent;
continue;
}
}
if (parent->rb_left == node) {
register struct rb_node *tmp;
__rb_rotate_right(parent, root);
tmp = parent;
parent = node;
node = tmp;
}
rb_set_black(parent);
rb_set_red(gparent);
__rb_rotate_left(gparent, root);
}
}
rb_set_black(root->rb_node);
}
static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
struct rb_root *root)
{
struct rb_node *other;
while ((!node || rb_is_black(node)) && node != root->rb_node) {
if (parent->rb_left == node) {
other = parent->rb_right;
if (rb_is_red(other)) {
rb_set_black(other);
rb_set_red(parent);
__rb_rotate_left(parent, root);
other = parent->rb_right;
}
if ((!other->rb_left || rb_is_black(other->rb_left)) &&
(!other->rb_right || rb_is_black(other->rb_right))) {
rb_set_red(other);
node = parent;
parent = rb_parent(node);
} else {
if (!other->rb_right || rb_is_black(other->rb_right)) {
rb_set_black(other->rb_left);
rb_set_red(other);
__rb_rotate_right(other, root);
other = parent->rb_right;
}
rb_set_color(other, rb_color(parent));
rb_set_black(parent);
rb_set_black(other->rb_right);
__rb_rotate_left(parent, root);
node = root->rb_node;
break;
}
} else {
other = parent->rb_left;
if (rb_is_red(other)) {
rb_set_black(other);
rb_set_red(parent);
__rb_rotate_right(parent, root);
other = parent->rb_left;
}
if ((!other->rb_left || rb_is_black(other->rb_left)) &&
(!other->rb_right || rb_is_black(other->rb_right))) {
rb_set_red(other);
node = parent;
parent = rb_parent(node);
} else {
if (!other->rb_left || rb_is_black(other->rb_left)) {
rb_set_black(other->rb_right);
rb_set_red(other);
__rb_rotate_left(other, root);
other = parent->rb_left;
}
rb_set_color(other, rb_color(parent));
rb_set_black(parent);
rb_set_black(other->rb_left);
__rb_rotate_right(parent, root);
node = root->rb_node;
break;
}
}
}
if (node)
rb_set_black(node);
}
void rb_erase(struct rb_node *node, struct rb_root *root)
{
struct rb_node *child, *parent;
int color;
if (!node->rb_left)
child = node->rb_right;
else if (!node->rb_right)
child = node->rb_left;
else {
struct rb_node *old = node, *left;
node = node->rb_right;
while ((left = node->rb_left) != NULL)
node = left;
if (rb_parent(old)) {
if (rb_parent(old)->rb_left == old)
rb_parent(old)->rb_left = node;
else
rb_parent(old)->rb_right = node;
} else
root->rb_node = node;
child = node->rb_right;
parent = rb_parent(node);
color = rb_color(node);
if (parent == old) {
parent = node;
} else {
if (child)
rb_set_parent(child, parent);
parent->rb_left = child;
node->rb_right = old->rb_right;
rb_set_parent(old->rb_right, node);
}
node->rb_parent_color = old->rb_parent_color;
node->rb_left = old->rb_left;
rb_set_parent(old->rb_left, node);
goto color;
}
parent = rb_parent(node);
color = rb_color(node);
if (child)
rb_set_parent(child, parent);
if (parent) {
if (parent->rb_left == node)
parent->rb_left = child;
else
parent->rb_right = child;
} else
root->rb_node = child;
color:
if (color == RB_BLACK)
__rb_erase_color(child, parent, root);
}
struct rb_node *rb_first(const struct rb_root *root)
{
struct rb_node *n;
n = root->rb_node;
if (!n)
return NULL;
while (n->rb_left)
n = n->rb_left;
return n;
}
struct rb_node *rb_last(const struct rb_root *root)
{
struct rb_node *n;
n = root->rb_node;
if (!n)
return NULL;
while (n->rb_right)
n = n->rb_right;
return n;
}
struct rb_node *rb_next(const struct rb_node *node)
{
struct rb_node *parent;
if (rb_parent(node) == node)
return NULL;
if (node->rb_right) {
node = node->rb_right;
while (node->rb_left)
node=node->rb_left;
return (struct rb_node *)node;
}
while ((parent = rb_parent(node)) && node == parent->rb_right)
node = parent;
return parent;
}
struct rb_node *rb_prev(const struct rb_node *node)
{
struct rb_node *parent;
if (rb_parent(node) == node)
return NULL;
if (node->rb_left) {
node = node->rb_left;
while (node->rb_right)
node=node->rb_right;
return (struct rb_node *)node;
}
while ((parent = rb_parent(node)) && node == parent->rb_left)
node = parent;
return parent;
}
void rb_replace_node(struct rb_node *victim, struct rb_node *new,
struct rb_root *root)
{
struct rb_node *parent = rb_parent(victim);
if (parent) {
if (victim == parent->rb_left)
parent->rb_left = new;
else
parent->rb_right = new;
} else {
root->rb_node = new;
}
if (victim->rb_left)
rb_set_parent(victim->rb_left, new);
if (victim->rb_right)
rb_set_parent(victim->rb_right, new);
*new = *victim;
}

112
testee-static.c Normal file
View File

@ -0,0 +1,112 @@
/*
* A simple testee program
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <string.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <sched.h>
int main(int argc, char *argv[])
{
// int pipefd[2];
int fd_shared, fd_private;
const char data_mark[] = "This is a data_mark marker";
void *mmap_shared, *mmap_private, *mmap_anon, *map_unreadable;
void *mmap_anon_shared;
const char sep[] = "----------";
unsigned long buf;
int i;
(void)data_mark;
printf("%s pid %d\n", argv[0], getpid());
// if (pipe(pipefd)) {
// perror("Can't create pipe");
// goto err;
// }
fd_shared = open("testee-shared.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
if (fd_shared < 0) {
perror("Can't open fd_shared file");
goto err;
}
fd_private = open("testee-private.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
if (fd_private < 0) {
perror("Can't open fd_private file");
goto err;
}
if (lseek(fd_shared, 1024, SEEK_SET) == -1 ||
lseek(fd_private, 1024, SEEK_SET) == -1) {
perror("Can't llsek");
goto err;
}
write(fd_shared, "", 1);
write(fd_private, "", 1);
mmap_shared = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd_shared, 0);
mmap_private = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, fd_private, 0);
mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
mmap_anon_shared= mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
if (mmap_shared == MAP_FAILED ||
mmap_private == MAP_FAILED ||
mmap_anon_shared == MAP_FAILED ||
mmap_anon == MAP_FAILED ||
map_unreadable == MAP_FAILED) {
perror("mmap failed");
goto err;
}
strcpy((char *)mmap_shared, sep);
strcpy((char *)mmap_private, sep);
strcpy((char *)mmap_anon, sep);
strcpy((char *)map_unreadable, sep);
strcpy((char *)mmap_anon_shared,sep);
for (i = 64; i < 128; i++) {
((char *)mmap_shared)[i] = 0 + i;
((char *)mmap_private)[i] = 64 + i;
((char *)mmap_anon)[i] = 128 + i;
((char *)map_unreadable)[i] = 190 + i;
((char *)mmap_anon_shared)[i] = 0 + i;
}
if (mprotect(map_unreadable, 1024, PROT_NONE)) {
perror("mprotect failed");
goto err;
}
asm volatile("" ::: "memory");
fsync(fd_shared);
fsync(fd_private);
sync();
asm volatile("" ::: "memory");
while (1) {
printf("ping: %d\n", getpid());
// write(pipefd[1], &buf, sizeof(buf));
sleep(6);
}
err:
/* resources are released by kernel */
return 0;
}

74
testee-threads.c Normal file
View File

@ -0,0 +1,74 @@
/*
* A simple testee program with threads
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <pthread.h>
static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
static int counter;
static void *f1(void *arg)
{
void *map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
(void)map_unreadable;
while (1) {
pthread_mutex_lock(&mtx);
counter++;
/* printf("Counter value: %d\n", counter); */
pthread_mutex_unlock(&mtx);
sleep(2);
}
return NULL;
}
static void *f2(void *arg)
{
void *map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
(void)map_unreadable;
while (1) {
pthread_mutex_lock(&mtx);
counter++;
/* printf("Counter value: %d\n", counter); */
pthread_mutex_unlock(&mtx);
sleep(3);
}
return NULL;
}
int main(int argc, char *argv[])
{
pthread_t th1, th2;
int rc1, rc2;
printf("%s pid %d\n", argv[0], getpid());
rc1 = pthread_create(&th1, NULL, &f1, NULL);
rc2 = pthread_create(&th2, NULL, &f2, NULL);
if (rc1 | rc2)
exit(1);
pthread_join(th1, NULL);
pthread_join(th2, NULL);
exit(0);
}

92
testee-unlinked.c Normal file
View File

@ -0,0 +1,92 @@
/*
* A simple testee program
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <string.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <sched.h>
int main(int argc, char *argv[])
{
int fd_shared, fd_private;
const char data_mark[] = "This is a data_mark marker";
void *mmap_shared, *mmap_private, *mmap_anon, *map_unreadable;
const char sep[] = "----------";
pid_t pid, child;
int i;
printf("%s pid %d\n", argv[0], getpid());
fd_shared = open("testee-shared.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
if (fd_shared < 0) {
perror("Can't open fd_shared file");
goto err;
}
fd_private = open("testee-private.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
if (fd_private < 0) {
perror("Can't open fd_private file");
goto err;
}
if (lseek(fd_shared, 1024, SEEK_SET) == -1 ||
lseek(fd_private, 1024, SEEK_SET) == -1) {
perror("Can't llsek");
goto err;
}
write(fd_shared, "", 1);
write(fd_private, "", 1);
mmap_shared = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd_shared, 0);
mmap_private = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, fd_private, 0);
if (mmap_shared == MAP_FAILED ||
mmap_private == MAP_FAILED) {
perror("mmap failed");
goto err;
}
strcpy((char *)mmap_shared, sep);
strcpy((char *)mmap_private, sep);
for (i = 64; i < 128; i++) {
((char *)mmap_shared)[i] = 0 + i;
((char *)mmap_private)[i] = 64 + i;
}
fsync(fd_shared);
fsync(fd_private);
close(fd_shared);
fsync(fd_private);
unlink("testee-shared.img");
unlink("testee-private.img");
for (i = 64; i < 128; i++) {
((char *)mmap_shared)[i] = 0 + i;
((char *)mmap_private)[i] = 64 + i;
}
msync(mmap_shared, 1024, MS_SYNC);
msync(mmap_private, 1024, MS_SYNC);
while (1)
sleep(1);
err:
/* resources are released by kernel */
return 0;
}

231
testee.c Normal file
View File

@ -0,0 +1,231 @@
/*
* A simple testee program
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <signal.h>
#include <string.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <sched.h>
static int do_child(void *arg)
{
printf("do_child pid: %d\n", getpid());
void *stack, *mmap_anon;
stack = mmap(0, 4 * 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0, 0);
if (stack == MAP_FAILED)
return -1;
mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (mmap_anon == MAP_FAILED)
return -1;
while (1)
sleep(6);
return 0;
}
static int run_clone(void)
{
pid_t pid = 0;
int ret = 0;
void *stack, *mmap_anon;
stack = mmap(0, 4 * 4096, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON | MAP_GROWSDOWN, 0, 0);
if (stack == MAP_FAILED)
return -1;
mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (mmap_anon == MAP_FAILED)
return -1;
stack += 4 * 4096;
ret = clone(do_child, stack, CLONE_FS, NULL, NULL, NULL, &pid);
if (ret < 0)
perror("Failed clone");
printf("run_clone: %d stack: %p mmap_anon: %p ret %d\n",
pid, stack, mmap_anon, ret);
if (stack == MAP_FAILED)
return -1;
mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (mmap_anon == MAP_FAILED)
return -1;
stack += 4 * 4096;
ret = clone(do_child, stack, CLONE_FS | CLONE_FILES | CLONE_VM, NULL, NULL, NULL, &pid);
if (ret < 0)
perror("Failed clone");
printf("run_clone: %d stack: %p mmap_anon: %p ret %d\n",
pid, stack, mmap_anon, ret);
return ret;
}
int main(int argc, char *argv[])
{
// int pipefd[2];
int fd_shared, fd_private;
const char data_mark[] = "This is a data_mark marker";
void *mmap_shared, *mmap_private, *mmap_anon, *map_unreadable;
const char sep[] = "----------";
pid_t pid, child;
char suided_path[128];
int i;
(void)data_mark;
printf("%s pid %d\n", argv[0], getpid());
// if (pipe(pipefd)) {
// perror("Can't create pipe");
// goto err;
// }
fd_shared = open("testee-shared.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
if (fd_shared < 0) {
perror("Can't open fd_shared file");
goto err;
}
fd_private = open("testee-private.img", O_RDWR | O_CREAT | O_TRUNC, 0600);
if (fd_private < 0) {
perror("Can't open fd_private file");
goto err;
}
if (lseek(fd_shared, 1024, SEEK_SET) == -1 ||
lseek(fd_private, 1024, SEEK_SET) == -1) {
perror("Can't llsek");
goto err;
}
write(fd_shared, "", 1);
write(fd_private, "", 1);
mmap_shared = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_SHARED, fd_shared, 0);
mmap_private = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_FILE | MAP_PRIVATE, fd_private, 0);
mmap_anon = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
map_unreadable = mmap(NULL, 1024, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
if (mmap_shared == MAP_FAILED ||
mmap_private == MAP_FAILED ||
mmap_anon == MAP_FAILED ||
map_unreadable == MAP_FAILED) {
perror("mmap failed");
goto err;
}
snprintf(suided_path, sizeof(suided_path),
"/proc/%d/map_files/%lx-%lx",
getpid(), (long)mmap_shared,
(long)mmap_shared + 0x1000);
strcpy((char *)mmap_shared, sep);
strcpy((char *)mmap_private, sep);
strcpy((char *)mmap_anon, sep);
strcpy((char *)map_unreadable, sep);
for (i = 64; i < 128; i++) {
((char *)mmap_shared)[i] = 0 + i;
((char *)mmap_private)[i] = 64 + i;
((char *)mmap_anon)[i] = 128 + i;
((char *)map_unreadable)[i] = 190 + i;
}
if (mprotect(map_unreadable, 1024, PROT_NONE)) {
perror("mprotect failed");
goto err;
}
asm volatile("" ::: "memory");
fsync(fd_shared);
fsync(fd_private);
close(fd_shared);
if (argc > 1) {
printf("my-uid: %d\n", getuid());
setuid(atoi(argv[1]));
printf("my-uid: %d\n", getuid());
}
fd_shared = open(suided_path, O_RDWR, 0600);
printf("fd_shared for O_RDWR: %d\n", fd_shared);
if (fd_shared >= 0) {
write(fd_shared, "aaaa", sizeof("aaaa"));
close(fd_shared);
}
fd_shared = open(suided_path, O_TRUNC, 0600);
printf("fd_shared for O_TRUNC: %d\n", fd_shared);
if (fd_shared >= 0) {
printf("tunc: %d\n", ftruncate(fd_shared, 512));
close(fd_shared);
}
fd_shared = open(suided_path, O_RDONLY, 0600);
printf("fd_shared for O_RDONLY: %d\n", fd_shared);
if (fd_shared >= 0)
close(fd_shared);
sync();
asm volatile("" ::: "memory");
pid = fork();
if (pid == -1)
goto err;
if (pid == 0) {
long buf;
child = fork();
if (child == -1)
goto err;
if (child == 0) {
printf("first child pid: %d\n", getpid());
// while (read(pipefd[0], &buf, sizeof(buf)) > 0)
// sleep(3);
while (1) {
printf("ping: %d\n", getpid());
sleep(8);
}
} else {
printf("first parent pid: %d\n", getpid());
// run_clone();
while (1) {
printf("ping: %d\n", getpid());
sleep(9);
}
}
} else {
long buf = 0xdeadbeef;
while (1) {
printf("ping: %d\n", getpid());
// write(pipefd[1], &buf, sizeof(buf));
sleep(10);
}
}
err:
/* resources are released by kernel */
return 0;
}

412
util.c Normal file
View File

@ -0,0 +1,412 @@
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <stdbool.h>
#include <limits.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/ptrace.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/wait.h>
#include <stdio.h>
#include <stdlib.h>
#include <signal.h>
#include <limits.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <dirent.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <sys/vfs.h>
#include <sys/ptrace.h>
#include <sys/user.h>
#include <sys/wait.h>
#include "compiler.h"
#include "types.h"
#include "list.h"
#include "util.h"
#include "crtools.h"
static char big_buffer[PATH_MAX];
void printk(const char *format, ...)
{
va_list params;
va_start(params, format);
vfprintf(stdout, format, params);
va_end(params);
}
int ptrace_show_area_r(pid_t pid, void *addr, long bytes)
{
unsigned long w, i;
if (bytes & (sizeof(long) - 1))
return -1;
for (w = 0; w < bytes / sizeof(long); w++) {
unsigned long *a = addr;
unsigned long v;
v = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL);
if (v == -1U && errno)
goto err;
else {
unsigned char *c = (unsigned char *)&v;
for (i = sizeof(v)/sizeof(*c); i > 0; i--)
printk("%02x ", c[i - 1]);
printk(" ");
}
}
printk("\n");
return 0;
err:
return -2;
}
int ptrace_show_area(pid_t pid, void *addr, long bytes)
{
unsigned long w, i;
if (bytes & (sizeof(long) - 1))
return -1;
printk("%016lx: ", (unsigned long)addr);
for (w = 0; w < bytes / sizeof(long); w++) {
unsigned long *a = addr;
unsigned long v;
v = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL);
if (v == -1U && errno)
goto err;
else {
unsigned char *c = (unsigned char *)&v;
for (i = 0; i < sizeof(v)/sizeof(*c); i++)
printk("%02x ", c[i]);
printk(" ");
}
}
printk("\n");
return 0;
err:
return -2;
}
int ptrace_peek_area(pid_t pid, void *dst, void *addr, long bytes)
{
unsigned long w;
if (bytes & (sizeof(long) - 1))
return -1;
for (w = 0; w < bytes / sizeof(long); w++) {
unsigned long *d = dst, *a = addr;
d[w] = ptrace(PTRACE_PEEKDATA, pid, a + w, NULL);
if (d[w] == -1U && errno)
goto err;
}
return 0;
err:
return -2;
}
int ptrace_poke_area(pid_t pid, void *src, void *addr, long bytes)
{
unsigned long w;
if (bytes & (sizeof(long) - 1))
return -1;
for (w = 0; w < bytes / sizeof(long); w++) {
unsigned long *s = src, *a = addr;
if (ptrace(PTRACE_POKEDATA, pid, a + w, s[w]))
goto err;
}
return 0;
err:
return -2;
}
void printk_registers(user_regs_struct_t *regs)
{
printk("ip : %16lx cs : %16lx ds : %16lx\n"
"es : %16lx fs : %16lx gs : %16lx\n"
"sp : %16lx ss : %16lx flags : %16lx\n"
"ax : %16lx cx : %16lx dx : %16lx\n"
"si : %16lx di : %16lx bp : %16lx\n"
"bx : %16lx r8 : %16lx r9 : %16lx\n"
"r10 : %16lx r11 : %16lx r12 : %16lx\n"
"r13 : %16lx r14 : %16lx r15 : %16lx\n"
"orig_ax: %16lx fs_base: %16lx gs_base: %16lx\n\n",
regs->ip, regs->cs, regs->ds,
regs->es, regs->fs, regs->gs,
regs->sp, regs->ss, regs->flags,
regs->ax, regs->cx, regs->dx,
regs->si, regs->di, regs->bp,
regs->bx, regs->r8, regs->r9,
regs->r10, regs->r11, regs->r12,
regs->r13, regs->r14, regs->r15,
regs->orig_ax, regs->fs_base, regs->gs_base);
}
void printk_siginfo(siginfo_t *siginfo)
{
printk("si_signo %d si_errno %d si_code %d\n",
siginfo->si_signo, siginfo->si_errno, siginfo->si_code);
}
void printk_vma(struct vma_area *vma_area)
{
if (!vma_area)
return;
printk("s: %16lx e: %16lx l: %4liK p: %4x f: %4x fd: %4d pid: %4d dev:%02x:%02x:%04lx vf: %s st: %s spc: %s\n",
vma_area->vma.start, vma_area->vma.end,
(vma_area->vma.end - vma_area->vma.start) >> 10,
vma_area->vma.prot,
vma_area->vma.flags,
vma_area->vma.fd,
vma_area->vma.pid,
vma_area->vma.dev_maj,
vma_area->vma.dev_min,
vma_area->vma.ino,
vma_area->vm_file_fd < 0 ? "n" : "y",
!vma_area->vma.status ? "--" :
((vma_area->vma.status & VMA_FILE_PRIVATE) ? "FP" :
((vma_area->vma.status & VMA_FILE_SHARED) ? "FS" :
((vma_area->vma.status & VMA_ANON_SHARED) ? "AS" :
((vma_area->vma.status & VMA_ANON_PRIVATE) ? "AP" : "--")))),
!vma_area->vma.status ? "--" :
((vma_area->vma.status & VMA_AREA_STACK) ? "stack" :
((vma_area->vma.status & VMA_AREA_VSYSCALL) ? "vsyscall" :
((vma_area->vma.status & VMA_AREA_VDSO) ? "vdso" : "n"))));
}
int unseize_task(pid_t pid)
{
return ptrace(PTRACE_DETACH, pid, NULL, NULL);
}
int seize_task(pid_t pid)
{
siginfo_t si;
int status;
int ret = 0;
jerr_rc(ptrace(PTRACE_SEIZE, pid, NULL,
(void *)(unsigned long)PTRACE_SEIZE_DEVEL), ret, err);
jerr_rc(ptrace(PTRACE_INTERRUPT, pid, NULL, NULL), ret, err);
ret = -10;
if (wait4(pid, &status, __WALL, NULL) != pid)
goto err;
ret = -20;
if (!WIFSTOPPED(status))
goto err;
jerr_rc(ptrace(PTRACE_GETSIGINFO, pid, NULL, &si), ret, err_cont);
ret = -30;
if ((si.si_code >> 8) != PTRACE_EVENT_STOP)
goto err_cont;
jerr_rc(ptrace(PTRACE_SETOPTIONS, pid, NULL,
(void *)(unsigned long)PTRACE_O_TRACEEXIT), ret, err_cont);
err:
return ret;
err_cont:
continue_task(pid);
goto err;
}
int reopen_fd_as(int new_fd, int old_fd)
{
if (old_fd != new_fd) {
int tmp = dup2(old_fd, new_fd);
if (tmp < 0)
return tmp;
close(old_fd);
}
return new_fd;
}
int parse_maps(pid_t pid, struct list_head *vma_area_list)
{
struct vma_area *vma_area = NULL;
u64 start, end, pgoff;
char map_files_path[64];
char maps_path[64];
unsigned long ino;
char r,w,x,s;
int dev_maj, dev_min;
int ret = -1;
DIR *map_files_dir = NULL;
FILE *maps = NULL;
snprintf(maps_path, sizeof(maps_path), "/proc/%d/maps", pid);
maps = fopen(maps_path, "r");
if (!maps) {
pr_perror("Can't open: %s\n", maps_path);
goto err;
}
snprintf(map_files_path, sizeof(map_files_path),
"/proc/%d/map_files", pid);
/*
* It might be a problem in kernel, either
* I'm debugging it on old kernel ;)
*/
map_files_dir = opendir(map_files_path);
if (!map_files_dir)
pr_warning("Crap, can't open %s, old kernel?\n",
map_files_path);
while (fgets(big_buffer, sizeof(big_buffer), maps)) {
char vma_file_path[16+16+2];
struct stat st_buf;
ret = sscanf(big_buffer, "%lx-%lx %c%c%c%c %lx %02x:%02x %lu",
&start, &end, &r, &w, &x, &s, &pgoff, &dev_maj,
&dev_min, &ino);
if (ret != 10) {
pr_error("Can't parse: %s", big_buffer);
return -1;
}
vma_area = alloc_vma_area();
if (!vma_area)
return -1;
/* Figure out if it's file mapping */
snprintf(vma_file_path, sizeof(vma_file_path), "%lx-%lx", start, end);
if (map_files_dir) {
/*
* Note that we "open" it in dumper process space
* so later we might refer to it via /proc/self/fd/vm_file_fd
* if needed.
*/
vma_area->vm_file_fd = openat(dirfd(map_files_dir),
vma_file_path, O_RDONLY);
if (vma_area->vm_file_fd < 0) {
if (errno != ENOENT) {
pr_perror("Failed opening %s/%s\n",
map_files_path,
vma_file_path);
goto err;
}
}
}
vma_area->vma.pid = pid;
vma_area->vma.start = start;
vma_area->vma.end = end;
vma_area->vma.pgoff = pgoff;
vma_area->vma.ino = ino;
vma_area->vma.dev_maj = dev_maj;
vma_area->vma.dev_min = dev_min;
vma_area->vma.prot = PROT_NONE;
if (r == 'r')
vma_area->vma.prot |= PROT_READ;
if (w == 'w')
vma_area->vma.prot |= PROT_WRITE;
if (x == 'x')
vma_area->vma.prot |= PROT_EXEC;
if (s == 's')
vma_area->vma.flags = MAP_SHARED;
else if (s == 'p')
vma_area->vma.flags = MAP_PRIVATE;
vma_area->vma.status = 0;
if (strstr(big_buffer, "[stack]"))
vma_area->vma.status |= VMA_AREA_REGULAR | VMA_AREA_STACK;
else if (strstr(big_buffer, "[vsyscall]"))
vma_area->vma.status |= VMA_AREA_VSYSCALL;
else if (strstr(big_buffer, "[vdso]"))
vma_area->vma.status |= VMA_AREA_VDSO;
else if (strstr(big_buffer, "[heap]"))
vma_area->vma.status |= VMA_AREA_REGULAR | VMA_AREA_HEAP;
else
vma_area->vma.status = VMA_AREA_REGULAR;
/*
* Some mapping hints for restore, we save this on
* disk and restore might need to analyze it.
*/
if (vma_area->vm_file_fd >= 0) {
if (fstat(vma_area->vm_file_fd, &st_buf) < 0) {
pr_perror("Failed fstat on %s%s\n",
map_files_path,
vma_file_path);
goto err;
}
if (!S_ISREG(st_buf.st_mode)) {
pr_error("Can't handle non-regular "
"mapping on %s%s\n",
map_files_path,
vma_file_path);
goto err;
}
/*
* /dev/zero stands for anon-shared mapping
* otherwise it's some file mapping.
*/
if (MAJOR(st_buf.st_dev) == 0) {
if (!(vma_area->vma.flags & MAP_SHARED))
goto err_bogus_mapping;
vma_area->vma.status |= VMA_ANON_SHARED;
vma_area->shmid = st_buf.st_ino;
} else {
if (vma_area->vma.flags & MAP_PRIVATE)
vma_area->vma.status |= VMA_FILE_PRIVATE;
else
vma_area->vma.status |= VMA_FILE_SHARED;
}
} else {
/*
* No file but mapping -- anonymous one.
*/
if (vma_area->vma.flags & MAP_SHARED)
goto err_bogus_mapping;
else
vma_area->vma.status |= VMA_ANON_PRIVATE;
}
list_add_tail(&vma_area->list, vma_area_list);
}
vma_area = NULL;
ret = 0;
err:
if (maps)
fclose(maps);
if (map_files_dir)
closedir(map_files_dir);
xfree(vma_area);
return ret;
err_bogus_mapping:
pr_error("Bogus mapping %lx-%lx\n",
vma_area->vma.start,
vma_area->vma.end);
goto err;
}

View File

@ -0,0 +1,562 @@
From f7e9d28188e7e2fd0f13f2696f29f20d784cb8fd Mon Sep 17 00:00:00 2001
From: root <root@ovzept.sw.ru>
Date: Fri, 3 Jun 2011 18:16:10 +0400
Subject: [PATCH] Image dumping via proc file
---
fs/proc/Kconfig | 8
fs/proc/Makefile | 1
fs/proc/base.c | 3
fs/proc/img_dump.c | 397 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/binfmt_img.h | 87 +++++++++
include/linux/proc_fs.h | 2
6 files changed, 498 insertions(+)
create mode 100644 fs/proc/img_dump.c
create mode 100644 include/linux/binfmt_img.h
Index: linux-2.6.git/fs/proc/Kconfig
===================================================================
--- linux-2.6.git.orig/fs/proc/Kconfig
+++ linux-2.6.git/fs/proc/Kconfig
@@ -67,3 +67,11 @@ config PROC_PAGE_MONITOR
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
/proc/kpagecount, and /proc/kpageflags. Disabling these
interfaces will reduce the size of the kernel by approximately 4kb.
+
+config PROC_IMG
+ default y
+ depends on PROC_FS
+ bool "Enable /proc/<pid>/dump file"
+ help
+ Say Y here if you want to be able to produce checkpoint-restore images
+ for tasks via proc
Index: linux-2.6.git/fs/proc/Makefile
===================================================================
--- linux-2.6.git.orig/fs/proc/Makefile
+++ linux-2.6.git/fs/proc/Makefile
@@ -28,3 +28,4 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
+proc-$(CONFIG_PROC_IMG) += img_dump.o
Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -2983,6 +2983,9 @@ static const struct pid_entry tgid_base_
#endif
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
+#ifdef CONFIG_PROC_IMG
+ REG("dump", S_IRUSR|S_IWUSR, proc_pid_dump_operations),
+#endif
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_maps_operations),
#ifdef CONFIG_NUMA
Index: linux-2.6.git/fs/proc/img_dump.c
===================================================================
--- /dev/null
+++ linux-2.6.git/fs/proc/img_dump.c
@@ -0,0 +1,397 @@
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/binfmt_img.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <linux/types.h>
+#include "internal.h"
+
+static int img_dump_buffer(char __user *ubuf, size_t size, void *buf, int len, int pos)
+{
+ int ret;
+ static size_t dumped = 0;
+
+ len -= pos;
+ if (len > size)
+ len = size;
+
+ ret = copy_to_user(ubuf, buf + pos, len);
+ if (ret)
+ return -EFAULT;
+
+ dumped += len;
+ return len;
+}
+
+static int img_dump_header(char __user *buf, size_t size, int pos)
+{
+ struct binfmt_img_header hdr;
+
+ hdr.magic = BINFMT_IMG_MAGIC;
+ hdr.version = BINFMT_IMG_VERS_0;
+
+ return img_dump_buffer(buf, size, &hdr, sizeof(hdr), pos);
+}
+
+static __u16 encode_segment(unsigned short seg)
+{
+ if (seg == 0)
+ return CKPT_X86_SEG_NULL;
+ BUG_ON((seg & 3) != 3);
+
+ if (seg == __USER_CS)
+ return CKPT_X86_SEG_USER64_CS;
+ if (seg == __USER_DS)
+ return CKPT_X86_SEG_USER64_DS;
+#ifdef CONFIG_COMPAT
+ if (seg == __USER32_CS)
+ return CKPT_X86_SEG_USER32_CS;
+ if (seg == __USER32_DS)
+ return CKPT_X86_SEG_USER32_DS;
+#endif
+
+ if (seg & 4)
+ return CKPT_X86_SEG_LDT | (seg >> 3);
+
+ seg >>= 3;
+ if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
+ return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
+
+ printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
+ BUG();
+}
+
+static __u64 encode_tls(struct desc_struct *d)
+{
+ return ((__u64)d->a << 32) + d->b;
+}
+
+static int img_dump_regs(struct task_struct *p, char __user *buf, size_t size, int pos)
+{
+ struct binfmt_regs_image regi;
+ struct pt_regs *regs;
+ int i;
+
+ regs = task_pt_regs(p);
+
+ regi.r15 = regs->r15;
+ regi.r14 = regs->r14;
+ regi.r13 = regs->r13;
+ regi.r12 = regs->r12;
+ regi.r11 = regs->r11;
+ regi.r10 = regs->r10;
+ regi.r9 = regs->r9;
+ regi.r8 = regs->r8;
+ regi.ax = regs->ax;
+ regi.orig_ax = regs->orig_ax;
+ regi.bx = regs->bx;
+ regi.cx = regs->cx;
+ regi.dx = regs->dx;
+ regi.si = regs->si;
+ regi.di = regs->di;
+ regi.ip = regs->ip;
+ regi.flags = regs->flags;
+ regi.bp = regs->bp;
+ regi.sp = regs->sp;
+
+ /* segments */
+ regi.gsindex = encode_segment(p->thread.gsindex);
+ regi.fsindex = encode_segment(p->thread.fsindex);
+ regi.cs = encode_segment(regs->cs);
+ regi.ss = encode_segment(regs->ss);
+ regi.ds = encode_segment(p->thread.ds);
+ regi.es = encode_segment(p->thread.es);
+
+ BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES);
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ regi.tls[i] = encode_tls(&p->thread.tls_array[i]);
+
+ if (p->thread.gsindex)
+ regi.gs = 0;
+ else
+ regi.gs = p->thread.gs;
+
+ if (p->thread.fsindex)
+ regi.fs = 0;
+ else
+ regi.fs = p->thread.fs;
+
+ return img_dump_buffer(buf, size, &regi, sizeof(regi), pos);
+}
+
+static int img_dump_mm(struct mm_struct *mm, char __user *buf, size_t size, int pos)
+{
+ struct binfmt_mm_image mmi;
+
+ mmi.flags = mm->flags;
+ mmi.def_flags = mm->def_flags;
+ mmi.start_code = mm->start_code;
+ mmi.end_code = mm->end_code;
+ mmi.start_data = mm->start_data;
+ mmi.end_data = mm->end_data;
+ mmi.start_brk = mm->start_brk;
+ mmi.brk = mm->brk;
+ mmi.start_stack = mm->start_stack;
+ mmi.arg_start = mm->arg_start;
+ mmi.arg_end = mm->arg_end;
+ mmi.env_start = mm->env_start;
+ mmi.env_end = mm->env_end;
+ mmi.exe_fd = 0;
+
+ return img_dump_buffer(buf, size, &mmi, sizeof(mmi), pos);
+}
+
+static int img_dump_vma(struct vm_area_struct *vma, char __user *buf, size_t size, int pos)
+{
+ struct binfmt_vma_image vmai;
+
+ if (vma == NULL) {
+ memset(&vmai, 0, sizeof(vmai));
+ goto dumpit;
+ }
+
+ printk("Dumping vma %016lx-%016lx %p/%p\n", vma->vm_start, vma->vm_end, vma, vma->vm_mm);
+
+ vmai.fd = 0;
+ vmai.prot = 0;
+ if (vma->vm_flags & VM_READ)
+ vmai.prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ vmai.prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ vmai.prot |= PROT_EXEC;
+
+ vmai.flags = 0;
+ if (vma->vm_file == NULL)
+ vmai.flags |= MAP_ANONYMOUS;
+ if (vma->vm_flags & VM_MAYSHARE)
+ vmai.flags |= MAP_SHARED;
+ else
+ vmai.flags |= MAP_PRIVATE;
+
+ vmai.start = vma->vm_start;
+ vmai.end = vma->vm_end;
+ vmai.pgoff = vma->vm_pgoff;
+
+dumpit:
+ return img_dump_buffer(buf, size, &vmai, sizeof(vmai), pos);
+}
+
+static int img_dump_page(unsigned long addr, void *data, char __user *buf, size_t size, int pos)
+{
+ struct binfmt_page_image pgi;
+ int ret = 0, tmp;
+
+ pgi.vaddr = addr;
+
+ if (pos < sizeof(pgi)) {
+ tmp = img_dump_buffer(buf, size, &pgi, sizeof(pgi), pos);
+ if (tmp < 0)
+ return tmp;
+
+ ret = tmp;
+ if (size <= ret)
+ return ret;
+
+ buf += ret;
+ size -= ret;
+ pos = 0;
+ } else
+ pos -= sizeof(pgi);
+
+ tmp = img_dump_buffer(buf, size, data, PAGE_SIZE, pos);
+ if (tmp < 0)
+ return tmp;
+
+ return ret + tmp;
+}
+
+static inline int is_private_vma(struct vm_area_struct *vma)
+{
+ if (vma->vm_file == NULL)
+ return 1;
+ if (!(vma->vm_flags & VM_SHARED))
+ return 1;
+ return 0;
+}
+
+static ssize_t do_produce_dump(struct task_struct *p, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ size_t img_pos = 0, img_ppos;
+ size_t produced = 0;
+ int len;
+ loff_t pos = *ppos;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+
+#define move_pos(); do { \
+ buf += len; \
+ produced += len;\
+ size -= len; \
+ pos += len; \
+ } while (0)
+
+#define seek_pos(__size); do { \
+ img_ppos = img_pos; \
+ img_pos += (__size); \
+ } while (0)
+
+ /* header */
+ seek_pos(sizeof(struct binfmt_img_header));
+ if (pos < img_pos) {
+ len = img_dump_header(buf, size, pos - img_ppos);
+ if (len < 0)
+ goto err;
+
+ move_pos();
+ if (size == 0)
+ goto out;
+ }
+
+ /* registers */
+ seek_pos(sizeof(struct binfmt_regs_image));
+ if (pos < img_pos) {
+ len = img_dump_regs(p, buf, size, pos - img_ppos);
+ if (len < 0)
+ goto err;
+
+ move_pos();
+ if (size == 0)
+ goto out;
+ }
+
+ /* memory */
+ mm = get_task_mm(p);
+ if (mm == NULL)
+ return -EACCES;
+
+ down_read(&mm->mmap_sem);
+
+ seek_pos(sizeof(struct binfmt_mm_image));
+ if (pos < img_pos) {
+ len = img_dump_mm(mm, buf, size, pos - img_ppos);
+ if (len < 0)
+ goto err_mm;
+
+ move_pos();
+ if (size == 0)
+ goto out_mm;
+ }
+
+ vma = mm->mmap;
+ while (1) {
+ seek_pos(sizeof(struct binfmt_vma_image));
+ if (pos < img_pos) {
+ len = img_dump_vma(vma, buf, size, pos - img_ppos);
+ if (len < 0)
+ goto err_mm;
+
+ move_pos();
+ if (size == 0)
+ goto out_mm;
+ }
+
+ if (vma == NULL)
+ break;
+
+ vma = vma->vm_next;
+ }
+
+ for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+ /* slow and stupid */
+ unsigned long addr;
+ struct page *page;
+ void *pg_data;
+
+ if (!is_private_vma(vma))
+ continue;
+
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+ page = follow_page(vma, addr, FOLL_FORCE | FOLL_DUMP | FOLL_GET);
+ if (page == NULL)
+ continue;
+ if (IS_ERR(page)) /* huh? */
+ continue;
+
+ seek_pos(sizeof(struct binfmt_page_image) + PAGE_SIZE);
+ if (pos < img_pos) {
+ pg_data = kmap(page);
+ len = img_dump_page(addr, pg_data, buf, size, pos - img_ppos);
+ kunmap(page);
+
+ if (len < 0) {
+ put_page(page);
+ goto err_mm;
+ }
+
+ move_pos();
+ if (size == 0) {
+ put_page(page);
+ goto out_mm;
+ }
+ }
+
+ put_page(page);
+ }
+ }
+
+ seek_pos(sizeof(struct binfmt_page_image));
+ if (pos < img_pos) {
+ struct binfmt_page_image zero;
+
+ memset(&zero, 0, sizeof(zero));
+ len = img_dump_buffer(buf, size, &zero, sizeof(zero), pos - img_ppos);
+ if (len < 0)
+ goto err;
+
+ move_pos();
+ }
+
+out_mm:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+out:
+ *ppos = pos;
+ return produced;
+
+err_mm:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+err:
+ return len;
+}
+
+static ssize_t img_dump_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
+{
+ struct task_struct *p;
+
+ p = get_proc_task(file->f_dentry->d_inode);
+ if (p == NULL)
+ return -ESRCH;
+
+ if (!(p->state & TASK_STOPPED)) {
+ put_task_struct(p);
+ return -EINVAL;
+ }
+
+ return do_produce_dump(p, buf, size, ppos);
+}
+
+static int img_dump_open(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+static int img_dump_release(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+const struct file_operations proc_pid_dump_operations = {
+ .open = img_dump_open,
+ .read = img_dump_read,
+ .release = img_dump_release,
+};
Index: linux-2.6.git/include/linux/binfmt_img.h
===================================================================
--- /dev/null
+++ linux-2.6.git/include/linux/binfmt_img.h
@@ -0,0 +1,87 @@
+#ifndef __BINFMT_IMG_H__
+#define __BINFMT_IMG_H__
+
+#include <linux/types.h>
+
+struct binfmt_img_header {
+ __u32 magic;
+ __u32 version;
+};
+
+#define CKPT_TLS_ENTRIES 3
+
+struct binfmt_regs_image {
+ __u64 r15;
+ __u64 r14;
+ __u64 r13;
+ __u64 r12;
+ __u64 r11;
+ __u64 r10;
+ __u64 r9;
+ __u64 r8;
+ __u64 ax;
+ __u64 orig_ax;
+ __u64 bx;
+ __u64 cx;
+ __u64 dx;
+ __u64 si;
+ __u64 di;
+ __u64 ip;
+ __u64 flags;
+ __u64 bp;
+ __u64 sp;
+
+ __u64 gs;
+ __u64 fs;
+ __u64 tls[CKPT_TLS_ENTRIES];
+ __u16 gsindex;
+ __u16 fsindex;
+ __u16 cs;
+ __u16 ss;
+ __u16 ds;
+ __u16 es;
+};
+
+#define CKPT_X86_SEG_NULL 0
+#define CKPT_X86_SEG_USER32_CS 1
+#define CKPT_X86_SEG_USER32_DS 2
+#define CKPT_X86_SEG_USER64_CS 3
+#define CKPT_X86_SEG_USER64_DS 4
+#define CKPT_X86_SEG_TLS 0x4000
+#define CKPT_X86_SEG_LDT 0x8000
+
+struct binfmt_mm_image {
+ __u64 flags;
+ __u64 def_flags;
+ __u64 start_code;
+ __u64 end_code;
+ __u64 start_data;
+ __u64 end_data;
+ __u64 start_brk;
+ __u64 brk;
+ __u64 start_stack;
+ __u64 arg_start;
+ __u64 arg_end;
+ __u64 env_start;
+ __u64 env_end;
+ __u32 exe_fd;
+};
+
+struct binfmt_vma_image {
+ __u32 prot;
+ __u32 flags;
+ __u32 pad;
+ __u32 fd;
+ __u64 start;
+ __u64 end;
+ __u64 pgoff;
+};
+
+struct binfmt_page_image {
+ __u64 vaddr;
+};
+
+#define BINFMT_IMG_MAGIC 0xa75b8d43
+#define BINFMT_IMG_VERS_0 0x00000100
+
+#endif
Index: linux-2.6.git/include/linux/proc_fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/proc_fs.h
+++ linux-2.6.git/include/linux/proc_fs.h
@@ -102,6 +102,8 @@ struct vmcore {
#ifdef CONFIG_PROC_FS
+extern const struct file_operations proc_pid_dump_operations;
+
extern void proc_root_init(void);
void proc_flush_task(struct task_struct *task);

View File

@ -0,0 +1,371 @@
From 0f8e07457aa91e9461665440ca258eb9f93bf2f9 Mon Sep 17 00:00:00 2001
From: root <root@ovzept.sw.ru>
Date: Fri, 3 Jun 2011 18:16:43 +0400
Subject: [PATCH] Images execution binfmt handler
---
fs/Kconfig.binfmt | 6 +
fs/Makefile | 1 +
fs/binfmt_img.c | 324 +++++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 331 insertions(+), 0 deletions(-)
create mode 100644 fs/binfmt_img.c
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 79e2ca7..0b2f48e 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -161,3 +161,9 @@ config BINFMT_MISC
You may say M here for module support and later load the module when
you have use for it; the module is called binfmt_misc. If you
don't know what to answer at this point, say Y.
+
+config BINFMT_IMG
+ tristate "Kernel support for IMG binaries"
+ depends on X86
+ help
+ Say M/Y here to enable support for checkpoint-restore images execution
diff --git a/fs/Makefile b/fs/Makefile
index fb68c2b..8221719 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_NFSD_DEPRECATED) += nfsctl.o
obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o
obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o
obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o
+obj-$(CONFIG_BINFMT_IMG) += binfmt_img.o
# binfmt_script is always there
obj-y += binfmt_script.o
diff --git a/fs/binfmt_img.c b/fs/binfmt_img.c
new file mode 100644
index 0000000..9b09797
--- /dev/null
+++ b/fs/binfmt_img.c
@@ -0,0 +1,324 @@
+#include <linux/binfmt_img.h>
+#include <linux/module.h>
+#include <linux/binfmts.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <asm/tlbflush.h>
+#include <asm/desc.h>
+
+/*
+ * The binary handler to save and restore a single task state
+ */
+
+static int img_check_header(void *buf)
+{
+ struct binfmt_img_header *hdr = buf;
+
+ if (hdr->magic != BINFMT_IMG_MAGIC)
+ return -ENOEXEC;
+
+ if (hdr->version != BINFMT_IMG_VERS_0)
+ return -EINVAL;
+
+ return sizeof(*hdr);
+}
+
+static unsigned short decode_segment(__u16 seg)
+{
+ if (seg == CKPT_X86_SEG_NULL)
+ return 0;
+
+ if (seg == CKPT_X86_SEG_USER64_CS)
+ return __USER_CS;
+ if (seg == CKPT_X86_SEG_USER64_DS)
+ return __USER_DS;
+#ifdef CONFIG_COMPAT
+ if (seg == CKPT_X86_SEG_USER32_CS)
+ return __USER32_CS;
+ if (seg == CKPT_X86_SEG_USER32_DS)
+ return __USER32_DS;
+#endif
+
+ if (seg & CKPT_X86_SEG_TLS) {
+ seg &= ~CKPT_X86_SEG_TLS;
+ return ((GDT_ENTRY_TLS_MIN + seg) << 3) | 3;
+ }
+ if (seg & CKPT_X86_SEG_LDT) {
+ seg &= ~CKPT_X86_SEG_LDT;
+ return (seg << 3) | 7;
+ }
+ BUG();
+}
+
+static void decode_tls(struct desc_struct *d, __u64 val)
+{
+ d->a = (unsigned int)(val >> 32);
+ d->b = (unsigned int)(val & 0xFFFFFFFF);
+}
+
+static int img_restore_regs(struct linux_binprm *bprm, loff_t off, struct pt_regs *regs)
+{
+ int ret, i;
+ struct binfmt_regs_image regi;
+ struct thread_struct *th = &current->thread;
+ unsigned short seg;
+
+ ret = kernel_read(bprm->file, off, (char *)&regi, sizeof(regi));
+ if (ret != sizeof(regi))
+ return -EIO;
+
+ regs->r15 = regi.r15;
+ regs->r14 = regi.r14;
+ regs->r13 = regi.r13;
+ regs->r12 = regi.r12;
+ regs->r11 = regi.r11;
+ regs->r10 = regi.r10;
+ regs->r9 = regi.r9;
+ regs->r8 = regi.r8;
+ regs->ax = regi.ax;
+ regs->orig_ax = regi.orig_ax;
+ regs->bx = regi.bx;
+ regs->cx = regi.cx;
+ regs->dx = regi.dx;
+ regs->si = regi.si;
+ regs->di = regi.di;
+ regs->ip = regi.ip;
+ regs->flags = regi.flags;
+ regs->bp = regi.bp;
+ regs->sp = regi.sp;
+
+ regs->cs = decode_segment(regi.cs);
+ regs->ss = decode_segment(regi.ss);
+
+ th->usersp = regi.sp;
+ th->ds = decode_segment(regi.ds);
+ th->es = decode_segment(regi.es);
+ th->fsindex = decode_segment(regi.fsindex);
+ th->gsindex = decode_segment(regi.gsindex);
+
+ th->fs = regi.fs;
+ th->gs = regi.gs;
+
+ BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES);
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ decode_tls(&th->tls_array[i], regi.tls[i]);
+
+ load_TLS(th, smp_processor_id());
+
+ seg = th->fsindex;
+ loadsegment(fs, seg);
+ savesegment(fs, seg);
+ if (seg != th->fsindex) {
+ printk("ERROR saving fs selector want %x, has %x\n",
+ (unsigned int)th->fsindex, (unsigned int)seg);
+ return -EFAULT;
+ }
+
+ if (th->fs)
+ wrmsrl(MSR_FS_BASE, th->fs);
+ load_gs_index(th->gsindex);
+ if (th->gs)
+ wrmsrl(MSR_KERNEL_GS_BASE, th->gs);
+
+ return sizeof(regi);
+}
+
+static int img_restore_mm(struct linux_binprm *bprm, loff_t off)
+{
+ int ret;
+ struct binfmt_mm_image mmi;
+ struct mm_struct *mm = current->mm;
+
+ ret = kernel_read(bprm->file, off, (char *)&mmi, sizeof(mmi));
+ if (ret != sizeof(mmi))
+ return -EIO;
+
+ mm->flags = mmi.flags;
+ mm->def_flags = mmi.def_flags;
+ mm->start_code = mmi.start_code;
+ mm->end_code = mmi.end_code;
+ mm->start_data = mmi.start_data;
+ mm->end_data = mmi.end_data;
+ mm->start_brk = mmi.start_brk;
+ mm->brk = mmi.brk;
+ mm->start_stack = mmi.start_stack;
+ mm->arg_start = mmi.arg_start;
+ mm->arg_end = mmi.arg_end;
+ mm->env_start = mmi.env_start;
+ mm->env_end = mmi.env_end;
+
+ if (mmi.exe_fd != 0) {
+ struct file *f;
+
+ f = fget(mmi.exe_fd);
+ if (f == NULL)
+ return -EBADF;
+
+ fput(mm->exe_file);
+ mm->exe_file = f;
+ }
+
+ return sizeof(mmi);
+}
+
+static int img_restore_vmas(struct linux_binprm *bprm, loff_t off)
+{
+ int ret;
+ struct mm_struct *mm = current->mm;
+ int len = 0;
+
+ do_munmap(mm, 0, TASK_SIZE);
+
+ while (1) {
+ struct binfmt_vma_image vmai;
+ unsigned long addr;
+ struct file *file = NULL;
+
+ len += sizeof(vmai);
+
+ ret = kernel_read(bprm->file, off, (char *)&vmai, sizeof(vmai));
+ if (ret != sizeof(vmai))
+ return -EIO;
+
+ if (vmai.start == 0 && vmai.end == 0)
+ break;
+
+ if (vmai.fd != 0) {
+ file = fget(vmai.fd);
+ if (file == NULL)
+ return -EBADF;
+ } else
+ vmai.flags |= MAP_ANONYMOUS;
+
+ if (vmai.start <= mm->start_stack && vmai.end >= mm->start_stack)
+ vmai.flags |= MAP_GROWSDOWN;
+
+ addr = do_mmap_pgoff(file, vmai.start, vmai.end - vmai.start,
+ vmai.prot, vmai.flags | MAP_FIXED, vmai.pgoff);
+
+ if (vmai.fd) {
+ fput(file);
+ do_close(vmai.fd);
+ }
+
+ if ((long)addr < 0 || (addr != vmai.start))
+ return -ENXIO;
+
+ off += sizeof(vmai);
+ }
+
+ return len;
+}
+
+static int img_restore_pages(struct linux_binprm *bprm, loff_t off)
+{
+ int ret;
+ struct mm_struct *mm = current->mm;
+ int len = 0;
+
+ while (1) {
+ struct binfmt_page_image pgi;
+ struct vm_area_struct *vma;
+ struct page *page;
+ void *pg_data;
+
+ ret = kernel_read(bprm->file, off, (char *)&pgi, sizeof(pgi));
+ if (ret != sizeof(pgi))
+ return -EIO;
+
+ len += sizeof(pgi);
+ if (pgi.vaddr == 0)
+ break;
+
+ vma = find_vma(mm, pgi.vaddr);
+ if (vma == NULL)
+ return -ESRCH;
+
+ ret = get_user_pages(current, current->mm, (unsigned long)pgi.vaddr,
+ 1, 1, 1, &page, NULL);
+ if (ret != 1)
+ return -EFAULT;
+
+ pg_data = kmap(page);
+ ret = kernel_read(bprm->file, off + sizeof(pgi), pg_data, PAGE_SIZE);
+ kunmap(page);
+ put_page(page);
+
+ if (ret != PAGE_SIZE)
+ return -EFAULT;
+
+ len += PAGE_SIZE;
+ off += sizeof(pgi) + PAGE_SIZE;
+ }
+
+ return len;
+}
+
+static int img_restore_mem(struct linux_binprm *bprm, loff_t off)
+{
+ int ret;
+ loff_t len = off;
+
+ ret = img_restore_mm(bprm, len);
+ if (ret < 0)
+ return ret;
+
+ len += ret;
+ ret = img_restore_vmas(bprm, len);
+ if (ret < 0)
+ return ret;
+
+ len += ret;
+ ret = img_restore_pages(bprm, len);
+ if (ret < 0)
+ return ret;
+
+ len += ret;
+ return len;
+
+}
+
+static int img_load_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+{
+ int ret;
+ loff_t len = 0;
+
+ ret = img_check_header(bprm->buf);
+ if (ret < 0)
+ return ret;
+
+ len += ret;
+ ret = img_restore_regs(bprm, len, regs);
+ if (ret < 0)
+ return ret;
+
+ len += ret;
+ ret = img_restore_mem(bprm, len);
+ if (ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static struct linux_binfmt img_binfmt = {
+ .module = THIS_MODULE,
+ .load_binary = img_load_binary,
+};
+
+static __init int img_binfmt_init(void)
+{
+ return register_binfmt(&img_binfmt);
+}
+
+static __exit void img_binfmt_exit(void)
+{
+ unregister_binfmt(&img_binfmt);
+}
+
+module_init(img_binfmt_init);
+module_exit(img_binfmt_exit);
+MODULE_LICENSE("GPL");
--
1.5.5.6

96
xemul/binfmt_img.h Normal file
View File

@ -0,0 +1,96 @@
#ifndef __BINFMT_IMG_H__
#define __BINFMT_IMG_H__
#include <linux/types.h>
#define __packed __attribute__((packed))
struct binfmt_img_header {
__u32 magic;
__u32 version;
__u16 arch;
__u16 flags;
} __packed;
#define CKPT_TLS_ENTRIES 3
struct binfmt_regs_image {
union {
struct {
__u64 r15;
__u64 r14;
__u64 r13;
__u64 r12;
__u64 r11;
__u64 r10;
__u64 r9;
__u64 r8;
__u64 ax;
__u64 orig_ax;
__u64 bx;
__u64 cx;
__u64 dx;
__u64 si;
__u64 di;
__u64 ip;
__u64 flags;
__u64 bp;
__u64 sp;
__u64 gs;
__u64 fs;
__u64 tls[CKPT_TLS_ENTRIES];
__u16 gsindex;
__u16 fsindex;
__u16 cs;
__u16 ss;
__u16 ds;
__u16 es;
} r;
__u64 dummy[32];
};
} __packed;
#define CKPT_X86_SEG_NULL 0
#define CKPT_X86_SEG_USER32_CS 1
#define CKPT_X86_SEG_USER32_DS 2
#define CKPT_X86_SEG_USER64_CS 3
#define CKPT_X86_SEG_USER64_DS 4
#define CKPT_X86_SEG_TLS 0x4000
#define CKPT_X86_SEG_LDT 0x8000
struct binfmt_mm_image {
__u64 flags;
__u64 def_flags;
__u64 start_code;
__u64 end_code;
__u64 start_data;
__u64 end_data;
__u64 start_brk;
__u64 brk;
__u64 start_stack;
__u64 arg_start;
__u64 arg_end;
__u64 env_start;
__u64 env_end;
__u32 exe_fd;
} __packed;
struct binfmt_vma_image {
__u32 prot;
__u32 flags;
__u32 pad;
__u32 fd;
__u64 start;
__u64 end;
__u64 pgoff;
} __packed;
struct binfmt_page_image {
__u64 vaddr;
} __packed;
#define BINFMT_IMG_MAGIC 0xa75b8d43
#define BINFMT_IMG_VERS_0 0x00000100
#endif

781
xemul/cr-dump.c Normal file
View File

@ -0,0 +1,781 @@
#include <stdio.h>
#include <unistd.h>
#include <signal.h>
#include <dirent.h>
#include <string.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <errno.h>
#include <linux/kdev_t.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <sys/vfs.h>
#include <linux/types.h>
#include "img_structs.h"
static int fdinfo_img;
static int pages_img;
static int core_img;
static int shmem_img;
static int pipes_img;
#define PIPEFS_MAGIC 0x50495045
static int prep_img_files(int pid)
{
__u32 type;
char name[64];
sprintf(name, "fdinfo-%d.img", pid);
fdinfo_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600);
if (fdinfo_img < 0) {
perror("Can't open fdinfo");
return 1;
}
type = FDINFO_MAGIC;
write(fdinfo_img, &type, 4);
sprintf(name, "pages-%d.img", pid);
pages_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600);
if (pages_img < 0) {
perror("Can't open shmem");
return 1;
}
type = PAGES_MAGIC;
write(pages_img, &type, 4);
sprintf(name, "core-%d.img", pid);
core_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600);
if (core_img < 0) {
perror("Can't open core");
return 1;
}
sprintf(name, "shmem-%d.img", pid);
shmem_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600);
if (shmem_img < 0) {
perror("Can't open shmem");
return 1;
}
type = SHMEM_MAGIC;
write(shmem_img, &type, 4);
sprintf(name, "pipes-%d.img", pid);
pipes_img = open(name, O_WRONLY | O_CREAT | O_EXCL, 0600);
if (pipes_img < 0) {
perror("Can't open pipes");
return 1;
}
type = PIPES_MAGIC;
write(pipes_img, &type, 4);
return 0;
}
static void kill_imgfiles(int pid)
{
/* FIXME */
}
static int stop_task(int pid)
{
return kill(pid, SIGSTOP);
}
static void continue_task(int pid)
{
if (kill(pid, SIGCONT))
perror("Can't cont task");
}
static char big_tmp_str[PATH_MAX];
static int read_fd_params(int pid, char *fd, unsigned long *pos, unsigned int *flags)
{
char fd_str[128];
int ifd;
sprintf(fd_str, "/proc/%d/fdinfo/%s", pid, fd);
printf("\tGetting fdinfo for fd %s\n", fd);
ifd = open(fd_str, O_RDONLY);
if (ifd < 0) {
perror("Can't open fdinfo");
return 1;
}
read(ifd, big_tmp_str, sizeof(big_tmp_str));
close(ifd);
sscanf(big_tmp_str, "pos:\t%lli\nflags:\t%o\n", pos, flags);
return 0;
}
static int dump_one_reg_file(int type, unsigned long fd_name, int lfd,
int lclose, unsigned long pos, unsigned int flags)
{
char fd_str[128];
int len;
struct fdinfo_entry e;
sprintf(fd_str, "/proc/self/fd/%d", lfd);
len = readlink(fd_str, big_tmp_str, sizeof(big_tmp_str) - 1);
if (len < 0) {
perror("Can't readlink fd");
return 1;
}
big_tmp_str[len] = '\0';
printf("\tDumping path for %x fd via self %d [%s]\n", fd_name, lfd, big_tmp_str);
if (lclose)
close(lfd);
e.type = type;
e.addr = fd_name;
e.len = len;
e.pos = pos;
e.flags = flags;
write(fdinfo_img, &e, sizeof(e));
write(fdinfo_img, big_tmp_str, len);
return 0;
}
#define MAX_PIPE_BUF_SIZE 1024 /* FIXME - this is not so */
#define SPLICE_F_NONBLOCK 0x2
static int dump_pipe_and_data(int lfd, struct pipes_entry *e)
{
int steal_pipe[2];
int ret;
printf("\tDumping data from pipe %x\n", e->pipeid);
if (pipe(steal_pipe) < 0) {
perror("Can't create pipe for stealing data");
return 1;
}
ret = tee(lfd, steal_pipe[1], MAX_PIPE_BUF_SIZE, SPLICE_F_NONBLOCK);
if (ret < 0) {
if (errno != EAGAIN) {
perror("Can't pick pipe data");
return 1;
}
ret = 0;
}
e->bytes = ret;
write(pipes_img, e, sizeof(*e));
if (ret) {
ret = splice(steal_pipe[0], NULL, pipes_img, NULL, ret, 0);
if (ret < 0) {
perror("Can't push pipe data");
return 1;
}
}
close(steal_pipe[0]);
close(steal_pipe[1]);
return 0;
}
static int dump_one_pipe(int fd, int lfd, unsigned int id, unsigned int flags)
{
struct pipes_entry e;
printf("\tDumping pipe %d/%x flags %x\n", fd, id, flags);
e.fd = fd;
e.pipeid = id;
e.flags = flags;
if (flags & O_WRONLY) {
e.bytes = 0;
write(pipes_img, &e, sizeof(e));
return 0;
}
return dump_pipe_and_data(lfd, &e);
}
static int dump_one_fd(int dir, char *fd_name, unsigned long pos, unsigned int flags)
{
int fd;
struct stat st_buf;
struct statfs stfs_buf;
printf("\tDumping fd %s\n", fd_name);
fd = openat(dir, fd_name, O_RDONLY);
if (fd == -1) {
printf("Tried to openat %d/%d %s\n", getpid(), dir, fd_name);
perror("Can't open fd");
return 1;
}
if (fstat(fd, &st_buf) < 0) {
perror("Can't stat one");
return 1;
}
if (S_ISREG(st_buf.st_mode))
return dump_one_reg_file(FDINFO_FD, atoi(fd_name), fd, 1, pos, flags);
if (S_ISFIFO(st_buf.st_mode)) {
if (fstatfs(fd, &stfs_buf) < 0) {
perror("Can't statfs one");
return 1;
}
if (stfs_buf.f_type == PIPEFS_MAGIC)
return dump_one_pipe(atoi(fd_name), fd, st_buf.st_ino, flags);
}
if (!strcmp(fd_name, "0")) {
printf("\tSkipping stdin\n");
return 0;
}
if (!strcmp(fd_name, "1")) {
printf("\tSkipping stdout\n");
return 0;
}
if (!strcmp(fd_name, "2")) {
printf("\tSkipping stderr\n");
return 0;
}
if (!strcmp(fd_name, "3")) {
printf("\tSkipping tty\n");
return 0;
}
fprintf(stderr, "Can't dump file %s of that type [%x]\n", fd_name, st_buf.st_mode);
return 1;
}
static int dump_task_files(int pid)
{
char pid_fd_dir[64];
DIR *fd_dir;
struct dirent *de;
unsigned long pos;
unsigned int flags;
printf("Dumping open files for %d\n", pid);
sprintf(pid_fd_dir, "/proc/%d/fd", pid);
fd_dir = opendir(pid_fd_dir);
if (fd_dir == NULL) {
perror("Can't open fd dir");
return -1;
}
while ((de = readdir(fd_dir)) != NULL) {
if (de->d_name[0] == '.')
continue;
if (read_fd_params(pid, de->d_name, &pos, &flags))
return 1;
if (dump_one_fd(dirfd(fd_dir), de->d_name, pos, flags))
return 1;
}
closedir(fd_dir);
return 0;
}
#define PAGE_SIZE 4096
#define PAGE_RSS 0x1
static unsigned long rawhex(char *str, char **end)
{
unsigned long ret = 0;
while (1) {
if (str[0] >= '0' && str[0] <= '9') {
ret <<= 4;
ret += str[0] - '0';
} else if (str[0] >= 'a' && str[0] <= 'f') {
ret <<= 4;
ret += str[0] - 'a' + 0xA;
} else if (str[0] >= 'A' && str[0] <= 'F') {
ret <<= 4;
ret += str[0] - 'A' + 0xA;
} else {
if (end)
*end = str;
return ret;
}
str++;
}
}
static void map_desc_parm(char *desc, unsigned long *pgoff, unsigned long *len)
{
char *s;
unsigned long start, end;
start = rawhex(desc, &s);
if (*s != '-') {
goto bug;
}
end = rawhex(s + 1, &s);
if (*s != ' ') {
goto bug;
}
s = strchr(s + 1, ' ');
*pgoff = rawhex(s + 1, &s);
if (*s != ' ') {
goto bug;
}
if (start > end)
goto bug;
*len = end - start;
if (*len % PAGE_SIZE) {
goto bug;
}
if (*pgoff % PAGE_SIZE) {
goto bug;
}
return;
bug:
fprintf(stderr, "BUG\n");
exit(1);
}
static int dump_map_pages(int lfd, unsigned long start, unsigned long pgoff, unsigned long len)
{
unsigned int nrpages, pfn;
void *mem;
unsigned char *mc;
printf("\t\tDumping pages start %x len %x off %x\n", start, len, pgoff);
mem = mmap(NULL, len, PROT_READ, MAP_FILE | MAP_PRIVATE, lfd, pgoff);
if (mem == MAP_FAILED) {
perror("Can't map");
return 1;
}
nrpages = len / PAGE_SIZE;
mc = malloc(nrpages);
if (mincore(mem, len, mc)) {
perror("Can't mincore mapping");
return 1;
}
for (pfn = 0; pfn < nrpages; pfn++)
if (mc[pfn] & PAGE_RSS) {
__u64 vaddr;
vaddr = start + pfn * PAGE_SIZE;
write(pages_img, &vaddr, 8);
write(pages_img, mem + pfn * PAGE_SIZE, PAGE_SIZE);
}
munmap(mem, len);
return 0;
}
static int dump_anon_private_map(char *start)
{
printf("\tSkipping anon private mapping at %s\n", start);
return 0;
}
static int dump_anon_shared_map(char *_start, char *mdesc, int lfd, struct stat *st)
{
unsigned long pgoff, len;
struct shmem_entry e;
unsigned long start;
struct stat buf;
map_desc_parm(mdesc, &pgoff, &len);
start = rawhex(_start, NULL);
e.start = start;
e.end = start + len;
e.shmid = st->st_ino;
write(shmem_img, &e, sizeof(e));
if (dump_map_pages(lfd, start, pgoff, len))
return 1;
close(lfd);
return 0;
}
static int dump_file_shared_map(char *start, char *mdesc, int lfd)
{
printf("\tSkipping file shared mapping at %s\n", start);
close(lfd);
return 0;
}
static int dump_file_private_map(char *_start, char *mdesc, int lfd)
{
unsigned long pgoff, len;
unsigned long start;
map_desc_parm(mdesc, &pgoff, &len);
start = rawhex(_start, NULL);
if (dump_one_reg_file(FDINFO_MAP, start, lfd, 0, 0, O_RDONLY))
return 1;
close(lfd);
return 0;
}
static int dump_one_mapping(char *mdesc, DIR *mfd_dir)
{
char *flags, *tmp;
char map_start[32];
int lfd;
struct stat st_buf;
tmp = strchr(mdesc, '-');
memset(map_start, 0, sizeof(map_start));
strncpy(map_start, mdesc, tmp - mdesc);
flags = strchr(mdesc, ' ');
flags++;
printf("\tDumping %s\n", map_start);
lfd = openat(dirfd(mfd_dir), map_start, O_RDONLY);
if (lfd == -1) {
if (errno != ENOENT) {
perror("Can't open mapping");
return 1;
}
if (flags[3] != 'p') {
fprintf(stderr, "Bogus mapping [%s]\n", mdesc);
return 1;
}
return dump_anon_private_map(map_start);
}
if (fstat(lfd, &st_buf) < 0) {
perror("Can't stat mapping!");
return 1;
}
if (!S_ISREG(st_buf.st_mode)) {
perror("Can't handle non-regular mapping");
return 1;
}
if (MAJOR(st_buf.st_dev) == 0) {
if (flags[3] != 's') {
fprintf(stderr, "Bogus mapping [%s]\n", mdesc);
return 1;
}
/* FIXME - this can be tmpfs visible file mapping */
return dump_anon_shared_map(map_start, mdesc, lfd, &st_buf);
}
if (flags[3] == 'p')
return dump_file_private_map(map_start, mdesc, lfd);
else
return dump_file_shared_map(map_start, mdesc, lfd);
}
static int dump_task_ext_mm(int pid)
{
char path[64];
DIR *mfd_dir;
FILE *maps;
printf("Dumping mappings for %d\n", pid);
sprintf(path, "/proc/%d/mfd", pid);
mfd_dir = opendir(path);
if (mfd_dir == NULL) {
perror("Can't open mfd dir");
return -1;
}
sprintf(path, "/proc/%d/maps", pid);
maps = fopen(path, "r");
if (maps == NULL) {
perror("Can't open maps file");
return 1;
}
while (fgets(big_tmp_str, sizeof(big_tmp_str), maps) != NULL)
if (dump_one_mapping(big_tmp_str, mfd_dir))
return 1;
fclose(maps);
closedir(mfd_dir);
return 0;
}
static int dump_task_state(int pid)
{
char path[64];
int dump_fd;
void *mem;
printf("Dumping task image for %d\n", pid);
sprintf(path, "/proc/%d/kstate_dump", pid);
dump_fd = open(path, O_RDONLY);
if (dump_fd < 0) {
perror("Can't open dump file");
return 1;
}
mem = mmap(NULL, 4096, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, 0, 0);
if (mem == MAP_FAILED) {
perror("Can't get mem");
return 1;
}
while (1) {
int r, w;
r = read(dump_fd, mem, 4096);
if (r == 0)
break;
if (r < 0) {
perror("Can't read dump file");
return 1;
}
w = 0;
while (w < r) {
int ret;
ret = write(core_img, mem + w, r - w);
if (ret <= 0) {
perror("Can't write core");
return 1;
}
w += ret;
}
}
munmap(mem, 4096);
close(dump_fd);
return 0;
}
static int dump_one_task(int pid, int stop)
{
printf("Dumping task %d\n", pid);
if (prep_img_files(pid))
return 1;
if (stop && stop_task(pid))
goto err_task;
if (dump_task_files(pid))
goto err;
if (dump_task_ext_mm(pid))
goto err;
if (dump_task_state(pid))
goto err;
if (stop)
continue_task(pid);
printf("Dump is complete\n");
return 0;
err:
if (stop)
continue_task(pid);
err_task:
kill_imgfiles(pid);
return 1;
}
static int pstree_fd;
static char big_tmp_str[4096];
static int *pids, nr_pids;
static char *get_children_pids(int pid)
{
FILE *f;
int len;
char *ret, *tmp;
sprintf(big_tmp_str, "/proc/%d/status", pid);
f = fopen(big_tmp_str, "r");
if (f == NULL)
return NULL;
while ((fgets(big_tmp_str, sizeof(big_tmp_str), f)) != NULL) {
if (strncmp(big_tmp_str, "Children:", 9))
continue;
tmp = big_tmp_str + 10;
len = strlen(tmp);
ret = malloc(len + 1);
strcpy(ret, tmp);
if (len)
ret[len - 1] = ' ';
fclose(f);
return ret;
}
fclose(f);
return NULL;
}
static int dump_pid_and_children(int pid)
{
struct pstree_entry e;
char *chlist, *tmp, *tmp2;
printf("\tReading %d children list\n", pid);
chlist = get_children_pids(pid);
if (chlist == NULL)
return 1;
printf("\t%d has children %s\n", pid, chlist);
e.pid = pid;
e.nr_children = 0;
pids = realloc(pids, (nr_pids + 1) * sizeof(int));
pids[nr_pids++] = e.pid;
tmp = chlist;
while ((tmp = strchr(tmp, ' ')) != NULL) {
tmp++;
e.nr_children++;
}
write(pstree_fd, &e, sizeof(e));
tmp = chlist;
while (1) {
__u32 cpid;
cpid = strtol(tmp, &tmp, 10);
if (cpid == 0)
break;
if (*tmp != ' ') {
fprintf(stderr, "Error in string with children!\n");
return 1;
}
write(pstree_fd, &cpid, sizeof(cpid));
tmp++;
}
tmp = chlist;
while ((tmp2 = strchr(tmp, ' ')) != NULL) {
*tmp2 = '\0';
if (dump_pid_and_children(atoi(tmp)))
return 1;
tmp = tmp2 + 1;
}
free(chlist);
return 0;
}
static int __dump_all_tasks(void)
{
int i, pid;
printf("Dumping tasks' images for");
for (i = 0; i < nr_pids; i++)
printf(" %d", pids[i]);
printf("\n");
printf("Stopping tasks\n");
for (i = 0; i < nr_pids; i++)
if (stop_task(pids[i]))
goto err;
for (i = 0; i < nr_pids; i++) {
if (dump_one_task(pids[i], 0))
goto err;
}
printf("Resuming tasks\n");
for (i = 0; i < nr_pids; i++)
continue_task(pids[i]);
return 0;
err:
for (i = 0; i < nr_pids; i++)
continue_task(pids[i]);
return 1;
}
static int dump_all_tasks(int pid)
{
char *chlist;
__u32 type;
pids = NULL;
nr_pids = 0;
printf("Dumping process tree, start from %d\n", pid);
sprintf(big_tmp_str, "pstree-%d.img", pid);
pstree_fd = open(big_tmp_str, O_WRONLY | O_CREAT | O_EXCL, 0600);
if (pstree_fd < 0) {
perror("Can't create pstree");
return 1;
}
type = PSTREE_MAGIC;
write(pstree_fd, &type, sizeof(type));
if (dump_pid_and_children(pid))
return 1;
close(pstree_fd);
return __dump_all_tasks();
}
int main(int argc, char **argv)
{
if (argc != 3)
goto usage;
if (argv[1][0] != '-')
goto usage;
if (argv[1][1] == 'p')
return dump_one_task(atoi(argv[2]), 1);
if (argv[1][1] == 't')
return dump_all_tasks(atoi(argv[2]));
usage:
printf("Usage: %s (-p|-t) <pid>\n", argv[0]);
return 1;
}

1115
xemul/cr-restore.c Normal file

File diff suppressed because it is too large Load Diff

354
xemul/img-show.c Normal file
View File

@ -0,0 +1,354 @@
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <linux/types.h>
#include <string.h>
#include "img_structs.h"
#include "binfmt_img.h"
static int show_fdinfo(int fd)
{
char data[1024];
struct fdinfo_entry e;
while (1) {
int ret;
ret = read(fd, &e, sizeof(e));
if (ret == 0)
break;
if (ret != sizeof(e)) {
perror("Can't read");
return 1;
}
ret = read(fd, data, e.len);
if (ret != e.len) {
perror("Can't read");
return 1;
}
data[e.len] = '\0';
switch (e.type) {
case FDINFO_FD:
printf("fd %d [%s] pos %lx flags %o\n", (int)e.addr, data, e.pos, e.flags);
break;
case FDINFO_MAP:
printf("map %lx [%s] flags %o\n", e.addr, data, e.flags);
break;
default:
fprintf(stderr, "Unknown fdinfo entry type %d\n", e.type);
return 1;
}
}
return 0;
}
#define PAGE_SIZE 4096
static int show_mem(int fd)
{
__u64 vaddr;
unsigned int data[2];
while (1) {
if (read(fd, &vaddr, 8) == 0)
break;
if (vaddr == 0)
break;
read(fd, &data[0], sizeof(unsigned int));
lseek(fd, PAGE_SIZE - 2 * sizeof(unsigned int), SEEK_CUR);
read(fd, &data[1], sizeof(unsigned int));
printf("\tpage 0x%lx [%x...%x]\n", (unsigned long)vaddr, data[0], data[1]);
}
return 0;
}
static int show_pages(int fd)
{
return show_mem(fd);
}
static int show_shmem(int fd)
{
int r;
struct shmem_entry e;
while (1) {
r = read(fd, &e, sizeof(e));
if (r == 0)
return 0;
if (r != sizeof(e)) {
perror("Can't read shmem entry");
return 1;
}
printf("%016lx-%016lx %016x\n", e.start, e.end, e.shmid);
}
}
static char *segval(__u16 seg)
{
switch (seg) {
case CKPT_X86_SEG_NULL: return "nul";
case CKPT_X86_SEG_USER32_CS: return "cs32";
case CKPT_X86_SEG_USER32_DS: return "ds32";
case CKPT_X86_SEG_USER64_CS: return "cs64";
case CKPT_X86_SEG_USER64_DS: return "ds64";
}
if (seg & CKPT_X86_SEG_TLS)
return "tls";
if (seg & CKPT_X86_SEG_LDT)
return "ldt";
return "[unknown]";
}
static int show_regs(int fd)
{
struct binfmt_regs_image ri;
if (read(fd, &ri, sizeof(ri)) != sizeof(ri)) {
perror("Can't read registers from image");
return 1;
}
printf("Registers:\n");
printf("\tr15: %016lx\n", ri.r.r15);
printf("\tr14: %016lx\n", ri.r.r14);
printf("\tr13: %016lx\n", ri.r.r13);
printf("\tr12: %016lx\n", ri.r.r12);
printf("\tr11: %016lx\n", ri.r.r11);
printf("\tr10: %016lx\n", ri.r.r10);
printf("\tr9: %016lx\n", ri.r.r9);
printf("\tr8: %016lx\n", ri.r.r8);
printf("\tax: %016lx\n", ri.r.ax);
printf("\torig_ax: %016lx\n", ri.r.orig_ax);
printf("\tbx: %016lx\n", ri.r.bx);
printf("\tcx: %016lx\n", ri.r.cx);
printf("\tdx: %016lx\n", ri.r.dx);
printf("\tsi: %016lx\n", ri.r.si);
printf("\tdi: %016lx\n", ri.r.di);
printf("\tip: %016lx\n", ri.r.ip);
printf("\tflags: %016lx\n", ri.r.flags);
printf("\tbp: %016lx\n", ri.r.bp);
printf("\tsp: %016lx\n", ri.r.sp);
printf("\tgs: %016lx\n", ri.r.gs);
printf("\tfs: %016lx\n", ri.r.fs);
printf("\tgsindex: %s\n", segval(ri.r.gsindex));
printf("\tfsindex: %s\n", segval(ri.r.fsindex));
printf("\tcs: %s\n", segval(ri.r.cs));
printf("\tss: %s\n", segval(ri.r.ss));
printf("\tds: %s\n", segval(ri.r.ds));
printf("\tes: %s\n", segval(ri.r.es));
printf("\ttls0 %016lx\n", ri.r.tls[0]);
printf("\ttls1 %016lx\n", ri.r.tls[1]);
printf("\ttls2 %016lx\n", ri.r.tls[2]);
return 0;
}
static int show_mm(int fd, unsigned long *stack)
{
struct binfmt_mm_image mi;
if (read(fd, &mi, sizeof(mi)) != sizeof(mi)) {
perror("Can't read mm from image");
return 1;
}
printf("MM:\n");
printf("\tflags: %016lx\n", mi.flags);
printf("\tdef_flags: %016lx\n", mi.def_flags);
printf("\tstart_code: %016lx\n", mi.start_code);
printf("\tend_code: %016lx\n", mi.end_code);
printf("\tstart_data: %016lx\n", mi.start_data);
printf("\tend_data: %016lx\n", mi.end_data);
printf("\tstart_brk: %016lx\n", mi.start_brk);
printf("\tbrk: %016lx\n", mi.brk);
printf("\tstart_stack: %016lx\n", mi.start_stack);
printf("\targ_start: %016lx\n", mi.arg_start);
printf("\targ_end: %016lx\n", mi.arg_end);
printf("\tenv_start: %016lx\n", mi.env_start);
printf("\tenv_end: %016lx\n", mi.env_end);
*stack = mi.start_stack;
return 0;
}
static int show_vmas(int fd, unsigned long stack)
{
struct binfmt_vma_image vi;
printf("VMAs:\n");
while (1) {
char *note = "";
if (read(fd, &vi, sizeof(vi)) != sizeof(vi)) {
perror("Can't read vma from image");
return 1;
}
if (vi.start == 0 && vi.end == 0)
return 0;
if (vi.start <= stack && vi.end >= stack)
note = "[stack]";
printf("\t%016lx-%016lx file %d %016lx prot %x flags %x %s\n",
vi.start, vi.end, vi.fd, vi.pgoff,
vi.prot, vi.flags, note);
}
}
static int show_privmem(int fd)
{
printf("Pages:\n");
return show_mem(fd);
}
static int show_core(int fd)
{
__u32 version = 0;
unsigned long stack;
read(fd, &version, 4);
if (version != BINFMT_IMG_VERS_0) {
printf("Unsupported version %d\n", version);
return 1;
}
/* the pad */
read(fd, &version, 4);
printf("Showing version 0\n");
if (show_regs(fd))
return 1;
if (show_mm(fd, &stack))
return 1;
if (show_vmas(fd, stack))
return 1;
if (show_privmem(fd))
return 1;
return 0;
}
static int show_pstree(int fd)
{
int ret;
struct pstree_entry e;
while (1) {
int i;
__u32 *ch;
ret = read(fd, &e, sizeof(e));
if (ret == 0)
return 0;
if (ret != sizeof(e)) {
perror("Can't read processes entry");
return 1;
}
printf("%d:", e.pid);
i = e.nr_children * sizeof(__u32);
ch = malloc(i);
ret = read(fd, ch, i);
if (ret != i) {
perror("Can't read children list");
return 1;
}
for (i = 0; i < e.nr_children; i++)
printf(" %d", ch[i]);
printf("\n");
}
}
static int show_pipes(int fd)
{
struct pipes_entry e;
int ret;
char buf[17];
while (1) {
ret = read(fd, &e, sizeof(e));
if (ret == 0)
break;
if (ret != sizeof(e)) {
perror("Can't read pipe entry");
return 1;
}
printf("%d: %lx %o %d ", e.fd, e.pipeid, e.flags, e.bytes);
if (e.flags & O_WRONLY) {
printf("\n");
if (e.bytes) {
printf("Bogus pipe\n");
return 1;
}
continue;
}
memset(buf, 0, sizeof(buf));
ret = e.bytes;
if (ret > 16)
ret = 16;
read(fd, buf, ret);
printf("\t[%s", buf);
if (ret < e.bytes)
printf("...");
printf("]\n");
lseek(fd, e.bytes - ret, SEEK_CUR);
}
return 0;
}
int main(int argc, char **argv)
{
__u32 type;
int fd;
fd = open(argv[1], O_RDONLY);
if (fd < 0) {
perror("Can't open");
return 1;
}
read(fd, &type, 4);
if (type == FDINFO_MAGIC)
return show_fdinfo(fd);
if (type == PAGES_MAGIC)
return show_pages(fd);
if (type == SHMEM_MAGIC)
return show_shmem(fd);
if (type == PSTREE_MAGIC)
return show_pstree(fd);
if (type == PIPES_MAGIC)
return show_pipes(fd);
if (type == BINFMT_IMG_MAGIC)
return show_core(fd);
printf("Unknown file type 0x%x\n", type);
return 1;
}

39
xemul/img_structs.h Normal file
View File

@ -0,0 +1,39 @@
#define FDINFO_MAGIC 0x01010101
struct fdinfo_entry {
__u8 type;
__u8 len;
__u16 flags;
__u32 pos;
__u64 addr;
};
#define FDINFO_FD 1
#define FDINFO_MAP 2
#define PAGES_MAGIC 0x20202020
#define SHMEM_MAGIC 0x03300330
struct shmem_entry {
__u64 start;
__u64 end;
__u64 shmid;
};
#define PSTREE_MAGIC 0x40044004
struct pstree_entry {
__u32 pid;
__u32 nr_children;
};
#define PIPES_MAGIC 0x05055050
struct pipes_entry {
__u32 fd;
__u32 pipeid;
__u32 flags;
__u32 bytes;
};

2
xemul/readme Normal file
View File

@ -0,0 +1,2 @@
Previous version of C/R -- uses in-kernel dumper restorer.
It's here for the reference and not used by crtools itself.