2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-30 13:58:34 +00:00
criu/xemul/0003-Image-dumping-via-proc-file.patch
Cyrill Gorcunov 523de23624 Initial commit
Signed-off-by: Cyrill Gorcunov <gorcunov@gmail.com>
2011-09-23 12:00:45 +04:00

563 lines
12 KiB
Diff

From f7e9d28188e7e2fd0f13f2696f29f20d784cb8fd Mon Sep 17 00:00:00 2001
From: root <root@ovzept.sw.ru>
Date: Fri, 3 Jun 2011 18:16:10 +0400
Subject: [PATCH] Image dumping via proc file
---
fs/proc/Kconfig | 8
fs/proc/Makefile | 1
fs/proc/base.c | 3
fs/proc/img_dump.c | 397 +++++++++++++++++++++++++++++++++++++++++++++
include/linux/binfmt_img.h | 87 +++++++++
include/linux/proc_fs.h | 2
6 files changed, 498 insertions(+)
create mode 100644 fs/proc/img_dump.c
create mode 100644 include/linux/binfmt_img.h
Index: linux-2.6.git/fs/proc/Kconfig
===================================================================
--- linux-2.6.git.orig/fs/proc/Kconfig
+++ linux-2.6.git/fs/proc/Kconfig
@@ -67,3 +67,11 @@ config PROC_PAGE_MONITOR
/proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap,
/proc/kpagecount, and /proc/kpageflags. Disabling these
interfaces will reduce the size of the kernel by approximately 4kb.
+
+config PROC_IMG
+ default y
+ depends on PROC_FS
+ bool "Enable /proc/<pid>/dump file"
+ help
+ Say Y here if you want to be able to produce checkpoint-restore images
+ for tasks via proc
Index: linux-2.6.git/fs/proc/Makefile
===================================================================
--- linux-2.6.git.orig/fs/proc/Makefile
+++ linux-2.6.git/fs/proc/Makefile
@@ -28,3 +28,4 @@ proc-$(CONFIG_PROC_VMCORE) += vmcore.o
proc-$(CONFIG_PROC_DEVICETREE) += proc_devtree.o
proc-$(CONFIG_PRINTK) += kmsg.o
proc-$(CONFIG_PROC_PAGE_MONITOR) += page.o
+proc-$(CONFIG_PROC_IMG) += img_dump.o
Index: linux-2.6.git/fs/proc/base.c
===================================================================
--- linux-2.6.git.orig/fs/proc/base.c
+++ linux-2.6.git/fs/proc/base.c
@@ -2983,6 +2983,9 @@ static const struct pid_entry tgid_base_
#endif
INF("cmdline", S_IRUGO, proc_pid_cmdline),
ONE("stat", S_IRUGO, proc_tgid_stat),
+#ifdef CONFIG_PROC_IMG
+ REG("dump", S_IRUSR|S_IWUSR, proc_pid_dump_operations),
+#endif
ONE("statm", S_IRUGO, proc_pid_statm),
REG("maps", S_IRUGO, proc_maps_operations),
#ifdef CONFIG_NUMA
Index: linux-2.6.git/fs/proc/img_dump.c
===================================================================
--- /dev/null
+++ linux-2.6.git/fs/proc/img_dump.c
@@ -0,0 +1,397 @@
+#include <linux/proc_fs.h>
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+#include <linux/binfmt_img.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/highmem.h>
+#include <linux/types.h>
+#include "internal.h"
+
+static int img_dump_buffer(char __user *ubuf, size_t size, void *buf, int len, int pos)
+{
+ int ret;
+ static size_t dumped = 0;
+
+ len -= pos;
+ if (len > size)
+ len = size;
+
+ ret = copy_to_user(ubuf, buf + pos, len);
+ if (ret)
+ return -EFAULT;
+
+ dumped += len;
+ return len;
+}
+
+static int img_dump_header(char __user *buf, size_t size, int pos)
+{
+ struct binfmt_img_header hdr;
+
+ hdr.magic = BINFMT_IMG_MAGIC;
+ hdr.version = BINFMT_IMG_VERS_0;
+
+ return img_dump_buffer(buf, size, &hdr, sizeof(hdr), pos);
+}
+
+static __u16 encode_segment(unsigned short seg)
+{
+ if (seg == 0)
+ return CKPT_X86_SEG_NULL;
+ BUG_ON((seg & 3) != 3);
+
+ if (seg == __USER_CS)
+ return CKPT_X86_SEG_USER64_CS;
+ if (seg == __USER_DS)
+ return CKPT_X86_SEG_USER64_DS;
+#ifdef CONFIG_COMPAT
+ if (seg == __USER32_CS)
+ return CKPT_X86_SEG_USER32_CS;
+ if (seg == __USER32_DS)
+ return CKPT_X86_SEG_USER32_DS;
+#endif
+
+ if (seg & 4)
+ return CKPT_X86_SEG_LDT | (seg >> 3);
+
+ seg >>= 3;
+ if (GDT_ENTRY_TLS_MIN <= seg && seg <= GDT_ENTRY_TLS_MAX)
+ return CKPT_X86_SEG_TLS | (seg - GDT_ENTRY_TLS_MIN);
+
+ printk(KERN_ERR "c/r: (decode) bad segment %#hx\n", seg);
+ BUG();
+}
+
+static __u64 encode_tls(struct desc_struct *d)
+{
+ return ((__u64)d->a << 32) + d->b;
+}
+
+static int img_dump_regs(struct task_struct *p, char __user *buf, size_t size, int pos)
+{
+ struct binfmt_regs_image regi;
+ struct pt_regs *regs;
+ int i;
+
+ regs = task_pt_regs(p);
+
+ regi.r15 = regs->r15;
+ regi.r14 = regs->r14;
+ regi.r13 = regs->r13;
+ regi.r12 = regs->r12;
+ regi.r11 = regs->r11;
+ regi.r10 = regs->r10;
+ regi.r9 = regs->r9;
+ regi.r8 = regs->r8;
+ regi.ax = regs->ax;
+ regi.orig_ax = regs->orig_ax;
+ regi.bx = regs->bx;
+ regi.cx = regs->cx;
+ regi.dx = regs->dx;
+ regi.si = regs->si;
+ regi.di = regs->di;
+ regi.ip = regs->ip;
+ regi.flags = regs->flags;
+ regi.bp = regs->bp;
+ regi.sp = regs->sp;
+
+ /* segments */
+ regi.gsindex = encode_segment(p->thread.gsindex);
+ regi.fsindex = encode_segment(p->thread.fsindex);
+ regi.cs = encode_segment(regs->cs);
+ regi.ss = encode_segment(regs->ss);
+ regi.ds = encode_segment(p->thread.ds);
+ regi.es = encode_segment(p->thread.es);
+
+ BUILD_BUG_ON(GDT_ENTRY_TLS_ENTRIES != CKPT_TLS_ENTRIES);
+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
+ regi.tls[i] = encode_tls(&p->thread.tls_array[i]);
+
+ if (p->thread.gsindex)
+ regi.gs = 0;
+ else
+ regi.gs = p->thread.gs;
+
+ if (p->thread.fsindex)
+ regi.fs = 0;
+ else
+ regi.fs = p->thread.fs;
+
+ return img_dump_buffer(buf, size, &regi, sizeof(regi), pos);
+}
+
+static int img_dump_mm(struct mm_struct *mm, char __user *buf, size_t size, int pos)
+{
+ struct binfmt_mm_image mmi;
+
+ mmi.flags = mm->flags;
+ mmi.def_flags = mm->def_flags;
+ mmi.start_code = mm->start_code;
+ mmi.end_code = mm->end_code;
+ mmi.start_data = mm->start_data;
+ mmi.end_data = mm->end_data;
+ mmi.start_brk = mm->start_brk;
+ mmi.brk = mm->brk;
+ mmi.start_stack = mm->start_stack;
+ mmi.arg_start = mm->arg_start;
+ mmi.arg_end = mm->arg_end;
+ mmi.env_start = mm->env_start;
+ mmi.env_end = mm->env_end;
+ mmi.exe_fd = 0;
+
+ return img_dump_buffer(buf, size, &mmi, sizeof(mmi), pos);
+}
+
+static int img_dump_vma(struct vm_area_struct *vma, char __user *buf, size_t size, int pos)
+{
+ struct binfmt_vma_image vmai;
+
+ if (vma == NULL) {
+ memset(&vmai, 0, sizeof(vmai));
+ goto dumpit;
+ }
+
+ printk("Dumping vma %016lx-%016lx %p/%p\n", vma->vm_start, vma->vm_end, vma, vma->vm_mm);
+
+ vmai.fd = 0;
+ vmai.prot = 0;
+ if (vma->vm_flags & VM_READ)
+ vmai.prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ vmai.prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ vmai.prot |= PROT_EXEC;
+
+ vmai.flags = 0;
+ if (vma->vm_file == NULL)
+ vmai.flags |= MAP_ANONYMOUS;
+ if (vma->vm_flags & VM_MAYSHARE)
+ vmai.flags |= MAP_SHARED;
+ else
+ vmai.flags |= MAP_PRIVATE;
+
+ vmai.start = vma->vm_start;
+ vmai.end = vma->vm_end;
+ vmai.pgoff = vma->vm_pgoff;
+
+dumpit:
+ return img_dump_buffer(buf, size, &vmai, sizeof(vmai), pos);
+}
+
+static int img_dump_page(unsigned long addr, void *data, char __user *buf, size_t size, int pos)
+{
+ struct binfmt_page_image pgi;
+ int ret = 0, tmp;
+
+ pgi.vaddr = addr;
+
+ if (pos < sizeof(pgi)) {
+ tmp = img_dump_buffer(buf, size, &pgi, sizeof(pgi), pos);
+ if (tmp < 0)
+ return tmp;
+
+ ret = tmp;
+ if (size <= ret)
+ return ret;
+
+ buf += ret;
+ size -= ret;
+ pos = 0;
+ } else
+ pos -= sizeof(pgi);
+
+ tmp = img_dump_buffer(buf, size, data, PAGE_SIZE, pos);
+ if (tmp < 0)
+ return tmp;
+
+ return ret + tmp;
+}
+
+static inline int is_private_vma(struct vm_area_struct *vma)
+{
+ if (vma->vm_file == NULL)
+ return 1;
+ if (!(vma->vm_flags & VM_SHARED))
+ return 1;
+ return 0;
+}
+
+static ssize_t do_produce_dump(struct task_struct *p, char __user *buf,
+ size_t size, loff_t *ppos)
+{
+ size_t img_pos = 0, img_ppos;
+ size_t produced = 0;
+ int len;
+ loff_t pos = *ppos;
+ struct mm_struct *mm;
+ struct vm_area_struct *vma;
+
+#define move_pos(); do { \
+ buf += len; \
+ produced += len;\
+ size -= len; \
+ pos += len; \
+ } while (0)
+
+#define seek_pos(__size); do { \
+ img_ppos = img_pos; \
+ img_pos += (__size); \
+ } while (0)
+
+ /* header */
+ seek_pos(sizeof(struct binfmt_img_header));
+ if (pos < img_pos) {
+ len = img_dump_header(buf, size, pos - img_ppos);
+ if (len < 0)
+ goto err;
+
+ move_pos();
+ if (size == 0)
+ goto out;
+ }
+
+ /* registers */
+ seek_pos(sizeof(struct binfmt_regs_image));
+ if (pos < img_pos) {
+ len = img_dump_regs(p, buf, size, pos - img_ppos);
+ if (len < 0)
+ goto err;
+
+ move_pos();
+ if (size == 0)
+ goto out;
+ }
+
+ /* memory */
+ mm = get_task_mm(p);
+ if (mm == NULL)
+ return -EACCES;
+
+ down_read(&mm->mmap_sem);
+
+ seek_pos(sizeof(struct binfmt_mm_image));
+ if (pos < img_pos) {
+ len = img_dump_mm(mm, buf, size, pos - img_ppos);
+ if (len < 0)
+ goto err_mm;
+
+ move_pos();
+ if (size == 0)
+ goto out_mm;
+ }
+
+ vma = mm->mmap;
+ while (1) {
+ seek_pos(sizeof(struct binfmt_vma_image));
+ if (pos < img_pos) {
+ len = img_dump_vma(vma, buf, size, pos - img_ppos);
+ if (len < 0)
+ goto err_mm;
+
+ move_pos();
+ if (size == 0)
+ goto out_mm;
+ }
+
+ if (vma == NULL)
+ break;
+
+ vma = vma->vm_next;
+ }
+
+ for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+ /* slow and stupid */
+ unsigned long addr;
+ struct page *page;
+ void *pg_data;
+
+ if (!is_private_vma(vma))
+ continue;
+
+ for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+ page = follow_page(vma, addr, FOLL_FORCE | FOLL_DUMP | FOLL_GET);
+ if (page == NULL)
+ continue;
+ if (IS_ERR(page)) /* huh? */
+ continue;
+
+ seek_pos(sizeof(struct binfmt_page_image) + PAGE_SIZE);
+ if (pos < img_pos) {
+ pg_data = kmap(page);
+ len = img_dump_page(addr, pg_data, buf, size, pos - img_ppos);
+ kunmap(page);
+
+ if (len < 0) {
+ put_page(page);
+ goto err_mm;
+ }
+
+ move_pos();
+ if (size == 0) {
+ put_page(page);
+ goto out_mm;
+ }
+ }
+
+ put_page(page);
+ }
+ }
+
+ seek_pos(sizeof(struct binfmt_page_image));
+ if (pos < img_pos) {
+ struct binfmt_page_image zero;
+
+ memset(&zero, 0, sizeof(zero));
+ len = img_dump_buffer(buf, size, &zero, sizeof(zero), pos - img_ppos);
+ if (len < 0)
+ goto err;
+
+ move_pos();
+ }
+
+out_mm:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+out:
+ *ppos = pos;
+ return produced;
+
+err_mm:
+ up_read(&mm->mmap_sem);
+ mmput(mm);
+err:
+ return len;
+}
+
+static ssize_t img_dump_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
+{
+ struct task_struct *p;
+
+ p = get_proc_task(file->f_dentry->d_inode);
+ if (p == NULL)
+ return -ESRCH;
+
+ if (!(p->state & TASK_STOPPED)) {
+ put_task_struct(p);
+ return -EINVAL;
+ }
+
+ return do_produce_dump(p, buf, size, ppos);
+}
+
+static int img_dump_open(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+static int img_dump_release(struct inode *inode, struct file *filp)
+{
+ return 0;
+}
+
+const struct file_operations proc_pid_dump_operations = {
+ .open = img_dump_open,
+ .read = img_dump_read,
+ .release = img_dump_release,
+};
Index: linux-2.6.git/include/linux/binfmt_img.h
===================================================================
--- /dev/null
+++ linux-2.6.git/include/linux/binfmt_img.h
@@ -0,0 +1,87 @@
+#ifndef __BINFMT_IMG_H__
+#define __BINFMT_IMG_H__
+
+#include <linux/types.h>
+
+struct binfmt_img_header {
+ __u32 magic;
+ __u32 version;
+};
+
+#define CKPT_TLS_ENTRIES 3
+
+struct binfmt_regs_image {
+ __u64 r15;
+ __u64 r14;
+ __u64 r13;
+ __u64 r12;
+ __u64 r11;
+ __u64 r10;
+ __u64 r9;
+ __u64 r8;
+ __u64 ax;
+ __u64 orig_ax;
+ __u64 bx;
+ __u64 cx;
+ __u64 dx;
+ __u64 si;
+ __u64 di;
+ __u64 ip;
+ __u64 flags;
+ __u64 bp;
+ __u64 sp;
+
+ __u64 gs;
+ __u64 fs;
+ __u64 tls[CKPT_TLS_ENTRIES];
+ __u16 gsindex;
+ __u16 fsindex;
+ __u16 cs;
+ __u16 ss;
+ __u16 ds;
+ __u16 es;
+};
+
+#define CKPT_X86_SEG_NULL 0
+#define CKPT_X86_SEG_USER32_CS 1
+#define CKPT_X86_SEG_USER32_DS 2
+#define CKPT_X86_SEG_USER64_CS 3
+#define CKPT_X86_SEG_USER64_DS 4
+#define CKPT_X86_SEG_TLS 0x4000
+#define CKPT_X86_SEG_LDT 0x8000
+
+struct binfmt_mm_image {
+ __u64 flags;
+ __u64 def_flags;
+ __u64 start_code;
+ __u64 end_code;
+ __u64 start_data;
+ __u64 end_data;
+ __u64 start_brk;
+ __u64 brk;
+ __u64 start_stack;
+ __u64 arg_start;
+ __u64 arg_end;
+ __u64 env_start;
+ __u64 env_end;
+ __u32 exe_fd;
+};
+
+struct binfmt_vma_image {
+ __u32 prot;
+ __u32 flags;
+ __u32 pad;
+ __u32 fd;
+ __u64 start;
+ __u64 end;
+ __u64 pgoff;
+};
+
+struct binfmt_page_image {
+ __u64 vaddr;
+};
+
+#define BINFMT_IMG_MAGIC 0xa75b8d43
+#define BINFMT_IMG_VERS_0 0x00000100
+
+#endif
Index: linux-2.6.git/include/linux/proc_fs.h
===================================================================
--- linux-2.6.git.orig/include/linux/proc_fs.h
+++ linux-2.6.git/include/linux/proc_fs.h
@@ -102,6 +102,8 @@ struct vmcore {
#ifdef CONFIG_PROC_FS
+extern const struct file_operations proc_pid_dump_operations;
+
extern void proc_root_init(void);
void proc_flush_task(struct task_struct *task);