From b50ee4a1750b9eadd9d6c3d4e424f14c7f2bade4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 8 Nov 2011 15:16:06 +0400 Subject: [PATCH] Update kernel area Signed-off-by: Cyrill Gorcunov --- README | 19 +- ..._get_link-to-use-dentry-instead-of-.patch} | 39 +- ...-the-proc-pid-map_files-directory-v.patch} | 37 +- ...-the-Children-line-in-proc-pid-stat.patch} | 25 +- ..._data-end_data-start_brk-members-to.patch} | 23 +- kernel/binfmt-elf-for-cr-5 | 976 ------------------ kernel/cr-clone-with-pid-support | 183 ---- kernel/cr-statfs-callback-for-pipefs | 27 - kernel/fs-add-do-close | 85 -- kernel/fs-proc-add-tls | 49 - ...es-against-execve-of-proc-pid-fd-fix.patch | 28 - ...-races-against-execve-of-proc-pid-fd.patch | 255 ----- ...e-dcache-drop-on-unauthorized-access.patch | 118 --- ...dir-when-reading-sysctl-dirs-in-proc.patch | 26 - kernel/readme | 13 - kernel/series | 13 - 16 files changed, 92 insertions(+), 1824 deletions(-) rename kernel/{fs-proc-switch-to-dentry => 0001-fs-proc-Make-proc_get_link-to-use-dentry-instead-of-.patch} (70%) rename kernel/{cr-proc-map-files-21 => 0002-fs-proc-Introduce-the-proc-pid-map_files-directory-v.patch} (94%) rename kernel/{cr-proc-add-children => 0003-fs-proc-Introduce-the-Children-line-in-proc-pid-stat.patch} (61%) rename kernel/{fs-proc-add-mm-task-stat => 0004-fs-proc-Add-start_data-end_data-start_brk-members-to.patch} (57%) delete mode 100644 kernel/binfmt-elf-for-cr-5 delete mode 100644 kernel/cr-clone-with-pid-support delete mode 100644 kernel/cr-statfs-callback-for-pipefs delete mode 100644 kernel/fs-add-do-close delete mode 100644 kernel/fs-proc-add-tls delete mode 100644 kernel/proc-fix-races-against-execve-of-proc-pid-fd-fix.patch delete mode 100644 kernel/proc-fix-races-against-execve-of-proc-pid-fd.patch delete mode 100644 kernel/proc-force-dcache-drop-on-unauthorized-access.patch delete mode 100644 kernel/procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch delete mode 100644 kernel/readme delete mode 100644 kernel/series diff --git a/README b/README index 0c845315a..88645bcea 100644 --- a/README +++ b/README @@ -1,7 +1,7 @@ crtools ======= -An utility to to checkpoint/restore tasks. +An utility to checkpoint/restore tasks. Some code snippets are borrowed from @@ -13,3 +13,20 @@ Some code snippets are borrowed from Many thanks to these projects. Licensed under GPLv2 (http://www.gnu.org/licenses/gpl-2.0.txt) + +Kernel patching +=============== + +To have crtools up and running either + +1) use patches from kernel/ directory +2) or clone git://github.com/cyrillos/linux-2.6.git + and switch to branch "crtools". + +It's based on Linux + + | commit 1ea6b8f48918282bdca0b32a34095504ee65bab5 + | Author: Linus Torvalds + | Date: Mon Nov 7 16:16:02 2011 -0800 + | + | Linux 3.2-rc1 diff --git a/kernel/fs-proc-switch-to-dentry b/kernel/0001-fs-proc-Make-proc_get_link-to-use-dentry-instead-of-.patch similarity index 70% rename from kernel/fs-proc-switch-to-dentry rename to kernel/0001-fs-proc-Make-proc_get_link-to-use-dentry-instead-of-.patch index f79c466d4..2cbb06d1d 100644 --- a/kernel/fs-proc-switch-to-dentry +++ b/kernel/0001-fs-proc-Make-proc_get_link-to-use-dentry-instead-of-.patch @@ -1,4 +1,8 @@ -fs, proc: Make proc_get_link to use dentry instead of inode +From fc4504ee8f471ac1ac8162ec68e98f2c09d53411 Mon Sep 17 00:00:00 2001 +From: Cyrill Gorcunov +Date: Tue, 8 Nov 2011 14:57:10 +0400 +Subject: [PATCH 1/4] fs, proc: Make proc_get_link to use dentry instead of + inode This patch prepares the ground for the next "map_files" patch which needs a name of a link file to analyse. @@ -16,11 +20,11 @@ CC: Andrew Morton include/linux/proc_fs.h | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) -Index: linux-2.6.git/fs/proc/base.c -=================================================================== ---- linux-2.6.git.orig/fs/proc/base.c -+++ linux-2.6.git/fs/proc/base.c -@@ -165,9 +165,9 @@ static int get_task_root(struct task_str +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 2db1bd3..93c81aa 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -165,9 +165,9 @@ static int get_task_root(struct task_struct *task, struct path *root) return result; } @@ -32,7 +36,7 @@ Index: linux-2.6.git/fs/proc/base.c int result = -ENOENT; if (task) { -@@ -182,9 +182,9 @@ static int proc_cwd_link(struct inode *i +@@ -182,9 +182,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path) return result; } @@ -44,7 +48,7 @@ Index: linux-2.6.git/fs/proc/base.c int result = -ENOENT; if (task) { -@@ -1580,13 +1580,13 @@ static const struct file_operations proc +@@ -1567,13 +1567,13 @@ static const struct file_operations proc_pid_set_comm_operations = { .release = single_release, }; @@ -60,7 +64,7 @@ Index: linux-2.6.git/fs/proc/base.c if (!task) return -ENOENT; mm = get_task_mm(task); -@@ -1616,7 +1616,7 @@ static void *proc_pid_follow_link(struct +@@ -1603,7 +1603,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) if (!proc_fd_access_allowed(inode)) goto out; @@ -69,7 +73,7 @@ Index: linux-2.6.git/fs/proc/base.c out: return ERR_PTR(error); } -@@ -1655,7 +1655,7 @@ static int proc_pid_readlink(struct dent +@@ -1642,7 +1642,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b if (!proc_fd_access_allowed(inode)) goto out; @@ -78,7 +82,7 @@ Index: linux-2.6.git/fs/proc/base.c if (error) goto out; -@@ -1959,9 +1959,9 @@ out_task: +@@ -1980,9 +1980,9 @@ out_task: return rc; } @@ -90,11 +94,11 @@ Index: linux-2.6.git/fs/proc/base.c } static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) -Index: linux-2.6.git/include/linux/proc_fs.h -=================================================================== ---- linux-2.6.git.orig/include/linux/proc_fs.h -+++ linux-2.6.git/include/linux/proc_fs.h -@@ -253,7 +253,7 @@ extern const struct proc_ns_operations u +diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h +index 643b96c..c3d11ff 100644 +--- a/include/linux/proc_fs.h ++++ b/include/linux/proc_fs.h +@@ -253,7 +253,7 @@ extern const struct proc_ns_operations utsns_operations; extern const struct proc_ns_operations ipcns_operations; union proc_op { @@ -103,3 +107,6 @@ Index: linux-2.6.git/include/linux/proc_fs.h int (*proc_read)(struct task_struct *task, char *page); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, +-- +1.7.6.4 + diff --git a/kernel/cr-proc-map-files-21 b/kernel/0002-fs-proc-Introduce-the-proc-pid-map_files-directory-v.patch similarity index 94% rename from kernel/cr-proc-map-files-21 rename to kernel/0002-fs-proc-Introduce-the-proc-pid-map_files-directory-v.patch index f67baf86c..f6bb18491 100644 --- a/kernel/cr-proc-map-files-21 +++ b/kernel/0002-fs-proc-Introduce-the-proc-pid-map_files-directory-v.patch @@ -1,6 +1,8 @@ -fs, proc: Introduce the /proc//map_files/ directory v14 - +From d23bde31590a7679aa2be7960848b0fedd0ce032 Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov +Date: Tue, 8 Nov 2011 14:58:01 +0400 +Subject: [PATCH 2/4] fs, proc: Introduce the /proc//map_files/ directory + v14 This one behaves similarly to the /proc//fd/ one - it contains symlinks one for each mapping with file, the name of a symlink is "vma->vm_start-vma->vm_end", @@ -115,14 +117,14 @@ CC: Al Viro CC: Andrew Morton CC: Pavel Machek --- - fs/proc/base.c | 345 +++++++++++++++++++++++++++++++++++++++++++++++++++++ - include/linux/mm.h | 12 + - 2 files changed, 357 insertions(+) + fs/proc/base.c | 345 ++++++++++++++++++++++++++++++++++++++++++++++++++++ + include/linux/mm.h | 12 ++ + 2 files changed, 357 insertions(+), 0 deletions(-) -Index: linux-2.6.git/fs/proc/base.c -=================================================================== ---- linux-2.6.git.orig/fs/proc/base.c -+++ linux-2.6.git/fs/proc/base.c +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 93c81aa..9b7a9cd 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c @@ -83,6 +83,7 @@ #include #include @@ -140,7 +142,7 @@ Index: linux-2.6.git/fs/proc/base.c /* * Count the number of hardlinks for the pid_entry table, excluding the . * and .. links. -@@ -2201,6 +2204,347 @@ static const struct file_operations proc +@@ -2217,6 +2220,347 @@ static const struct file_operations proc_fd_operations = { }; /* @@ -488,7 +490,7 @@ Index: linux-2.6.git/fs/proc/base.c * /proc/pid/fd needs a special permission handler so that a process can still * access /proc/self/fd after it has executed a setuid(). */ -@@ -2815,6 +3159,7 @@ static const struct inode_operations pro +@@ -2832,6 +3176,7 @@ static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), @@ -496,11 +498,11 @@ Index: linux-2.6.git/fs/proc/base.c DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), DIR("ns", S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations), #ifdef CONFIG_NET -Index: linux-2.6.git/include/linux/mm.h -=================================================================== ---- linux-2.6.git.orig/include/linux/mm.h -+++ linux-2.6.git/include/linux/mm.h -@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(st +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 3dc3a8c..14159d3 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -1491,6 +1491,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; } @@ -519,3 +521,6 @@ Index: linux-2.6.git/include/linux/mm.h #ifdef CONFIG_MMU pgprot_t vm_get_page_prot(unsigned long vm_flags); #else +-- +1.7.6.4 + diff --git a/kernel/cr-proc-add-children b/kernel/0003-fs-proc-Introduce-the-Children-line-in-proc-pid-stat.patch similarity index 61% rename from kernel/cr-proc-add-children rename to kernel/0003-fs-proc-Introduce-the-Children-line-in-proc-pid-stat.patch index d307a6024..9aa09dd71 100644 --- a/kernel/cr-proc-add-children +++ b/kernel/0003-fs-proc-Introduce-the-Children-line-in-proc-pid-stat.patch @@ -1,8 +1,10 @@ -proc: Introduce the Children: line in /proc//status - +From 9e489dbc4f796b76adb4440ccf4888d934ede61d Mon Sep 17 00:00:00 2001 From: Pavel Emelyanov +Date: Tue, 8 Nov 2011 14:59:40 +0400 +Subject: [PATCH 3/4] fs, proc: Introduce the Children: line in + /proc//status -Although we can get the pids of some task's issue, this is just +Although we can get the pids of some task's issue, this is just more convenient to have them this way. Signed-off-by: Pavel Emelyanov @@ -10,13 +12,13 @@ Acked-by: Serge Hallyn Signed-off-by: Cyrill Gorcunov --- fs/proc/array.c | 14 ++++++++++++++ - 1 file changed, 14 insertions(+) + 1 files changed, 14 insertions(+), 0 deletions(-) -Index: linux-2.6.git/fs/proc/array.c -=================================================================== ---- linux-2.6.git.orig/fs/proc/array.c -+++ linux-2.6.git/fs/proc/array.c -@@ -158,6 +158,18 @@ static inline const char *get_task_state +diff --git a/fs/proc/array.c b/fs/proc/array.c +index 3a1dafd..8f33329 100644 +--- a/fs/proc/array.c ++++ b/fs/proc/array.c +@@ -158,6 +158,18 @@ static inline const char *get_task_state(struct task_struct *tsk) return *p; } @@ -35,7 +37,7 @@ Index: linux-2.6.git/fs/proc/array.c static inline void task_state(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *p) { -@@ -192,6 +204,8 @@ static inline void task_state(struct seq +@@ -192,6 +204,8 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns, cred->uid, cred->euid, cred->suid, cred->fsuid, cred->gid, cred->egid, cred->sgid, cred->fsgid); @@ -44,3 +46,6 @@ Index: linux-2.6.git/fs/proc/array.c task_lock(p); if (p->files) fdt = files_fdtable(p->files); +-- +1.7.6.4 + diff --git a/kernel/fs-proc-add-mm-task-stat b/kernel/0004-fs-proc-Add-start_data-end_data-start_brk-members-to.patch similarity index 57% rename from kernel/fs-proc-add-mm-task-stat rename to kernel/0004-fs-proc-Add-start_data-end_data-start_brk-members-to.patch index 4a068dc9c..b18c4f146 100644 --- a/kernel/fs-proc-add-mm-task-stat +++ b/kernel/0004-fs-proc-Add-start_data-end_data-start_brk-members-to.patch @@ -1,17 +1,21 @@ -fs, proc: Add start_data, end_data, start_brk members to /proc/$pid/stat +From e46fc1fa01faea36ad4c5608436f5900e66c9529 Mon Sep 17 00:00:00 2001 +From: Cyrill Gorcunov +Date: Tue, 8 Nov 2011 15:00:56 +0400 +Subject: [PATCH 4/4] fs, proc: Add start_data, end_data, start_brk members to + /proc/$pid/stat It helps to dump and restore this mm_struct members at chekpoint/restore time. Signed-off-by: Cyrill Gorcunov --- fs/proc/array.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) + 1 files changed, 5 insertions(+), 2 deletions(-) -Index: linux-2.6.git/fs/proc/array.c -=================================================================== ---- linux-2.6.git.orig/fs/proc/array.c -+++ linux-2.6.git/fs/proc/array.c -@@ -478,7 +478,7 @@ static int do_task_stat(struct seq_file +diff --git a/fs/proc/array.c b/fs/proc/array.c +index 8f33329..8248682 100644 +--- a/fs/proc/array.c ++++ b/fs/proc/array.c +@@ -478,7 +478,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \ %lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \ @@ -20,7 +24,7 @@ Index: linux-2.6.git/fs/proc/array.c pid_nr_ns(pid, ns), tcomm, state, -@@ -525,7 +525,10 @@ static int do_task_stat(struct seq_file +@@ -525,7 +525,10 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, task->policy, (unsigned long long)delayacct_blkio_ticks(task), cputime_to_clock_t(gtime), @@ -32,3 +36,6 @@ Index: linux-2.6.git/fs/proc/array.c if (mm) mmput(mm); return 0; +-- +1.7.6.4 + diff --git a/kernel/binfmt-elf-for-cr-5 b/kernel/binfmt-elf-for-cr-5 deleted file mode 100644 index 0e7a613d1..000000000 --- a/kernel/binfmt-elf-for-cr-5 +++ /dev/null @@ -1,976 +0,0 @@ -elf: Add support for loading ET_CKPT files - -This patch add ability to run that named "checkpoint" files by -enhancing Elf file format, which includes - - - new Elf file type ET_CKPT - - - three additional program header types PT_CKPT_VMA, PT_CKPT_CORE - and PT_CKPT_PAGES. - - PT_CKPT_VMA -- holds 'vma_entry' structure, which describes the - memory area the kernel should map. It also might contain a file descriptor - so the kernel will be mapping a file povided. Usually such file get - opened by user-space helper tool which prepares 'vma_entry' structure - for the kernel. - - PT_CKPT_CORE -- 'core_entry' structure (registers, tls, tasks specific - settings). The structure is defined as a 16K container which should be - enough for most cases. 8K of it is reserved for arch specific settings. - - PT_CKPT_PAGES -- a set of all pages which contents we should restored. - -Apart from Elf extension flush_old_exec() has been splitted to two -functions -- the former flush_old_exec() and flush_exec_keep_thread(). -The later doesn't call for de_thread() allowing to keep threads -relationship. Also arch_setup_additional_pages_at() helper added -to setup vdso at predefined address. - -At moment only pure x86-64 architecture is supported. - -Signed-off-by: Cyrill Gorcunov -CC: Andrew Vagin -CC: Pavel Emelyanov -CC: James Bottomley -CC: Glauber Costa -CC: H. Peter Anvin -CC: Ingo Molnar -CC: Tejun Heo -CC: Dave Hansen -CC: Eric W. Biederman -CC: Daniel Lezcano -CC: Alexey Dobriyan ---- - arch/x86/include/asm/elf.h | 3 - arch/x86/include/asm/elf_ckpt.h | 80 ++++++++ - arch/x86/kernel/Makefile | 2 - arch/x86/kernel/elf_ckpt.c | 161 ++++++++++++++++++ - arch/x86/vdso/vma.c | 22 ++ - fs/Kconfig.binfmt | 11 + - fs/Makefile | 1 - fs/binfmt_elf.c | 17 + - fs/binfmt_elf_ckpt.c | 356 ++++++++++++++++++++++++++++++++++++++++ - fs/exec.c | 27 +-- - include/linux/binfmts.h | 1 - include/linux/elf_ckpt.h | 103 +++++++++++ - 12 files changed, 772 insertions(+), 12 deletions(-) - -Index: linux-2.6.git/arch/x86/include/asm/elf.h -=================================================================== ---- linux-2.6.git.orig/arch/x86/include/asm/elf.h -+++ linux-2.6.git/arch/x86/include/asm/elf.h -@@ -314,7 +314,8 @@ struct linux_binprm; - #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 - extern int arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp); -- -+extern int arch_setup_additional_pages_at(struct linux_binprm *bprm, -+ void *addr, int uses_interp); - extern int syscall32_setup_pages(struct linux_binprm *, int exstack); - #define compat_arch_setup_additional_pages syscall32_setup_pages - -Index: linux-2.6.git/arch/x86/include/asm/elf_ckpt.h -=================================================================== ---- /dev/null -+++ linux-2.6.git/arch/x86/include/asm/elf_ckpt.h -@@ -0,0 +1,80 @@ -+#ifndef _LINUX_ELF_X86_CHECKPOINT_H -+#define _LINUX_ELF_X86_CHECKPOINT_H -+ -+#include -+ -+#include -+#include -+ -+#define CKPT_GDT_ENTRY_TLS_ENTRIES 3 -+ -+struct user_regs_entry { -+ __u64 r15; -+ __u64 r14; -+ __u64 r13; -+ __u64 r12; -+ __u64 bp; -+ __u64 bx; -+ __u64 r11; -+ __u64 r10; -+ __u64 r9; -+ __u64 r8; -+ __u64 ax; -+ __u64 cx; -+ __u64 dx; -+ __u64 si; -+ __u64 di; -+ __u64 orig_ax; -+ __u64 ip; -+ __u64 cs; -+ __u64 flags; -+ __u64 sp; -+ __u64 ss; -+ __u64 fs_base; -+ __u64 gs_base; -+ __u64 ds; -+ __u64 es; -+ __u64 fs; -+ __u64 gs; -+} __packed; -+ -+struct desc_struct_entry { -+ __u32 a; -+ __u32 b; -+} __packed; -+ -+struct user_fpregs_entry { -+ __u16 cwd; -+ __u16 swd; -+ __u16 twd; -+ __u16 fop; -+ __u64 rip; -+ __u64 rdp; -+ __u32 mxcsr; -+ __u32 mxcsr_mask; -+ __u32 st_space[32]; -+ __u32 xmm_space[64]; -+ __u32 padding[24]; -+} __packed; -+ -+struct ckpt_arch_entry { -+ struct user_regs_entry gpregs; -+ struct user_fpregs_entry fpregs; -+ struct desc_struct tls_array[CKPT_GDT_ENTRY_TLS_ENTRIES]; -+}; -+ -+struct core_entry; -+ -+#ifdef CONFIG_X86_64 -+extern int load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs, -+ struct core_entry *core_entry); -+#else -+static inline int -+load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs, -+ struct core_entry *core_entry) -+{ -+ return -ENOEXEC; -+} -+#endif -+ -+#endif /* _LINUX_ELF_X86_CHECKPOINT_H */ -Index: linux-2.6.git/arch/x86/kernel/Makefile -=================================================================== ---- linux-2.6.git.orig/arch/x86/kernel/Makefile -+++ linux-2.6.git/arch/x86/kernel/Makefile -@@ -99,6 +99,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) - obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o - obj-$(CONFIG_OF) += devicetree.o - -+obj-$(CONFIG_BINFMT_ELF_CKPT) += elf_ckpt.o -+ - ### - # 64 bit specific files - ifeq ($(CONFIG_X86_64),y) -Index: linux-2.6.git/arch/x86/kernel/elf_ckpt.c -=================================================================== ---- /dev/null -+++ linux-2.6.git/arch/x86/kernel/elf_ckpt.c -@@ -0,0 +1,161 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+ -+#ifdef CONFIG_X86_64 -+ -+#define cp_reg(d, s, r) d.r = s.r -+ -+int load_elf_ckpt_arch(struct task_struct *tsk, struct pt_regs *regs, -+ struct core_entry *core_entry) -+{ -+ struct ckpt_arch_entry *arch = (struct ckpt_arch_entry *)core_entry->arch; -+ struct thread_struct *thread = ¤t->thread; -+ -+ struct user_regs_struct gpregs; -+ struct user_i387_struct fpregs; -+ -+ mm_segment_t old_fs; -+ int i, ret; -+ -+ if (core_entry->header.arch != CKPT_HEADER_ARCH_X86_64) { -+ pr_err("elf-ckpt-x86: Unsupported or corrupted header\n"); -+ return -ENOEXEC; -+ } -+ -+ BUILD_BUG_ON(CKPT_GDT_ENTRY_TLS_ENTRIES != GDT_ENTRY_TLS_ENTRIES); -+ BUILD_BUG_ON(sizeof(struct ckpt_arch_entry) > CKPT_ARCH_SIZE); -+ -+ memset(&gpregs, 0, sizeof(gpregs)); -+ memset(&fpregs, 0, sizeof(fpregs)); -+ -+ /* -+ * General purpose registers -+ */ -+ cp_reg(gpregs, arch->gpregs, r15); -+ cp_reg(gpregs, arch->gpregs, r14); -+ cp_reg(gpregs, arch->gpregs, r13); -+ cp_reg(gpregs, arch->gpregs, r12); -+ cp_reg(gpregs, arch->gpregs, bp); -+ cp_reg(gpregs, arch->gpregs, bx); -+ cp_reg(gpregs, arch->gpregs, r11); -+ cp_reg(gpregs, arch->gpregs, r10); -+ cp_reg(gpregs, arch->gpregs, r9); -+ cp_reg(gpregs, arch->gpregs, r8); -+ cp_reg(gpregs, arch->gpregs, ax); -+ cp_reg(gpregs, arch->gpregs, cx); -+ cp_reg(gpregs, arch->gpregs, dx); -+ cp_reg(gpregs, arch->gpregs, si); -+ cp_reg(gpregs, arch->gpregs, di); -+ cp_reg(gpregs, arch->gpregs, orig_ax); -+ cp_reg(gpregs, arch->gpregs, ip); -+ cp_reg(gpregs, arch->gpregs, cs); -+ cp_reg(gpregs, arch->gpregs, flags); -+ cp_reg(gpregs, arch->gpregs, sp); -+ cp_reg(gpregs, arch->gpregs, ss); -+ cp_reg(gpregs, arch->gpregs, fs_base); -+ cp_reg(gpregs, arch->gpregs, gs_base); -+ cp_reg(gpregs, arch->gpregs, ds); -+ cp_reg(gpregs, arch->gpregs, es); -+ cp_reg(gpregs, arch->gpregs, fs); -+ cp_reg(gpregs, arch->gpregs, gs); -+ -+ old_fs = get_fs(); -+ set_fs(KERNEL_DS); -+ ret = arch_ptrace(current, PTRACE_SETREGS, 0, (unsigned long)&gpregs); -+ set_fs(old_fs); -+ if (ret) -+ goto out; -+ -+ *regs = *task_pt_regs(current); -+ -+ thread->usersp = arch->gpregs.sp; -+ thread->ds = arch->gpregs.ds; -+ thread->es = arch->gpregs.es; -+ thread->fs = arch->gpregs.fs; -+ thread->gs = arch->gpregs.gs; -+ -+ thread->fsindex = thread->fs; -+ thread->gsindex = thread->gs; -+ -+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) { -+ thread->tls_array[i].a = arch->tls_array[i].a; -+ thread->tls_array[i].b = arch->tls_array[i].b; -+ } -+ -+ if (arch->gpregs.fs_base) { -+ ret = do_arch_prctl(current, ARCH_SET_FS, arch->gpregs.fs_base); -+ if (ret) -+ goto out; -+ } -+ -+ if (arch->gpregs.gs_base) { -+ ret = do_arch_prctl(current, ARCH_SET_GS, arch->gpregs.gs_base); -+ if (ret) -+ goto out; -+ } -+ -+ /* Restoring FPU */ -+ if (core_entry->task_flags & PF_USED_MATH) { -+ -+ cp_reg(fpregs, arch->fpregs, cwd); -+ cp_reg(fpregs, arch->fpregs, swd); -+ cp_reg(fpregs, arch->fpregs, twd); -+ cp_reg(fpregs, arch->fpregs, fop); -+ cp_reg(fpregs, arch->fpregs, rip); -+ cp_reg(fpregs, arch->fpregs, rdp); -+ cp_reg(fpregs, arch->fpregs, mxcsr); -+ cp_reg(fpregs, arch->fpregs, mxcsr_mask); -+ -+ for (i = 0; i < ARRAY_SIZE(arch->fpregs.st_space); i++) -+ cp_reg(fpregs, arch->fpregs, st_space[i]); -+ -+ for (i = 0; i < ARRAY_SIZE(arch->fpregs.xmm_space); i++) -+ cp_reg(fpregs, arch->fpregs, xmm_space[i]); -+ -+ old_fs = get_fs(); -+ set_fs(KERNEL_DS); -+ ret = arch_ptrace(current, PTRACE_SETFPREGS, 0, (unsigned long)&fpregs); -+ set_fs(old_fs); -+ if (ret) -+ goto out; -+ } -+ -+out: -+ return ret; -+} -+ -+#endif /* CONFIG_X86_64 */ -Index: linux-2.6.git/arch/x86/vdso/vma.c -=================================================================== ---- linux-2.6.git.orig/arch/x86/vdso/vma.c -+++ linux-2.6.git/arch/x86/vdso/vma.c -@@ -137,6 +137,28 @@ up_fail: - return ret; - } - -+int arch_setup_additional_pages_at(struct linux_binprm *bprm, void *addr, int uses_interp) -+{ -+ struct mm_struct *mm = current->mm; -+ int ret; -+ -+ if (!vdso_enabled) -+ return 0; -+ -+ down_write(&mm->mmap_sem); -+ current->mm->context.vdso = addr; -+ ret = install_special_mapping(mm, (unsigned long)addr, vdso_size, -+ VM_READ | VM_EXEC | -+ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | -+ VM_ALWAYSDUMP, -+ vdso_pages); -+ if (ret) -+ current->mm->context.vdso = NULL; -+ -+ up_write(&mm->mmap_sem); -+ return ret; -+} -+ - static __init int vdso_setup(char *s) - { - vdso_enabled = simple_strtoul(s, NULL, 0); -Index: linux-2.6.git/fs/Kconfig.binfmt -=================================================================== ---- linux-2.6.git.orig/fs/Kconfig.binfmt -+++ linux-2.6.git/fs/Kconfig.binfmt -@@ -23,6 +23,17 @@ config BINFMT_ELF - ld.so (check the file for location and - latest version). - -+config BINFMT_ELF_CKPT -+ tristate "Kernel support for CKPT ELF binaries" -+ default n -+ depends on BINFMT_ELF && X86_64 -+ help -+ ELF CKPT (checkpoint) is an extension to ELF format to restore -+ checkpointed processes. It's not confirmed yet and highly -+ experimental. -+ -+ If unsure, say N. -+ - config COMPAT_BINFMT_ELF - bool - depends on COMPAT && BINFMT_ELF -Index: linux-2.6.git/fs/Makefile -=================================================================== ---- linux-2.6.git.orig/fs/Makefile -+++ linux-2.6.git/fs/Makefile -@@ -37,6 +37,7 @@ obj-$(CONFIG_BINFMT_MISC) += binfmt_misc - obj-y += binfmt_script.o - - obj-$(CONFIG_BINFMT_ELF) += binfmt_elf.o -+obj-$(CONFIG_BINFMT_ELF_CKPT) += binfmt_elf_ckpt.o - obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o - obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o - obj-$(CONFIG_BINFMT_SOM) += binfmt_som.o -Index: linux-2.6.git/fs/binfmt_elf.c -=================================================================== ---- linux-2.6.git.orig/fs/binfmt_elf.c -+++ linux-2.6.git/fs/binfmt_elf.c -@@ -30,6 +30,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -592,7 +593,11 @@ static int load_elf_binary(struct linux_ - if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0) - goto out; - -- if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN) -+ if (loc->elf_ex.e_type != ET_EXEC && -+#ifdef CONFIG_BINFMT_ELF_CKPT -+ loc->elf_ex.e_type != ET_CKPT && -+#endif -+ loc->elf_ex.e_type != ET_DYN) - goto out; - if (!elf_check_arch(&loc->elf_ex)) - goto out; -@@ -619,6 +624,16 @@ static int load_elf_binary(struct linux_ - goto out_free_ph; - } - -+#ifdef CONFIG_BINFMT_ELF_CKPT -+ if (loc->elf_ex.e_type == ET_CKPT) { -+ retval = load_elf_ckpt(bprm, regs, &loc->elf_ex, -+ (struct elf_phdr *)elf_phdata); -+ if (!retval) -+ set_binfmt(&elf_format); -+ goto out_free_ph; -+ } -+#endif -+ - elf_ppnt = elf_phdata; - elf_bss = 0; - elf_brk = 0; -Index: linux-2.6.git/fs/binfmt_elf_ckpt.c -=================================================================== ---- /dev/null -+++ linux-2.6.git/fs/binfmt_elf_ckpt.c -@@ -0,0 +1,356 @@ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include -+#include -+ -+#include -+#include -+#include -+ -+int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs, -+ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr) -+{ -+ struct elf_phdr *elf_phdr_pages; -+ struct flex_array *fa = NULL; -+ struct vma_entry *vma_entry_ptr; -+ int nr_vma_found, nr_vma_mapped; -+ struct vma_entry vma_entry; -+ struct file *file = NULL; -+ unsigned long map_addr; -+ -+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES -+ unsigned long vdso = -1UL; -+#endif -+ -+ struct core_entry *core_entry = NULL; -+ unsigned long start_stack = -1UL; -+ -+ int i, ret = -ENOEXEC; -+ loff_t off; -+ -+ BUILD_BUG_ON(CKPT_TASK_COMM_LEN != TASK_COMM_LEN); -+ BUILD_BUG_ON(CKPT_PAGE_SIZE != PAGE_SIZE); -+ BUILD_BUG_ON(CKPT_CORE_SIZE != sizeof(*core_entry)); -+ -+ elf_phdr_pages = NULL; -+ nr_vma_found = 0; -+ nr_vma_mapped = 0; -+ -+ /* -+ * An early check for header version so if we fail here -+ * we would not need to use flex array at all. -+ */ -+ for (i = 0; i < elf_ex->e_phnum; i++) { -+ if (elf_phdr[i].p_type != PT_CKPT_CORE) -+ continue; -+ -+ core_entry = vmalloc(sizeof(*core_entry)); -+ if (!core_entry) { -+ ret = -ENOMEM; -+ goto out; -+ } -+ -+ ret = kernel_read(bprm->file, elf_phdr[i].p_offset, -+ (char *)core_entry, sizeof(*core_entry)); -+ if (ret != sizeof(*core_entry)) { -+ pr_err("elf-ckpt: Can't read core_entry\n"); -+ ret = -EIO; -+ goto out; -+ } -+ -+ if (core_entry->header.version != CKPT_HEADER_VERSION) { -+ pr_err("elf-ckpt: Unsupported or corrupted header\n"); -+ ret = -ENOEXEC; -+ goto out; -+ } -+ -+ break; -+ } -+ -+ if (i == elf_ex->e_phnum) { -+ pr_err("elf-ckpt: No header found\n"); -+ ret = -ENOEXEC; -+ goto out; -+ } -+ -+ -+ fa = flex_array_alloc(sizeof(vma_entry), elf_ex->e_phnum, GFP_KERNEL); -+ if (!fa || flex_array_prealloc(fa, 0, elf_ex->e_phnum, GFP_KERNEL)) { -+ ret = -ENOMEM; -+ if (fa) { -+ flex_array_free(fa); -+ fa = NULL; -+ goto out; -+ } -+ } -+ -+ ret = flush_exec_keep_thread(bprm); -+ if (ret) -+ goto out; -+ -+ current->flags &= ~PF_FORKNOEXEC; -+ current->mm->def_flags = 0; -+ -+ /* -+ * We don't care about parameters passed (such as argc, argv, env) -+ * when execute checkpoint file because we're to substitute -+ * all things anyway. -+ */ -+ do_munmap(current->mm, 0, TASK_SIZE); -+ -+ SET_PERSONALITY(loc->elf_ex); -+ -+ for (i = 0; i < elf_ex->e_phnum; i++) { -+ -+ switch (elf_phdr[i].p_type) { -+ case PT_CKPT_VMA: -+ ret = kernel_read(bprm->file, elf_phdr[i].p_offset, -+ (char *)&vma_entry, sizeof(vma_entry)); -+ if (ret != sizeof(vma_entry)) { -+ pr_err("elf-ckpt: Can't read vma_entry\n"); -+ ret = -EIO; -+ goto out; -+ } -+ if (flex_array_put(fa, i, &vma_entry, GFP_KERNEL)) -+ BUG(); -+ -+ /* We need to know if there is executable stack */ -+ if (vma_entry.status & VMA_AREA_STACK) { -+ if (vma_entry.flags & PROT_EXEC) -+ current->personality |= READ_IMPLIES_EXEC; -+ } -+ -+ nr_vma_found++; -+ continue; -+ case PT_CKPT_PAGES: -+ elf_phdr_pages = &elf_phdr[i]; -+ continue; -+ default: -+ continue; -+ } -+ } -+ -+ /* Be sure it has the file structure we expected to see. */ -+ if (!elf_phdr_pages || !nr_vma_found) { -+ ret = -ENOEXEC; -+ goto out; -+ } -+ -+ /* -+ * VMA randomization still needs to be set (just in case if -+ * the program we restore will exec() something else later). -+ */ -+ if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) -+ current->flags |= PF_RANDOMIZE; -+ -+ /* -+ * FIXME: Note it flushes signal handlers as well, -+ * so we need to dump queued signals and restore -+ * them here. -+ */ -+ setup_new_exec(bprm); -+ -+ current->mm->free_area_cache = current->mm->mmap_base; -+ current->mm->cached_hole_size = 0; -+ -+ for (i = 0; i < nr_vma_found; i++) { -+ vma_entry_ptr = flex_array_get(fa, i); -+ -+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES -+ if (vma_entry_ptr->status & VMA_AREA_VDSO) -+ vdso = vma_entry_ptr->start; -+#endif -+ -+ if (vma_entry_ptr->status & VMA_AREA_STACK) { -+ /* Note if stack is VM_GROWSUP -- it should be reversed */ -+ start_stack = vma_entry_ptr->start; -+ } -+ -+ /* Anything special should be ignored */ -+ if (!(vma_entry_ptr->status & VMA_AREA_REGULAR)) -+ continue; -+ -+ /* It's a file mmap'ed */ -+ if (vma_entry_ptr->fd != -1) { -+ file = fget((unsigned int)vma_entry_ptr->fd); -+ if (!file) { -+ ret = -EBADF; -+ goto out_unmap; -+ } -+ -+ /* Reuse this field to handle error cases */ -+ vma_entry_ptr->fd = (__u64)file; -+ } else -+ file = NULL; -+ -+ down_write(¤t->mm->mmap_sem); -+ map_addr = do_mmap(file, -+ vma_entry_ptr->start, -+ vma_entry_ptr->end - vma_entry_ptr->start, -+ vma_entry_ptr->prot, -+ vma_entry_ptr->flags | MAP_FIXED, -+ vma_entry_ptr->pgoff); -+ up_write(¤t->mm->mmap_sem); -+ -+ if (file) { -+ fput(file); -+ do_close((unsigned int)vma_entry_ptr->fd); -+ } -+ -+ if ((unsigned long)(map_addr) >= TASK_SIZE) { -+ ret = IS_ERR((void *)map_addr) ? PTR_ERR((void*)map_addr) : -EINVAL; -+ goto out_unmap; -+ } -+ -+ nr_vma_mapped++; -+ } -+ -+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES -+ if (vdso == -1UL) { -+ pr_err("elf-ckpt: Can't find VDSO address\n"); -+ ret = -ENOEXEC; -+ goto out_unmap; -+ } -+#endif -+ -+ if (start_stack == -1UL) { -+ pr_err("elf-ckpt: Can't find stack VMA\n"); -+ ret = -ENOEXEC; -+ goto out_unmap; -+ } -+ -+ /* The name it has before */ -+ set_task_comm(current, core_entry->task_comm); -+ -+ bprm->p = core_entry->mm_start_stack; -+ -+ current->mm->start_code = core_entry->mm_start_code; -+ current->mm->end_code = core_entry->mm_end_code; -+ current->mm->start_data = core_entry->mm_start_data; -+ current->mm->end_data = core_entry->mm_end_data; -+ current->mm->start_stack = core_entry->mm_start_stack; -+ current->mm->start_brk = core_entry->mm_start_brk; -+ current->mm->brk = core_entry->mm_brk; -+ -+#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES -+ ret = arch_setup_additional_pages_at(bprm, (void *)vdso, 0); -+ if (ret) { -+ pr_err("elf-ckpt: Can't setup additional pages at %lx with %d\n", -+ vdso, ret); -+ goto out_unmap; -+ } -+#endif -+ -+ /* -+ * Restore pages -+ */ -+ off = elf_phdr_pages->p_offset; -+ while (1) { -+ struct vm_area_struct *vma; -+ struct page *page; -+ void *page_data; -+ __u64 va; -+ -+ ret = kernel_read(bprm->file, off, (char *)&va, sizeof(va)); -+ if (ret != sizeof(va)) { -+ pr_err("elf-ckpt: Can't read page virtual address: " -+ "ret = %d off = %lx\n", ret, (unsigned long)off); -+ ret = -EIO; -+ goto out_unmap; -+ } -+ -+ /* End of pages reached */ -+ if (!va) -+ break; -+ -+ vma = find_vma(current->mm, (unsigned long)va); -+ if (!vma) { -+ pr_err("elf-ckpt: No VMA for page: %16lx\n", (unsigned long)va); -+ ret = -ESRCH; -+ goto out_unmap; -+ } -+ -+ ret = get_user_pages(current, current->mm, (unsigned long)va, -+ 1, 1, 1, &page, NULL); -+ if (ret != 1) { -+ pr_err("elf-ckpt: Can't get user page: %16lx\n", (unsigned long)va); -+ ret = -EFAULT; -+ goto out_unmap; -+ } -+ -+ page_data = kmap(page); -+ ret = kernel_read(bprm->file, off + sizeof(va), page_data, PAGE_SIZE); -+ kunmap(page); -+ put_page(page); -+ -+ if (ret != PAGE_SIZE) { -+ pr_err("elf-ckpt: Can't read data on page: %16lx\n", (unsigned long)va); -+ ret = -EFAULT; -+ goto out_unmap; -+ } -+ -+ off += sizeof(va) + PAGE_SIZE; -+ } -+ -+ /* -+ * Architecture specific setup for registers -+ * and friends, it's done lately since if -+ * an error happened before there is no much -+ * point to setup arch-specific things at all. -+ */ -+ ret = load_elf_ckpt_arch(current, regs, core_entry); -+ if (ret) -+ goto out_unmap; -+ -+ /* We're done */ -+ ret = 0; -+out: -+ if (core_entry) -+ vfree(core_entry); -+ -+ if (fa) -+ flex_array_free(fa); -+ return ret; -+ -+out_unmap: -+ for (i = 0; i < nr_vma_mapped; i++) { -+ vma_entry_ptr = flex_array_get(fa, i); -+ down_write(¤t->mm->mmap_sem); -+ do_munmap(current->mm, vma_entry_ptr->start, -+ vma_entry_ptr->end - vma_entry_ptr->start); -+ up_write(¤t->mm->mmap_sem); -+ } -+ -+ send_sig(SIGKILL, current, 0); -+ goto out; -+} -Index: linux-2.6.git/fs/exec.c -=================================================================== ---- linux-2.6.git.orig/fs/exec.c -+++ linux-2.6.git/fs/exec.c -@@ -1071,18 +1071,10 @@ void set_task_comm(struct task_struct *t - perf_event_comm(tsk); - } - --int flush_old_exec(struct linux_binprm * bprm) -+int flush_exec_keep_thread(struct linux_binprm * bprm) - { - int retval; - -- /* -- * Make sure we have a private signal table and that -- * we are unassociated from the previous thread group. -- */ -- retval = de_thread(current); -- if (retval) -- goto out; -- - set_mm_exe_file(bprm->mm, bprm->file); - - /* -@@ -1101,10 +1093,25 @@ int flush_old_exec(struct linux_binprm * - current->personality &= ~bprm->per_clear; - - return 0; -- - out: - return retval; - } -+EXPORT_SYMBOL(flush_exec_keep_thread); -+ -+int flush_old_exec(struct linux_binprm * bprm) -+{ -+ int retval; -+ -+ /* -+ * Make sure we have a private signal table and that -+ * we are unassociated from the previous thread group. -+ */ -+ retval = de_thread(current); -+ if (retval) -+ return retval; -+ -+ return flush_exec_keep_thread(bprm); -+} - EXPORT_SYMBOL(flush_old_exec); - - void would_dump(struct linux_binprm *bprm, struct file *file) -Index: linux-2.6.git/include/linux/binfmts.h -=================================================================== ---- linux-2.6.git.orig/include/linux/binfmts.h -+++ linux-2.6.git/include/linux/binfmts.h -@@ -110,6 +110,7 @@ extern int prepare_binprm(struct linux_b - extern int __must_check remove_arg_zero(struct linux_binprm *); - extern int search_binary_handler(struct linux_binprm *, struct pt_regs *); - extern int flush_old_exec(struct linux_binprm * bprm); -+extern int flush_exec_keep_thread(struct linux_binprm * bprm); - extern void setup_new_exec(struct linux_binprm * bprm); - extern void would_dump(struct linux_binprm *, struct file *); - -Index: linux-2.6.git/include/linux/elf_ckpt.h -=================================================================== ---- /dev/null -+++ linux-2.6.git/include/linux/elf_ckpt.h -@@ -0,0 +1,103 @@ -+#ifndef _LINUX_ELF_CHECKPOINT_H -+#define _LINUX_ELF_CHECKPOINT_H -+ -+#ifdef __KERNEL__ -+ -+#include -+#include -+ -+#include -+#include -+ -+/* -+ * Elf extension includes new Elf file type -+ * and program header types as well. -+ */ -+#define ET_CKPT 5 -+ -+#define PT_CKPT_OFFSET 0x01010101 -+ -+#define PT_CKPT_VMA (PT_LOOS + PT_CKPT_OFFSET + 1) -+#define PT_CKPT_CORE (PT_LOOS + PT_CKPT_OFFSET + 2) -+#define PT_CKPT_PAGES (PT_LOOS + PT_CKPT_OFFSET + 3) -+ -+#define CKPT_PAGE_SIZE 4096 -+#define CKPT_TASK_COMM_LEN 16 -+ -+#define CKPT_HEADER_VERSION 1 -+#define CKPT_HEADER_ARCH_X86_64 1 -+ -+#define VMA_AREA_REGULAR (1 << 0) -+#define VMA_AREA_STACK (1 << 1) -+#define VMA_AREA_VSYSCALL (1 << 2) -+#define VMA_AREA_VDSO (1 << 3) -+#define VMA_FORCE_READ (1 << 4) -+#define VMA_AREA_HEAP (1 << 5) -+#define VMA_FILE_PRIVATE (1 << 6) -+#define VMA_FILE_SHARED (1 << 7) -+#define VMA_ANON_SHARED (1 << 8) -+#define VMA_ANON_PRIVATE (1 << 9) -+#define VMA_FORCE_WRITE (1 << 10) -+ -+struct vma_entry { -+ __u64 start; -+ __u64 end; -+ __u64 pgoff; -+ __u32 prot; -+ __u32 flags; -+ __u32 status; /* from VMA_x above */ -+ __u32 pid; /* pid VMA belongs to */ -+ __s64 fd; -+ __u64 ino; -+ __u32 dev_maj; -+ __u32 dev_min; -+} __packed; -+ -+struct page_entry { -+ __u64 va; /* page virtual address */ -+ __u8 data[CKPT_PAGE_SIZE]; /* page contents */ -+} __packed; -+ -+struct image_header { -+ __u16 version; -+ __u16 arch; -+ __u32 flags; -+} __packed; -+ -+#define CKPT_ARCH_SIZE (2 * 4096) -+#define CKPT_CORE_SIZE (4 * 4096) -+ -+struct core_entry { -+ union { -+ struct { -+ struct image_header header; -+ __u8 arch[CKPT_ARCH_SIZE]; /* should be enough for all archs */ -+ __u32 task_personality; -+ __u8 task_comm[CKPT_TASK_COMM_LEN]; -+ __u32 task_flags; -+ __u64 mm_start_code; -+ __u64 mm_end_code; -+ __u64 mm_start_data; -+ __u64 mm_end_data; -+ __u64 mm_start_stack; -+ __u64 mm_start_brk; -+ __u64 mm_brk; -+ }; -+ __u8 __core_pad[CKPT_CORE_SIZE]; -+ }; -+} __packed; -+ -+#ifdef CONFIG_BINFMT_ELF_CKPT -+extern int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs, -+ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr); -+#else -+static inline int load_elf_ckpt(struct linux_binprm *bprm, struct pt_regs *regs, -+ struct elfhdr *elf_ex, struct elf_phdr *elf_phdr) -+{ -+ return -ENOEXEC; -+} -+#endif -+ -+#endif /* __KERNEL__ */ -+ -+#endif /* _LINUX_ELF_CHECKPOINT_H */ diff --git a/kernel/cr-clone-with-pid-support b/kernel/cr-clone-with-pid-support deleted file mode 100644 index 8625e54b2..000000000 --- a/kernel/cr-clone-with-pid-support +++ /dev/null @@ -1,183 +0,0 @@ -clone: Introduce the CLONE_CHILD_USEPID functionality - -From: Pavel Emelyanov - -When restoring a task (or a set of tasks) we need to recreate them with -exactly the same pid as they had before. Thus we need the ability to create -a task with specified pid. - -The proposal is to reuse the already free CLONE_STOPPED clone flag. - -About the security implication - this can create some problems with pids -wraparound and similar, so this approach can be restricted with the "don't -allow for CLONE_CHILD_USEPID when the current pid namespace has ever done -real pid allocation". This will work perfectly for checkpoint-restore and -will not give anyone chances for screwing pids up on a living system. - -Signed-off-by: Pavel Emelyanov ---- - include/linux/pid.h | 2 - - include/linux/sched.h | 1 - kernel/fork.c | 10 ++++++- - kernel/pid.c | 70 ++++++++++++++++++++++++++++++++++++-------------- - 4 files changed, 62 insertions(+), 21 deletions(-) - -Index: linux-2.6.git/include/linux/pid.h -=================================================================== ---- linux-2.6.git.orig/include/linux/pid.h -+++ linux-2.6.git/include/linux/pid.h -@@ -119,7 +119,7 @@ extern struct pid *find_get_pid(int nr); - extern struct pid *find_ge_pid(int nr, struct pid_namespace *); - int next_pidmap(struct pid_namespace *pid_ns, unsigned int last); - --extern struct pid *alloc_pid(struct pid_namespace *ns); -+extern struct pid *alloc_pid(struct pid_namespace *ns, int pid); - extern void free_pid(struct pid *pid); - - /* -Index: linux-2.6.git/include/linux/sched.h -=================================================================== ---- linux-2.6.git.orig/include/linux/sched.h -+++ linux-2.6.git/include/linux/sched.h -@@ -23,6 +23,7 @@ - #define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */ - /* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state) - and is now available for re-use. */ -+#define CLONE_CHILD_USEPID 0x02000000 /* use the given pid */ - #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ - #define CLONE_NEWIPC 0x08000000 /* New ipcs */ - #define CLONE_NEWUSER 0x10000000 /* New user namespace */ -Index: linux-2.6.git/kernel/fork.c -=================================================================== ---- linux-2.6.git.orig/kernel/fork.c -+++ linux-2.6.git/kernel/fork.c -@@ -1253,8 +1253,16 @@ static struct task_struct *copy_process( - goto bad_fork_cleanup_io; - - if (pid != &init_struct_pid) { -+ int want_pid = 0; -+ -+ if (clone_flags & CLONE_CHILD_USEPID) { -+ retval = get_user(want_pid, child_tidptr); -+ if (retval) -+ goto bad_fork_cleanup_io; -+ } -+ - retval = -ENOMEM; -- pid = alloc_pid(p->nsproxy->pid_ns); -+ pid = alloc_pid(p->nsproxy->pid_ns, want_pid); - if (!pid) - goto bad_fork_cleanup_io; - } -Index: linux-2.6.git/kernel/pid.c -=================================================================== ---- linux-2.6.git.orig/kernel/pid.c -+++ linux-2.6.git/kernel/pid.c -@@ -159,11 +159,55 @@ static void set_last_pid(struct pid_name - } while ((prev != last_write) && (pid_before(base, last_write, pid))); - } - --static int alloc_pidmap(struct pid_namespace *pid_ns) -+static int alloc_pidmap_page(struct pidmap *map) -+{ -+ if (unlikely(!map->page)) { -+ void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); -+ /* -+ * Free the page if someone raced with us -+ * installing it: -+ */ -+ spin_lock_irq(&pidmap_lock); -+ if (!map->page) { -+ map->page = page; -+ page = NULL; -+ } -+ spin_unlock_irq(&pidmap_lock); -+ kfree(page); -+ if (unlikely(!map->page)) -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+static int set_pidmap(struct pid_namespace *pid_ns, int pid) -+{ -+ int offset; -+ struct pidmap *map; -+ -+ offset = pid & BITS_PER_PAGE_MASK; -+ map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; -+ -+ if (alloc_pidmap_page(map) < 0) -+ return -ENOMEM; -+ -+ if (!test_and_set_bit(offset, map->page)) { -+ atomic_dec(&map->nr_free); -+ return pid; -+ } -+ -+ return -EBUSY; -+} -+ -+static int alloc_pidmap(struct pid_namespace *pid_ns, int desired_pid) - { - int i, offset, max_scan, pid, last = pid_ns->last_pid; - struct pidmap *map; - -+ if (desired_pid) -+ return set_pidmap(pid_ns, desired_pid); -+ - pid = last + 1; - if (pid >= pid_max) - pid = RESERVED_PIDS; -@@ -176,22 +220,9 @@ static int alloc_pidmap(struct pid_names - */ - max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; - for (i = 0; i <= max_scan; ++i) { -- if (unlikely(!map->page)) { -- void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); -- /* -- * Free the page if someone raced with us -- * installing it: -- */ -- spin_lock_irq(&pidmap_lock); -- if (!map->page) { -- map->page = page; -- page = NULL; -- } -- spin_unlock_irq(&pidmap_lock); -- kfree(page); -- if (unlikely(!map->page)) -- break; -- } -+ if (alloc_pidmap_page(map) < 0) -+ break; -+ - if (likely(atomic_read(&map->nr_free))) { - do { - if (!test_and_set_bit(offset, map->page)) { -@@ -277,7 +308,7 @@ void free_pid(struct pid *pid) - call_rcu(&pid->rcu, delayed_put_pid); - } - --struct pid *alloc_pid(struct pid_namespace *ns) -+struct pid *alloc_pid(struct pid_namespace *ns, int this_ns_pid) - { - struct pid *pid; - enum pid_type type; -@@ -291,13 +322,14 @@ struct pid *alloc_pid(struct pid_namespa - - tmp = ns; - for (i = ns->level; i >= 0; i--) { -- nr = alloc_pidmap(tmp); -+ nr = alloc_pidmap(tmp, this_ns_pid); - if (nr < 0) - goto out_free; - - pid->numbers[i].nr = nr; - pid->numbers[i].ns = tmp; - tmp = tmp->parent; -+ this_ns_pid = 0; - } - - get_pid_ns(ns); diff --git a/kernel/cr-statfs-callback-for-pipefs b/kernel/cr-statfs-callback-for-pipefs deleted file mode 100644 index 6fae692af..000000000 --- a/kernel/cr-statfs-callback-for-pipefs +++ /dev/null @@ -1,27 +0,0 @@ -vfs: Add ->statfs callback for pipefs - -From: Pavel Emelyanov - -This is done to make it possible to distinguish pipes -from fifos when opening one via /proc//fd/ link. - -Signed-off-by: Pavel Emelyanov -Reviewed-by: Tejun Heo -Acked-by: Serge Hallyn -Signed-off-by: Cyrill Gorcunov ---- - fs/pipe.c | 1 + - 1 file changed, 1 insertion(+) - -Index: linux-2.6.git/fs/pipe.c -=================================================================== ---- linux-2.6.git.orig/fs/pipe.c -+++ linux-2.6.git/fs/pipe.c -@@ -1254,6 +1254,7 @@ out: - - static const struct super_operations pipefs_ops = { - .destroy_inode = free_inode_nonrcu, -+ .statfs = simple_statfs, - }; - - /* diff --git a/kernel/fs-add-do-close b/kernel/fs-add-do-close deleted file mode 100644 index c6ca7e4f7..000000000 --- a/kernel/fs-add-do-close +++ /dev/null @@ -1,85 +0,0 @@ -fs: Add do_close helper - -To be able to close file descriptors right from inside -kernel space do_close() helper is added. - -Signed-off-by: Pavel Emelyanov -Signed-off-by: Cyrill Gorcunov ---- - fs/open.c | 32 ++++++++++++++++++++------------ - include/linux/fs.h | 1 + - 2 files changed, 21 insertions(+), 12 deletions(-) - -Index: linux-2.6.git/fs/open.c -=================================================================== ---- linux-2.6.git.orig/fs/open.c -+++ linux-2.6.git/fs/open.c -@@ -1056,17 +1056,11 @@ int filp_close(struct file *filp, fl_own - - EXPORT_SYMBOL(filp_close); - --/* -- * Careful here! We test whether the file pointer is NULL before -- * releasing the fd. This ensures that one clone task can't release -- * an fd while another clone is opening it. -- */ --SYSCALL_DEFINE1(close, unsigned int, fd) -+int do_close(unsigned int fd) - { - struct file * filp; - struct files_struct *files = current->files; - struct fdtable *fdt; -- int retval; - - spin_lock(&files->file_lock); - fdt = files_fdtable(files); -@@ -1079,7 +1073,25 @@ SYSCALL_DEFINE1(close, unsigned int, fd) - FD_CLR(fd, fdt->close_on_exec); - __put_unused_fd(files, fd); - spin_unlock(&files->file_lock); -- retval = filp_close(filp, files); -+ -+ return filp_close(filp, files); -+ -+out_unlock: -+ spin_unlock(&files->file_lock); -+ return -EBADF; -+} -+EXPORT_SYMBOL_GPL(do_close); -+ -+/* -+ * Careful here! We test whether the file pointer is NULL before -+ * releasing the fd. This ensures that one clone task can't release -+ * an fd while another clone is opening it. -+ */ -+SYSCALL_DEFINE1(close, unsigned int, fd) -+{ -+ int retval; -+ -+ retval = do_close(fd); - - /* can't restart close syscall because file table entry was cleared */ - if (unlikely(retval == -ERESTARTSYS || -@@ -1089,10 +1101,6 @@ SYSCALL_DEFINE1(close, unsigned int, fd) - retval = -EINTR; - - return retval; -- --out_unlock: -- spin_unlock(&files->file_lock); -- return -EBADF; - } - EXPORT_SYMBOL(sys_close); - -Index: linux-2.6.git/include/linux/fs.h -=================================================================== ---- linux-2.6.git.orig/include/linux/fs.h -+++ linux-2.6.git/include/linux/fs.h -@@ -2025,6 +2025,7 @@ extern struct file *file_open_root(struc - extern struct file * dentry_open(struct dentry *, struct vfsmount *, int, - const struct cred *); - extern int filp_close(struct file *, fl_owner_t id); -+extern int do_close(unsigned int fd); - extern char * getname(const char __user *); - - /* fs/ioctl.c */ diff --git a/kernel/fs-proc-add-tls b/kernel/fs-proc-add-tls deleted file mode 100644 index c0ddc10e3..000000000 --- a/kernel/fs-proc-add-tls +++ /dev/null @@ -1,49 +0,0 @@ -fs, proc: Add /proc/$pid/tls entry - -To be able to restart checkpointed tasks we need -to know TLS status at dumping time. Export this -information by /proc/$pid/tls entry. - -Signed-off-by: Cyrill Gorcunov ---- - fs/proc/base.c | 20 ++++++++++++++++++++ - 1 file changed, 20 insertions(+) - -Index: linux-2.6.git/fs/proc/base.c -=================================================================== ---- linux-2.6.git.orig/fs/proc/base.c -+++ linux-2.6.git/fs/proc/base.c -@@ -3150,6 +3150,23 @@ static int proc_pid_personality(struct s - return err; - } - -+#ifdef CONFIG_X86 -+static int proc_pid_tls(struct seq_file *m, struct pid_namespace *ns, -+ struct pid *pid, struct task_struct *task) -+{ -+ int err = lock_trace(task); -+ if (!err) { -+ int i; -+ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) -+ seq_printf(m, "%x %x\n", -+ task->thread.tls_array[i].a, -+ task->thread.tls_array[i].b); -+ unlock_trace(task); -+ } -+ return err; -+} -+#endif -+ - /* - * Thread groups - */ -@@ -3169,6 +3186,9 @@ static const struct pid_entry tgid_base_ - INF("auxv", S_IRUSR, proc_pid_auxv), - ONE("status", S_IRUGO, proc_pid_status), - ONE("personality", S_IRUGO, proc_pid_personality), -+#ifdef CONFIG_X86 -+ ONE("tls", S_IRUGO, proc_pid_tls), -+#endif - INF("limits", S_IRUGO, proc_pid_limits), - #ifdef CONFIG_SCHED_DEBUG - REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), diff --git a/kernel/proc-fix-races-against-execve-of-proc-pid-fd-fix.patch b/kernel/proc-fix-races-against-execve-of-proc-pid-fd-fix.patch deleted file mode 100644 index 70d259330..000000000 --- a/kernel/proc-fix-races-against-execve-of-proc-pid-fd-fix.patch +++ /dev/null @@ -1,28 +0,0 @@ -From: Vasiliy Kulikov - -In the patch "proc: fix races against execve() of /proc/PID/fd**" -proc_pid_fd_link_getattr() leaked task_struct if ptrace check fails. - -Signed-off-by: Vasiliy Kulikov -Reported-by: Cyrill Gorcunov -Signed-off-by: Andrew Morton ---- - - fs/proc/base.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff -puN fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd-fix fs/proc/base.c ---- a/fs/proc/base.c~proc-fix-races-against-execve-of-proc-pid-fd-fix -+++ a/fs/proc/base.c -@@ -1681,9 +1681,9 @@ static int proc_pid_fd_link_getattr(stru - - generic_fillattr(inode, stat); - unlock_trace(task); -- put_task_struct(task); - rc = 0; - out_task: -+ put_task_struct(task); - return rc; - } - -_ diff --git a/kernel/proc-fix-races-against-execve-of-proc-pid-fd.patch b/kernel/proc-fix-races-against-execve-of-proc-pid-fd.patch deleted file mode 100644 index d2ef913a7..000000000 --- a/kernel/proc-fix-races-against-execve-of-proc-pid-fd.patch +++ /dev/null @@ -1,255 +0,0 @@ -From: Vasiliy Kulikov - -fd* files are restricted to the task's owner, and other users may not get -direct access to them. But one may open any of these files and run any -setuid program, keeping opened file descriptors. As there are permission -checks on open(), but not on readdir() and read(), operations on the kept -file descriptors will not be checked. It makes it possible to violate -procfs permission model. - -Reading fdinfo/* may disclosure current fds' position and flags, reading -directory contents of fdinfo/ and fd/ may disclosure the number of opened -files by the target task. This information is not sensible per se, but it -can reveal some private information (like length of a password stored in a -file) under certain conditions. - -Used existing (un)lock_trace functions to check for ptrace_may_access(), -but instead of using EPERM return code from it use EACCES to be consistent -with existing proc_pid_follow_link()/proc_pid_readlink() return code. If -they differ, attacker can guess what fds exist by analyzing stat() return -code. Patched handlers: stat() for fd/*, stat() and read() for fdindo/*, -readdir() and lookup() for fd/ and fdinfo/. - -Signed-off-by: Vasiliy Kulikov -Cc: Cyrill Gorcunov -Cc: -Signed-off-by: Andrew Morton ---- - - fs/proc/base.c | 146 ++++++++++++++++++++++++++++++++++++++++----------------- - 1 file changed, 103 insertions(+), 43 deletions(-) - -Index: linux-2.6.git/fs/proc/base.c -=================================================================== ---- linux-2.6.git.orig/fs/proc/base.c -+++ linux-2.6.git/fs/proc/base.c -@@ -1665,12 +1665,46 @@ out: - return error; - } - -+static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry, -+ struct kstat *stat) -+{ -+ struct inode *inode = dentry->d_inode; -+ struct task_struct *task = get_proc_task(inode); -+ int rc; -+ -+ if (task == NULL) -+ return -ESRCH; -+ -+ rc = -EACCES; -+ if (lock_trace(task)) -+ goto out_task; -+ -+ generic_fillattr(inode, stat); -+ unlock_trace(task); -+ put_task_struct(task); -+ rc = 0; -+out_task: -+ return rc; -+} -+ - static const struct inode_operations proc_pid_link_inode_operations = { - .readlink = proc_pid_readlink, - .follow_link = proc_pid_follow_link, - .setattr = proc_setattr, - }; - -+static const struct inode_operations proc_fdinfo_link_inode_operations = { -+ .setattr = proc_setattr, -+ .getattr = proc_pid_fd_link_getattr, -+}; -+ -+static const struct inode_operations proc_fd_link_inode_operations = { -+ .readlink = proc_pid_readlink, -+ .follow_link = proc_pid_follow_link, -+ .setattr = proc_setattr, -+ .getattr = proc_pid_fd_link_getattr, -+}; -+ - - /* building an inode */ - -@@ -1902,49 +1936,61 @@ out: - - static int proc_fd_info(struct inode *inode, struct path *path, char *info) - { -- struct task_struct *task = get_proc_task(inode); -- struct files_struct *files = NULL; -+ struct task_struct *task; -+ struct files_struct *files; - struct file *file; - int fd = proc_fd(inode); -+ int rc; - -- if (task) { -- files = get_files_struct(task); -- put_task_struct(task); -- } -- if (files) { -- /* -- * We are not taking a ref to the file structure, so we must -- * hold ->file_lock. -- */ -- spin_lock(&files->file_lock); -- file = fcheck_files(files, fd); -- if (file) { -- unsigned int f_flags; -- struct fdtable *fdt; -- -- fdt = files_fdtable(files); -- f_flags = file->f_flags & ~O_CLOEXEC; -- if (FD_ISSET(fd, fdt->close_on_exec)) -- f_flags |= O_CLOEXEC; -- -- if (path) { -- *path = file->f_path; -- path_get(&file->f_path); -- } -- if (info) -- snprintf(info, PROC_FDINFO_MAX, -- "pos:\t%lli\n" -- "flags:\t0%o\n", -- (long long) file->f_pos, -- f_flags); -- spin_unlock(&files->file_lock); -- put_files_struct(files); -- return 0; -+ task = get_proc_task(inode); -+ if (!task) -+ return -ENOENT; -+ -+ rc = -EACCES; -+ if (lock_trace(task)) -+ goto out_task; -+ -+ rc = -ENOENT; -+ files = get_files_struct(task); -+ if (files == NULL) -+ goto out_unlock; -+ -+ /* -+ * We are not taking a ref to the file structure, so we must -+ * hold ->file_lock. -+ */ -+ spin_lock(&files->file_lock); -+ file = fcheck_files(files, fd); -+ if (file) { -+ unsigned int f_flags; -+ struct fdtable *fdt; -+ -+ fdt = files_fdtable(files); -+ f_flags = file->f_flags & ~O_CLOEXEC; -+ if (FD_ISSET(fd, fdt->close_on_exec)) -+ f_flags |= O_CLOEXEC; -+ -+ if (path) { -+ *path = file->f_path; -+ path_get(&file->f_path); - } -- spin_unlock(&files->file_lock); -- put_files_struct(files); -- } -- return -ENOENT; -+ if (info) -+ snprintf(info, PROC_FDINFO_MAX, -+ "pos:\t%lli\n" -+ "flags:\t0%o\n", -+ (long long) file->f_pos, -+ f_flags); -+ rc = 0; -+ } else -+ rc = -ENOENT; -+ spin_unlock(&files->file_lock); -+ put_files_struct(files); -+ -+out_unlock: -+ unlock_trace(task); -+out_task: -+ put_task_struct(task); -+ return rc; - } - - static int proc_fd_link(struct inode *inode, struct path *path) -@@ -2039,7 +2085,7 @@ static struct dentry *proc_fd_instantiat - spin_unlock(&files->file_lock); - put_files_struct(files); - -- inode->i_op = &proc_pid_link_inode_operations; -+ inode->i_op = &proc_fd_link_inode_operations; - inode->i_size = 64; - ei->op.proc_get_link = proc_fd_link; - d_set_d_op(dentry, &tid_fd_dentry_operations); -@@ -2071,7 +2117,12 @@ static struct dentry *proc_lookupfd_comm - if (fd == ~0U) - goto out; - -+ result = ERR_PTR(-EACCES); -+ if (lock_trace(task)) -+ goto out; -+ - result = instantiate(dir, dentry, task, &fd); -+ unlock_trace(task); - out: - put_task_struct(task); - out_no_task: -@@ -2091,23 +2142,28 @@ static int proc_readfd_common(struct fil - retval = -ENOENT; - if (!p) - goto out_no_task; -+ -+ retval = -EACCES; -+ if (lock_trace(p)) -+ goto out; -+ - retval = 0; - - fd = filp->f_pos; - switch (fd) { - case 0: - if (filldir(dirent, ".", 1, 0, inode->i_ino, DT_DIR) < 0) -- goto out; -+ goto out_unlock; - filp->f_pos++; - case 1: - ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) -- goto out; -+ goto out_unlock; - filp->f_pos++; - default: - files = get_files_struct(p); - if (!files) -- goto out; -+ goto out_unlock; - rcu_read_lock(); - for (fd = filp->f_pos-2; - fd < files_fdtable(files)->max_fds; -@@ -2131,6 +2187,9 @@ static int proc_readfd_common(struct fil - rcu_read_unlock(); - put_files_struct(files); - } -+ -+out_unlock: -+ unlock_trace(p); - out: - put_task_struct(p); - out_no_task: -@@ -2208,6 +2267,7 @@ static struct dentry *proc_fdinfo_instan - ei->fd = fd; - inode->i_mode = S_IFREG | S_IRUSR; - inode->i_fop = &proc_fdinfo_file_operations; -+ inode->i_op = &proc_fdinfo_link_inode_operations; - d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ diff --git a/kernel/proc-force-dcache-drop-on-unauthorized-access.patch b/kernel/proc-force-dcache-drop-on-unauthorized-access.patch deleted file mode 100644 index bfe6bf1a8..000000000 --- a/kernel/proc-force-dcache-drop-on-unauthorized-access.patch +++ /dev/null @@ -1,118 +0,0 @@ -From: Vasiliy Kulikov - -The patch "proc: fix races against execve() of /proc/PID/fd**" is still a -partial fix for a setxid problem. link(2) is a yet another way to -identify whether a specific fd is opened by a privileged process. By -calling link(2) against /proc/PID/fd/* an attacker may identify whether -the fd number is valid for PID by analysing link(2) return code. - -Both getattr() and link() can be used by the attacker iff the dentry is -present in the dcache. In this case ->lookup() is not called and the only -way to check ptrace permissions is either operation handler or -->revalidate(). The easiest solution to prevent any unauthorized access -to /proc/PID/fd*/ files is to force the dentry drop on each unauthorized -access attempt. - -If an attacker keeps opened fd of /proc/PID/fd/ and dcache contains a -specific dentry for some /proc/PID/fd/XXX, any future attemp to use the -dentry by the attacker would lead to the dentry drop as a result of a -failed ptrace check in ->revalidate(). Then the attacker cannot spawn a -dentry for the specific fd number because of ptrace check in ->lookup(). - -The dentry drop can be still observed by an attacker by analysing -information from /proc/slabinfo, which is addressed in the successive -patch. - -Signed-off-by: Vasiliy Kulikov -Cc: Cyrill Gorcunov -Cc: Al Viro -Cc: Christoph Lameter -Cc: Pekka Enberg -Cc: Matt Mackall -Cc: Alexey Dobriyan -Signed-off-by: Andrew Morton ---- - - fs/proc/base.c | 42 ++++++------------------------------------ - 1 file changed, 6 insertions(+), 36 deletions(-) - -Index: linux-2.6.git/fs/proc/base.c -=================================================================== ---- linux-2.6.git.orig/fs/proc/base.c -+++ linux-2.6.git/fs/proc/base.c -@@ -1665,46 +1665,12 @@ out: - return error; - } - --static int proc_pid_fd_link_getattr(struct vfsmount *mnt, struct dentry *dentry, -- struct kstat *stat) --{ -- struct inode *inode = dentry->d_inode; -- struct task_struct *task = get_proc_task(inode); -- int rc; -- -- if (task == NULL) -- return -ESRCH; -- -- rc = -EACCES; -- if (lock_trace(task)) -- goto out_task; -- -- generic_fillattr(inode, stat); -- unlock_trace(task); -- rc = 0; --out_task: -- put_task_struct(task); -- return rc; --} -- - static const struct inode_operations proc_pid_link_inode_operations = { - .readlink = proc_pid_readlink, - .follow_link = proc_pid_follow_link, - .setattr = proc_setattr, - }; - --static const struct inode_operations proc_fdinfo_link_inode_operations = { -- .setattr = proc_setattr, -- .getattr = proc_pid_fd_link_getattr, --}; -- --static const struct inode_operations proc_fd_link_inode_operations = { -- .readlink = proc_pid_readlink, -- .follow_link = proc_pid_follow_link, -- .setattr = proc_setattr, -- .getattr = proc_pid_fd_link_getattr, --}; -- - - /* building an inode */ - -@@ -2013,6 +1979,11 @@ static int tid_fd_revalidate(struct dent - task = get_proc_task(inode); - fd = proc_fd(inode); - -+ if (!ptrace_may_access(task, PTRACE_MODE_READ)) { -+ put_task_struct(task); -+ task = NULL; -+ } -+ - if (task) { - files = get_files_struct(task); - if (files) { -@@ -2085,7 +2056,7 @@ static struct dentry *proc_fd_instantiat - spin_unlock(&files->file_lock); - put_files_struct(files); - -- inode->i_op = &proc_fd_link_inode_operations; -+ inode->i_op = &proc_pid_link_inode_operations; - inode->i_size = 64; - ei->op.proc_get_link = proc_fd_link; - d_set_d_op(dentry, &tid_fd_dentry_operations); -@@ -2267,7 +2238,6 @@ static struct dentry *proc_fdinfo_instan - ei->fd = fd; - inode->i_mode = S_IFREG | S_IRUSR; - inode->i_fop = &proc_fdinfo_file_operations; -- inode->i_op = &proc_fdinfo_link_inode_operations; - d_set_d_op(dentry, &tid_fd_dentry_operations); - d_add(dentry, inode); - /* Close the race of the process dying before we return the dentry */ diff --git a/kernel/procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch b/kernel/procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch deleted file mode 100644 index 2aed2c557..000000000 --- a/kernel/procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch +++ /dev/null @@ -1,26 +0,0 @@ -From: Pavel Emelyanov - -On reading sysctl dirs we should return -EISDIR instead of -EINVAL. - -Signed-off-by: Pavel Emelyanov -Signed-off-by: Cyrill Gorcunov -Cc: Alexey Dobriyan -Cc: Al Viro -Signed-off-by: Andrew Morton ---- - - fs/proc/proc_sysctl.c | 1 + - 1 file changed, 1 insertion(+) - -Index: linux-2.6.git/fs/proc/proc_sysctl.c -=================================================================== ---- linux-2.6.git.orig/fs/proc/proc_sysctl.c -+++ linux-2.6.git/fs/proc/proc_sysctl.c -@@ -370,6 +370,7 @@ static const struct file_operations proc - }; - - static const struct file_operations proc_sys_dir_file_operations = { -+ .read = generic_read_dir, - .readdir = proc_sys_readdir, - .llseek = generic_file_llseek, - }; diff --git a/kernel/readme b/kernel/readme deleted file mode 100644 index a638c1e1f..000000000 --- a/kernel/readme +++ /dev/null @@ -1,13 +0,0 @@ -The kernel patches series. See "series" file to obtain -order of appliance. Not all patches do address C/R directly -but some of them are needed due to dependencies. - -The following patches are known to be in -mm tree already - -procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch -proc-fix-races-against-execve-of-proc-pid-fd.patch -proc-fix-races-against-execve-of-proc-pid-fd-fix.patch -proc-force-dcache-drop-on-unauthorized-access.patch -cr-statfs-callback-for-pipefs - -Has been tested on Linux 3.1-rc3. diff --git a/kernel/series b/kernel/series deleted file mode 100644 index 80d8eb663..000000000 --- a/kernel/series +++ /dev/null @@ -1,13 +0,0 @@ -procfs-report-eisdir-when-reading-sysctl-dirs-in-proc.patch -proc-fix-races-against-execve-of-proc-pid-fd.patch -proc-fix-races-against-execve-of-proc-pid-fd-fix.patch -proc-force-dcache-drop-on-unauthorized-access.patch -cr-statfs-callback-for-pipefs -fs-proc-switch-to-dentry -cr-proc-map-files-21 -cr-clone-with-pid-support -cr-proc-add-children -fs-add-do-close -fs-proc-add-tls -fs-proc-add-mm-task-stat -binfmt-elf-for-cr-5