mirror of
https://github.com/checkpoint-restore/criu
synced 2025-08-31 14:25:49 +00:00
kernel: Update kernel patches
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
This commit is contained in:
46
kernel/c-r-introduce-checkpoint_restore-symbol.patch
Normal file
46
kernel/c-r-introduce-checkpoint_restore-symbol.patch
Normal file
@@ -0,0 +1,46 @@
|
||||
From: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Subject: c/r: introduce CHECKPOINT_RESTORE symbol
|
||||
|
||||
For checkpoint/restore we need auxilary features being compiled into the
|
||||
kernel, such as additional prctl codes, /proc/<pid>/map_files and etc...
|
||||
but same time these features are not mandatory for a regular kernel so
|
||||
CHECKPOINT_RESTORE config symbol should bring a way to disable them all at
|
||||
once if one wish to get rid of additional functionality.
|
||||
|
||||
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Andrew Vagin <avagin@openvz.org>
|
||||
Cc: Serge Hallyn <serge.hallyn@canonical.com>
|
||||
Cc: Vasiliy Kulikov <segoon@openwall.com>
|
||||
Reviewed-by: Kees Cook <keescook@chromium.org>
|
||||
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
|
||||
Cc: Alexey Dobriyan <adobriyan@gmail.com>
|
||||
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
|
||||
init/Kconfig | 11 +++++++++++
|
||||
1 file changed, 11 insertions(+)
|
||||
|
||||
Index: linux-2.6.git/init/Kconfig
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/init/Kconfig
|
||||
+++ linux-2.6.git/init/Kconfig
|
||||
@@ -773,6 +773,17 @@ config DEBUG_BLK_CGROUP
|
||||
|
||||
endif # CGROUPS
|
||||
|
||||
+config CHECKPOINT_RESTORE
|
||||
+ bool "Checkpoint/restore support" if EXPERT
|
||||
+ default n
|
||||
+ help
|
||||
+ Enables additional kernel features in a sake of checkpoint/restore.
|
||||
+ In particular it adds auxiliary prctl codes to setup process text,
|
||||
+ data and heap segment sizes, and a few additional /proc filesystem
|
||||
+ entries.
|
||||
+
|
||||
+ If unsure, say N here.
|
||||
+
|
||||
menuconfig NAMESPACES
|
||||
bool "Namespaces support" if EXPERT
|
||||
default !EXPERT
|
@@ -0,0 +1,117 @@
|
||||
From: Andrew Morton <akpm@linux-foundation.org>
|
||||
Subject: c-r-prctl-add-pr_set_mm-codes-to-set-up-mm_struct-entries-fix
|
||||
|
||||
cache current->mm in a local, saving 200 bytes text
|
||||
|
||||
Cc: Andrew Vagin <avagin@openvz.org>
|
||||
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
|
||||
Cc: Kees Cook <keescook@chromium.org>
|
||||
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
|
||||
Cc: Pavel Emelyanov <xemul@parallels.com>
|
||||
Cc: Serge Hallyn <serge.hallyn@canonical.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vasiliy Kulikov <segoon@openwall.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
|
||||
kernel/sys.c | 33 +++++++++++++++++----------------
|
||||
1 file changed, 17 insertions(+), 16 deletions(-)
|
||||
|
||||
Index: linux-2.6.git/kernel/sys.c
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/kernel/sys.c
|
||||
+++ linux-2.6.git/kernel/sys.c
|
||||
@@ -1701,6 +1701,7 @@ static int prctl_set_mm(int opt, unsigne
|
||||
unsigned long vm_bad_flags;
|
||||
struct vm_area_struct *vma;
|
||||
int error = 0;
|
||||
+ struct mm_struct *mm = current->mm;
|
||||
|
||||
if (arg4 | arg5)
|
||||
return -EINVAL;
|
||||
@@ -1711,8 +1712,8 @@ static int prctl_set_mm(int opt, unsigne
|
||||
if (addr >= TASK_SIZE)
|
||||
return -EINVAL;
|
||||
|
||||
- down_read(¤t->mm->mmap_sem);
|
||||
- vma = find_vma(current->mm, addr);
|
||||
+ down_read(&mm->mmap_sem);
|
||||
+ vma = find_vma(mm, addr);
|
||||
|
||||
if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
|
||||
/* It must be existing VMA */
|
||||
@@ -1732,9 +1733,9 @@ static int prctl_set_mm(int opt, unsigne
|
||||
goto out;
|
||||
|
||||
if (opt == PR_SET_MM_START_CODE)
|
||||
- current->mm->start_code = addr;
|
||||
+ mm->start_code = addr;
|
||||
else
|
||||
- current->mm->end_code = addr;
|
||||
+ mm->end_code = addr;
|
||||
break;
|
||||
|
||||
case PR_SET_MM_START_DATA:
|
||||
@@ -1747,9 +1748,9 @@ static int prctl_set_mm(int opt, unsigne
|
||||
goto out;
|
||||
|
||||
if (opt == PR_SET_MM_START_DATA)
|
||||
- current->mm->start_data = addr;
|
||||
+ mm->start_data = addr;
|
||||
else
|
||||
- current->mm->end_data = addr;
|
||||
+ mm->end_data = addr;
|
||||
break;
|
||||
|
||||
case PR_SET_MM_START_STACK:
|
||||
@@ -1762,31 +1763,31 @@ static int prctl_set_mm(int opt, unsigne
|
||||
if ((vma->vm_flags & vm_req_flags) != vm_req_flags)
|
||||
goto out;
|
||||
|
||||
- current->mm->start_stack = addr;
|
||||
+ mm->start_stack = addr;
|
||||
break;
|
||||
|
||||
case PR_SET_MM_START_BRK:
|
||||
- if (addr <= current->mm->end_data)
|
||||
+ if (addr <= mm->end_data)
|
||||
goto out;
|
||||
|
||||
if (rlim < RLIM_INFINITY &&
|
||||
- (current->mm->brk - addr) +
|
||||
- (current->mm->end_data - current->mm->start_data) > rlim)
|
||||
+ (mm->brk - addr) +
|
||||
+ (mm->end_data - mm->start_data) > rlim)
|
||||
goto out;
|
||||
|
||||
- current->mm->start_brk = addr;
|
||||
+ mm->start_brk = addr;
|
||||
break;
|
||||
|
||||
case PR_SET_MM_BRK:
|
||||
- if (addr <= current->mm->end_data)
|
||||
+ if (addr <= mm->end_data)
|
||||
goto out;
|
||||
|
||||
if (rlim < RLIM_INFINITY &&
|
||||
- (addr - current->mm->start_brk) +
|
||||
- (current->mm->end_data - current->mm->start_data) > rlim)
|
||||
+ (addr - mm->start_brk) +
|
||||
+ (mm->end_data - mm->start_data) > rlim)
|
||||
goto out;
|
||||
|
||||
- current->mm->brk = addr;
|
||||
+ mm->brk = addr;
|
||||
break;
|
||||
|
||||
default:
|
||||
@@ -1797,7 +1798,7 @@ static int prctl_set_mm(int opt, unsigne
|
||||
error = 0;
|
||||
|
||||
out:
|
||||
- up_read(¤t->mm->mmap_sem);
|
||||
+ up_read(&mm->mmap_sem);
|
||||
|
||||
return error;
|
||||
}
|
@@ -1,28 +1,48 @@
|
||||
prctl: Add PR_SET_MM codes to tune up mm_struct entires v2
|
||||
From: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Subject: c/r: prctl: add PR_SET_MM codes to set up mm_struct entries
|
||||
|
||||
A few members of mm_struct such as start_code, end_code,
|
||||
start_data, end_data, start_stack, start_brk, brk provided
|
||||
by the kernel via /proc/$pid/stat and we use it at checkpoint
|
||||
time.
|
||||
When we restore a task we need to set up text, data and data heap sizes
|
||||
from userspace to the values a task had at checkpoint time. This patch
|
||||
adds auxilary prctl codes for that.
|
||||
|
||||
At restore time we need a mechanism to restore those values
|
||||
back and for this sake PR_SET_MM prctl code is introduced.
|
||||
While most of them have a statistical nature (their values are involved
|
||||
into calculation of /proc/<pid>/statm output) the start_brk and brk values
|
||||
are used to compute an allowed size of program data segment expansion.
|
||||
Which means an arbitrary changes of this values might be dangerous
|
||||
operation. So to restrict access the following requirements applied to
|
||||
prctl calls:
|
||||
|
||||
Note because of being a dangerous operation this inteface
|
||||
is allowed for CAP_SYS_ADMIN only.
|
||||
- The process has to have CAP_SYS_ADMIN capability granted.
|
||||
- For all opcodes except start_brk/brk members an appropriate
|
||||
VMA area must exist and should fit certain VMA flags,
|
||||
such as:
|
||||
- code segment must be executable but not writable;
|
||||
- data segment must not be executable.
|
||||
|
||||
v2:
|
||||
- Add a check for vma start address, testing for vma ending
|
||||
address is not enough. From Kees Cook.
|
||||
start_brk/brk values must not intersect with data segment and must not
|
||||
exceed RLIMIT_DATA resource limit.
|
||||
|
||||
- Add some sanity tests for assigned addresses.
|
||||
Still the main guard is CAP_SYS_ADMIN capability check.
|
||||
|
||||
Note the kernel should be compiled with CONFIG_CHECKPOINT_RESTORE support
|
||||
otherwise these prctl calls will return -EINVAL.
|
||||
|
||||
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
CC: Kees Cook <keescook@chromium.org>
|
||||
Cc: Andrew Morton <akpm@linux-foundation.org>
|
||||
Reviewed-by: Kees Cook <keescook@chromium.org>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Andrew Vagin <avagin@openvz.org>
|
||||
Cc: Serge Hallyn <serge.hallyn@canonical.com>
|
||||
Cc: Pavel Emelyanov <xemul@parallels.com>
|
||||
Cc: Vasiliy Kulikov <segoon@openwall.com>
|
||||
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
|
||||
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
|
||||
include/linux/prctl.h | 12 +++++
|
||||
kernel/sys.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
2 files changed, 130 insertions(+)
|
||||
kernel/sys.c | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++
|
||||
2 files changed, 132 insertions(+)
|
||||
|
||||
Index: linux-2.6.git/include/linux/prctl.h
|
||||
===================================================================
|
||||
@@ -49,34 +69,33 @@ Index: linux-2.6.git/kernel/sys.c
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/kernel/sys.c
|
||||
+++ linux-2.6.git/kernel/sys.c
|
||||
@@ -1692,6 +1692,118 @@ SYSCALL_DEFINE1(umask, int, mask)
|
||||
@@ -1692,6 +1692,123 @@ SYSCALL_DEFINE1(umask, int, mask)
|
||||
return mask;
|
||||
}
|
||||
|
||||
+static int prctl_set_mm(int opt, unsigned long addr)
|
||||
+#ifdef CONFIG_CHECKPOINT_RESTORE
|
||||
+static int prctl_set_mm(int opt, unsigned long addr,
|
||||
+ unsigned long arg4, unsigned long arg5)
|
||||
+{
|
||||
+ unsigned long rlim = rlimit(RLIMIT_DATA);
|
||||
+ unsigned long vm_req_flags;
|
||||
+ unsigned long vm_bad_flags;
|
||||
+ struct vm_area_struct *vma;
|
||||
+ struct mm_struct *mm;
|
||||
+ int error = 0;
|
||||
+
|
||||
+ if (arg4 | arg5)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ if (!capable(CAP_SYS_ADMIN))
|
||||
+ return -EPERM;
|
||||
+
|
||||
+ if (addr >= TASK_SIZE)
|
||||
+ return -EINVAL;
|
||||
+
|
||||
+ mm = get_task_mm(current);
|
||||
+ if (!mm)
|
||||
+ return -ENOENT;
|
||||
+ down_read(¤t->mm->mmap_sem);
|
||||
+ vma = find_vma(current->mm, addr);
|
||||
+
|
||||
+ down_read(&mm->mmap_sem);
|
||||
+ vma = find_vma(mm, addr);
|
||||
+
|
||||
+ if (opt != PR_SET_MM_START_BRK &&
|
||||
+ opt != PR_SET_MM_BRK) {
|
||||
+ if (opt != PR_SET_MM_START_BRK && opt != PR_SET_MM_BRK) {
|
||||
+ /* It must be existing VMA */
|
||||
+ if (!vma || vma->vm_start > addr)
|
||||
+ goto out;
|
||||
@@ -86,7 +105,6 @@ Index: linux-2.6.git/kernel/sys.c
|
||||
+ switch (opt) {
|
||||
+ case PR_SET_MM_START_CODE:
|
||||
+ case PR_SET_MM_END_CODE:
|
||||
+
|
||||
+ vm_req_flags = VM_READ | VM_EXEC;
|
||||
+ vm_bad_flags = VM_WRITE | VM_MAYSHARE;
|
||||
+
|
||||
@@ -102,7 +120,6 @@ Index: linux-2.6.git/kernel/sys.c
|
||||
+
|
||||
+ case PR_SET_MM_START_DATA:
|
||||
+ case PR_SET_MM_END_DATA:
|
||||
+
|
||||
+ vm_req_flags = VM_READ | VM_WRITE;
|
||||
+ vm_bad_flags = VM_EXEC | VM_MAYSHARE;
|
||||
+
|
||||
@@ -130,22 +147,24 @@ Index: linux-2.6.git/kernel/sys.c
|
||||
+ break;
|
||||
+
|
||||
+ case PR_SET_MM_START_BRK:
|
||||
+ if (addr <= mm->end_data)
|
||||
+ if (addr <= current->mm->end_data)
|
||||
+ goto out;
|
||||
+
|
||||
+ if (rlim < RLIM_INFINITY &&
|
||||
+ (mm->brk - addr) + (mm->end_data - mm->start_data) > rlim)
|
||||
+ (current->mm->brk - addr) +
|
||||
+ (current->mm->end_data - current->mm->start_data) > rlim)
|
||||
+ goto out;
|
||||
+
|
||||
+ current->mm->start_brk = addr;
|
||||
+ break;
|
||||
+
|
||||
+ case PR_SET_MM_BRK:
|
||||
+ if (addr <= mm->end_data)
|
||||
+ if (addr <= current->mm->end_data)
|
||||
+ goto out;
|
||||
+
|
||||
+ if (rlim < RLIM_INFINITY &&
|
||||
+ (addr - mm->start_brk) + (mm->end_data - mm->start_data) > rlim)
|
||||
+ (addr - current->mm->start_brk) +
|
||||
+ (current->mm->end_data - current->mm->start_data) > rlim)
|
||||
+ goto out;
|
||||
+
|
||||
+ current->mm->brk = addr;
|
||||
@@ -159,25 +178,28 @@ Index: linux-2.6.git/kernel/sys.c
|
||||
+ error = 0;
|
||||
+
|
||||
+out:
|
||||
+ up_read(&mm->mmap_sem);
|
||||
+ mmput(mm);
|
||||
+ up_read(¤t->mm->mmap_sem);
|
||||
+
|
||||
+ return error;
|
||||
+}
|
||||
+#else /* CONFIG_CHECKPOINT_RESTORE */
|
||||
+static int prctl_set_mm(int opt, unsigned long addr,
|
||||
+ unsigned long arg4, unsigned long arg5)
|
||||
+{
|
||||
+ return -EINVAL;
|
||||
+}
|
||||
+#endif
|
||||
+
|
||||
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
|
||||
unsigned long, arg4, unsigned long, arg5)
|
||||
{
|
||||
@@ -1841,6 +1953,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
|
||||
@@ -1841,6 +1958,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsi
|
||||
else
|
||||
error = PR_MCE_KILL_DEFAULT;
|
||||
break;
|
||||
+ case PR_SET_MM: {
|
||||
+ if (arg4 | arg5)
|
||||
+ return -EINVAL;
|
||||
+ error = prctl_set_mm(arg2, arg3);
|
||||
+ case PR_SET_MM:
|
||||
+ error = prctl_set_mm(arg2, arg3, arg4, arg5);
|
||||
+ break;
|
||||
+ }
|
||||
default:
|
||||
error = -EINVAL;
|
||||
break;
|
@@ -0,0 +1,31 @@
|
||||
From: Andrew Morton <akpm@linux-foundation.org>
|
||||
Subject: c-r-procfs-add-start_data-end_data-start_brk-members-to-proc-pid-stat-v4-fix
|
||||
|
||||
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
|
||||
Cc: Alexey Dobriyan <adobriyan@gmail.com>
|
||||
Cc: Andrew Vagin <avagin@openvz.org>
|
||||
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
|
||||
Cc: Kees Cook <keescook@chromium.org>
|
||||
Cc: Serge Hallyn <serge.hallyn@canonical.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Vasiliy Kulikov <segoon@openwall.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
|
||||
Documentation/filesystems/proc.txt | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
Index: linux-2.6.git/Documentation/filesystems/proc.txt
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/Documentation/filesystems/proc.txt
|
||||
+++ linux-2.6.git/Documentation/filesystems/proc.txt
|
||||
@@ -307,7 +307,7 @@ Table 1-4: Contents of the stat files (a
|
||||
cgtime guest time of the task children in jiffies
|
||||
start_data address above which program data+bss is placed
|
||||
end_data address below which program data+bss is placed
|
||||
- start_brk address above which program heap can be expaned with brk() call
|
||||
+ start_brk address above which program heap can be expanded with brk()
|
||||
..............................................................................
|
||||
|
||||
The /proc/PID/maps file containing the currently mapped memory regions and
|
@@ -0,0 +1,70 @@
|
||||
From: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Subject: c/r: procfs: add start_data, end_data, start_brk members to /proc/$pid/stat v4
|
||||
|
||||
The mm->start_code/end_code, mm->start_data/end_data, mm->start_brk are
|
||||
involved into calculation of program text/data segment sizes (which might
|
||||
be seen in /proc/<pid>/statm) and into brk() call final address.
|
||||
|
||||
For restore we need to know all these values. While
|
||||
mm->start_code/end_code already present in /proc/$pid/stat, the rest
|
||||
members are not, so this patch brings them in.
|
||||
|
||||
The restore procedure of these members is addressed in another patch using
|
||||
prctl().
|
||||
|
||||
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
|
||||
Reviewed-by: Kees Cook <keescook@chromium.org>
|
||||
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
|
||||
Cc: Alexey Dobriyan <adobriyan@gmail.com>
|
||||
Cc: Tejun Heo <tj@kernel.org>
|
||||
Cc: Andrew Vagin <avagin@openvz.org>
|
||||
Cc: Vasiliy Kulikov <segoon@openwall.com>
|
||||
Cc: Alexey Dobriyan <adobriyan@gmail.com>
|
||||
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
|
||||
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
|
||||
---
|
||||
|
||||
Documentation/filesystems/proc.txt | 3 +++
|
||||
fs/proc/array.c | 7 +++++--
|
||||
2 files changed, 8 insertions(+), 2 deletions(-)
|
||||
|
||||
Index: linux-2.6.git/Documentation/filesystems/proc.txt
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/Documentation/filesystems/proc.txt
|
||||
+++ linux-2.6.git/Documentation/filesystems/proc.txt
|
||||
@@ -305,6 +305,9 @@ Table 1-4: Contents of the stat files (a
|
||||
blkio_ticks time spent waiting for block IO
|
||||
gtime guest time of the task in jiffies
|
||||
cgtime guest time of the task children in jiffies
|
||||
+ start_data address above which program data+bss is placed
|
||||
+ end_data address below which program data+bss is placed
|
||||
+ start_brk address above which program heap can be expaned with brk() call
|
||||
..............................................................................
|
||||
|
||||
The /proc/PID/maps file containing the currently mapped memory regions and
|
||||
Index: linux-2.6.git/fs/proc/array.c
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/fs/proc/array.c
|
||||
+++ linux-2.6.git/fs/proc/array.c
|
||||
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file
|
||||
|
||||
seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
|
||||
%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
|
||||
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
|
||||
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
|
||||
pid_nr_ns(pid, ns),
|
||||
tcomm,
|
||||
state,
|
||||
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file
|
||||
task->policy,
|
||||
(unsigned long long)delayacct_blkio_ticks(task),
|
||||
cputime_to_clock_t(gtime),
|
||||
- cputime_to_clock_t(cgtime));
|
||||
+ cputime_to_clock_t(cgtime),
|
||||
+ (mm && permitted) ? mm->start_data : 0,
|
||||
+ (mm && permitted) ? mm->end_data : 0,
|
||||
+ (mm && permitted) ? mm->start_brk : 0);
|
||||
if (mm)
|
||||
mmput(mm);
|
||||
return 0;
|
@@ -1,37 +0,0 @@
|
||||
fs, proc: Add start_data, end_data, start_brk members to /proc/$pid/stat
|
||||
|
||||
It helps to dump and restore this mm_struct members.
|
||||
|
||||
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Acked-by: Serge Hallyn <serge.hallyn@canonical.com>
|
||||
Reviewed-by: Kees Cook <keescook@chromium.org>
|
||||
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
|
||||
---
|
||||
fs/proc/array.c | 7 +++++--
|
||||
1 file changed, 5 insertions(+), 2 deletions(-)
|
||||
|
||||
Index: linux-2.6.git/fs/proc/array.c
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/fs/proc/array.c
|
||||
+++ linux-2.6.git/fs/proc/array.c
|
||||
@@ -464,7 +464,7 @@ static int do_task_stat(struct seq_file
|
||||
|
||||
seq_printf(m, "%d (%s) %c %d %d %d %d %d %u %lu \
|
||||
%lu %lu %lu %lu %lu %ld %ld %ld %ld %d 0 %llu %lu %ld %lu %lu %lu %lu %lu \
|
||||
-%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld\n",
|
||||
+%lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld %lu %lu %lu\n",
|
||||
pid_nr_ns(pid, ns),
|
||||
tcomm,
|
||||
state,
|
||||
@@ -511,7 +511,10 @@ static int do_task_stat(struct seq_file
|
||||
task->policy,
|
||||
(unsigned long long)delayacct_blkio_ticks(task),
|
||||
cputime_to_clock_t(gtime),
|
||||
- cputime_to_clock_t(cgtime));
|
||||
+ cputime_to_clock_t(cgtime),
|
||||
+ mm ? (permitted ? mm->start_data : 1) : 0,
|
||||
+ mm ? (permitted ? mm->end_data : 1) : 0,
|
||||
+ mm ? (permitted ? mm->start_brk : 1) : 0);
|
||||
if (mm)
|
||||
mmput(mm);
|
||||
return 0;
|
@@ -1,4 +1,5 @@
|
||||
fs, proc: Introduce the /proc/<pid>/map_files/ directory v14
|
||||
From: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Subject: [PATCH] fs, proc: Introduce the /proc/<pid>/map_files/ directory v14
|
||||
|
||||
From: Pavel Emelyanov <xemul@parallels.com>
|
||||
|
||||
|
@@ -1,4 +1,5 @@
|
||||
fs, proc: Make proc_get_link to use dentry instead of inode
|
||||
From: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Subject: [PATCH] fs, proc: Make proc_get_link to use dentry instead of inode
|
||||
|
||||
From: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
|
||||
|
@@ -1,69 +0,0 @@
|
||||
fs, proc: Introduce the /proc/<pid>/children entry
|
||||
|
||||
There is no easy way to make a reverse parent->children chain
|
||||
from the task status, in turn children->parent provided with "PPid"
|
||||
field.
|
||||
|
||||
So instead of walking over all pids in system to figure out what
|
||||
children the task have -- we add explicit /proc/<pid>/children entry,
|
||||
since kernel already knows this kind of information but it was not
|
||||
yet exported.
|
||||
|
||||
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Cc: Pavel Emelyanov <xemul@parallels.com>
|
||||
Cc: Serge Hallyn <serge.hallyn@canonical.com>
|
||||
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
|
||||
---
|
||||
fs/proc/array.c | 14 ++++++++++++++
|
||||
fs/proc/base.c | 1 +
|
||||
fs/proc/internal.h | 3 +++
|
||||
3 files changed, 18 insertions(+)
|
||||
|
||||
Index: linux-2.6.git/fs/proc/array.c
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/fs/proc/array.c
|
||||
+++ linux-2.6.git/fs/proc/array.c
|
||||
@@ -547,3 +547,17 @@ int proc_pid_statm(struct seq_file *m, s
|
||||
|
||||
return 0;
|
||||
}
|
||||
+
|
||||
+int proc_pid_children(struct seq_file *m, struct pid_namespace *ns,
|
||||
+ struct pid *pid, struct task_struct *task)
|
||||
+{
|
||||
+ struct task_struct *c;
|
||||
+
|
||||
+ read_lock(&tasklist_lock);
|
||||
+ list_for_each_entry(c, &task->children, sibling)
|
||||
+ seq_printf(m, " %d", pid_nr_ns(task_pid(c), ns));
|
||||
+ read_unlock(&tasklist_lock);
|
||||
+ seq_putc(m, '\n');
|
||||
+
|
||||
+ return 0;
|
||||
+}
|
||||
Index: linux-2.6.git/fs/proc/base.c
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/fs/proc/base.c
|
||||
+++ linux-2.6.git/fs/proc/base.c
|
||||
@@ -3204,6 +3204,7 @@ static const struct pid_entry tgid_base_
|
||||
INF("cmdline", S_IRUGO, proc_pid_cmdline),
|
||||
ONE("stat", S_IRUGO, proc_tgid_stat),
|
||||
ONE("statm", S_IRUGO, proc_pid_statm),
|
||||
+ ONE("children", S_IRUGO, proc_pid_children),
|
||||
REG("maps", S_IRUGO, proc_maps_operations),
|
||||
#ifdef CONFIG_NUMA
|
||||
REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
|
||||
Index: linux-2.6.git/fs/proc/internal.h
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/fs/proc/internal.h
|
||||
+++ linux-2.6.git/fs/proc/internal.h
|
||||
@@ -51,6 +51,9 @@ extern int proc_pid_status(struct seq_fi
|
||||
struct pid *pid, struct task_struct *task);
|
||||
extern int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns,
|
||||
struct pid *pid, struct task_struct *task);
|
||||
+extern int proc_pid_children(struct seq_file *m, struct pid_namespace *ns,
|
||||
+ struct pid *pid, struct task_struct *task);
|
||||
+
|
||||
extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
|
||||
|
||||
extern const struct file_operations proc_maps_operations;
|
279
kernel/fs-proc-add-children-entry-11
Normal file
279
kernel/fs-proc-add-children-entry-11
Normal file
@@ -0,0 +1,279 @@
|
||||
From: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Subject: [PATCH] fs, proc: Introduce the /proc/<pid>/children entry v4
|
||||
|
||||
There is no easy way to make a reverse parent->children chain
|
||||
from arbitrary <pid> (while parent pid is provided in "PPid"
|
||||
field of /proc/<pid>/status).
|
||||
|
||||
So instead of walking over all pids in the system to figure out which
|
||||
children a task have -- we add explicit /proc/<pid>/children entry,
|
||||
because kernel already has this kind of information but it is not
|
||||
yet exported. This is a first level children, not the whole process
|
||||
tree, neither the process threads are identified with this interface.
|
||||
|
||||
v2:
|
||||
- Kame suggested to use a separated /proc/<pid>/children entry
|
||||
instead of poking /proc/<pid>/status
|
||||
- Andew suggested to use rcu facility instead of locking
|
||||
tasklist_lock
|
||||
- Tejun pointed that non-seekable seq file might not be
|
||||
enough for tasks with large number of children
|
||||
|
||||
v3:
|
||||
- To be on a safe side use %lu format for pid_t printing
|
||||
|
||||
v4:
|
||||
- New line get printed when sequence ends not at seq->stop,
|
||||
a nit pointed by Tejun
|
||||
- Documentation update
|
||||
- tasklist_lock is back, Oleg pointed that ->children list
|
||||
is actually not rcu-safe
|
||||
|
||||
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Cc: Andrew Morton <akpm@linux-foundation.org>
|
||||
Cc: Pavel Emelyanov <xemul@parallels.com>
|
||||
Cc: Serge Hallyn <serge.hallyn@canonical.com>
|
||||
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
|
||||
Cc: Pavel Emelyanov <xemul@parallels.com>
|
||||
---
|
||||
Documentation/filesystems/proc.txt | 20 ++++
|
||||
fs/proc/array.c | 163 +++++++++++++++++++++++++++++++++++++
|
||||
fs/proc/base.c | 1
|
||||
fs/proc/internal.h | 6 +
|
||||
4 files changed, 190 insertions(+)
|
||||
|
||||
Index: linux-2.6.git/Documentation/filesystems/proc.txt
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/Documentation/filesystems/proc.txt
|
||||
+++ linux-2.6.git/Documentation/filesystems/proc.txt
|
||||
@@ -40,6 +40,7 @@ Table of Contents
|
||||
3.4 /proc/<pid>/coredump_filter - Core dump filtering settings
|
||||
3.5 /proc/<pid>/mountinfo - Information about mounts
|
||||
3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm
|
||||
+ 3.7 /proc/<pid>/children - Information about task children
|
||||
|
||||
|
||||
------------------------------------------------------------------------------
|
||||
@@ -1545,3 +1546,22 @@ a task to set its own or one of its thre
|
||||
is limited in size compared to the cmdline value, so writing anything longer
|
||||
then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated
|
||||
comm value.
|
||||
+
|
||||
+3.7 /proc/<pid>/children - Information about task children
|
||||
+--------------------------------------------------------------
|
||||
+This file provides a fast way to retrieve first level children pids
|
||||
+of a task pointed by <pid>. The format is a stream of pids separated
|
||||
+by space with a new line at the end. If a task has no children at
|
||||
+all -- only a new line returned.
|
||||
+
|
||||
+Note the "first level" here -- if a child has own children they will
|
||||
+not be printed there, one need to read /proc/<children-pid>/children
|
||||
+to obtain descendants. The same applies to threads -- they are not
|
||||
+counted here.
|
||||
+
|
||||
+Because this interface is intended to be fast and cheap it doesn't
|
||||
+guarantee to provide the precise results, which means if a child is
|
||||
+exiting it might or might not be counted. The same applies to freshly
|
||||
+created children -- they might or might not be counted. If one needs
|
||||
+precise pids -- the task and children should be either stopped or
|
||||
+frozen.
|
||||
Index: linux-2.6.git/fs/proc/array.c
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/fs/proc/array.c
|
||||
+++ linux-2.6.git/fs/proc/array.c
|
||||
@@ -547,3 +547,166 @@ int proc_pid_statm(struct seq_file *m, s
|
||||
|
||||
return 0;
|
||||
}
|
||||
+
|
||||
+static struct list_head *
|
||||
+children_get_at(struct proc_pid_children_iter *iter, loff_t pos)
|
||||
+{
|
||||
+ struct task_struct *t = iter->group_leader;
|
||||
+ struct task_struct *task;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+ do {
|
||||
+ list_for_each_entry(task, &t->children, sibling) {
|
||||
+ if (list_empty(&task->sibling))
|
||||
+ break;
|
||||
+ if (pos-- == 0) {
|
||||
+ put_task_struct(iter->last_group);
|
||||
+ iter->last_group = t;
|
||||
+ get_task_struct(iter->last_group);
|
||||
+ get_task_struct(task);
|
||||
+ rcu_read_unlock();
|
||||
+ return &task->sibling;
|
||||
+ }
|
||||
+ }
|
||||
+ } while_each_thread(iter->group_leader, t);
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ return NULL;
|
||||
+}
|
||||
+
|
||||
+static int children_seq_show(struct seq_file *seq, void *v)
|
||||
+{
|
||||
+ struct task_struct *task = container_of(v, struct task_struct, sibling);
|
||||
+ unsigned long pid;
|
||||
+ int ret = -1;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+ if (pid_alive(task)) {
|
||||
+ pid = (unsigned long)pid_vnr(task_pid(task));
|
||||
+ ret = 0;
|
||||
+ }
|
||||
+ rcu_read_unlock();
|
||||
+
|
||||
+ if (!ret)
|
||||
+ ret = seq_printf(seq, " %lu", pid);
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+static void *children_seq_start(struct seq_file *seq, loff_t *pos)
|
||||
+{
|
||||
+ return children_get_at(seq->private, *pos);
|
||||
+}
|
||||
+
|
||||
+static void *children_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
+{
|
||||
+ struct proc_pid_children_iter *iter = seq->private;
|
||||
+ struct task_struct *task = container_of(v, struct task_struct, sibling);
|
||||
+ struct list_head *next = NULL;
|
||||
+
|
||||
+ if (!iter->last_group)
|
||||
+ goto out;
|
||||
+
|
||||
+ rcu_read_lock();
|
||||
+
|
||||
+ if (list_empty(&task->sibling) ||
|
||||
+ list_is_last(v, &iter->last_group->children)) {
|
||||
+ struct task_struct *t = iter->last_group;
|
||||
+
|
||||
+ while_each_thread(iter->group_leader, t) {
|
||||
+ if (!list_empty(&t->children)) {
|
||||
+ put_task_struct(task);
|
||||
+ next = t->children.next;
|
||||
+ task = container_of(next, struct task_struct, sibling);
|
||||
+ get_task_struct(task);
|
||||
+ put_task_struct(iter->last_group);
|
||||
+ iter->last_group = t;
|
||||
+ get_task_struct(iter->last_group);
|
||||
+ goto out_unlock;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ put_task_struct(task);
|
||||
+ put_task_struct(iter->last_group);
|
||||
+ iter->last_group = NULL;
|
||||
+ } else {
|
||||
+ next = ((struct list_head *)v)->next;
|
||||
+ put_task_struct(task);
|
||||
+ task = container_of(next, struct task_struct, sibling);
|
||||
+ get_task_struct(task);
|
||||
+ }
|
||||
+out_unlock:
|
||||
+ rcu_read_unlock();
|
||||
+out:
|
||||
+ ++*pos;
|
||||
+ if (!next)
|
||||
+ seq_printf(seq, "\n");
|
||||
+ return next;
|
||||
+}
|
||||
+
|
||||
+static void children_seq_stop(struct seq_file *seq, void *v)
|
||||
+{
|
||||
+ struct proc_pid_children_iter *iter = seq->private;
|
||||
+ if (iter->last_group)
|
||||
+ put_task_struct(iter->last_group);
|
||||
+ iter->last_group = NULL;
|
||||
+}
|
||||
+
|
||||
+static const struct seq_operations children_seq_ops = {
|
||||
+ .start = children_seq_start,
|
||||
+ .next = children_seq_next,
|
||||
+ .stop = children_seq_stop,
|
||||
+ .show = children_seq_show,
|
||||
+};
|
||||
+
|
||||
+static int children_seq_open(struct inode *inode, struct file *file)
|
||||
+{
|
||||
+ struct proc_pid_children_iter *iter = NULL;
|
||||
+ struct task_struct *task = NULL;
|
||||
+ int ret;
|
||||
+
|
||||
+ ret = -ENOMEM;
|
||||
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
|
||||
+ if (!iter)
|
||||
+ goto err;
|
||||
+
|
||||
+ ret = -ENOENT;
|
||||
+ task = get_proc_task(inode);
|
||||
+ if (!task)
|
||||
+ goto err;
|
||||
+
|
||||
+ ret = seq_open(file, &children_seq_ops);
|
||||
+ if (!ret) {
|
||||
+ struct seq_file *m = file->private_data;
|
||||
+ m->private = iter;
|
||||
+ iter->group_leader = task;
|
||||
+ iter->last_group = task;
|
||||
+ get_task_struct(iter->last_group);
|
||||
+ }
|
||||
+
|
||||
+err:
|
||||
+ if (ret) {
|
||||
+ if (task)
|
||||
+ put_task_struct(task);
|
||||
+ kfree(iter);
|
||||
+ }
|
||||
+
|
||||
+ return ret;
|
||||
+}
|
||||
+
|
||||
+int children_seq_release(struct inode *inode, struct file *file)
|
||||
+{
|
||||
+ struct seq_file *m = file->private_data;
|
||||
+ struct proc_pid_children_iter *iter = m->private;
|
||||
+
|
||||
+ put_task_struct(iter->group_leader);
|
||||
+ kfree(iter);
|
||||
+ seq_release(inode, file);
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+const struct file_operations proc_pid_children_operations = {
|
||||
+ .open = children_seq_open,
|
||||
+ .read = seq_read,
|
||||
+ .llseek = seq_lseek,
|
||||
+ .release = children_seq_release,
|
||||
+};
|
||||
Index: linux-2.6.git/fs/proc/base.c
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/fs/proc/base.c
|
||||
+++ linux-2.6.git/fs/proc/base.c
|
||||
@@ -3204,6 +3204,7 @@ static const struct pid_entry tgid_base_
|
||||
INF("cmdline", S_IRUGO, proc_pid_cmdline),
|
||||
ONE("stat", S_IRUGO, proc_tgid_stat),
|
||||
ONE("statm", S_IRUGO, proc_pid_statm),
|
||||
+ REG("children", S_IRUGO, proc_pid_children_operations),
|
||||
REG("maps", S_IRUGO, proc_maps_operations),
|
||||
#ifdef CONFIG_NUMA
|
||||
REG("numa_maps", S_IRUGO, proc_numa_maps_operations),
|
||||
Index: linux-2.6.git/fs/proc/internal.h
|
||||
===================================================================
|
||||
--- linux-2.6.git.orig/fs/proc/internal.h
|
||||
+++ linux-2.6.git/fs/proc/internal.h
|
||||
@@ -53,6 +53,12 @@ extern int proc_pid_statm(struct seq_fil
|
||||
struct pid *pid, struct task_struct *task);
|
||||
extern loff_t mem_lseek(struct file *file, loff_t offset, int orig);
|
||||
|
||||
+struct proc_pid_children_iter {
|
||||
+ struct task_struct *group_leader;
|
||||
+ struct task_struct *last_group;
|
||||
+};
|
||||
+
|
||||
+extern const struct file_operations proc_pid_children_operations;
|
||||
extern const struct file_operations proc_maps_operations;
|
||||
extern const struct file_operations proc_numa_maps_operations;
|
||||
extern const struct file_operations proc_smaps_operations;
|
@@ -1,4 +1,5 @@
|
||||
mincore: Add named constant for reported present bit
|
||||
From: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Subject: [PATCH] mincore: Add named constant for reported present bit
|
||||
|
||||
From: Pavel Emelyanov <xemul@parallels.com>
|
||||
|
||||
|
@@ -1,4 +1,5 @@
|
||||
mincore: Report whether page is anon or not
|
||||
From: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Subject: [PATCH] mincore: Report whether page is anon or not
|
||||
|
||||
From: Pavel Emelyanov <xemul@parallels.com>
|
||||
|
||||
|
@@ -1,4 +1,5 @@
|
||||
procfs-introduce-the-proc-pid-map_files-directory-checkpatch-fixes
|
||||
From: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Subject: [PATCH] procfs-introduce-the-proc-pid-map_files-directory-checkpatch-fixes
|
||||
|
||||
From: Andrew Morton <akpm@linux-foundation.org>
|
||||
|
||||
|
@@ -1,9 +1,12 @@
|
||||
fs-proc-Make-proc_get_link-to-use-dentry
|
||||
fs-proc-Introduce-the-proc-pid-map_files-directory
|
||||
procfs-introduce-the-proc-pid-map_files-directory-checkpatch
|
||||
fs-proc-Add-start_data-end_data-start_brk-members
|
||||
fs-proc-add-children-entry
|
||||
prctl-tune-up-mm_struct-members-2
|
||||
sysfs-add-kernel.ns_last_pid
|
||||
c-r-introduce-checkpoint_restore-symbol.patch
|
||||
c-r-procfs-add-start_data-end_data-start_brk-members-to-proc-pid-stat-v4.patch
|
||||
c-r-procfs-add-start_data-end_data-start_brk-members-to-proc-pid-stat-v4-fix.patch
|
||||
c-r-prctl-add-pr_set_mm-codes-to-set-up-mm_struct-entries.patch
|
||||
c-r-prctl-add-pr_set_mm-codes-to-set-up-mm_struct-entries-fix.patch
|
||||
mincore-Add-named-constant-for-reported-present-bit
|
||||
mincore-Report-whether-page-is-anon-or-not
|
||||
sysfs-add-kernel.ns_last_pid
|
||||
fs-proc-add-children-entry-11
|
||||
|
@@ -1,4 +1,5 @@
|
||||
sysctl: Add the kernel.ns_last_pid control
|
||||
From: Cyrill Gorcunov <gorcunov@openvz.org>
|
||||
Subject: [PATCH] sysctl: Add the kernel.ns_last_pid control
|
||||
|
||||
From: Pavel Emelyanov <xemul@parallels.com>
|
||||
|
||||
|
Reference in New Issue
Block a user