2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-27 04:18:27 +00:00
criu/criu/proc_parse.c
Pavel Tikhomirov 0a7c5fd1bd string: use our own __strlcpy and __strlcat to remove bsd headers
We see that libbsd redefines __has_include to be always true, which
breaks such checks for rseq. The idea behind this patch is remove the
use of libbsd functions and always export our replacement functions.

Using __strlcat and __strlcpy everywhere in existing code:
git grep --files-with-matches "strlcat" | xargs sed -i 's/strlcat/__strlcat/g'
git grep --files-with-matches "strlcpy" | xargs sed -i 's/strlcpy/__strlcpy/g'

Fixes: #2036
Suggested-by: Andrei Vagin <avagin@google.com>
Signed-off-by: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
2023-04-15 21:17:21 -07:00

2845 lines
62 KiB
C

#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <dirent.h>
#include <errno.h>
#include <sys/stat.h>
#include <string.h>
#include <ctype.h>
#include <linux/fs.h>
#include <sys/sysmacros.h>
#include "types.h"
#include "common/list.h"
#include "util.h"
#include "mount.h"
#include "filesystems.h"
#include "mman.h"
#include "cpu.h"
#include "file-lock.h"
#include "pstree.h"
#include "fsnotify.h"
#include "posix-timer.h"
#include "kerndat.h"
#include "vdso.h"
#include "vma.h"
#include "mem.h"
#include "bfd.h"
#include "proc_parse.h"
#include "fdinfo.h"
#include "parasite.h"
#include "cr_options.h"
#include "sysfs_parse.h"
#include "seccomp.h"
#include "string.h"
#include "namespaces.h"
#include "cgroup.h"
#include "cgroup-props.h"
#include "timerfd.h"
#include "path.h"
#include "fault-injection.h"
#include "memfd.h"
#include "hugetlb.h"
#include "protobuf.h"
#include "images/fdinfo.pb-c.h"
#include "images/mnt.pb-c.h"
#include "plugin.h"
#include <stdlib.h>
#ifndef SIGEV_SIGNAL
#define SIGEV_SIGNAL 0 /* notify via signal */
#endif
#ifndef SIGEV_NONE
#define SIGEV_NONE 1 /* other notification: meaningless */
#endif
#ifndef SIGEV_THREAD
#define SIGEV_THREAD 2 /* deliver via thread creation */
#endif
#ifndef SIGEV_THREAD_ID
#define SIGEV_THREAD_ID 4 /* deliver to thread */
#endif
#define BUF_SIZE 4096 /* Good enough value - can be changed */
struct buffer {
char buf[BUF_SIZE];
char end; /* '\0' */
};
static struct buffer __buf;
static char *buf = __buf.buf;
/*
* This is how AIO ring buffers look like in proc
*/
#define AIO_FNAME "/[aio]"
/* check the @line starts with "%lx-%lx" format */
static bool __is_vma_range_fmt(char *line)
{
#define ____is_vma_addr_char(__c) (((__c) <= '9' && (__c) >= '0') || ((__c) <= 'f' && (__c) >= 'a'))
while (*line && ____is_vma_addr_char(*line))
line++;
if (*line++ != '-')
return false;
while (*line && ____is_vma_addr_char(*line))
line++;
if (*line++ != ' ')
return false;
return true;
#undef ____is_vma_addr_char
}
bool is_vma_range_fmt(char *line)
{
return __is_vma_range_fmt(line);
}
bool handle_vma_plugin(int *fd, struct stat *stat)
{
int ret;
ret = run_plugins(HANDLE_DEVICE_VMA, *fd, stat);
if (ret < 0) {
pr_perror("handle_device_vma plugin failed");
return false;
}
return true;
}
static void __parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf)
{
char *tok;
if (!buf[0])
return;
tok = strtok(buf, " \n");
if (!tok)
return;
#define _vmflag_match(_t, _s) (_t[0] == _s[0] && _t[1] == _s[1])
do {
/* mmap() block */
if (_vmflag_match(tok, "gd"))
*flags |= MAP_GROWSDOWN;
else if (_vmflag_match(tok, "lo"))
*flags |= MAP_LOCKED;
else if (_vmflag_match(tok, "nr"))
*flags |= MAP_NORESERVE;
else if (_vmflag_match(tok, "ht"))
*flags |= MAP_HUGETLB;
/* madvise() block */
if (_vmflag_match(tok, "sr"))
*madv |= (1ul << MADV_SEQUENTIAL);
else if (_vmflag_match(tok, "rr"))
*madv |= (1ul << MADV_RANDOM);
else if (_vmflag_match(tok, "dc"))
*madv |= (1ul << MADV_DONTFORK);
else if (_vmflag_match(tok, "dd"))
*madv |= (1ul << MADV_DONTDUMP);
else if (_vmflag_match(tok, "mg"))
*madv |= (1ul << MADV_MERGEABLE);
else if (_vmflag_match(tok, "hg"))
*madv |= (1ul << MADV_HUGEPAGE);
else if (_vmflag_match(tok, "nh"))
*madv |= (1ul << MADV_NOHUGEPAGE);
/* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings. */
if (_vmflag_match(tok, "io") || _vmflag_match(tok, "pf"))
*io_pf = 1;
/*
* Anything else is just ignored.
*/
} while ((tok = strtok(NULL, " \n")));
#undef _vmflag_match
}
void parse_vmflags(char *buf, u32 *flags, u64 *madv, int *io_pf)
{
__parse_vmflags(buf, flags, madv, io_pf);
}
static void parse_vma_vmflags(char *buf, struct vma_area *vma_area)
{
int io_pf = 0;
__parse_vmflags(buf, &vma_area->e->flags, &vma_area->e->madv, &io_pf);
/*
* vmsplice doesn't work for VM_IO and VM_PFNMAP mappings, the
* only exception is VVAR area that mapped by the kernel as
* VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP
*/
if (io_pf && !vma_area_is(vma_area, VMA_AREA_VVAR) && !vma_entry_is(vma_area->e, VMA_FILE_SHARED))
vma_area->e->status |= VMA_UNSUPP;
if (vma_area->e->madv)
vma_area->e->has_madv = true;
}
static inline int is_anon_shmem_map(dev_t dev)
{
return kdat.shmem_dev == dev;
}
struct vma_file_info {
int dev_maj;
int dev_min;
unsigned long ino;
struct vma_area *vma;
};
static inline int vfi_equal(struct vma_file_info *a, struct vma_file_info *b)
{
return ((a->ino ^ b->ino) | (a->dev_maj ^ b->dev_maj) | (a->dev_min ^ b->dev_min)) == 0;
}
static int vma_get_mapfile_flags(struct vma_area *vma, DIR *mfd, char *path)
{
struct stat stat;
if (fstatat(dirfd(mfd), path, &stat, AT_SYMLINK_NOFOLLOW) < 0) {
if (errno == ENOENT) {
/* Just mapping w/o map_files link */
return 0;
}
pr_perror("Failed fstatat on map %" PRIx64 "", vma->e->start);
return -1;
}
switch (stat.st_mode & 0600) {
case 0200:
vma->e->fdflags = O_WRONLY;
break;
case 0400:
vma->e->fdflags = O_RDONLY;
break;
case 0600:
vma->e->fdflags = O_RDWR;
break;
}
vma->e->has_fdflags = true;
return 0;
}
static int vma_stat(struct vma_area *vma, int fd)
{
vma->vmst = xmalloc(sizeof(struct stat));
if (!vma->vmst)
return -1;
/*
* For AUFS support, we need to check if the symbolic link
* points to a branch. If it does, we cannot fstat() its file
* descriptor because it would return a different dev/ino than
* the real file. If fixup_aufs_vma_fd() returns positive,
* it means that it has stat()'ed using the full pathname.
* Zero return means that the symbolic link does not point to
* a branch and we can do fstat() below.
*/
if (opts.aufs) {
int ret;
ret = fixup_aufs_vma_fd(vma, fd);
if (ret < 0)
return -1;
if (ret > 0)
return 0;
}
if (fstat(fd, vma->vmst) < 0) {
pr_perror("Failed fstat on map %" PRIx64 "", vma->e->start);
return -1;
}
return 0;
}
static int vma_get_mapfile_user(const char *fname, struct vma_area *vma, struct vma_file_info *vfi, int *vm_file_fd,
const char *path)
{
int fd, hugetlb_flag = 0;
dev_t vfi_dev;
/*
* Kernel prohibits reading map_files for users. The
* best we can do here is fill stat using the information
* from smaps file and ... hope for the better :\
*
* Here we'll miss AIO-s and sockets :(
*/
if (fname[0] == '\0') {
/*
* Another bad thing is that kernel first checks
* for permission access to ANY map_files link,
* then checks for its existence. So we have to
* check for file path being empty to "emulate"
* the ENOENT case.
*/
if (vfi->dev_maj != 0 || vfi->dev_min != 0 || vfi->ino != 0) {
pr_err("Strange file mapped at %lx [%s]:%d.%d.%ld\n", (unsigned long)vma->e->start, fname,
vfi->dev_maj, vfi->dev_min, vfi->ino);
return -1;
}
return 0;
} else if (fname[0] != '/') {
/*
* This should be some kind of
* special mapping like [heap], [vdso]
* and such, the caller should take care
* of the @fname and vma status.
*/
return 0;
}
vfi_dev = makedev(vfi->dev_maj, vfi->dev_min);
if (is_memfd(vfi_dev)) {
char tmp[PATH_MAX];
__strlcpy(tmp, fname, PATH_MAX);
strip_deleted(tmp, strlen(tmp));
/*
* The error EPERM will be shown in the following pr_perror().
* It comes from the previous open() call.
*/
pr_perror("Can't open mapped [%s]", tmp);
/*
* TODO Perhaps we could do better than failing and dump the
* memory like what is being done in shmem.c
*/
return -1;
}
if (is_hugetlb_dev(vfi_dev, &hugetlb_flag) || is_anon_shmem_map(vfi_dev)) {
if (!(vma->e->flags & MAP_SHARED))
vma->e->status |= VMA_ANON_PRIVATE;
else
vma->e->status |= VMA_ANON_SHARED;
vma->e->flags |= MAP_ANONYMOUS;
vma->e->shmid = vfi->ino;
vma->e->flags |= hugetlb_flag;
if (!strncmp(fname, "/SYSV", 5)) {
vma->e->status |= VMA_AREA_SYSVIPC;
} else if (vma->e->flags & MAP_SHARED) {
if (fault_injected(FI_HUGE_ANON_SHMEM_ID))
vma->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE;
}
return 0;
}
pr_info("Failed to open map_files/%s, try to go via [%s] path\n", path, fname);
fd = open(fname, O_RDONLY);
if (fd < 0) {
pr_perror("Can't open mapped [%s]", fname);
return -1;
}
if (vma_stat(vma, fd)) {
close(fd);
return -1;
}
if (vma->vmst->st_dev != vfi_dev || vma->vmst->st_ino != vfi->ino) {
pr_err("Failed to resolve mapping %lx filename\n", (unsigned long)vma->e->start);
close(fd);
return -1;
}
*vm_file_fd = fd;
return 0;
}
static int vma_get_mapfile(const char *fname, struct vma_area *vma, DIR *mfd, struct vma_file_info *vfi,
struct vma_file_info *prev_vfi, int *vm_file_fd)
{
char path[32];
int flags;
/* Figure out if it's file mapping */
snprintf(path, sizeof(path), "%" PRIx64 "-%" PRIx64, vma->e->start, vma->e->end);
if (vma_get_mapfile_flags(vma, mfd, path))
return -1;
if (prev_vfi->vma && vfi_equal(vfi, prev_vfi)) {
struct vma_area *prev = prev_vfi->vma;
/*
* If vfi is equal (!) and negative @vm_file_fd --
* we have nothing to borrow for sure.
*/
if (*vm_file_fd < 0)
return 0;
pr_debug("vma %" PRIx64 " borrows vfi from previous %" PRIx64 "\n", vma->e->start, prev->e->start);
if (prev->e->status & VMA_AREA_SOCKET)
vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR;
/*
* FIXME -- in theory there can be vmas that have
* dev:ino match, but live in different mount
* namespaces. However, we only borrow files for
* subsequent vmas. These are _very_ likely to
* have files from the same namespaces.
*/
vma->file_borrowed = true;
return 0;
}
close_safe(vm_file_fd);
/*
* Note that we "open" it in dumper process space
* so later we might refer to it via /proc/self/fd/vm_file_fd
* if needed.
*/
flags = O_PATH;
if (vfi->dev_maj == 0)
/*
* Opening with O_PATH omits calling kernel ->open
* method, thus for some special files their type
* detection might be broken. Thus we open those with
* the O_RDONLY to potentially get ENXIO and check
* it below.
*/
flags = O_RDONLY;
*vm_file_fd = openat(dirfd(mfd), path, flags);
if (*vm_file_fd < 0) {
if (errno == ENOENT)
/* Just mapping w/o map_files link */
return 0;
if (errno == ENXIO) {
struct stat buf;
if (fstatat(dirfd(mfd), path, &buf, 0))
return -1;
if (S_ISSOCK(buf.st_mode)) {
pr_info("Found socket mapping @%" PRIx64 "\n", vma->e->start);
vma->vm_socket_id = buf.st_ino;
vma->e->status |= VMA_AREA_SOCKET | VMA_AREA_REGULAR;
return 0;
}
if ((buf.st_mode & S_IFMT) == 0 && !strncmp(fname, AIO_FNAME, sizeof(AIO_FNAME) - 1)) {
/* AIO ring, let's try */
close_safe(vm_file_fd);
vma->e->status = VMA_AREA_AIORING;
return 0;
}
pr_err("Unknown shit %o (%s)\n", buf.st_mode, fname);
return -1;
}
if (errno == EPERM && !opts.aufs)
return vma_get_mapfile_user(fname, vma, vfi, vm_file_fd, path);
pr_perror("Can't open map_files");
return -1;
}
return vma_stat(vma, *vm_file_fd);
}
int parse_self_maps_lite(struct vm_area_list *vms)
{
struct vma_area *prev = NULL;
struct bfd maps;
char *buf;
vm_area_list_init(vms);
maps.fd = open_proc(PROC_SELF, "maps");
if (maps.fd < 0)
return -1;
if (bfdopenr(&maps))
return -1;
while (1) {
struct vma_area *vma;
char *end;
unsigned long s, e;
buf = breadline(&maps);
if (!buf)
break;
if (IS_ERR(buf))
goto err;
s = strtoul(buf, &end, 16);
e = strtoul(end + 1, NULL, 16);
if (prev && prev->e->end == s)
/*
* This list is needed for one thing only -- to
* get the idea of what parts of current address
* space are busy. So merge them altogether.
*/
prev->e->end = e;
else {
vma = alloc_vma_area();
if (!vma)
goto err;
vma->e->start = s;
vma->e->end = e;
list_add_tail(&vma->list, &vms->h);
vms->nr++;
prev = vma;
}
pr_debug("Parsed %" PRIx64 "-%" PRIx64 " vma\n", prev->e->start, prev->e->end);
}
bclose(&maps);
return 0;
err:
bclose(&maps);
return -1;
}
static inline int handle_vdso_vma(struct vma_area *vma)
{
vma->e->status |= VMA_AREA_REGULAR;
if ((vma->e->prot & VDSO_PROT) == VDSO_PROT)
vma->e->status |= VMA_AREA_VDSO;
return 0;
}
static inline int handle_vvar_vma(struct vma_area *vma)
{
vma->e->status |= VMA_AREA_REGULAR;
if ((vma->e->prot & VVAR_PROT) == VVAR_PROT)
vma->e->status |= VMA_AREA_VVAR;
return 0;
}
static int handle_vma(pid_t pid, struct vma_area *vma_area, const char *file_path, DIR *map_files_dir,
struct vma_file_info *vfi, struct vma_file_info *prev_vfi, int *vm_file_fd)
{
if (vma_get_mapfile(file_path, vma_area, map_files_dir, vfi, prev_vfi, vm_file_fd))
goto err_bogus_mapfile;
if (vma_area->e->status != 0)
return 0;
if (!strcmp(file_path, "[vsyscall]") || !strcmp(file_path, "[vectors]")) {
vma_area->e->status |= VMA_AREA_VSYSCALL;
} else if (!strcmp(file_path, "[vdso]")) {
if (handle_vdso_vma(vma_area))
goto err;
} else if (!strcmp(file_path, "[vvar]")) {
if (handle_vvar_vma(vma_area))
goto err;
} else if (!strcmp(file_path, "[heap]")) {
vma_area->e->status |= VMA_AREA_REGULAR | VMA_AREA_HEAP;
} else {
vma_area->e->status = VMA_AREA_REGULAR;
}
/*
* Some mapping hints for restore, we save this on
* disk and restore might need to analyze it.
*/
if (vma_area->file_borrowed) {
struct vma_area *prev = prev_vfi->vma;
/*
* Pick-up flags that might be set in the branch below.
* Status is copied as-is as it should be zero here,
* and have full match with the previous.
*/
vma_area->e->flags |= (prev->e->flags & MAP_ANONYMOUS);
vma_area->e->status = prev->e->status;
vma_area->e->shmid = prev->e->shmid;
vma_area->vmst = prev->vmst;
vma_area->mnt_id = prev->mnt_id;
if (!(vma_area->e->status & VMA_AREA_SYSVIPC)) {
vma_area->e->status &= ~(VMA_FILE_PRIVATE | VMA_FILE_SHARED);
if (vma_area->e->flags & MAP_PRIVATE)
vma_area->e->status |= VMA_FILE_PRIVATE;
else
vma_area->e->status |= VMA_FILE_SHARED;
}
} else if (*vm_file_fd >= 0) {
struct stat *st_buf = vma_area->vmst;
int hugetlb_flag = 0;
if (S_ISREG(st_buf->st_mode)) {
/* regular file mapping -- supported */;
pr_debug("Found regular file mapping, OK\n");
} else if (S_ISCHR(st_buf->st_mode) && (st_buf->st_rdev == DEVZERO)) {
/* devzero mapping -- also makes sense */;
pr_debug("Found devzero mapping, OK\n");
} else if (handle_vma_plugin(vm_file_fd, st_buf)) {
pr_info("Found device file mapping, plugin is available\n");
vma_area->e->status |= VMA_EXT_PLUGIN;
} else {
/* non-regular mapping with no supporting plugin */
pr_err("Can't handle non-regular mapping on %d's map %" PRIx64 "\n", pid, vma_area->e->start);
goto err;
}
if ((is_anon_shmem_map(st_buf->st_dev) || is_hugetlb_dev(st_buf->st_dev, NULL)) &&
!strncmp(file_path, "/SYSV", 5)) {
vma_area->e->flags |= MAP_ANONYMOUS;
vma_area->e->status |= VMA_ANON_SHARED;
vma_area->e->shmid = st_buf->st_ino;
if (!(vma_area->e->flags & MAP_SHARED))
goto err_bogus_mapping;
pr_info("path: %s\n", file_path);
vma_area->e->status |= VMA_AREA_SYSVIPC;
} else {
/* We dump memfd backed mapping, both normal and hugepage anonymous share
* mapping using memfd approach when possible.
*/
if (is_memfd(st_buf->st_dev) || is_anon_shmem_map(st_buf->st_dev) ||
can_dump_with_memfd_hugetlb(st_buf->st_dev, &hugetlb_flag, file_path, vma_area)) {
vma_area->e->status |= VMA_AREA_MEMFD;
vma_area->e->flags |= hugetlb_flag;
if (fault_injected(FI_HUGE_ANON_SHMEM_ID))
vma_area->e->shmid += FI_HUGE_ANON_SHMEM_ID_BASE;
} else if (is_hugetlb_dev(st_buf->st_dev, &hugetlb_flag)) {
vma_area->e->flags |= hugetlb_flag;
vma_area->e->flags |= MAP_ANONYMOUS;
if (vma_area->e->flags & MAP_SHARED) {
vma_area->e->status |= VMA_ANON_SHARED;
vma_area->e->shmid = st_buf->st_ino;
} else {
vma_area->e->status |= VMA_ANON_PRIVATE;
}
close_safe(vm_file_fd);
return 0;
}
if (vma_area->e->flags & MAP_PRIVATE)
vma_area->e->status |= VMA_FILE_PRIVATE;
else
vma_area->e->status |= VMA_FILE_SHARED;
}
/*
* We cannot use the mnt_id value provided by the kernel
* for vm_file_fd if it is an AUFS file (the value is
* wrong). In such a case, fixup_aufs_vma_fd() has set
* mnt_id to -1 to mimic pre-3.15 kernels that didn't
* have mnt_id.
*/
if (vma_area->mnt_id != -1 && get_fd_mntid(*vm_file_fd, &vma_area->mnt_id))
return -1;
} else {
/*
* No file but mapping -- anonymous one.
*/
if (vma_area->e->flags & MAP_SHARED) {
vma_area->e->status |= VMA_ANON_SHARED;
vma_area->e->shmid = vfi->ino;
} else {
vma_area->e->status |= VMA_ANON_PRIVATE;
}
vma_area->e->flags |= MAP_ANONYMOUS;
}
return 0;
err:
return -1;
err_bogus_mapping:
pr_err("Bogus mapping 0x%" PRIx64 "-0x%" PRIx64 " (flags: %#x vm_file_fd: %d)\n", vma_area->e->start,
vma_area->e->end, vma_area->e->flags, *vm_file_fd);
goto err;
err_bogus_mapfile:
pr_perror("Can't open %d's mapfile link %" PRIx64, pid, vma_area->e->start);
goto err;
}
static int vma_list_add(struct vma_area *vma_area, struct vm_area_list *vma_area_list, unsigned long *prev_end,
struct vma_file_info *vfi, struct vma_file_info *prev_vfi)
{
if (vma_area->e->status & VMA_EXT_PLUGIN) {
/* Unsupported VMAs that provide special plugins for
* backup can be treated as regular VMAs and criu
* should only save their metadata in the dump files.
* There can be several special backup plugins hooks
* that might run at different stages during checkpoint
* and restore.
*/
pr_debug("Device file mapping %016" PRIx64 "-%016" PRIx64 " supported via device plugins\n",
vma_area->e->start, vma_area->e->end);
} else if (vma_area->e->status & VMA_UNSUPP) {
pr_err("Unsupported mapping found %016" PRIx64 "-%016" PRIx64 "\n", vma_area->e->start,
vma_area->e->end);
return -1;
}
/* Add a guard page only if here is enough space for it */
if (vma_has_guard_gap_hidden(vma_area) && *prev_end < vma_area->e->start)
vma_area->e->start -= PAGE_SIZE; /* Guard page */
*prev_end = vma_area->e->end;
list_add_tail(&vma_area->list, &vma_area_list->h);
vma_area_list->nr++;
if (vma_area_is_private(vma_area, kdat.task_size)) {
unsigned long pages;
pages = vma_area_len(vma_area) / PAGE_SIZE;
vma_area_list->nr_priv_pages += pages;
vma_area_list->nr_priv_pages_longest = max(vma_area_list->nr_priv_pages_longest, pages);
} else if (vma_area_is(vma_area, VMA_ANON_SHARED)) {
unsigned long pages;
pages = vma_area_len(vma_area) / PAGE_SIZE;
vma_area_list->nr_shared_pages_longest = max(vma_area_list->nr_shared_pages_longest, pages);
}
*prev_vfi = *vfi;
prev_vfi->vma = vma_area;
return 0;
}
/*
* On s390 we have old kernels where the global task size assumption of
* criu does not work. See also compel_task_size() for s390.
*/
static int task_size_check(pid_t pid, VmaEntry *entry)
{
#ifdef __s390x__
if (entry->end <= kdat.task_size)
return 0;
pr_err("Can't dump high memory region %lx-%lx of task %d because kernel commit ee71d16d22bb is missing\n",
entry->start, entry->end, pid);
return -1;
#else
return 0;
#endif
}
int parse_smaps(pid_t pid, struct vm_area_list *vma_area_list, dump_filemap_t dump_filemap)
{
struct vma_area *vma_area = NULL;
unsigned long start, end, pgoff, prev_end = 0;
char r, w, x, s;
int ret = -1, vm_file_fd = -1;
struct vma_file_info vfi;
struct vma_file_info prev_vfi = {};
DIR *map_files_dir = NULL;
struct bfd f;
vm_area_list_init(vma_area_list);
f.fd = open_proc(pid, "smaps");
if (f.fd < 0)
goto err_n;
if (bfdopenr(&f))
goto err_n;
map_files_dir = opendir_proc(pid, "map_files");
if (!map_files_dir) /* old kernel? */
goto err;
while (1) {
int num, path_off;
bool eof;
char *str;
str = breadline(&f);
if (IS_ERR(str))
goto err;
eof = (str == NULL);
if (!eof && !__is_vma_range_fmt(str)) {
if (!strncmp(str, "VmFlags: ", 9)) {
BUG_ON(!vma_area);
parse_vma_vmflags(&str[9], vma_area);
continue;
} else
continue;
}
if (vma_area && vma_list_add(vma_area, vma_area_list, &prev_end, &vfi, &prev_vfi))
goto err;
if (eof)
break;
vma_area = alloc_vma_area();
if (!vma_area)
goto err;
num = sscanf(str, "%lx-%lx %c%c%c%c %lx %x:%x %lu %n", &start, &end, &r, &w, &x, &s, &pgoff,
&vfi.dev_maj, &vfi.dev_min, &vfi.ino, &path_off);
if (num < 10) {
pr_err("Can't parse: %s\n", str);
goto err;
}
vma_area->e->start = start;
vma_area->e->end = end;
vma_area->e->pgoff = pgoff;
vma_area->e->prot = PROT_NONE;
if (task_size_check(pid, vma_area->e))
goto err;
if (r == 'r')
vma_area->e->prot |= PROT_READ;
if (w == 'w')
vma_area->e->prot |= PROT_WRITE;
if (x == 'x')
vma_area->e->prot |= PROT_EXEC;
if (s == 's')
vma_area->e->flags = MAP_SHARED;
else if (s == 'p')
vma_area->e->flags = MAP_PRIVATE;
else {
pr_err("Unexpected VMA met (%c)\n", s);
goto err;
}
if (handle_vma(pid, vma_area, str + path_off, map_files_dir, &vfi, &prev_vfi, &vm_file_fd))
goto err;
if (vma_entry_is(vma_area->e, VMA_FILE_PRIVATE) || vma_entry_is(vma_area->e, VMA_FILE_SHARED)) {
if (dump_filemap && dump_filemap(vma_area, vm_file_fd))
goto err;
} else if (vma_entry_is(vma_area->e, VMA_AREA_AIORING))
vma_area_list->nr_aios++;
}
vma_area = NULL;
ret = 0;
err:
bclose(&f);
err_n:
close_safe(&vm_file_fd);
if (map_files_dir)
closedir(map_files_dir);
xfree(vma_area);
return ret;
}
int parse_pid_stat(pid_t pid, struct proc_pid_stat *s)
{
char *tok, *p;
int fd;
int n;
fd = open_proc(pid, "stat");
if (fd < 0)
return -1;
n = read(fd, buf, BUF_SIZE);
close(fd);
if (n < 1) {
pr_err("stat for %d is corrupted\n", pid);
return -1;
}
memset(s, 0, sizeof(*s));
tok = strchr(buf, ' ');
if (!tok)
goto err;
*tok++ = '\0';
if (*tok != '(')
goto err;
s->pid = atoi(buf);
p = strrchr(tok + 1, ')');
if (!p)
goto err;
*tok = '\0';
*p = '\0';
__strlcpy(s->comm, tok + 1, sizeof(s->comm));
n = sscanf(p + 1,
" %c %d %d %d %d %d %u %lu %lu %lu %lu "
"%lu %lu %ld %ld %ld %ld %d %d %llu %lu %ld %lu %lu %lu %lu "
"%lu %lu %lu %lu %lu %lu %lu %lu %lu %d %d %u %u %llu %lu %ld "
"%lu %lu %lu %lu %lu %lu %lu %d",
&s->state, &s->ppid, &s->pgid, &s->sid, &s->tty_nr, &s->tty_pgrp, &s->flags, &s->min_flt,
&s->cmin_flt, &s->maj_flt, &s->cmaj_flt, &s->utime, &s->stime, &s->cutime, &s->cstime, &s->priority,
&s->nice, &s->num_threads, &s->zero0, &s->start_time, &s->vsize, &s->mm_rss, &s->rsslim,
&s->start_code, &s->end_code, &s->start_stack, &s->esp, &s->eip, &s->sig_pending, &s->sig_blocked,
&s->sig_ignored, &s->sig_handled, &s->wchan, &s->zero1, &s->zero2, &s->exit_signal, &s->task_cpu,
&s->rt_priority, &s->policy, &s->delayacct_blkio_ticks, &s->gtime, &s->cgtime, &s->start_data,
&s->end_data, &s->start_brk, &s->arg_start, &s->arg_end, &s->env_start, &s->env_end, &s->exit_code);
if (n < 50)
goto err;
return 0;
err:
pr_err("Parsing %d's stat failed (#fields do not match)\n", pid);
return -1;
}
int prepare_loginuid(unsigned int value)
{
int fd, ret = 0;
char buf[11]; /* 4294967295 is maximum for u32 */
fd = open_proc_rw(PROC_SELF, "loginuid");
if (fd < 0)
return -1;
snprintf(buf, 11, "%u", value);
if (write(fd, buf, 11) < 0) {
pr_warn("Write %s to /proc/self/loginuid failed: %s\n", buf, strerror(errno));
ret = -1;
}
close(fd);
return ret;
}
unsigned int parse_pid_loginuid(pid_t pid, int *err, bool ignore_noent)
{
int fd;
ssize_t num;
*err = 0;
fd = __open_proc(pid, (ignore_noent) ? ENOENT : 0, O_RDONLY, "loginuid");
if (fd < 0)
goto out;
num = read(fd, buf, 10);
close(fd);
if (num < 0) {
pr_perror("Unable to read /proc/%d/loginuid", pid);
goto out;
}
buf[num] = '\0';
return strtol(buf, NULL, 10);
out:
*err = -1;
return INVALID_UID; /* unset value */
}
int parse_pid_oom_score_adj(pid_t pid, int *err)
{
int fd;
ssize_t num;
*err = 0;
fd = open_proc(pid, "oom_score_adj");
if (fd < 0)
goto out;
num = read(fd, buf, 10);
close(fd);
if (num < 0) {
pr_perror("Unable to read /proc/%d/oom_score_adj", pid);
goto out;
}
buf[num] = '\0';
return strtol(buf, NULL, 10);
out:
*err = -1;
return 0;
}
static int ids_parse(char *str, unsigned int *arr)
{
char *end;
arr[0] = strtol(str, &end, 10);
arr[1] = strtol(end + 1, &end, 10);
arr[2] = strtol(end + 1, &end, 10);
arr[3] = strtol(end + 1, &end, 10);
if (*end)
return -1;
else
return 0;
}
static int cap_parse(char *str, unsigned int *res)
{
int i, ret;
for (i = 0; i < PROC_CAP_SIZE; i++) {
ret = sscanf(str, "%08x", &res[PROC_CAP_SIZE - 1 - i]);
if (ret != 1)
return -1;
str += 8;
}
return 0;
}
int parse_pid_status(pid_t pid, struct seize_task_status *ss, void *data)
{
struct proc_status_creds *cr = container_of(ss, struct proc_status_creds, s);
struct bfd f;
int done = 0;
int ret = -1;
char *str;
bool parsed_seccomp = false;
int expected_done;
f.fd = open_proc(pid, "status");
if (f.fd < 0)
return -1;
cr->s.sigpnd = 0;
cr->s.shdpnd = 0;
cr->s.sigblk = 0;
cr->s.seccomp_mode = SECCOMP_MODE_DISABLED;
if (bfdopenr(&f))
return -1;
while (done < 14) {
str = breadline(&f);
if (str == NULL)
break;
if (IS_ERR(str))
goto err_parse;
if (!strncmp(str, "State:", 6)) {
cr->s.state = str[7];
done++;
continue;
}
if (!strncmp(str, "PPid:", 5)) {
if (sscanf(str, "PPid:\t%d", &cr->s.ppid) != 1) {
pr_err("Unable to parse: %s\n", str);
goto err_parse;
}
done++;
continue;
}
if (!strncmp(str, "NSpid:", 6)) {
/* Get a thread ID in the thread PID namespace. */
char *last;
last = strrchr(str, '\t');
if (!last || sscanf(last, "%d", &cr->s.vpid) != 1) {
pr_err("Unable to parse: %s\n", str);
goto err_parse;
}
done++;
continue;
}
if (!strncmp(str, "Uid:", 4)) {
if (ids_parse(str + 5, cr->uids))
goto err_parse;
done++;
continue;
}
if (!strncmp(str, "Gid:", 4)) {
if (ids_parse(str + 5, cr->gids))
goto err_parse;
done++;
continue;
}
if (!strncmp(str, "CapInh:", 7)) {
if (cap_parse(str + 8, cr->cap_inh))
goto err_parse;
done++;
continue;
}
if (!strncmp(str, "CapEff:", 7)) {
if (cap_parse(str + 8, cr->cap_eff))
goto err_parse;
done++;
continue;
}
if (!strncmp(str, "CapPrm:", 7)) {
if (cap_parse(str + 8, cr->cap_prm))
goto err_parse;
done++;
continue;
}
if (!strncmp(str, "CapBnd:", 7)) {
if (cap_parse(str + 8, cr->cap_bnd))
goto err_parse;
done++;
continue;
}
if (!strncmp(str, "Seccomp:", 8)) {
if (sscanf(str + 9, "%d", &cr->s.seccomp_mode) != 1) {
goto err_parse;
}
parsed_seccomp = true;
done++;
continue;
}
if (!strncmp(str, "ShdPnd:", 7)) {
unsigned long long sigpnd;
if (sscanf(str + 7, "%llx", &sigpnd) != 1)
goto err_parse;
cr->s.shdpnd |= sigpnd;
done++;
continue;
}
if (!strncmp(str, "SigPnd:", 7)) {
unsigned long long sigpnd;
if (sscanf(str + 7, "%llx", &sigpnd) != 1)
goto err_parse;
cr->s.sigpnd |= sigpnd;
done++;
continue;
}
if (!strncmp(str, "SigBlk:", 7)) {
unsigned long long sigblk = 0;
if (sscanf(str + 7, "%llx", &sigblk) != 1)
goto err_parse;
cr->s.sigblk |= sigblk;
done++;
continue;
}
}
/* seccomp and nspids are optional */
expected_done = (parsed_seccomp ? 12 : 11);
if (kdat.has_nspid)
expected_done++;
if (done == expected_done)
ret = 0;
err_parse:
if (ret)
pr_err("Error parsing proc status file\n");
bclose(&f);
return ret;
}
struct opt2flag {
char *opt;
unsigned flag;
};
static bool sb_opt_cb(char *opt, char *unknown, size_t *uoff)
{
unsigned int id;
if (sscanf(opt, "gid=%d", &id) == 1) {
*uoff += sprintf(unknown + *uoff, "gid=%d", userns_gid(id));
unknown[*uoff] = ',';
(*uoff)++;
return true;
} else if (sscanf(opt, "uid=%d", &id) == 1) {
*uoff += sprintf(unknown + *uoff, "uid=%d", userns_uid(id));
unknown[*uoff] = ',';
(*uoff)++;
return true;
}
return false;
}
static int do_opt2flag(char *opt, unsigned *flags, const struct opt2flag *opts, char *unknown,
bool (*cb)(char *opt, char *unknown, size_t *uoff))
{
int i;
char *end;
size_t uoff = 0;
while (1) {
end = strchr(opt, ',');
if (end)
*end = '\0';
for (i = 0; opts[i].opt != NULL; i++)
if (!strcmp(opts[i].opt, opt)) {
(*flags) |= opts[i].flag;
break;
}
if (opts[i].opt == NULL && cb && !cb(opt, unknown, &uoff)) {
if (!unknown) {
pr_err("Unknown option [%s]\n", opt);
return -1;
}
strcpy(unknown + uoff, opt);
uoff += strlen(opt);
unknown[uoff] = ',';
uoff++;
}
if (!end) {
if (uoff)
uoff--;
if (unknown)
unknown[uoff] = '\0';
break;
} else
opt = end + 1;
}
return 0;
}
static int parse_mnt_flags(char *opt, unsigned *flags)
{
static const struct opt2flag mnt_opt2flag[] = {
{
"rw",
0,
},
{
"ro",
MS_RDONLY,
},
{
"nosuid",
MS_NOSUID,
},
{
"nodev",
MS_NODEV,
},
{
"noexec",
MS_NOEXEC,
},
{
"noatime",
MS_NOATIME,
},
{
"nodiratime",
MS_NODIRATIME,
},
{
"relatime",
MS_RELATIME,
},
{},
};
if (do_opt2flag(opt, flags, mnt_opt2flag, NULL, NULL))
return -1;
/* Otherwise the kernel assumes RELATIME by default */
if ((*flags & (MS_RELATIME | MS_NOATIME)) == 0)
*flags |= MS_STRICTATIME;
return 0;
}
static int parse_sb_opt(char *opt, unsigned *flags, char *uopt)
{
static const struct opt2flag sb_opt2flag[] = {
{
"rw",
0,
},
{
"ro",
MS_RDONLY,
},
{
"sync",
MS_SYNC,
},
{
"dirsync",
MS_DIRSYNC,
},
{
"mad",
MS_MANDLOCK,
},
{},
};
return do_opt2flag(opt, flags, sb_opt2flag, uopt, sb_opt_cb);
}
static int parse_mnt_opt(char *str, struct mount_info *mi, int *off)
{
char *istr = str, *end;
while (1) {
end = strchr(str, ' ');
if (!end) {
pr_err("Error parsing mount options\n");
return -1;
}
*end = '\0';
if (!strncmp(str, "-", 1))
break;
else if (!strncmp(str, "shared:", 7)) {
mi->flags |= MS_SHARED;
mi->shared_id = atoi(str + 7);
} else if (!strncmp(str, "master:", 7)) {
mi->flags |= MS_SLAVE;
mi->master_id = atoi(str + 7);
} else if (!strncmp(str, "propagate_from:", 15)) {
/* skip */;
} else if (!strncmp(str, "unbindable", 11))
mi->flags |= MS_UNBINDABLE;
else {
pr_err("Unknown option [%s]\n", str);
return -1;
}
str = end + 1;
}
*off = end - istr + 1;
return 0;
}
/*
* mountinfo contains mangled paths. space, tab and back slash were replaced
* with usual octal escape. This function replaces these symbols back.
*/
static void cure_path(char *path)
{
int i, len, off = 0;
if (strchr(path, '\\') == NULL) /* fast path */
return;
len = strlen(path);
for (i = 0; i < len; i++) {
if (!strncmp(path + i, "\\040", 4)) {
path[i - off] = ' ';
goto replace;
} else if (!strncmp(path + i, "\\011", 4)) {
path[i - off] = '\t';
goto replace;
} else if (!strncmp(path + i, "\\134", 4)) {
path[i - off] = '\\';
goto replace;
}
if (off)
path[i - off] = path[i];
continue;
replace:
off += 3;
i += 3;
}
path[len - off] = 0;
}
static int parse_mountinfo_ent(char *str, struct mount_info *new, char **fsname)
{
unsigned int kmaj, kmin;
int ret, n, len;
char *sub, *opt = NULL;
char link_path[PATH_MAX];
new->mountpoint = xmalloc(PATH_MAX);
if (new->mountpoint == NULL)
goto err;
new->mountpoint[0] = '.';
ret = sscanf(str, "%i %i %u:%u %ms %s %ms %n", &new->mnt_id, &new->parent_mnt_id, &kmaj, &kmin, &new->root,
new->mountpoint + 1, &opt, &n);
if (ret != 7)
goto err;
cure_path(new->mountpoint);
cure_path(new->root);
len = strlen(new->root);
if (len >= PATH_MAX - 1) {
pr_err("new root path (%s) exceeds %d\n", new->root, PATH_MAX);
goto err;
}
strcpy(link_path, new->root);
if (strip_deleted(link_path, len)) {
strcpy(new->root, link_path);
new->deleted = true;
}
new->mountpoint = xrealloc(new->mountpoint, strlen(new->mountpoint) + 1);
if (!new->mountpoint)
goto err;
new->ns_mountpoint = new->mountpoint;
new->is_ns_root = is_root(new->ns_mountpoint + 1);
new->s_dev = new->s_dev_rt = MKKDEV(kmaj, kmin);
new->flags = 0;
if (parse_mnt_flags(opt, &new->flags))
goto err;
free(opt); /* we are going to reallocate/reuse this buffer */
opt = NULL;
str += n;
if (parse_mnt_opt(str, new, &n))
goto err;
str += n;
ret = sscanf(str, "%ms %ms %ms", fsname, &new->source, &opt);
if (ret == 2) {
/* src may be empty */
opt = new->source;
new->source = xstrdup("");
if (new->source == NULL)
goto err;
} else if (ret != 3)
goto err;
cure_path(new->source);
new->fsname = xstrdup(*fsname);
if (!new->fsname)
goto err;
/*
* The kernel reports "subtypes" sometimes and the valid
* type-vs-subtype delimiter is the dot symbol. We disregard
* any subtypes for the purpose of finding the fstype.
*/
sub = strchr(*fsname, '.');
if (sub)
*sub = 0;
new->fstype = find_fstype_by_name(*fsname);
new->options = xmalloc(strlen(opt) + 1);
if (!new->options)
goto err;
if (parse_sb_opt(opt, &new->sb_flags, new->options))
goto err;
ret = 0;
ret:
xfree(opt);
return ret;
err:
ret = -1;
goto ret;
}
static LIST_HEAD(skip_mount_list);
struct str_node {
struct list_head node;
char string[];
};
bool add_skip_mount(const char *mountpoint)
{
struct str_node *skip = xmalloc(sizeof(struct str_node) + strlen(mountpoint) + 1);
if (!skip)
return false;
strcpy(skip->string, mountpoint);
list_add(&skip->node, &skip_mount_list);
return true;
}
static bool should_skip_mount(char *mountpoint)
{
struct str_node *pos;
list_for_each_entry(pos, &skip_mount_list, node) {
if (is_same_path(mountpoint, pos->string))
return true;
}
return false;
}
int parse_timens_offsets(struct timespec *boff, struct timespec *moff)
{
int exit_code = -1;
FILE *f;
f = fopen_proc(PROC_SELF, "timens_offsets");
if (!f) {
pr_perror("Unable to open /proc/self/timens_offsets");
return exit_code;
}
while (fgets(buf, BUF_SIZE, f)) {
int64_t sec, nsec;
char clockid[10];
if (sscanf(buf, "%9s %" PRId64 " %" PRId64 "\n", clockid, &sec, &nsec) != 3) {
pr_err("Unable to parse: %s\n", buf);
goto out;
}
clockid[sizeof(clockid) - 1] = 0;
if (strcmp(clockid, "monotonic") == 0 || strcmp(clockid, __stringify(CLOCK_MONOTONIC)) == 0) {
moff->tv_sec = sec;
moff->tv_nsec = nsec;
continue;
}
if (strcmp(clockid, "boottime") == 0 || strcmp(clockid, __stringify(CLOCK_BOOTTIME)) == 0) {
boff->tv_sec = sec;
boff->tv_nsec = nsec;
continue;
}
pr_err("Unknown clockid: %s\n", clockid);
goto out;
}
exit_code = 0;
out:
fclose(f);
return exit_code;
}
static int get_mountinfo_sdev_from_mntid(int mnt_id, unsigned int *sdev)
{
int exit_code = -1;
FILE *f;
f = fopen_proc(PROC_SELF, "mountinfo");
if (!f)
return -1;
while (fgets(buf, BUF_SIZE, f)) {
unsigned int kmaj, kmin;
int id;
if (sscanf(buf, "%i %*i %u:%u", &id, &kmaj, &kmin) != 3) {
pr_err("Failed to parse mountinfo line %s\n", buf);
goto err;
}
if (id == mnt_id) {
*sdev = MKKDEV(kmaj, kmin);
exit_code = 0;
break;
}
}
err:
fclose(f);
return exit_code;
}
/* This works even on btrfs where stat does not show right sdev */
int get_sdev_from_fd(int fd, unsigned int *sdev, bool parse_mountinfo)
{
struct mount_info *mi;
int ret, mnt_id;
ret = get_fd_mntid(fd, &mnt_id);
if (ret < 0)
return -1;
/* Simple case mnt_id is in dumped mntns */
mi = lookup_mnt_id(mnt_id);
if (mi) {
*sdev = mi->s_dev_rt;
return 0;
}
if (!parse_mountinfo)
return -1;
/* Complex case mnt_id is in mntns created by criu */
return get_mountinfo_sdev_from_mntid(mnt_id, sdev);
}
struct mount_info *parse_mountinfo(pid_t pid, struct ns_id *nsid, bool for_dump)
{
struct mount_info *list = NULL;
FILE *f;
f = fopen_proc(pid, "mountinfo");
if (!f)
return NULL;
while (fgets(buf, BUF_SIZE, f)) {
struct mount_info *new;
int ret = -1;
char *fsname = NULL;
new = mnt_entry_alloc(false);
if (!new)
goto end;
new->nsid = nsid;
ret = parse_mountinfo_ent(buf, new, &fsname);
if (ret < 0) {
pr_err("Bad format in %d mountinfo: '%s'\n", pid, buf);
goto end;
}
/*
* Drop this mountpoint early, so that lookup_mnt_id/etc will
* fail loudly at "dump" stage if an opened file or another mnt
* depends on this one.
*/
if (for_dump && should_skip_mount(new->ns_mountpoint)) {
pr_info("\tskip %s @ %s\n", fsname, new->ns_mountpoint);
mnt_entry_free(new);
new = NULL;
goto end;
}
pr_info("\ttype %s source %s mnt_id %d s_dev %#x %s @ %s flags %#x options %s\n", fsname, new->source,
new->mnt_id, new->s_dev, new->root, new->ns_mountpoint, new->flags, new->options);
if (new->fstype->parse) {
ret = new->fstype->parse(new);
if (ret < 0) {
pr_err("Failed to parse FS specific data on %s\n", service_mountpoint(new));
mnt_entry_free(new);
new = NULL;
goto end;
}
if (ret > 0) {
pr_info("\tskipping fs mounted at %s\n", service_mountpoint(new) + 1);
mnt_entry_free(new);
new = NULL;
ret = 0;
goto end;
}
}
end:
if (fsname)
free(fsname);
if (new)
mntinfo_add_list_before(&list, new);
if (ret)
goto err;
}
out:
fclose(f);
return list;
err:
while (list) {
struct mount_info *next = list->next;
mnt_entry_free(list);
list = next;
}
goto out;
}
static char nybble(const char n)
{
if (n >= '0' && n <= '9')
return n - '0';
else if (n >= 'A' && n <= 'F')
return n - ('A' - 10);
else if (n >= 'a' && n <= 'f')
return n - ('a' - 10);
return 0;
}
static void parse_fhandle_encoded(char *tok, FhEntry *fh)
{
char *d = (char *)fh->handle;
int i = 0;
memzero(d, pb_repeated_size(fh, handle));
while (*tok == ' ')
tok++;
while (*tok) {
if (i >= pb_repeated_size(fh, handle))
break;
d[i++] = (nybble(tok[0]) << 4) | nybble(tok[1]);
if (tok[1])
tok += 2;
else
break;
}
}
static int parse_timerfd(struct bfd *f, char *str, TimerfdEntry *tfy)
{
/*
* Format is
* clockid: 0
* ticks: 0
* settime flags: 01
* it_value: (0, 49406829)
* it_interval: (1, 0)
*/
if (sscanf(str, "clockid: %d", &tfy->clockid) != 1)
goto parse_err;
if (verify_timerfd(tfy) < 0)
goto parse_err;
str = breadline(f);
if (IS_ERR_OR_NULL(str))
goto nodata;
if (sscanf(str, "ticks: %llu", (unsigned long long *)&tfy->ticks) != 1)
goto parse_err;
str = breadline(f);
if (IS_ERR_OR_NULL(str))
goto nodata;
if (sscanf(str, "settime flags: 0%o", &tfy->settime_flags) != 1)
goto parse_err;
str = breadline(f);
if (IS_ERR_OR_NULL(str))
goto nodata;
if (sscanf(str, "it_value: (%llu, %llu)", (unsigned long long *)&tfy->vsec,
(unsigned long long *)&tfy->vnsec) != 2)
goto parse_err;
str = breadline(f);
if (IS_ERR_OR_NULL(str))
goto nodata;
if (sscanf(str, "it_interval: (%llu, %llu)", (unsigned long long *)&tfy->isec,
(unsigned long long *)&tfy->insec) != 2)
goto parse_err;
return 0;
parse_err:
return -1;
nodata:
pr_err("No data left in proc file while parsing timerfd\n");
goto parse_err;
}
typedef struct bpfmap_fmt {
char *fmt;
void *value;
/*
* If newer kernels are adding additional entries, these entries need
* to be marked as optional in the protobuf definition and the parsing
* must be able to ignore it if running on an older kernel.
*/
protobuf_c_boolean *optional;
} bpfmap_fmt;
static int parse_bpfmap(struct bfd *f, char *str, BpfmapFileEntry *bpf)
{
/*
* Format is:
*
* uint32_t map_type
* uint32_t key_size
* uint32_t value_size
* uint32_t max_entries
* uint32_t map_flags
* uint64_t map_extra
* uint64_t memlock
* uint32_t map_id
* boolean frozen
*/
/* This needs to be in the same order as in the fdinfo entry. */
bpfmap_fmt map[] = {
{ "map_type: %u", &bpf->map_type, NULL },
{ "key_size: %u", &bpf->key_size, NULL },
{ "value_size: %u", &bpf->value_size, NULL },
{ "max_entries: %u", &bpf->max_entries, NULL },
{ "map_flags: %" PRIx32 "", &bpf->map_flags, NULL },
{ "map_extra: %" PRIx64 "", &bpf->map_extra, &bpf->has_map_extra },
{ "memlock: %" PRIu64 "", &bpf->memlock, NULL },
{ "map_id: %u", &bpf->map_id, NULL },
{ "frozen: %d", &bpf->frozen, NULL },
};
size_t n = sizeof(map) / sizeof(bpfmap_fmt);
int i;
for (i = 0; i < n; i++) {
if (sscanf(str, map[i].fmt, map[i].value) != 1) {
if (map[i].optional)
continue;
return -1;
}
if (i == n - 1)
break;
str = breadline(f);
if (IS_ERR_OR_NULL(str)) {
pr_err("No data left in proc file while parsing bpfmap\n");
return -1;
}
}
if (bpf->has_map_extra && bpf->map_extra)
pr_warn("Non-zero value for fdinfo map_extra entry found. This will not be restored.\n");
return 0;
}
#define fdinfo_field(str, field) !strncmp(str, field ":", sizeof(field))
static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked);
static int parse_fdinfo_pid_s(int pid, int fd, int type, void *arg)
{
struct bfd f;
char *str;
bool entry_met = false;
int ret, exit_code = -1;
f.fd = open_proc(pid, "fdinfo/%d", fd);
if (f.fd < 0)
return -1;
if (bfdopenr(&f))
return -1;
while (1) {
str = breadline(&f);
if (!str)
break;
if (IS_ERR(str))
goto out;
if (fdinfo_field(str, "pos") || fdinfo_field(str, "flags") || fdinfo_field(str, "mnt_id")) {
unsigned long long val;
struct fdinfo_common *fdinfo = arg;
if (type != FD_TYPES__UND)
continue;
ret = sscanf(str, "%*s %lli", &val);
if (ret != 1)
goto parse_err;
if (fdinfo_field(str, "pos"))
fdinfo->pos = val;
else if (fdinfo_field(str, "flags"))
fdinfo->flags = val;
else if (fdinfo_field(str, "mnt_id"))
fdinfo->mnt_id = val;
entry_met = true;
continue;
}
if (fdinfo_field(str, "lock")) {
struct file_lock *fl;
struct fdinfo_common *fdinfo = arg;
char *flock_status = str + sizeof("lock:\t") - 1;
if (type != FD_TYPES__UND)
continue;
/*
* The lock status can be empty when the owner of the
* lock is invisible from our PID namespace.
* This unfortunate behavior is fixed in kernels v4.19
* and up (see commit 1cf8e5de40).
*/
if (flock_status[0] == '\0')
continue;
fl = alloc_file_lock();
if (!fl) {
pr_perror("Alloc file lock failed!");
goto out;
}
if (parse_file_lock_buf(flock_status, fl, 0)) {
xfree(fl);
goto parse_err;
}
pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n", fl->fl_id, fl->fl_kind, fl->fl_ltype,
fl->fl_owner, fl->maj, fl->min, fl->i_no, fl->start, fl->end);
if (fl->fl_kind == FL_UNKNOWN) {
pr_err("Unknown file lock!\n");
xfree(fl);
goto out;
}
fl->real_owner = fdinfo->owner;
fl->fl_holder = pid;
fl->owners_fd = fd;
list_add_tail(&fl->list, &file_lock_list);
}
if (type == FD_TYPES__UND)
continue;
if (fdinfo_field(str, "eventfd-count")) {
EventfdFileEntry *efd = arg;
if (type != FD_TYPES__EVENTFD)
goto parse_err;
ret = sscanf(str, "eventfd-count: %" PRIx64, &efd->counter);
if (ret != 1)
goto parse_err;
entry_met = true;
continue;
}
if (fdinfo_field(str, "clockid")) {
TimerfdEntry *tfe = arg;
if (type != FD_TYPES__TIMERFD)
goto parse_err;
ret = parse_timerfd(&f, str, tfe);
if (ret)
goto parse_err;
entry_met = true;
continue;
}
if (fdinfo_field(str, "tfd")) {
EventpollFileEntry *epfe = arg;
EventpollTfdEntry *e;
int i;
if (type != FD_TYPES__EVENTPOLL)
goto parse_err;
e = xmalloc(sizeof(EventpollTfdEntry));
if (!e)
goto out;
eventpoll_tfd_entry__init(e);
ret = sscanf(str,
"tfd: %d events: %x data: %llx"
" pos:%lli ino:%lx sdev:%x",
&e->tfd, &e->events, (long long *)&e->data, (long long *)&e->pos,
(long *)&e->inode, &e->dev);
if (ret < 3 || ret > 6) {
eventpoll_tfd_entry__free_unpacked(e, NULL);
goto parse_err;
} else if (ret == 3) {
e->has_dev = false;
e->has_inode = false;
e->has_pos = false;
} else if (ret == 6) {
e->has_dev = true;
e->has_inode = true;
e->has_pos = true;
} else if (ret < 6) {
eventpoll_tfd_entry__free_unpacked(e, NULL);
goto parse_err;
}
i = epfe->n_tfd++;
if (xrealloc_safe(&epfe->tfd, epfe->n_tfd * sizeof(EventpollTfdEntry *)))
goto out;
epfe->tfd[i] = e;
entry_met = true;
continue;
}
if (fdinfo_field(str, "sigmask")) {
SignalfdEntry *sfd = arg;
if (type != FD_TYPES__SIGNALFD)
goto parse_err;
ret = sscanf(str, "sigmask: %llx", (unsigned long long *)&sfd->sigmask);
if (ret != 1)
goto parse_err;
entry_met = true;
continue;
}
if (fdinfo_field(str, "fanotify flags")) {
FanotifyFileEntry *fe = arg;
if (type != FD_TYPES__FANOTIFY)
goto parse_err;
ret = sscanf(str, "fanotify flags:%x event-flags:%x", &fe->faflags, &fe->evflags);
if (ret != 2)
goto parse_err;
entry_met = true;
continue;
}
if (fdinfo_field(str, "fanotify ino")) {
void *buf, *ob;
FanotifyFileEntry *fe = arg;
FanotifyMarkEntry *me;
int hoff = 0, i;
if (type != FD_TYPES__FANOTIFY)
goto parse_err;
ob = buf = xmalloc(sizeof(FanotifyMarkEntry) + sizeof(FanotifyInodeMarkEntry) +
sizeof(FhEntry) + FH_ENTRY_SIZES__min_entries * sizeof(uint64_t));
if (!buf)
goto out;
me = xptr_pull(&buf, FanotifyMarkEntry);
fanotify_mark_entry__init(me);
me->ie = xptr_pull(&buf, FanotifyInodeMarkEntry);
fanotify_inode_mark_entry__init(me->ie);
me->ie->f_handle = xptr_pull(&buf, FhEntry);
fh_entry__init(me->ie->f_handle);
me->ie->f_handle->n_handle = FH_ENTRY_SIZES__min_entries;
me->ie->f_handle->handle = xptr_pull_s(&buf, FH_ENTRY_SIZES__min_entries * sizeof(uint64_t));
ret = sscanf(str,
"fanotify ino:%" PRIx64 " sdev:%x mflags:%x mask:%x ignored_mask:%x "
"fhandle-bytes:%x fhandle-type:%x f_handle: %n",
&me->ie->i_ino, &me->s_dev, &me->mflags, &me->mask, &me->ignored_mask,
&me->ie->f_handle->bytes, &me->ie->f_handle->type, &hoff);
if (ret != 7 || hoff == 0) {
xfree(ob);
goto parse_err;
}
parse_fhandle_encoded(str + hoff, me->ie->f_handle);
me->type = MARK_TYPE__INODE;
i = fe->n_mark++;
if (xrealloc_safe(&fe->mark, fe->n_mark * sizeof(FanotifyMarkEntry *))) {
xfree(ob);
goto out;
}
fe->mark[i] = me;
entry_met = true;
continue;
}
if (fdinfo_field(str, "fanotify mnt_id")) {
void *buf, *ob;
FanotifyFileEntry *fe = arg;
FanotifyMarkEntry *me;
int i;
if (type != FD_TYPES__FANOTIFY)
goto parse_err;
ob = buf = xmalloc(sizeof(FanotifyMarkEntry) + sizeof(FanotifyMountMarkEntry));
if (!buf)
goto out;
me = xptr_pull(&buf, FanotifyMarkEntry);
fanotify_mark_entry__init(me);
me->me = xptr_pull(&buf, FanotifyMountMarkEntry);
fanotify_mount_mark_entry__init(me->me);
ret = sscanf(str, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x", &me->me->mnt_id,
&me->mflags, &me->mask, &me->ignored_mask);
if (ret != 4) {
xfree(ob);
goto parse_err;
}
me->type = MARK_TYPE__MOUNT;
i = fe->n_mark++;
if (xrealloc_safe(&fe->mark, fe->n_mark * sizeof(FanotifyMarkEntry *))) {
xfree(ob);
goto out;
}
fe->mark[i] = me;
entry_met = true;
continue;
}
if (fdinfo_field(str, "inotify wd")) {
void *buf, *ob;
InotifyFileEntry *ie = arg;
InotifyWdEntry *ify;
int hoff, i;
if (type != FD_TYPES__INOTIFY)
goto parse_err;
ob = buf = xmalloc(sizeof(InotifyWdEntry) + sizeof(FhEntry) +
FH_ENTRY_SIZES__min_entries * sizeof(uint64_t));
if (!buf)
goto out;
ify = xptr_pull(&buf, InotifyWdEntry);
inotify_wd_entry__init(ify);
ify->f_handle = xptr_pull(&buf, FhEntry);
fh_entry__init(ify->f_handle);
ify->f_handle->n_handle = FH_ENTRY_SIZES__min_entries;
ify->f_handle->handle = xptr_pull_s(&buf, FH_ENTRY_SIZES__min_entries * sizeof(uint64_t));
ret = sscanf(str,
"inotify wd:%x ino:%" PRIx64 " sdev:%x "
"mask:%x ignored_mask:%x "
"fhandle-bytes:%x fhandle-type:%x "
"f_handle: %n",
&ify->wd, &ify->i_ino, &ify->s_dev, &ify->mask, &ify->ignored_mask,
&ify->f_handle->bytes, &ify->f_handle->type, &hoff);
if (ret != 7) {
xfree(ob);
goto parse_err;
}
parse_fhandle_encoded(str + hoff, ify->f_handle);
i = ie->n_wd++;
if (xrealloc_safe(&ie->wd, ie->n_wd * sizeof(InotifyWdEntry *))) {
xfree(ob);
goto out;
}
ie->wd[i] = ify;
entry_met = true;
continue;
}
if (fdinfo_field(str, "map_type")) {
BpfmapFileEntry *bpf = arg;
if (type != FD_TYPES__BPFMAP)
goto parse_err;
ret = parse_bpfmap(&f, str, bpf);
if (ret)
goto parse_err;
entry_met = true;
continue;
}
}
exit_code = 0;
if (entry_met)
goto out;
/*
* An eventpoll/inotify file may have no target fds set thus
* resulting in no tfd: lines in proc. This is normal.
*/
if (type == FD_TYPES__EVENTPOLL || type == FD_TYPES__INOTIFY)
goto out;
pr_err("No records of type %d found in fdinfo file\n", type);
parse_err:
exit_code = -1;
pr_perror("%s: error parsing [%s] for %d", __func__, str, type);
out:
bclose(&f);
return exit_code;
}
int parse_fdinfo_pid(int pid, int fd, int type, void *arg)
{
return parse_fdinfo_pid_s(pid, fd, type, arg);
}
int parse_fdinfo(int fd, int type, void *arg)
{
return parse_fdinfo_pid_s(PROC_SELF, fd, type, arg);
}
int get_fd_mntid(int fd, int *mnt_id)
{
struct fdinfo_common fdinfo = { .mnt_id = -1 };
if (parse_fdinfo(fd, FD_TYPES__UND, &fdinfo))
return -1;
*mnt_id = fdinfo.mnt_id;
return 0;
}
static int parse_file_lock_buf(char *buf, struct file_lock *fl, bool is_blocked)
{
int num;
char fl_flag[10], fl_type[15], fl_option[10];
if (is_blocked) {
num = sscanf(buf, "%lld: -> %s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option,
&fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end);
} else {
num = sscanf(buf, "%lld:%s %s %s %d %x:%x:%ld %lld %s", &fl->fl_id, fl_flag, fl_type, fl_option,
&fl->fl_owner, &fl->maj, &fl->min, &fl->i_no, &fl->start, fl->end);
}
if (num < 10) {
pr_err("Invalid file lock info (%d): %s\n", num, buf);
return -1;
}
if (!strcmp(fl_flag, "POSIX"))
fl->fl_kind = FL_POSIX;
else if (!strcmp(fl_flag, "FLOCK"))
fl->fl_kind = FL_FLOCK;
else if (!strcmp(fl_flag, "OFDLCK"))
fl->fl_kind = FL_OFD;
else if (!strcmp(fl_flag, "LEASE"))
fl->fl_kind = FL_LEASE;
else
fl->fl_kind = FL_UNKNOWN;
if (fl->fl_kind == FL_LEASE && !strcmp(fl_type, "BREAKING")) {
fl->fl_ltype |= LEASE_BREAKING;
}
if (!strcmp(fl_type, "MSNFS")) {
fl->fl_ltype |= LOCK_MAND;
if (!strcmp(fl_option, "READ")) {
fl->fl_ltype |= LOCK_READ;
} else if (!strcmp(fl_option, "RW")) {
fl->fl_ltype |= LOCK_RW;
} else if (!strcmp(fl_option, "WRITE")) {
fl->fl_ltype |= LOCK_WRITE;
} else {
pr_err("Unknown lock option!\n");
return -1;
}
} else {
if (!strcmp(fl_option, "UNLCK")) {
fl->fl_ltype |= F_UNLCK;
} else if (!strcmp(fl_option, "WRITE")) {
fl->fl_ltype |= F_WRLCK;
} else if (!strcmp(fl_option, "READ")) {
fl->fl_ltype |= F_RDLCK;
} else {
pr_err("Unknown lock option!\n");
return -1;
}
}
return 0;
}
static bool pid_in_pstree(pid_t pid)
{
return pstree_item_by_real(pid) != NULL;
}
int parse_file_locks(void)
{
struct file_lock *fl;
FILE *fl_locks;
int exit_code = -1;
bool is_blocked;
if (kdat.has_fdinfo_lock)
return 0;
fl_locks = fopen_proc(PROC_GEN, "locks");
if (!fl_locks)
return -1;
while (fgets(buf, BUF_SIZE, fl_locks)) {
is_blocked = strstr(buf, "->") != NULL;
fl = alloc_file_lock();
if (!fl) {
pr_perror("Alloc file lock failed!");
goto err;
}
if (parse_file_lock_buf(buf, fl, is_blocked)) {
xfree(fl);
goto err;
}
pr_info("lockinfo: %lld:%d %x %d %02x:%02x:%ld %lld %s\n", fl->fl_id, fl->fl_kind, fl->fl_ltype,
fl->fl_owner, fl->maj, fl->min, fl->i_no, fl->start, fl->end);
if (fl->fl_kind == FL_UNKNOWN) {
pr_err("Unknown file lock: %s!\n", buf);
xfree(fl);
goto err;
}
if (is_blocked) {
/*
* All target processes are stopped in this moment and
* can't wait any locks.
*/
pr_debug("Skip blocked processes\n");
xfree(fl);
continue;
}
if ((fl->fl_kind == FL_POSIX) && !pid_in_pstree(fl->fl_owner)) {
/*
* We only care about tasks which are taken
* into dump, so we only collect file locks
* belong to these tasks.
*/
xfree(fl);
continue;
}
list_add_tail(&fl->list, &file_lock_list);
}
exit_code = 0;
err:
fclose(fl_locks);
return exit_code;
}
void free_posix_timers(struct proc_posix_timers_stat *st)
{
while (!list_empty(&st->timers)) {
struct proc_posix_timer *timer;
timer = list_first_entry(&st->timers, struct proc_posix_timer, list);
list_del(&timer->list);
xfree(timer);
}
}
int parse_posix_timers(pid_t pid, struct proc_posix_timers_stat *args)
{
int exit_code = -1;
int pid_t;
int i = 0;
struct bfd f;
char *s;
char sigpid[7];
char tidpid[4];
struct proc_posix_timer *timer = NULL;
INIT_LIST_HEAD(&args->timers);
args->timer_n = 0;
f.fd = open_proc(pid, "timers");
if (f.fd < 0)
return -1;
if (bfdopenr(&f))
return -1;
while (1) {
char pbuf[17]; /* 16 + eol */
s = breadline(&f);
if (!s)
break;
if (IS_ERR(s))
goto err;
switch (i % 4) {
case 0:
timer = xzalloc(sizeof(struct proc_posix_timer));
if (timer == NULL)
goto err;
if (sscanf(s, "ID: %ld", &timer->spt.it_id) != 1)
goto err;
break;
case 1:
if (sscanf(s, "signal: %d/%16s", &timer->spt.si_signo, pbuf) != 2)
goto err;
break;
case 2:
if (sscanf(s, "notify: %6[a-z]/%3[a-z].%d\n", sigpid, tidpid, &pid_t) != 3)
goto err;
break;
case 3:
if (sscanf(s, "ClockID: %d\n", &timer->spt.clock_id) != 1)
goto err;
timer->spt.sival_ptr = NULL;
if (sscanf(pbuf, "%p", &timer->spt.sival_ptr) != 1 && strcmp(pbuf, "(null)")) {
pr_err("Unable to parse '%s'\n", pbuf);
goto err;
}
if (tidpid[0] == 't') {
timer->spt.it_sigev_notify = SIGEV_THREAD_ID;
timer->spt.notify_thread_id = pid_t;
} else {
switch (sigpid[0]) {
case 's':
timer->spt.it_sigev_notify = SIGEV_SIGNAL;
break;
case 't':
timer->spt.it_sigev_notify = SIGEV_THREAD;
break;
default:
timer->spt.it_sigev_notify = SIGEV_NONE;
break;
}
}
list_add(&timer->list, &args->timers);
timer = NULL;
args->timer_n++;
break;
}
i++;
}
exit_code = 0;
out:
bclose(&f);
return exit_code;
err:
xfree(timer);
free_posix_timers(args);
pr_perror("Parse error in posix timers proc file!");
goto out;
}
int parse_threads(int pid, struct pid **_t, int *_n)
{
struct dirent *de;
DIR *dir;
struct pid *t = NULL;
int nr = 1;
if (*_t)
t = *_t;
dir = opendir_proc(pid, "task");
if (!dir)
return -1;
while ((de = readdir(dir))) {
struct pid *tmp;
/* We expect numbers only here */
if (de->d_name[0] == '.')
continue;
if (*_t == NULL) {
tmp = xrealloc(t, nr * sizeof(struct pid));
if (!tmp) {
xfree(t);
closedir(dir);
return -1;
}
t = tmp;
t[nr - 1].ns[0].virt = -1;
}
t[nr - 1].real = atoi(de->d_name);
t[nr - 1].state = TASK_THREAD;
nr++;
}
closedir(dir);
if (*_t == NULL) {
*_t = t;
*_n = nr - 1;
} else
BUG_ON(nr - 1 != *_n);
return 0;
}
int parse_cgroup_file(FILE *f, struct list_head *retl, unsigned int *n)
{
while (fgets(buf, BUF_SIZE, f)) {
struct cg_ctl *ncc, *cc;
char *name, *path = NULL, *e;
ncc = xmalloc(sizeof(*cc));
if (!ncc)
goto err;
/*
* Typical output (':' is a separator here)
*
* 4:cpu,cpuacct:/
* 3:cpuset:/
* 2:name=systemd:/user.slice/user-1000.slice/session-1.scope
*/
name = strchr(buf, ':');
if (!name) {
pr_err("Failed parsing cgroup %s\n", buf);
xfree(ncc);
goto err;
}
path = strchr(++name, ':');
if (!path) {
pr_err("Failed parsing cgroup %s\n", buf);
xfree(ncc);
goto err;
}
e = strchr(name, '\n');
*path++ = '\0';
if (e)
*e = '\0';
/*
* Controllers and their props might be
* configured the way some of them are
* not taken into the image for migration
* sake or container specifics.
*/
if (cgp_should_skip_controller(name)) {
pr_debug("cg-prop: Skipping controller %s\n", name);
xfree(ncc);
continue;
}
ncc->name = xstrdup(name);
ncc->path = xstrdup(path);
ncc->cgns_prefix = 0;
if (!ncc->name || !ncc->path) {
xfree(ncc->name);
xfree(ncc->path);
xfree(ncc);
goto err;
}
list_for_each_entry(cc, retl, l)
if (strcmp(cc->name, name) >= 0)
break;
list_add_tail(&ncc->l, &cc->l);
(*n)++;
}
return 0;
err:
put_ctls(retl);
return -1;
}
int parse_thread_cgroup(int pid, int tid, struct parasite_dump_cgroup_args *args, struct list_head *retl,
unsigned int *n)
{
FILE *f;
int ret;
LIST_HEAD(internal);
unsigned int n_internal = 0;
struct cg_ctl *intern, *ext;
f = fopen_proc(pid, "task/%d/cgroup", tid);
if (!f)
return -1;
ret = parse_cgroup_file(f, retl, n);
fclose(f);
if (ret < 0)
return -1;
/* No parasite args, we're dumping criu's cg set, so we don't need to
* try and parse the "internal" cgroup set to find namespace
* boundaries.
*/
if (!args)
return 0;
f = fmemopen(args->contents, strlen(args->contents), "r");
if (!f) {
pr_perror("couldn't fmemopen cgroup buffer %s", args->contents);
return -1;
}
ret = parse_cgroup_file(f, &internal, &n_internal);
fclose(f);
if (ret < 0) {
pr_err("couldn't parse internal cgroup file\n");
return -1;
}
/* Here's where we actually compute the cgns prefix. Consider a task
* in /foo/bar which has unshared its namespace at /foo. The internal
* path is /bar, but the external path is /foo/bar, and the cgns
* prefix is /foo. The algorithm is:
*
* // no cg ns unshare in this case
* if (internal == external)
* continue;
* idx = find_suffix_pos(external, internal)
* cgns_prefix = external[:idx]
*/
list_for_each_entry(intern, &internal, l) {
list_for_each_entry(ext, retl, l) {
char *pos;
if (strcmp(ext->name, intern->name))
continue;
/* If the cgroup namespace was unshared at / (or there
* is no cgroup namespace relative to criu), the paths
* are equal and we don't need to set a prefix.
*/
if (!strcmp(ext->path, intern->path))
continue;
/* +1 here to chop off the leading / */
pos = ext->path + strlen(ext->path) - strlen(intern->path + 1);
if (strcmp(pos, intern->path + 1)) {
pr_err("invalid cgroup configuration, %s is not a suffix of %s\n", intern->path,
ext->path);
ret = -1;
goto out;
}
ext->cgns_prefix = pos - ext->path;
if (ext->path[ext->cgns_prefix - 1] == '/')
ext->cgns_prefix--;
}
}
out:
put_ctls(&internal);
return ret;
}
void put_ctls(struct list_head *l)
{
struct cg_ctl *c, *n;
list_for_each_entry_safe(c, n, l, l) {
xfree(c->name);
xfree(c->path);
xfree(c);
}
INIT_LIST_HEAD(l);
}
/* Parse and create all the real controllers. This does not include things with
* the "name=" prefix, e.g. systemd.
*/
int collect_controllers(struct list_head *cgroups, unsigned int *n_cgroups)
{
int exit_code = -1;
FILE *f;
f = fopen_proc(PROC_SELF, "cgroup");
if (f == NULL)
return -1;
while (fgets(buf, BUF_SIZE, f)) {
struct cg_controller *nc = NULL;
char *controllers, *off;
controllers = strchr(buf, ':');
if (!controllers) {
pr_err("Unable to parse \"%s\"\n", buf);
goto err;
}
controllers++;
off = strchr(controllers, ':');
if (!off) {
pr_err("Unable to parse \"%s\"\n", buf);
goto err;
}
*off = '\0';
if (cgp_should_skip_controller(controllers)) {
pr_debug("cg-prop: Skipping controller %s\n", controllers);
continue;
}
while (1) {
off = strchr(controllers, ',');
if (off)
*off = '\0';
if (!strncmp("name=", controllers, 5))
goto skip;
if (!nc) {
nc = new_controller(controllers);
if (!nc)
goto err;
list_add_tail(&nc->l, cgroups);
(*n_cgroups)++;
} else {
void *m;
char *n;
nc->n_controllers++;
m = xrealloc(nc->controllers, sizeof(char *) * nc->n_controllers);
if (!m)
goto err;
nc->controllers = m;
n = xstrdup(controllers);
if (!n)
goto err;
nc->controllers[nc->n_controllers - 1] = n;
}
skip:
if (!off)
break;
controllers = off + 1;
}
}
exit_code = 0;
err:
fclose(f);
return exit_code;
}
/*
* If an OverlayFS mountpoint is found in the mountinfo table,
* we enable opts.overlayfs, which is a workaround for the
* OverlayFS Kernel bug.
*
* See fixup_overlayfs for details.
*/
int overlayfs_parse(struct mount_info *new)
{
opts.overlayfs = true;
return 0;
}
/*
* AUFS callback function to "fix up" the root pathname.
* See sysfs_parse.c for details.
*/
int aufs_parse(struct mount_info *new)
{
int ret = 0;
if (!strcmp(new->ns_mountpoint, "./")) {
opts.aufs = true;
ret = parse_aufs_branches(new);
}
return ret;
}
int parse_children(pid_t pid, pid_t **_c, int *_n)
{
pid_t *ch = NULL;
int nr = 0;
DIR *dir;
struct dirent *de;
struct bfd f;
dir = opendir_proc(pid, "task");
if (dir == NULL)
return -1;
while ((de = readdir(dir))) {
char *pos, *end;
if (dir_dots(de))
continue;
f.fd = open_proc(pid, "task/%s/children", de->d_name);
if (f.fd < 0)
goto err;
if (bfdopenr(&f))
goto err;
while (1) {
pid_t val, *tmp;
pos = breadchr(&f, ' ');
if (IS_ERR(pos))
goto err_close;
if (pos == NULL)
break;
val = strtol(pos, &end, 0);
if (*end != 0 && *end != ' ') {
pr_err("Unable to parse %s\n", end);
goto err_close;
}
tmp = xrealloc(ch, (nr + 1) * sizeof(pid_t));
if (!tmp)
goto err_close;
ch = tmp;
ch[nr] = val;
nr++;
}
bclose(&f);
}
*_c = ch;
*_n = nr;
closedir(dir);
return 0;
err_close:
bclose(&f);
err:
closedir(dir);
xfree(ch);
return -1;
}
#define CSEC_PER_SEC 100
int parse_uptime(uint64_t *upt)
{
unsigned long sec, csec;
FILE *f;
f = fopen("/proc/uptime", "r");
if (!f) {
pr_perror("Failed to fopen /proc/uptime");
return -1;
}
if (fscanf(f, "%lu.%2lu", &sec, &csec) != 2) {
pr_perror("Failed to parse /proc/uptime");
fclose(f);
return -1;
}
*upt = sec * USEC_PER_SEC + csec * (USEC_PER_SEC / CSEC_PER_SEC);
fclose(f);
return 0;
}