2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 18:07:57 +00:00
criu/sysfs_parse.c

327 lines
7.6 KiB
C
Raw Normal View History

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include <errno.h>
#include <sys/types.h>
#include <dirent.h>
#include <sys/stat.h>
#include "cr_options.h"
#include "criu-log.h"
#include "xmalloc.h"
#include "files.h"
#include "proc_parse.h"
#include "util.h"
#include "sysfs_parse.h"
#include "namespaces.h"
/*
* Currently, there are two kernel problems dealing with AUFS
* filesystems. Until these problems are fixed in the kernel,
* we have AUFS support in CRIU to handle the following issues:
*
* 1) /proc/<pid>/mountinfo: The problem is that for AUFS the root field
* of the root entry is missing the pathname (it's only /). For example:
*
* 90 61 0:33 / / rw,relatime - aufs none rw,si=4476a910a24617e6
*
* To handle this issue, the user has to specify the root of the AUFS
* filesystem with the --root command line option.
*
* 2) /proc/<pid>/map_files: The symlinks are absolute pathnames of the
* corresponding *physical* files in the branch they exist. For example,
* for a Docker container using AUFS, a symlink would look like:
* 400000-489000 -> /var/lib/docker/aufs/diff/<LAYER_ID>/bin/<cmd>
*
* Therefore, when we use the link file descriptor vm_file_fd in
* dump_one_reg_file() to read the link, we get the file's physical
* absolute pathname which does not exist relative to the root of the
* mount namespace and even if we used its relative pathname, the dev:ino
* values would be different from the physical file's dev:ino causing the
* dump to fail.
*
* To handle this issue, we figure out the "correct" paths when parsing
* map_files and save it for later use. See fixup_aufs_vma_fd() for
* details.
*/
struct ns_id *aufs_nsid;
static char **aufs_branches;
/*
* Parse out and save the AUFS superblock info in the
* given buffer.
*/
static int parse_aufs_sbinfo(struct mount_info *mi, char *sbinfo, int len)
{
char *cp;
int n;
cp = strstr(mi->options, "si=");
if (!cp) {
pr_err("Cannot find sbinfo in option string %s\n", mi->options);
return -1;
}
/* all ok, copy */
if (len < 4) { /* 4 for "si_" */
pr_err("Buffer of %d bytes too small for sbinfo\n", len);
return -1;
}
strcpy(sbinfo, "si_");
n = 3;
sbinfo += n;
cp += n;
while (isxdigit(*cp) && n < len) {
*sbinfo++ = *cp++;
n++;
}
if (n >= len) {
pr_err("Sbinfo in options string %s too long\n", mi->options);
return -1;
}
*sbinfo = '\0';
return 0;
}
/*
* If the specified path is in a branch, replace it
* with pathname from root.
*/
static int fixup_aufs_path(char *path, int size)
{
char rpath[PATH_MAX];
int n;
int blen;
if (aufs_branches == NULL) {
pr_err("No aufs branches to search for %s\n", path);
return -1;
}
for (n = 0; aufs_branches[n] != NULL; n++) {
blen = strlen(aufs_branches[n]);
if (!strncmp(path, aufs_branches[n], blen))
break;
}
if (aufs_branches[n] == NULL)
return 0; /* not in a branch */
n = snprintf(rpath, PATH_MAX, "%s", &path[blen]);
if (n >= min(PATH_MAX, size)) {
pr_err("Not enough space to replace %s\n", path);
return -1;
}
pr_debug("Replacing %s with %s\n", path, rpath);
strcpy(path, rpath);
return n;
}
/*
* Kernel stores patchnames to AUFS branches in the br<n> files in
* the /sys/fs/aufs/si_<sbinfo> directory where <n> denotes a branch
* number and <sbinfo> is a hexadecimal number in %lx format. For
* example:
*
* $ cat /sys/fs/aufs/si_f598876b087ed883/br0
* /path/to/branch0/directory=rw
*
* This function sets up an array of pointers to branch pathnames.
*/
int parse_aufs_branches(struct mount_info *mi)
{
char path[AUFSBR_PATH_LEN];
char *cp;
int n;
int ret;
unsigned int br_num;
unsigned int br_max;
DIR *dp;
FILE *fp;
struct dirent *de;
pr_info("Collecting AUFS branch pathnames ...\n");
if (mi->nsid == 0) {
pr_err("No nsid to parse its aufs branches\n");
return -1;
}
if (mi->nsid == aufs_nsid) {
pr_debug("Using cached aufs branch paths for nsid %p\n", aufs_nsid);
return 0;
}
if (aufs_nsid)
free_aufs_branches();
strcpy(path, SYSFS_AUFS); /* /sys/fs/aufs/ */
if (parse_aufs_sbinfo(mi, &path[sizeof SYSFS_AUFS - 1], SBINFO_LEN) < 0)
return -1;
if ((dp = opendir(path)) == NULL) {
pr_perror("Cannot opendir %s", path);
return -1;
}
/*
* Find out how many branches we have.
*/
br_max = 0;
ret = 0;
while (1) {
errno = 0;
if ((de = readdir(dp)) == NULL) {
if (errno) {
pr_perror("Cannot readdir %s", path);
ret = -1;
}
break;
}
ret = sscanf(de->d_name, "br%d", &br_num);
if (ret == 1 && br_num > br_max)
br_max = br_num;
}
closedir(dp);
if (ret == -1)
return -1;
/*
* Default AUFS maximum is 127, so 1000 should be plenty.
* If you increase the maximum to more than 3 digits,
* make sure to change AUFSBR_PATH_LEN accordingly.
*/
if (br_max > 999) {
pr_err("Too many branches %d\n", br_max);
return -1;
}
/*
* Allocate an array of pointers to branch pathnames to be read.
* Branches are indexed from 0 and we need a NULL pointer at the end.
*/
aufs_branches = xzalloc((br_max + 2) * sizeof (char *));
if (!aufs_branches)
return -1;
/*
* Now read branch pathnames from the branch files.
*/
n = strlen(path);
for (br_num = 0; br_num <= br_max; br_num++) {
fp = NULL;
ret = snprintf(&path[n], sizeof path - n, "/br%d", br_num);
if (ret >= sizeof path - n) {
pr_err("Buffer overrun creating path for branch %d\n", br_num);
goto err;
}
if ((fp = fopen(path, "r")) == NULL) {
pr_perror("Cannot fopen %s", path);
goto err;
}
if (fscanf(fp, "%ms=", &aufs_branches[br_num]) != 1 ||
aufs_branches[br_num] == NULL) {
pr_perror("Parse error reading %s", path);
goto err;
}
/* chop off the trailing "=..." stuff */
if ((cp = strchr(aufs_branches[br_num], '=')) == NULL) {
pr_err("Bad format in branch pathname %s\n", aufs_branches[br_num]);
goto err;
}
*cp = '\0';
fclose(fp);
/*
* Log branch information for extenal utitilies that
* want to recreate the process's AUFS filesystem
* before calling criu restore.
*
* DO NOT CHANGE this format!
*/
pr_info("%s : %s\n", path, aufs_branches[br_num]);
}
aufs_nsid = mi->nsid;
return 0;
err:
if (fp)
fclose(fp);
free_aufs_branches();
return -1;
}
/*
* AUFS support to compensate for the kernel bug
Ignore mnt_id value for AUFS file descriptors. Starting with version 3.15, the kernel provides a mnt_id field in /proc/<pid>/fdinfo/<fd>. However, the value provided by the kernel for AUFS file descriptors obtained by opening a file in /proc/<pid>/map_files is incorrect. Below is an example for a Docker container running Nginx. The mntid program below mimics CRIU by opening a file in /proc/1/map_files and using the descriptor to obtain its mnt_id. As shown below, mnt_id is set to 22 by the kernel but it does not exist in the mount namespace of the container. Therefore, CRIU fails with the error: "Unable to look up the 22 mount" In the global namespace, 22 is the root of AUFS (/var/lib/docker/aufs). This patch sets the mnt_id of these AUFS descriptors to -1, mimicing pre-3.15 kernel behavior. $ docker ps CONTAINER ID IMAGE ... 3850a63ee857 nginx-streaming:latest ... $ docker exec -it 38 bash -i root@3850a63ee857:/# ps -e PID TTY TIME CMD 1 ? 00:00:00 nginx 7 ? 00:00:00 nginx 31 ? 00:00:00 bash 46 ? 00:00:00 ps root@3850a63ee857:/# ./mntid 1 open("/proc/1/map_files/400000-4b8000") = 3 cat /proc/49/fdinfo/3 pos: 0 flags: 0100000 mnt_id: 22 root@3850a63ee857:/# awk '{print $1 " " $2}' /proc/1/mountinfo 87 58 103 87 104 87 105 104 106 104 107 104 108 87 109 87 110 87 111 87 root@3850a63ee857:/# exit $ grep 22 /proc/self/mountinfo 22 21 8:1 /var/lib/docker/aufs /var/lib/docker/aufs ... 44 22 0:35 / /var/lib/docker/aufs/mnt/<ID> ... $ Signed-off-by: Saied Kazemi <saied@google.com> Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
2015-02-09 14:07:40 +03:00
* exposing branch pathnames in map_files and providing
* a wrong mnt_id value in /proc/<pid>/fdinfo/<fd>.
*
* If the link points inside a branch, save the
* relative pathname from the root of the mount
* namespace as well as the full pathname from
* globl root (/) for later use in dump_filemap()
* and parse_smaps().
*/
int fixup_aufs_vma_fd(struct vma_area *vma)
{
char path[PATH_MAX];
int len;
path[0] = '.';
len = read_fd_link(vma->vm_file_fd, &path[1], sizeof path - 1);
if (len < 0)
return -1;
len = fixup_aufs_path(&path[1], sizeof path - 1);
if (len <= 0)
return len;
vma->aufs_rpath = xmalloc(len + 2);
if (!vma->aufs_rpath)
return -1;
strcpy(vma->aufs_rpath, path);
if (opts.root) {
vma->aufs_fpath = xmalloc(strlen(opts.root) + 1 + len + 1);
if (!vma->aufs_fpath)
return -1;
/* skip ./ in path */
sprintf(vma->aufs_fpath, "%s/%s", opts.root, &path[2]);
}
pr_debug("Saved AUFS paths %s and %s\n", vma->aufs_rpath, vma->aufs_fpath);
if (stat(vma->aufs_fpath, vma->vmst) < 0) {
pr_perror("Failed stat on map %"PRIx64" (%s)",
vma->e->start, vma->aufs_fpath);
return -1;
}
Ignore mnt_id value for AUFS file descriptors. Starting with version 3.15, the kernel provides a mnt_id field in /proc/<pid>/fdinfo/<fd>. However, the value provided by the kernel for AUFS file descriptors obtained by opening a file in /proc/<pid>/map_files is incorrect. Below is an example for a Docker container running Nginx. The mntid program below mimics CRIU by opening a file in /proc/1/map_files and using the descriptor to obtain its mnt_id. As shown below, mnt_id is set to 22 by the kernel but it does not exist in the mount namespace of the container. Therefore, CRIU fails with the error: "Unable to look up the 22 mount" In the global namespace, 22 is the root of AUFS (/var/lib/docker/aufs). This patch sets the mnt_id of these AUFS descriptors to -1, mimicing pre-3.15 kernel behavior. $ docker ps CONTAINER ID IMAGE ... 3850a63ee857 nginx-streaming:latest ... $ docker exec -it 38 bash -i root@3850a63ee857:/# ps -e PID TTY TIME CMD 1 ? 00:00:00 nginx 7 ? 00:00:00 nginx 31 ? 00:00:00 bash 46 ? 00:00:00 ps root@3850a63ee857:/# ./mntid 1 open("/proc/1/map_files/400000-4b8000") = 3 cat /proc/49/fdinfo/3 pos: 0 flags: 0100000 mnt_id: 22 root@3850a63ee857:/# awk '{print $1 " " $2}' /proc/1/mountinfo 87 58 103 87 104 87 105 104 106 104 107 104 108 87 109 87 110 87 111 87 root@3850a63ee857:/# exit $ grep 22 /proc/self/mountinfo 22 21 8:1 /var/lib/docker/aufs /var/lib/docker/aufs ... 44 22 0:35 / /var/lib/docker/aufs/mnt/<ID> ... $ Signed-off-by: Saied Kazemi <saied@google.com> Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
2015-02-09 14:07:40 +03:00
/* tell parse_smap() not to call get_fd_mntid() */
vma->mnt_id = -1;
return len;
}
void free_aufs_branches(void)
{
int n;
if (aufs_branches) {
for (n = 0; aufs_branches[n] != NULL; n++)
xfree(aufs_branches[n]);
xfree(aufs_branches);
aufs_branches = NULL;
}
aufs_nsid = NULL;
}