2
0
mirror of https://github.com/checkpoint-restore/criu synced 2025-08-22 09:58:09 +00:00

img-streamer: introduction of criu-image-streamer

This adds the ability to stream images with criu-image-streamer

The workflow is the following:
1) criu-image-streamer is started, and starts listening on a UNIX
   socket.
2) CRIU is started. img_streamer_init() is invoked, which connects to the
   socket. During dump/restore operations, instead of using local disk to
   open an image file, img_streamer_open() is called to provide a UNIX pipe
   that is sent over the UNIX socket.
3) Once the operation is done, img_streamer_finish() is called, and the
   UNIX socket is disconnected.

criu-image-streamer can be found at:
https://github.com/checkpoint-restore/criu-image-streamer

Signed-off-by: Nicolas Viennot <Nicolas.Viennot@twosigma.com>
This commit is contained in:
Nicolas Viennot 2020-05-21 17:40:17 +00:00 committed by Andrei Vagin
parent 51c3f8a908
commit 7d79a58f4d
22 changed files with 429 additions and 26 deletions

View File

@ -76,6 +76,11 @@ The following levels are available:
*-D*, *--images-dir* 'path'::
Use 'path' as a base directory where to look for sets of image files.
*--stream*::
dump/restore images using criu-image-streamer.
See https://github.com/checkpoint-restore/criu-image-streamer for detailed
usage.
*--prev-images-dir* 'path'::
Use 'path' as a parent directory where to look for sets of image files.
This option makes sense in case of incremental dumps.

View File

@ -29,6 +29,7 @@ obj-y += files-reg.o
obj-y += fsnotify.o
obj-y += image-desc.o
obj-y += image.o
obj-y += img-streamer.o
obj-y += ipc_ns.o
obj-y += irmap.o
obj-y += kcmp-ids.o

View File

@ -510,6 +510,7 @@ int parse_options(int argc, char **argv, bool *usage_error,
BOOL_OPT(SK_CLOSE_PARAM, &opts.tcp_close),
{ "verbosity", optional_argument, 0, 'v' },
{ "ps-socket", required_argument, 0, 1091},
BOOL_OPT("stream", &opts.stream),
{ "config", required_argument, 0, 1089},
{ "no-default-config", no_argument, 0, 1090},
{ "tls-cacert", required_argument, 0, 1092},

View File

@ -82,6 +82,7 @@
#include "eventpoll.h"
#include "memfd.h"
#include "timens.h"
#include "img-streamer.h"
/*
* Architectures can overwrite this function to restore register sets that
@ -1759,6 +1760,7 @@ static int cr_dump_finish(int ret)
free_userns_maps();
close_service_fd(CR_PROC_FD_OFF);
close_image_dir();
if (ret) {
pr_err("Dumping FAILED.\n");

View File

@ -29,6 +29,7 @@
#include "cr_options.h"
#include "servicefd.h"
#include "image.h"
#include "img-streamer.h"
#include "util.h"
#include "util-pie.h"
#include "criu-log.h"
@ -2355,6 +2356,9 @@ skip_ns_bouncing:
pr_info("Restore finished successfully. Tasks resumed.\n");
write_stats(RESTORE_STATS);
/* This has the effect of dismissing the image streamer */
close_image_dir();
ret = run_scripts(ACT_POST_RESUME);
if (ret != 0)
pr_err("Post-resume script ret code %d\n", ret);

View File

@ -343,7 +343,14 @@ static int setup_opts_from_req(int sk, CriuOpts *req)
if (req->parent_img)
SET_CHAR_OPTS(img_parent, req->parent_img);
if (open_image_dir(images_dir_path) < 0) {
/*
* Image streaming is not supported with CRIU's service feature as
* the streamer must be started for each dump/restore operation.
* It is unclear how to do that with RPC, so we punt for now.
* This explains why we provide the argument mode=-1 instead of
* O_RSTR or O_DUMP.
*/
if (open_image_dir(images_dir_path, -1) < 0) {
pr_perror("Can't open images directory");
goto err;
}

View File

@ -54,6 +54,20 @@ void flush_early_log_to_stderr(void)
flush_early_log_buffer(STDERR_FILENO);
}
static int image_dir_mode(char *argv[], int optind)
{
if (!strcmp(argv[optind], "dump") ||
!strcmp(argv[optind], "pre-dump") ||
(!strcmp(argv[optind], "cpuinfo") && !strcmp(argv[optind + 1], "dump")))
return O_DUMP;
if (!strcmp(argv[optind], "restore") ||
(!strcmp(argv[optind], "cpuinfo") && !strcmp(argv[optind + 1], "restore")))
return O_RSTR;
return -1;
}
int main(int argc, char *argv[], char *envp[])
{
int ret = -1;
@ -148,13 +162,30 @@ int main(int argc, char *argv[], char *envp[])
}
}
if (opts.stream && image_dir_mode(argv, optind) == -1) {
pr_err("--stream cannot be used with the %s command\n", argv[optind]);
goto usage;
}
/* We must not open imgs dir, if service is called */
if (strcmp(argv[optind], "service")) {
ret = open_image_dir(opts.imgs_dir);
ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind));
if (ret < 0)
return 1;
}
/*
* The kernel might send us lethal signals when writing to a pipe
* which reader has disappeared. We deal with write() failures on our
* own, and prefer not to get killed. So we ignore SIGPIPEs.
*
* Pipes are used in various places:
* 1) Receiving application page data
* 2) Transmitting data to the image streamer
* 3) Emitting logs (potentially to a pipe).
*/
signal(SIGPIPE, SIG_IGN);
/*
* When a process group becomes an orphan,
* its processes are sent a SIGHUP signal
@ -322,6 +353,7 @@ usage:
" this requires running a second instance of criu\n"
" in lazy-pages mode: 'criu lazy-pages -D DIR'\n"
" --lazy-pages and lazy-pages mode require userfaultfd\n"
" --stream dump/restore images using criu-image-streamer\n"
"\n"
"* External resources support:\n"
" --external RES dump objects from this list as external resources:\n"

View File

@ -154,7 +154,6 @@ static int trim_last_parent(char *path)
static int copy_chunk_from_file(int fd, int img, off_t off, size_t len)
{
char *buf = NULL;
int ret;
while (len > 0) {
@ -167,7 +166,6 @@ static int copy_chunk_from_file(int fd, int img, off_t off, size_t len)
len -= ret;
}
xfree(buf);
return 0;
}
@ -213,7 +211,6 @@ static int copy_file_to_chunks(int fd, struct cr_img *img, size_t file_size)
static int copy_chunk_to_file(int img, int fd, off_t off, size_t len)
{
char *buf = NULL;
int ret;
while (len > 0) {
@ -221,7 +218,11 @@ static int copy_chunk_to_file(int img, int fd, off_t off, size_t len)
pr_perror("Can't seek file");
return -1;
}
ret = sendfile(fd, img, NULL, len);
if (opts.stream)
ret = splice(img, NULL, fd, NULL, len, SPLICE_F_MOVE);
else
ret = sendfile(fd, img, NULL, len);
if (ret < 0) {
pr_perror("Can't send data");
return -1;
@ -231,7 +232,6 @@ static int copy_chunk_to_file(int img, int fd, off_t off, size_t len)
len -= ret;
}
xfree(buf);
return 0;
}

View File

@ -17,6 +17,7 @@
#include "images/inventory.pb-c.h"
#include "images/pagemap.pb-c.h"
#include "proc_parse.h"
#include "img-streamer.h"
#include "namespaces.h"
bool ns_per_id = false;
@ -415,13 +416,16 @@ static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long of
flags = oflags & ~(O_NOBUF | O_SERVICE | O_FORCE_LOCAL);
/*
* For pages images dedup we need to open images read-write on
* restore, that may require proper capabilities, so we ask
* usernsd to do it for us
*/
if (root_ns_mask & CLONE_NEWUSER &&
type == CR_FD_PAGES && oflags & O_RDWR) {
if (opts.stream && !(oflags & O_FORCE_LOCAL)) {
ret = img_streamer_open(path, flags);
errno = EIO; /* errno value is meaningless, only the ret value is meaningful */
} else if (root_ns_mask & CLONE_NEWUSER &&
type == CR_FD_PAGES && oflags & O_RDWR) {
/*
* For pages images dedup we need to open images read-write on
* restore, that may require proper capabilities, so we ask
* usernsd to do it for us
*/
struct openat_args pa = {
.flags = flags,
.err = 0,
@ -520,7 +524,12 @@ struct cr_img *img_from_fd(int fd)
return img;
}
int open_image_dir(char *dir)
/*
* `mode` should be O_RSTR or O_DUMP depending on the intent.
* This is used when opts.stream is enabled for picking the right streamer
* socket name. `mode` is ignored when opts.stream is not enabled.
*/
int open_image_dir(char *dir, int mode)
{
int fd, ret;
@ -535,7 +544,10 @@ int open_image_dir(char *dir)
return -1;
fd = ret;
if (opts.img_parent) {
if (opts.stream) {
if (img_streamer_init(dir, mode) < 0)
goto err;
} else if (opts.img_parent) {
ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK);
if (ret < 0 && errno != EEXIST) {
pr_perror("Can't link parent snapshot");
@ -556,6 +568,8 @@ err:
void close_image_dir(void)
{
if (opts.stream)
img_streamer_finish();
close_service_fd(IMG_FD_OFF);
}

232
criu/img-streamer.c Normal file
View File

@ -0,0 +1,232 @@
#include <sys/socket.h>
#include <sys/un.h>
#include <unistd.h>
#include <stdio.h>
#include "cr_options.h"
#include "img-streamer.h"
#include "image.h"
#include "images/img-streamer.pb-c.h"
#include "protobuf.h"
#include "servicefd.h"
#include "rst-malloc.h"
#include "common/scm.h"
#include "common/lock.h"
/*
* We use different path names for the dump and restore sockets because:
* 1) The user may want to perform both at the same time (akin to live
* migration). Specifying the same images-dir is convenient.
* 2) It fails quickly when the user mix-up the streamer and CRIU operations.
* (e.g., streamer is in capture more, while CRIU is in restore mode).
*/
#define IMG_STREAMER_CAPTURE_SOCKET_NAME "streamer-capture.sock"
#define IMG_STREAMER_SERVE_SOCKET_NAME "streamer-serve.sock"
/* All requests go through the same socket connection. We must synchronize */
static mutex_t *img_streamer_fd_lock;
/* Either O_DUMP or O_RSTR */
static int img_streamer_mode;
static const char *socket_name_for_mode(int mode)
{
switch (mode) {
case O_DUMP: return IMG_STREAMER_CAPTURE_SOCKET_NAME;
case O_RSTR: return IMG_STREAMER_SERVE_SOCKET_NAME;
default: BUG(); return NULL;
}
}
/*
* img_streamer_init() connects to the image streamer socket.
* mode should be either O_DUMP or O_RSTR.
*/
int img_streamer_init(const char *image_dir, int mode)
{
struct sockaddr_un addr;
int sockfd;
img_streamer_mode = mode;
sockfd = socket(AF_UNIX, SOCK_STREAM, 0);
if (sockfd < 0) {
pr_perror("Unable to instantiate UNIX socket");
return -1;
}
memset(&addr, 0, sizeof(addr));
addr.sun_family = AF_UNIX;
snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s",
image_dir, socket_name_for_mode(mode));
if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
pr_perror("Unable to connect to image streamer socket: %s", addr.sun_path);
goto err;
}
img_streamer_fd_lock = shmalloc(sizeof(*img_streamer_fd_lock));
if (!img_streamer_fd_lock) {
pr_err("Failed to allocate memory\n");
goto err;
}
mutex_init(img_streamer_fd_lock);
if (install_service_fd(IMG_STREAMER_FD_OFF, sockfd) < 0)
goto err;
return 0;
err:
close(sockfd);
return -1;
}
/*
* img_streamer_finish() indicates that no more files will be opened.
* In other words, img_streamer_open() will no longer be called.
*/
void img_streamer_finish(void)
{
if (get_service_fd(IMG_STREAMER_FD_OFF) >= 0) {
pr_info("Dismissing the image streamer\n");
close_service_fd(IMG_STREAMER_FD_OFF);
}
}
/*
* The regular protobuf APIs pb_write_one() and pb_read_one() operate over a
* `struct cr_img` object. Sadly, we don't have such object. We just have a
* file descriptor. The following pb_write_one_fd() and pb_read_one_fd()
* provide a protobuf API over a file descriptor. The implementation is a bit
* of a hack, but should be fine. At some point we can revisit to have a
* proper protobuf API over fds.
*/
static int pb_write_one_fd(int fd, void *obj, int type)
{
int ret;
struct cr_img img;
memset(&img, 0, sizeof(img));
img._x.fd = fd;
ret = pb_write_one(&img, obj, type);
if (ret < 0)
pr_perror("Failed to communicate with the image streamer");
return ret;
}
static int pb_read_one_fd(int fd, void **pobj, int type)
{
int ret;
struct cr_img img;
memset(&img, 0, sizeof(img));
img._x.fd = fd;
ret = pb_read_one(&img, pobj, type);
if (ret < 0)
pr_perror("Failed to communicate with the image streamer");
return ret;
}
static int send_file_request(char *filename)
{
ImgStreamerRequestEntry req = IMG_STREAMER_REQUEST_ENTRY__INIT;
req.filename = filename;
return pb_write_one_fd(get_service_fd(IMG_STREAMER_FD_OFF),
&req, PB_IMG_STREAMER_REQUEST);
}
static int recv_file_reply(bool *exists)
{
ImgStreamerReplyEntry *reply;
int ret = pb_read_one_fd(get_service_fd(IMG_STREAMER_FD_OFF),
(void **)&reply, PB_IMG_STREAMER_REPLY);
if (ret < 0)
return ret;
*exists = reply->exists;
free(reply);
return 0;
}
/*
* Using a pipe for image file transfers allows the data to be spliced by the
* image streamer, greatly improving performance.
* Transfer rates of up to 15GB/s can be seen with this technique.
*/
#define READ_PIPE 0 /* index of the read pipe returned by pipe() */
#define WRITE_PIPE 1
static int establish_streamer_file_pipe(void)
{
/*
* If the other end of the pipe closes, the kernel will want to kill
* us with a SIGPIPE. These signal must be ignored, which we do in
* crtools.c:main() with signal(SIGPIPE, SIG_IGN).
*/
int ret = -1;
int criu_pipe_direction = img_streamer_mode == O_DUMP ? WRITE_PIPE : READ_PIPE;
int streamer_pipe_direction = 1 - criu_pipe_direction;
int fds[2];
if (pipe(fds) < 0) {
pr_perror("Unable to create pipe");
return -1;
}
if (send_fd(get_service_fd(IMG_STREAMER_FD_OFF),
NULL, 0, fds[streamer_pipe_direction]) < 0)
close(fds[criu_pipe_direction]);
else
ret = fds[criu_pipe_direction];
close(fds[streamer_pipe_direction]);
return ret;
}
static int _img_streamer_open(char *filename)
{
if (send_file_request(filename) < 0)
return -1;
if (img_streamer_mode == O_RSTR) {
/* The streamer replies whether the file exists */
bool exists;
if (recv_file_reply(&exists) < 0)
return -1;
if (!exists)
return -ENOENT;
}
/*
* When the image streamer encounters a fatal error, it won't report
* errors via protobufs. Instead, CRIU will get a broken pipe error
* when trying to access a streaming pipe. This behavior is similar to
* what would happen if we were connecting criu and * criu-image-streamer
* via a shell pipe.
*/
return establish_streamer_file_pipe();
}
/*
* Opens an image file via a UNIX pipe with the image streamer.
*
* Return:
* A file descriptor on success
* -ENOENT when the file was not found.
* -1 on any other error.
*/
int img_streamer_open(char *filename, int flags)
{
int ret;
BUG_ON(flags != img_streamer_mode);
mutex_lock(img_streamer_fd_lock);
ret = _img_streamer_open(filename);
mutex_unlock(img_streamer_fd_lock);
return ret;
}

View File

@ -143,6 +143,7 @@ struct cr_options {
int weak_sysctls;
int status_fd;
bool orphan_pts_master;
int stream;
pid_t tree_id;
int log_level;
char *imgs_dir;

View File

@ -145,7 +145,7 @@ static inline int img_raw_fd(struct cr_img *img)
extern off_t img_raw_size(struct cr_img *img);
extern int open_image_dir(char *dir);
extern int open_image_dir(char *dir, int mode);
extern void close_image_dir(void);
extern struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...);

View File

@ -0,0 +1,8 @@
#ifndef IMAGE_STREAMER_H
#define IMAGE_STREAMER_H
extern int img_streamer_init(const char *image_dir, int mode);
extern void img_streamer_finish(void);
extern int img_streamer_open(char *filename, int flags);
#endif /* IMAGE_STREAMER_H */

View File

@ -62,8 +62,10 @@ enum {
PB_GHOST_CHUNK,
PB_FILE,
PB_MEMFD_FILE,
PB_MEMFD_INODE, /* 60 */
PB_MEMFD_INODE,
PB_TIMENS,
PB_IMG_STREAMER_REQUEST,
PB_IMG_STREAMER_REPLY,
/* PB_AUTOGEN_STOP */

View File

@ -14,6 +14,7 @@ enum sfd_type {
LOG_FD_OFF,
IMG_FD_OFF,
IMG_STREAMER_FD_OFF,
PROC_FD_OFF, /* fd with /proc for all proc_ calls */
PROC_PID_FD_OFF,
CR_PROC_FD_OFF, /* some other's proc fd:

View File

@ -1406,9 +1406,9 @@ static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta)
/*
* We optimize the case when rsti(t)->vma_io is empty.
*
* This is useful for for remote images, where all VMAs are premapped
* (pr->pieok is false). This avoids re-opening the CR_FD_PAGES file,
* which could be no longer be available.
* This is useful when using the image streamer, where all VMAs are
* premapped (pr->pieok is false). This avoids re-opening the
* CR_FD_PAGES file, which may only be readable only once.
*/
if (list_empty(&rsti(t)->vma_io)) {
ta->vma_ios = NULL;

View File

@ -382,6 +382,10 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, unsigned lo
int pfd;
int pr_flags = (fd_type == CR_FD_PAGEMAP) ? PR_TASK : PR_SHMEM;
/* Image streaming lacks support for incremental images */
if (opts.stream)
goto out;
pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
if (pfd < 0 && errno == ENOENT)
goto out;
@ -928,6 +932,10 @@ int check_parent_local_xfer(int fd_type, unsigned long img_id)
struct stat st;
int ret, pfd;
/* Image streaming lacks support for incremental images */
if (opts.stream)
return 0;
pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY);
if (pfd < 0 && errno == ENOENT)
return 0;

View File

@ -406,6 +406,49 @@ static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr,
return ret;
}
/*
* We cannot use maybe_read_page_local() for streaming images as it uses
* pread(), seeking in the file. Instead, we use this custom page reader.
*/
static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr,
int nr, void *buf, unsigned flags)
{
unsigned long len = nr * PAGE_SIZE;
int fd = img_raw_fd(pr->pi);
int ret;
size_t curr = 0;
pr_debug("\tpr%lu-%u Read page from self %lx/%"PRIx64"\n",
pr->img_id, pr->id, pr->cvaddr, pr->pi_off);
/* We can't seek. The requested address better match */
BUG_ON(pr->cvaddr != vaddr);
while (1) {
ret = read(fd, buf + curr, len - curr);
if (ret == 0) {
pr_err("Reached EOF unexpectedly while reading page from image\n");
return -1;
} else if (ret < 0) {
pr_perror("Can't read mapping page %d", ret);
return -1;
}
curr += ret;
if (curr == len)
break;
}
if (opts.auto_dedup)
pr_warn_once("Can't dedup when streaming images\n");
if (ret == 0 && pr->io_complete)
ret = pr->io_complete(pr, vaddr, nr);
pr->pi_off += len;
return ret;
}
static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_pages, void *priv)
{
int ret = 0;
@ -601,6 +644,10 @@ static int try_open_parent(int dfd, unsigned long id, struct page_read *pr, int
int pfd, ret;
struct page_read *parent = NULL;
/* Image streaming lacks support for incremental images */
if (opts.stream)
goto out;
pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY);
if (pfd < 0 && errno == ENOENT)
goto out;
@ -657,7 +704,19 @@ static int init_pagemaps(struct page_read *pr)
off_t fsize;
int nr_pmes, nr_realloc;
fsize = img_raw_size(pr->pmi);
if (opts.stream) {
/*
* TODO - There is no easy way to estimate the size of the
* pagemap that is still to be read from the pipe. Possible
* solution is to ask the image streamer for the size of the
* image. 1024 is a wild guess (more space is allocated if
* needed).
*/
fsize = 1024;
} else {
fsize = img_raw_size(pr->pmi);
}
if (fsize < 0)
return -1;
@ -781,6 +840,8 @@ int open_page_read_at(int dfd, unsigned long img_id, struct page_read *pr, int p
if (remote)
pr->maybe_read_page = maybe_read_page_remote;
else if (opts.stream)
pr->maybe_read_page = maybe_read_page_img_streamer;
else {
pr->maybe_read_page = maybe_read_page_local;
if (!pr->parent && !opts.lazy_pages)

View File

@ -63,6 +63,7 @@
#include "images/seccomp.pb-c.h"
#include "images/binfmt-misc.pb-c.h"
#include "images/autofs.pb-c.h"
#include "images/img-streamer.pb-c.h"
struct cr_pb_message_desc cr_pb_descs[PB_MAX];

View File

@ -423,13 +423,19 @@ int copy_file(int fd_in, int fd_out, size_t bytes)
{
ssize_t written = 0;
size_t chunk = bytes ? bytes : 4096;
ssize_t ret;
while (1) {
ssize_t ret;
ret = sendfile(fd_out, fd_in, NULL, chunk);
/*
* When fd_out is a pipe, sendfile() returns -EINVAL, so we
* fallback to splice(). Not sure why.
*/
if (opts.stream)
ret = splice(fd_in, NULL, fd_out, NULL, chunk, SPLICE_F_MOVE);
else
ret = sendfile(fd_out, fd_in, NULL, chunk);
if (ret < 0) {
pr_perror("Can't send data to ghost file");
pr_perror("Can't transfer data to ghost file from image");
return -1;
}

View File

@ -65,6 +65,7 @@ proto-obj-y += macvlan.o
proto-obj-y += sit.o
proto-obj-y += memfd.o
proto-obj-y += timens.o
proto-obj-y += img-streamer.o
CFLAGS += -iquote $(obj)/

16
images/img-streamer.proto Normal file
View File

@ -0,0 +1,16 @@
syntax = "proto2";
// This message is sent from CRIU to the streamer.
// * During dump, it communicates the name of the file that is about to be sent
// to the streamer.
// * During restore, CRIU requests image files from the streamer. The message is
// used to communicate the name of the desired file.
message img_streamer_request_entry {
required string filename = 1;
}
// This message is sent from the streamer to CRIU. It is only used during
// restore to report whether the requested file exists.
message img_streamer_reply_entry {
required bool exists = 1;
}