From 7d79a58f4daa9ddda015fc0aaca2a60d156d12cc Mon Sep 17 00:00:00 2001 From: Nicolas Viennot Date: Thu, 21 May 2020 17:40:17 +0000 Subject: [PATCH] img-streamer: introduction of criu-image-streamer This adds the ability to stream images with criu-image-streamer The workflow is the following: 1) criu-image-streamer is started, and starts listening on a UNIX socket. 2) CRIU is started. img_streamer_init() is invoked, which connects to the socket. During dump/restore operations, instead of using local disk to open an image file, img_streamer_open() is called to provide a UNIX pipe that is sent over the UNIX socket. 3) Once the operation is done, img_streamer_finish() is called, and the UNIX socket is disconnected. criu-image-streamer can be found at: https://github.com/checkpoint-restore/criu-image-streamer Signed-off-by: Nicolas Viennot --- Documentation/criu.txt | 5 + criu/Makefile.crtools | 1 + criu/config.c | 1 + criu/cr-dump.c | 2 + criu/cr-restore.c | 4 + criu/cr-service.c | 9 +- criu/crtools.c | 34 ++++- criu/files-reg.c | 10 +- criu/image.c | 32 +++-- criu/img-streamer.c | 232 +++++++++++++++++++++++++++++++++++ criu/include/cr_options.h | 1 + criu/include/image.h | 2 +- criu/include/img-streamer.h | 8 ++ criu/include/protobuf-desc.h | 4 +- criu/include/servicefd.h | 1 + criu/mem.c | 6 +- criu/page-xfer.c | 8 ++ criu/pagemap.c | 63 +++++++++- criu/protobuf-desc.c | 1 + criu/util.c | 14 ++- images/Makefile | 1 + images/img-streamer.proto | 16 +++ 22 files changed, 429 insertions(+), 26 deletions(-) create mode 100644 criu/img-streamer.c create mode 100644 criu/include/img-streamer.h create mode 100644 images/img-streamer.proto diff --git a/Documentation/criu.txt b/Documentation/criu.txt index ab63e461c..4e9b4132e 100644 --- a/Documentation/criu.txt +++ b/Documentation/criu.txt @@ -76,6 +76,11 @@ The following levels are available: *-D*, *--images-dir* 'path':: Use 'path' as a base directory where to look for sets of image files. +*--stream*:: + dump/restore images using criu-image-streamer. + See https://github.com/checkpoint-restore/criu-image-streamer for detailed + usage. + *--prev-images-dir* 'path':: Use 'path' as a parent directory where to look for sets of image files. This option makes sense in case of incremental dumps. diff --git a/criu/Makefile.crtools b/criu/Makefile.crtools index 5c25b8928..dc92c2ea2 100644 --- a/criu/Makefile.crtools +++ b/criu/Makefile.crtools @@ -29,6 +29,7 @@ obj-y += files-reg.o obj-y += fsnotify.o obj-y += image-desc.o obj-y += image.o +obj-y += img-streamer.o obj-y += ipc_ns.o obj-y += irmap.o obj-y += kcmp-ids.o diff --git a/criu/config.c b/criu/config.c index b84b7da28..e78b534a9 100644 --- a/criu/config.c +++ b/criu/config.c @@ -510,6 +510,7 @@ int parse_options(int argc, char **argv, bool *usage_error, BOOL_OPT(SK_CLOSE_PARAM, &opts.tcp_close), { "verbosity", optional_argument, 0, 'v' }, { "ps-socket", required_argument, 0, 1091}, + BOOL_OPT("stream", &opts.stream), { "config", required_argument, 0, 1089}, { "no-default-config", no_argument, 0, 1090}, { "tls-cacert", required_argument, 0, 1092}, diff --git a/criu/cr-dump.c b/criu/cr-dump.c index 745998afc..2b4c9ae82 100644 --- a/criu/cr-dump.c +++ b/criu/cr-dump.c @@ -82,6 +82,7 @@ #include "eventpoll.h" #include "memfd.h" #include "timens.h" +#include "img-streamer.h" /* * Architectures can overwrite this function to restore register sets that @@ -1759,6 +1760,7 @@ static int cr_dump_finish(int ret) free_userns_maps(); close_service_fd(CR_PROC_FD_OFF); + close_image_dir(); if (ret) { pr_err("Dumping FAILED.\n"); diff --git a/criu/cr-restore.c b/criu/cr-restore.c index ed4b95b91..f572f79a0 100644 --- a/criu/cr-restore.c +++ b/criu/cr-restore.c @@ -29,6 +29,7 @@ #include "cr_options.h" #include "servicefd.h" #include "image.h" +#include "img-streamer.h" #include "util.h" #include "util-pie.h" #include "criu-log.h" @@ -2355,6 +2356,9 @@ skip_ns_bouncing: pr_info("Restore finished successfully. Tasks resumed.\n"); write_stats(RESTORE_STATS); + /* This has the effect of dismissing the image streamer */ + close_image_dir(); + ret = run_scripts(ACT_POST_RESUME); if (ret != 0) pr_err("Post-resume script ret code %d\n", ret); diff --git a/criu/cr-service.c b/criu/cr-service.c index 279016bcd..53eadb1bc 100644 --- a/criu/cr-service.c +++ b/criu/cr-service.c @@ -343,7 +343,14 @@ static int setup_opts_from_req(int sk, CriuOpts *req) if (req->parent_img) SET_CHAR_OPTS(img_parent, req->parent_img); - if (open_image_dir(images_dir_path) < 0) { + /* + * Image streaming is not supported with CRIU's service feature as + * the streamer must be started for each dump/restore operation. + * It is unclear how to do that with RPC, so we punt for now. + * This explains why we provide the argument mode=-1 instead of + * O_RSTR or O_DUMP. + */ + if (open_image_dir(images_dir_path, -1) < 0) { pr_perror("Can't open images directory"); goto err; } diff --git a/criu/crtools.c b/criu/crtools.c index 7f72dde27..ad61fa9bb 100644 --- a/criu/crtools.c +++ b/criu/crtools.c @@ -54,6 +54,20 @@ void flush_early_log_to_stderr(void) flush_early_log_buffer(STDERR_FILENO); } +static int image_dir_mode(char *argv[], int optind) +{ + if (!strcmp(argv[optind], "dump") || + !strcmp(argv[optind], "pre-dump") || + (!strcmp(argv[optind], "cpuinfo") && !strcmp(argv[optind + 1], "dump"))) + return O_DUMP; + + if (!strcmp(argv[optind], "restore") || + (!strcmp(argv[optind], "cpuinfo") && !strcmp(argv[optind + 1], "restore"))) + return O_RSTR; + + return -1; +} + int main(int argc, char *argv[], char *envp[]) { int ret = -1; @@ -148,13 +162,30 @@ int main(int argc, char *argv[], char *envp[]) } } + if (opts.stream && image_dir_mode(argv, optind) == -1) { + pr_err("--stream cannot be used with the %s command\n", argv[optind]); + goto usage; + } + /* We must not open imgs dir, if service is called */ if (strcmp(argv[optind], "service")) { - ret = open_image_dir(opts.imgs_dir); + ret = open_image_dir(opts.imgs_dir, image_dir_mode(argv, optind)); if (ret < 0) return 1; } + /* + * The kernel might send us lethal signals when writing to a pipe + * which reader has disappeared. We deal with write() failures on our + * own, and prefer not to get killed. So we ignore SIGPIPEs. + * + * Pipes are used in various places: + * 1) Receiving application page data + * 2) Transmitting data to the image streamer + * 3) Emitting logs (potentially to a pipe). + */ + signal(SIGPIPE, SIG_IGN); + /* * When a process group becomes an orphan, * its processes are sent a SIGHUP signal @@ -322,6 +353,7 @@ usage: " this requires running a second instance of criu\n" " in lazy-pages mode: 'criu lazy-pages -D DIR'\n" " --lazy-pages and lazy-pages mode require userfaultfd\n" +" --stream dump/restore images using criu-image-streamer\n" "\n" "* External resources support:\n" " --external RES dump objects from this list as external resources:\n" diff --git a/criu/files-reg.c b/criu/files-reg.c index b53e9b080..7e84addf2 100644 --- a/criu/files-reg.c +++ b/criu/files-reg.c @@ -154,7 +154,6 @@ static int trim_last_parent(char *path) static int copy_chunk_from_file(int fd, int img, off_t off, size_t len) { - char *buf = NULL; int ret; while (len > 0) { @@ -167,7 +166,6 @@ static int copy_chunk_from_file(int fd, int img, off_t off, size_t len) len -= ret; } - xfree(buf); return 0; } @@ -213,7 +211,6 @@ static int copy_file_to_chunks(int fd, struct cr_img *img, size_t file_size) static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) { - char *buf = NULL; int ret; while (len > 0) { @@ -221,7 +218,11 @@ static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) pr_perror("Can't seek file"); return -1; } - ret = sendfile(fd, img, NULL, len); + + if (opts.stream) + ret = splice(img, NULL, fd, NULL, len, SPLICE_F_MOVE); + else + ret = sendfile(fd, img, NULL, len); if (ret < 0) { pr_perror("Can't send data"); return -1; @@ -231,7 +232,6 @@ static int copy_chunk_to_file(int img, int fd, off_t off, size_t len) len -= ret; } - xfree(buf); return 0; } diff --git a/criu/image.c b/criu/image.c index 0225788b0..2bbb4dd02 100644 --- a/criu/image.c +++ b/criu/image.c @@ -17,6 +17,7 @@ #include "images/inventory.pb-c.h" #include "images/pagemap.pb-c.h" #include "proc_parse.h" +#include "img-streamer.h" #include "namespaces.h" bool ns_per_id = false; @@ -415,13 +416,16 @@ static int do_open_image(struct cr_img *img, int dfd, int type, unsigned long of flags = oflags & ~(O_NOBUF | O_SERVICE | O_FORCE_LOCAL); - /* - * For pages images dedup we need to open images read-write on - * restore, that may require proper capabilities, so we ask - * usernsd to do it for us - */ - if (root_ns_mask & CLONE_NEWUSER && - type == CR_FD_PAGES && oflags & O_RDWR) { + if (opts.stream && !(oflags & O_FORCE_LOCAL)) { + ret = img_streamer_open(path, flags); + errno = EIO; /* errno value is meaningless, only the ret value is meaningful */ + } else if (root_ns_mask & CLONE_NEWUSER && + type == CR_FD_PAGES && oflags & O_RDWR) { + /* + * For pages images dedup we need to open images read-write on + * restore, that may require proper capabilities, so we ask + * usernsd to do it for us + */ struct openat_args pa = { .flags = flags, .err = 0, @@ -520,7 +524,12 @@ struct cr_img *img_from_fd(int fd) return img; } -int open_image_dir(char *dir) +/* + * `mode` should be O_RSTR or O_DUMP depending on the intent. + * This is used when opts.stream is enabled for picking the right streamer + * socket name. `mode` is ignored when opts.stream is not enabled. + */ +int open_image_dir(char *dir, int mode) { int fd, ret; @@ -535,7 +544,10 @@ int open_image_dir(char *dir) return -1; fd = ret; - if (opts.img_parent) { + if (opts.stream) { + if (img_streamer_init(dir, mode) < 0) + goto err; + } else if (opts.img_parent) { ret = symlinkat(opts.img_parent, fd, CR_PARENT_LINK); if (ret < 0 && errno != EEXIST) { pr_perror("Can't link parent snapshot"); @@ -556,6 +568,8 @@ err: void close_image_dir(void) { + if (opts.stream) + img_streamer_finish(); close_service_fd(IMG_FD_OFF); } diff --git a/criu/img-streamer.c b/criu/img-streamer.c new file mode 100644 index 000000000..e31b17dd9 --- /dev/null +++ b/criu/img-streamer.c @@ -0,0 +1,232 @@ +#include +#include +#include +#include + +#include "cr_options.h" +#include "img-streamer.h" +#include "image.h" +#include "images/img-streamer.pb-c.h" +#include "protobuf.h" +#include "servicefd.h" +#include "rst-malloc.h" +#include "common/scm.h" +#include "common/lock.h" + +/* + * We use different path names for the dump and restore sockets because: + * 1) The user may want to perform both at the same time (akin to live + * migration). Specifying the same images-dir is convenient. + * 2) It fails quickly when the user mix-up the streamer and CRIU operations. + * (e.g., streamer is in capture more, while CRIU is in restore mode). + */ +#define IMG_STREAMER_CAPTURE_SOCKET_NAME "streamer-capture.sock" +#define IMG_STREAMER_SERVE_SOCKET_NAME "streamer-serve.sock" + +/* All requests go through the same socket connection. We must synchronize */ +static mutex_t *img_streamer_fd_lock; + +/* Either O_DUMP or O_RSTR */ +static int img_streamer_mode; + +static const char *socket_name_for_mode(int mode) +{ + switch (mode) { + case O_DUMP: return IMG_STREAMER_CAPTURE_SOCKET_NAME; + case O_RSTR: return IMG_STREAMER_SERVE_SOCKET_NAME; + default: BUG(); return NULL; + } +} + +/* + * img_streamer_init() connects to the image streamer socket. + * mode should be either O_DUMP or O_RSTR. + */ +int img_streamer_init(const char *image_dir, int mode) +{ + struct sockaddr_un addr; + int sockfd; + + img_streamer_mode = mode; + + sockfd = socket(AF_UNIX, SOCK_STREAM, 0); + if (sockfd < 0) { + pr_perror("Unable to instantiate UNIX socket"); + return -1; + } + + memset(&addr, 0, sizeof(addr)); + addr.sun_family = AF_UNIX; + snprintf(addr.sun_path, sizeof(addr.sun_path), "%s/%s", + image_dir, socket_name_for_mode(mode)); + + if (connect(sockfd, (struct sockaddr *)&addr, sizeof(addr)) < 0) { + pr_perror("Unable to connect to image streamer socket: %s", addr.sun_path); + goto err; + } + + img_streamer_fd_lock = shmalloc(sizeof(*img_streamer_fd_lock)); + if (!img_streamer_fd_lock) { + pr_err("Failed to allocate memory\n"); + goto err; + } + mutex_init(img_streamer_fd_lock); + + if (install_service_fd(IMG_STREAMER_FD_OFF, sockfd) < 0) + goto err; + + return 0; + +err: + close(sockfd); + return -1; +} + +/* + * img_streamer_finish() indicates that no more files will be opened. + * In other words, img_streamer_open() will no longer be called. + */ +void img_streamer_finish(void) +{ + if (get_service_fd(IMG_STREAMER_FD_OFF) >= 0) { + pr_info("Dismissing the image streamer\n"); + close_service_fd(IMG_STREAMER_FD_OFF); + } +} + +/* + * The regular protobuf APIs pb_write_one() and pb_read_one() operate over a + * `struct cr_img` object. Sadly, we don't have such object. We just have a + * file descriptor. The following pb_write_one_fd() and pb_read_one_fd() + * provide a protobuf API over a file descriptor. The implementation is a bit + * of a hack, but should be fine. At some point we can revisit to have a + * proper protobuf API over fds. + */ +static int pb_write_one_fd(int fd, void *obj, int type) +{ + int ret; + struct cr_img img; + memset(&img, 0, sizeof(img)); + + img._x.fd = fd; + ret = pb_write_one(&img, obj, type); + if (ret < 0) + pr_perror("Failed to communicate with the image streamer"); + return ret; +} + +static int pb_read_one_fd(int fd, void **pobj, int type) +{ + int ret; + struct cr_img img; + memset(&img, 0, sizeof(img)); + + img._x.fd = fd; + ret = pb_read_one(&img, pobj, type); + if (ret < 0) + pr_perror("Failed to communicate with the image streamer"); + return ret; +} + +static int send_file_request(char *filename) +{ + ImgStreamerRequestEntry req = IMG_STREAMER_REQUEST_ENTRY__INIT; + req.filename = filename; + return pb_write_one_fd(get_service_fd(IMG_STREAMER_FD_OFF), + &req, PB_IMG_STREAMER_REQUEST); +} + +static int recv_file_reply(bool *exists) +{ + ImgStreamerReplyEntry *reply; + int ret = pb_read_one_fd(get_service_fd(IMG_STREAMER_FD_OFF), + (void **)&reply, PB_IMG_STREAMER_REPLY); + if (ret < 0) + return ret; + + *exists = reply->exists; + free(reply); + + return 0; +} + +/* + * Using a pipe for image file transfers allows the data to be spliced by the + * image streamer, greatly improving performance. + * Transfer rates of up to 15GB/s can be seen with this technique. + */ +#define READ_PIPE 0 /* index of the read pipe returned by pipe() */ +#define WRITE_PIPE 1 +static int establish_streamer_file_pipe(void) +{ + /* + * If the other end of the pipe closes, the kernel will want to kill + * us with a SIGPIPE. These signal must be ignored, which we do in + * crtools.c:main() with signal(SIGPIPE, SIG_IGN). + */ + int ret = -1; + int criu_pipe_direction = img_streamer_mode == O_DUMP ? WRITE_PIPE : READ_PIPE; + int streamer_pipe_direction = 1 - criu_pipe_direction; + int fds[2]; + + if (pipe(fds) < 0) { + pr_perror("Unable to create pipe"); + return -1; + } + + if (send_fd(get_service_fd(IMG_STREAMER_FD_OFF), + NULL, 0, fds[streamer_pipe_direction]) < 0) + close(fds[criu_pipe_direction]); + else + ret = fds[criu_pipe_direction]; + + close(fds[streamer_pipe_direction]); + + return ret; +} + +static int _img_streamer_open(char *filename) +{ + if (send_file_request(filename) < 0) + return -1; + + if (img_streamer_mode == O_RSTR) { + /* The streamer replies whether the file exists */ + bool exists; + if (recv_file_reply(&exists) < 0) + return -1; + + if (!exists) + return -ENOENT; + } + + /* + * When the image streamer encounters a fatal error, it won't report + * errors via protobufs. Instead, CRIU will get a broken pipe error + * when trying to access a streaming pipe. This behavior is similar to + * what would happen if we were connecting criu and * criu-image-streamer + * via a shell pipe. + */ + + return establish_streamer_file_pipe(); +} + +/* + * Opens an image file via a UNIX pipe with the image streamer. + * + * Return: + * A file descriptor on success + * -ENOENT when the file was not found. + * -1 on any other error. + */ +int img_streamer_open(char *filename, int flags) +{ + int ret; + + BUG_ON(flags != img_streamer_mode); + + mutex_lock(img_streamer_fd_lock); + ret = _img_streamer_open(filename); + mutex_unlock(img_streamer_fd_lock); + return ret; +} diff --git a/criu/include/cr_options.h b/criu/include/cr_options.h index ba405182e..d5655212d 100644 --- a/criu/include/cr_options.h +++ b/criu/include/cr_options.h @@ -143,6 +143,7 @@ struct cr_options { int weak_sysctls; int status_fd; bool orphan_pts_master; + int stream; pid_t tree_id; int log_level; char *imgs_dir; diff --git a/criu/include/image.h b/criu/include/image.h index 1c7cc5471..62c8d7ba0 100644 --- a/criu/include/image.h +++ b/criu/include/image.h @@ -145,7 +145,7 @@ static inline int img_raw_fd(struct cr_img *img) extern off_t img_raw_size(struct cr_img *img); -extern int open_image_dir(char *dir); +extern int open_image_dir(char *dir, int mode); extern void close_image_dir(void); extern struct cr_img *open_image_at(int dfd, int type, unsigned long flags, ...); diff --git a/criu/include/img-streamer.h b/criu/include/img-streamer.h new file mode 100644 index 000000000..0c380c915 --- /dev/null +++ b/criu/include/img-streamer.h @@ -0,0 +1,8 @@ +#ifndef IMAGE_STREAMER_H +#define IMAGE_STREAMER_H + +extern int img_streamer_init(const char *image_dir, int mode); +extern void img_streamer_finish(void); +extern int img_streamer_open(char *filename, int flags); + +#endif /* IMAGE_STREAMER_H */ diff --git a/criu/include/protobuf-desc.h b/criu/include/protobuf-desc.h index ee4135d65..43d961731 100644 --- a/criu/include/protobuf-desc.h +++ b/criu/include/protobuf-desc.h @@ -62,8 +62,10 @@ enum { PB_GHOST_CHUNK, PB_FILE, PB_MEMFD_FILE, - PB_MEMFD_INODE, /* 60 */ + PB_MEMFD_INODE, PB_TIMENS, + PB_IMG_STREAMER_REQUEST, + PB_IMG_STREAMER_REPLY, /* PB_AUTOGEN_STOP */ diff --git a/criu/include/servicefd.h b/criu/include/servicefd.h index 986c46af5..c11f89d37 100644 --- a/criu/include/servicefd.h +++ b/criu/include/servicefd.h @@ -14,6 +14,7 @@ enum sfd_type { LOG_FD_OFF, IMG_FD_OFF, + IMG_STREAMER_FD_OFF, PROC_FD_OFF, /* fd with /proc for all proc_ calls */ PROC_PID_FD_OFF, CR_PROC_FD_OFF, /* some other's proc fd: diff --git a/criu/mem.c b/criu/mem.c index 15aa0cbdb..167838b98 100644 --- a/criu/mem.c +++ b/criu/mem.c @@ -1406,9 +1406,9 @@ static int prepare_vma_ios(struct pstree_item *t, struct task_restore_args *ta) /* * We optimize the case when rsti(t)->vma_io is empty. * - * This is useful for for remote images, where all VMAs are premapped - * (pr->pieok is false). This avoids re-opening the CR_FD_PAGES file, - * which could be no longer be available. + * This is useful when using the image streamer, where all VMAs are + * premapped (pr->pieok is false). This avoids re-opening the + * CR_FD_PAGES file, which may only be readable only once. */ if (list_empty(&rsti(t)->vma_io)) { ta->vma_ios = NULL; diff --git a/criu/page-xfer.c b/criu/page-xfer.c index 9affc2706..db8e5bec2 100644 --- a/criu/page-xfer.c +++ b/criu/page-xfer.c @@ -382,6 +382,10 @@ static int open_page_local_xfer(struct page_xfer *xfer, int fd_type, unsigned lo int pfd; int pr_flags = (fd_type == CR_FD_PAGEMAP) ? PR_TASK : PR_SHMEM; + /* Image streaming lacks support for incremental images */ + if (opts.stream) + goto out; + pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); if (pfd < 0 && errno == ENOENT) goto out; @@ -928,6 +932,10 @@ int check_parent_local_xfer(int fd_type, unsigned long img_id) struct stat st; int ret, pfd; + /* Image streaming lacks support for incremental images */ + if (opts.stream) + return 0; + pfd = openat(get_service_fd(IMG_FD_OFF), CR_PARENT_LINK, O_RDONLY); if (pfd < 0 && errno == ENOENT) return 0; diff --git a/criu/pagemap.c b/criu/pagemap.c index 05f6b82b8..f1e1be91f 100644 --- a/criu/pagemap.c +++ b/criu/pagemap.c @@ -406,6 +406,49 @@ static int maybe_read_page_local(struct page_read *pr, unsigned long vaddr, return ret; } +/* + * We cannot use maybe_read_page_local() for streaming images as it uses + * pread(), seeking in the file. Instead, we use this custom page reader. + */ +static int maybe_read_page_img_streamer(struct page_read *pr, unsigned long vaddr, + int nr, void *buf, unsigned flags) +{ + unsigned long len = nr * PAGE_SIZE; + int fd = img_raw_fd(pr->pi); + int ret; + size_t curr = 0; + + pr_debug("\tpr%lu-%u Read page from self %lx/%"PRIx64"\n", + pr->img_id, pr->id, pr->cvaddr, pr->pi_off); + + /* We can't seek. The requested address better match */ + BUG_ON(pr->cvaddr != vaddr); + + while (1) { + ret = read(fd, buf + curr, len - curr); + if (ret == 0) { + pr_err("Reached EOF unexpectedly while reading page from image\n"); + return -1; + } else if (ret < 0) { + pr_perror("Can't read mapping page %d", ret); + return -1; + } + curr += ret; + if (curr == len) + break; + } + + if (opts.auto_dedup) + pr_warn_once("Can't dedup when streaming images\n"); + + if (ret == 0 && pr->io_complete) + ret = pr->io_complete(pr, vaddr, nr); + + pr->pi_off += len; + + return ret; +} + static int read_page_complete(unsigned long img_id, unsigned long vaddr, int nr_pages, void *priv) { int ret = 0; @@ -601,6 +644,10 @@ static int try_open_parent(int dfd, unsigned long id, struct page_read *pr, int int pfd, ret; struct page_read *parent = NULL; + /* Image streaming lacks support for incremental images */ + if (opts.stream) + goto out; + pfd = openat(dfd, CR_PARENT_LINK, O_RDONLY); if (pfd < 0 && errno == ENOENT) goto out; @@ -657,7 +704,19 @@ static int init_pagemaps(struct page_read *pr) off_t fsize; int nr_pmes, nr_realloc; - fsize = img_raw_size(pr->pmi); + if (opts.stream) { + /* + * TODO - There is no easy way to estimate the size of the + * pagemap that is still to be read from the pipe. Possible + * solution is to ask the image streamer for the size of the + * image. 1024 is a wild guess (more space is allocated if + * needed). + */ + fsize = 1024; + } else { + fsize = img_raw_size(pr->pmi); + } + if (fsize < 0) return -1; @@ -781,6 +840,8 @@ int open_page_read_at(int dfd, unsigned long img_id, struct page_read *pr, int p if (remote) pr->maybe_read_page = maybe_read_page_remote; + else if (opts.stream) + pr->maybe_read_page = maybe_read_page_img_streamer; else { pr->maybe_read_page = maybe_read_page_local; if (!pr->parent && !opts.lazy_pages) diff --git a/criu/protobuf-desc.c b/criu/protobuf-desc.c index 2ee81e5db..13655264a 100644 --- a/criu/protobuf-desc.c +++ b/criu/protobuf-desc.c @@ -63,6 +63,7 @@ #include "images/seccomp.pb-c.h" #include "images/binfmt-misc.pb-c.h" #include "images/autofs.pb-c.h" +#include "images/img-streamer.pb-c.h" struct cr_pb_message_desc cr_pb_descs[PB_MAX]; diff --git a/criu/util.c b/criu/util.c index 06c594ca9..b30dbc86c 100644 --- a/criu/util.c +++ b/criu/util.c @@ -423,13 +423,19 @@ int copy_file(int fd_in, int fd_out, size_t bytes) { ssize_t written = 0; size_t chunk = bytes ? bytes : 4096; + ssize_t ret; while (1) { - ssize_t ret; - - ret = sendfile(fd_out, fd_in, NULL, chunk); + /* + * When fd_out is a pipe, sendfile() returns -EINVAL, so we + * fallback to splice(). Not sure why. + */ + if (opts.stream) + ret = splice(fd_in, NULL, fd_out, NULL, chunk, SPLICE_F_MOVE); + else + ret = sendfile(fd_out, fd_in, NULL, chunk); if (ret < 0) { - pr_perror("Can't send data to ghost file"); + pr_perror("Can't transfer data to ghost file from image"); return -1; } diff --git a/images/Makefile b/images/Makefile index 5ddd37664..bc67278e6 100644 --- a/images/Makefile +++ b/images/Makefile @@ -65,6 +65,7 @@ proto-obj-y += macvlan.o proto-obj-y += sit.o proto-obj-y += memfd.o proto-obj-y += timens.o +proto-obj-y += img-streamer.o CFLAGS += -iquote $(obj)/ diff --git a/images/img-streamer.proto b/images/img-streamer.proto new file mode 100644 index 000000000..d1bd4cc19 --- /dev/null +++ b/images/img-streamer.proto @@ -0,0 +1,16 @@ +syntax = "proto2"; + +// This message is sent from CRIU to the streamer. +// * During dump, it communicates the name of the file that is about to be sent +// to the streamer. +// * During restore, CRIU requests image files from the streamer. The message is +// used to communicate the name of the desired file. +message img_streamer_request_entry { + required string filename = 1; +} + +// This message is sent from the streamer to CRIU. It is only used during +// restore to report whether the requested file exists. +message img_streamer_reply_entry { + required bool exists = 1; +}